Merge remote-tracking branch 'origin' into integration-2

2024-09-20 08:40:50 +00:00 · 2021-03-09 10:33:15 +03:00 · 2021-03-09 10:33:15 +03:00 · 7bfa4bb64b
commit 7bfa4bb64b
parent e6f90298bb 6ec19b2632
125 changed files with 3990 additions and 585 deletions
--- a/cmake/autogenerated_versions.txt
+++ b/cmake/autogenerated_versions.txt
@ -1,9 +1,9 @@
 # This strings autochanged from release_lib.sh:
-SET(VERSION_REVISION 54448)
+SET(VERSION_REVISION 54449)
 SET(VERSION_MAJOR 21)
-SET(VERSION_MINOR 3)
+SET(VERSION_MINOR 4)
 SET(VERSION_PATCH 1)
-SET(VERSION_GITHASH ef72ba7349f230321750c13ee63b49a11a7c0adc)
-SET(VERSION_DESCRIBE v21.3.1.1-prestable)
-SET(VERSION_STRING 21.3.1.1)
+SET(VERSION_GITHASH af2135ef9dc72f16fa4f229b731262c3f0a8bbdc)
+SET(VERSION_DESCRIBE v21.4.1.1-prestable)
+SET(VERSION_STRING 21.4.1.1)
 # end of autochange
--- a/debian/changelog
+++ b/debian/changelog
@ -1,5 +1,5 @@
-clickhouse (21.3.1.1) unstable; urgency=low
+clickhouse (21.4.1.1) unstable; urgency=low

  * Modified source code

- -- clickhouse-release <clickhouse-release@yandex-team.ru>  Mon, 01 Feb 2021 12:50:53 +0300
+ -- clickhouse-release <clickhouse-release@yandex-team.ru>  Sat, 06 Mar 2021 14:43:27 +0300
--- a/docker/client/Dockerfile
+++ b/docker/client/Dockerfile
@ -1,7 +1,7 @@
 FROM ubuntu:18.04

 ARG repository="deb https://repo.clickhouse.tech/deb/stable/ main/"
-ARG version=21.3.1.*
+ARG version=21.4.1.*

 RUN apt-get update \
    && apt-get install --yes --no-install-recommends \
--- a/docker/server/Dockerfile
+++ b/docker/server/Dockerfile
@ -1,7 +1,7 @@
 FROM ubuntu:20.04

 ARG repository="deb https://repo.clickhouse.tech/deb/stable/ main/"
-ARG version=21.3.1.*
+ARG version=21.4.1.*
 ARG gosu_ver=1.10

 # user/group precreated explicitly with fixed uid/gid on purpose.
--- a/docker/test/Dockerfile
+++ b/docker/test/Dockerfile
@ -1,7 +1,7 @@
 FROM ubuntu:18.04

 ARG repository="deb https://repo.clickhouse.tech/deb/stable/ main/"
-ARG version=21.3.1.*
+ARG version=21.4.1.*

 RUN apt-get update && \
    apt-get install -y apt-transport-https dirmngr && \
--- a/docker/test/performance-comparison/config/config.d/user_files.xml
+++ b/docker/test/performance-comparison/config/config.d/user_files.xml
@ -4,4 +4,7 @@

    <!-- Path to configuration file with users, access rights, profiles of settings, quotas. -->
    <users_config>users.xml</users_config>
-</yandex>
+
+    <!-- Path to directory where users created by SQL commands are stored. -->
+    <access_control_path>access/</access_control_path>
+</yandex>
--- a/docs/en/engines/table-engines/special/distributed.md
+++ b/docs/en/engines/table-engines/special/distributed.md
@ -31,6 +31,12 @@ Also it accept the following settings:

 - `fsync_directories` - do the `fsync` for directories. Guarantees that the OS refreshed directory metadata after operations related to asynchronous inserts on Distributed table (after insert, after sending the data to shard, etc).

+- `bytes_to_throw_insert` - if more than this number of compressed bytes will be pending for async INSERT, an exception will be thrown. 0 - do not throw. Default 0.
+
+- `bytes_to_delay_insert` - if more than this number of compressed bytes will be pending for async INSERT, the query will be delayed. 0 - do not delay. Default 0.
+
+- `max_delay_to_insert` - max delay of inserting data into Distributed table in seconds, if there are a lot of pending bytes for async send. Default 60.
+
 !!! note "Note"

    **Durability settings** (`fsync_...`):
@ -39,6 +45,12 @@ Also it accept the following settings:
    - May significantly decrease the inserts' performance
    - Affect writing the data stored inside Distributed table folder into the **node which accepted your insert**. If you need to have guarantees of writing data to underlying MergeTree tables - see durability settings (`...fsync...`) in `system.merge_tree_settings`

+    For **Insert limit settings** (`..._insert`) see also:
+
+    - [insert_distributed_sync](../../../operations/settings/settings.md#insert_distributed_sync) setting
+    - [prefer_localhost_replica](../../../operations/settings/settings.md#settings-prefer-localhost-replica) setting
+    - `bytes_to_throw_insert` handled before `bytes_to_delay_insert`, so you should not set it to the value less then `bytes_to_delay_insert`
+
 Example:

 ``` sql
--- a/docs/en/introduction/distinctive-features.md
+++ b/docs/en/introduction/distinctive-features.md
@ -7,9 +7,9 @@ toc_title: Distinctive Features

 ## True Column-Oriented Database Management System {#true-column-oriented-dbms}

-In a true column-oriented DBMS, no extra data is stored with the values. Among other things, this means that constant-length values must be supported, to avoid storing their length “number” next to the values. As an example, a billion UInt8-type values should consume around 1 GB uncompressed, or this strongly affects the CPU use. It is essential to store data compactly (without any “garbage”) even when uncompressed, since the speed of decompression (CPU usage) depends mainly on the volume of uncompressed data.
+In a real column-oriented DBMS, no extra data is stored with the values. Among other things, this means that constant-length values must be supported, to avoid storing their length “number” next to the values. For example, a billion UInt8-type values should consume around 1 GB uncompressed, or this strongly affects the CPU use. It is essential to store data compactly (without any “garbage”) even when uncompressed since the speed of decompression (CPU usage) depends mainly on the volume of uncompressed data.

-It is worth noting because there are systems that can store values of different columns separately, but that can’t effectively process analytical queries due to their optimization for other scenarios. Examples are HBase, BigTable, Cassandra, and HyperTable. In these systems, you would get throughput around a hundred thousand rows per second, but not hundreds of millions of rows per second.
+It is worth noting because there are systems that can store values of different columns separately, but that can’t effectively process analytical queries due to their optimization for other scenarios. Examples are HBase, BigTable, Cassandra, and HyperTable. You would get throughput around a hundred thousand rows per second in these systems, but not hundreds of millions of rows per second.

 It’s also worth noting that ClickHouse is a database management system, not a single database. ClickHouse allows creating tables and databases in runtime, loading data, and running queries without reconfiguring and restarting the server.

--- a/docs/en/operations/opentelemetry.md
+++ b/docs/en/operations/opentelemetry.md
@ -5,74 +5,40 @@ toc_title: OpenTelemetry Support

 # [experimental] OpenTelemetry Support

-[OpenTelemetry](https://opentelemetry.io/) is an open standard for collecting
-traces and metrics from distributed application. ClickHouse has some support
-for OpenTelemetry.
+[OpenTelemetry](https://opentelemetry.io/) is an open standard for collecting traces and metrics from the distributed application. ClickHouse has some support for OpenTelemetry.

 !!! warning "Warning"
-This is an experimental feature that will change in backwards-incompatible ways in the future releases.
-
+    This is an experimental feature that will change in backwards-incompatible ways in future releases.

 ## Supplying Trace Context to ClickHouse

-ClickHouse accepts trace context HTTP headers, as described by
-the [W3C recommendation](https://www.w3.org/TR/trace-context/).
-It also accepts trace context over native protocol that is used for
-communication between ClickHouse servers or between the client and server.
-For manual testing, trace context headers conforming to the Trace Context
-recommendation can be supplied to `clickhouse-client` using
-`--opentelemetry-traceparent` and `--opentelemetry-tracestate` flags.
-
-If no parent trace context is supplied, ClickHouse can start a new trace, with
-probability controlled by the `opentelemetry_start_trace_probability` setting.
+ClickHouse accepts trace context HTTP headers, as described by the [W3C recommendation](https://www.w3.org/TR/trace-context/). It also accepts trace context over a native protocol that is used for communication between ClickHouse servers or between the client and server. For manual testing, trace context headers conforming to the Trace Context recommendation can be supplied to `clickhouse-client` using `--opentelemetry-traceparent` and `--opentelemetry-tracestate` flags.

+If no parent trace context is supplied, ClickHouse can start a new trace, with probability controlled by the [opentelemetry_start_trace_probability](../operations/settings/settings.md#opentelemetry-start-trace-probability) setting.

 ## Propagating the Trace Context

 The trace context is propagated to downstream services in the following cases:

-* Queries to remote ClickHouse servers, such as when using `Distributed` table
-  engine.
-
-* `URL` table function. Trace context information is sent in HTTP headers.
+* Queries to remote ClickHouse servers, such as when using [Distributed](../engines/table-engines/special/distributed.md) table engine.

+* [url](../sql-reference/table-functions/url.md) table function. Trace context information is sent in HTTP headers.

 ## Tracing the ClickHouse Itself

-ClickHouse creates _trace spans_ for each query and some of the query execution
-stages, such as query planning or distributed queries.
+ClickHouse creates `trace spans` for each query and some of the query execution stages, such as query planning or distributed queries.

-To be useful, the tracing information has to be exported to a monitoring system
-that supports OpenTelemetry, such as Jaeger or Prometheus. ClickHouse avoids
-a dependency on a particular monitoring system, instead only providing the
-tracing data through a system table. OpenTelemetry trace span information
-[required by the standard](https://github.com/open-telemetry/opentelemetry-specification/blob/master/specification/overview.md#span)
-is stored in the `system.opentelemetry_span_log` table.
+To be useful, the tracing information has to be exported to a monitoring system that supports OpenTelemetry, such as [Jaeger](https://jaegertracing.io/) or [Prometheus](https://prometheus.io/). ClickHouse avoids a dependency on a particular monitoring system, instead only providing the tracing data through a system table. OpenTelemetry trace span information [required by the standard](https://github.com/open-telemetry/opentelemetry-specification/blob/master/specification/overview.md#span) is stored in the [system.opentelemetry_span_log](../operations/system-tables/opentelemetry_span_log.md) table.

-The table must be enabled in the server configuration, see the `opentelemetry_span_log`
-element in the default config file `config.xml`. It is enabled by default.
+The table must be enabled in the server configuration, see the `opentelemetry_span_log` element in the default config file `config.xml`. It is enabled by default.

-The table has the following columns:
-
- `trace_id` 
- `span_id`
- `parent_span_id`
- `operation_name`
- `start_time`
- `finish_time`
- `finish_date`
- `attribute.name`
- `attribute.values`
-
-The tags or attributes are saved as two parallel arrays, containing the keys
-and values. Use `ARRAY JOIN` to work with them.
+The tags or attributes are saved as two parallel arrays, containing the keys and values. Use [ARRAY JOIN](../sql-reference/statements/select/array-join.md) to work with them.

 ## Integration with monitoring systems

-At the moment, there is no ready tool that can export the tracing data from
-ClickHouse to a monitoring system.
+At the moment, there is no ready tool that can export the tracing data from ClickHouse to a monitoring system.

-For testing, it is possible to setup the export using a materialized view with the URL engine over the `system.opentelemetry_span_log` table, which would push the arriving log data to an HTTP endpoint of a trace collector. For example, to push the minimal span data to a Zipkin instance running at `http://localhost:9411`, in Zipkin v2 JSON format:
+For testing, it is possible to setup the export using a materialized view with the [URL](../engines/table-engines/special/url.md) engine over the [system.opentelemetry_span_log](../operations/system-tables/opentelemetry_span_log.md) table, which would push the arriving log data to an HTTP endpoint of a trace collector. For example, to push the minimal span data to a Zipkin instance running at `http://localhost:9411`, in Zipkin v2 JSON format:

 ```sql
 CREATE MATERIALIZED VIEW default.zipkin_spans
@ -94,3 +60,5 @@ FROM system.opentelemetry_span_log
 ```

 In case of any errors, the part of the log data for which the error has occurred will be silently lost. Check the server log for error messages if the data does not arrive.
+
+[Original article](https://clickhouse.tech/docs/en/operations/opentelemetry/) <!--hide-->
--- a/docs/en/operations/system-tables/part_log.md
+++ b/docs/en/operations/system-tables/part_log.md
@ -17,7 +17,6 @@ The `system.part_log` table contains the following columns:
 -   `event_date` ([Date](../../sql-reference/data-types/date.md)) — Event date.
 -   `event_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Event time.
 -   `event_time_microseconds` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — Event time with microseconds precision.
-
 -   `duration_ms` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Duration.
 -   `database` ([String](../../sql-reference/data-types/string.md)) — Name of the database the data part is in.
 -   `table` ([String](../../sql-reference/data-types/string.md)) — Name of the table the data part is in.
--- a/docs/en/sql-reference/data-types/datetime.md
+++ b/docs/en/sql-reference/data-types/datetime.md
@ -19,15 +19,17 @@ Resolution: 1 second.

 ## Usage Remarks {#usage-remarks}

-The point in time is saved as a [Unix timestamp](https://en.wikipedia.org/wiki/Unix_time), regardless of the time zone or daylight saving time. Additionally, the `DateTime` type can store time zone that is the same for the entire column, that affects how the values of the `DateTime` type values are displayed in text format and how the values specified as strings are parsed (‘2020-01-01 05:00:01’). The time zone is not stored in the rows of the table (or in resultset), but is stored in the column metadata.
-A list of supported time zones can be found in the [IANA Time Zone Database](https://www.iana.org/time-zones).
-The `tzdata` package, containing [IANA Time Zone Database](https://www.iana.org/time-zones), should be installed in the system. Use the `timedatectl list-timezones` command to list timezones known by a local system.
+The point in time is saved as a [Unix timestamp](https://en.wikipedia.org/wiki/Unix_time), regardless of the time zone or daylight saving time. The time zone affects how the values of the `DateTime` type values are displayed in text format and how the values specified as strings are parsed (‘2020-01-01 05:00:01’).

-You can explicitly set a time zone for `DateTime`-type columns when creating a table. If the time zone isn’t set, ClickHouse uses the value of the [timezone](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) parameter in the server settings or the operating system settings at the moment of the ClickHouse server start.
+Timezone agnostic unix timestamp is stored in tables, and the timezone is used to transform it to text format or back during data import/export or to make calendar calculations on the values (example: `toDate`, `toHour` functions et cetera). The time zone is not stored in the rows of the table (or in resultset), but is stored in the column metadata.
+
+A list of supported time zones can be found in the [IANA Time Zone Database](https://www.iana.org/time-zones) and also can be queried by `SELECT * FROM system.time_zones`.
+
+You can explicitly set a time zone for `DateTime`-type columns when creating a table. Example: `DateTime('UTC')`. If the time zone isn’t set, ClickHouse uses the value of the [timezone](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) parameter in the server settings or the operating system settings at the moment of the ClickHouse server start.

 The [clickhouse-client](../../interfaces/cli.md) applies the server time zone by default if a time zone isn’t explicitly set when initializing the data type. To use the client time zone, run `clickhouse-client` with the `--use_client_time_zone` parameter.

-ClickHouse outputs values depending on the value of the [date\_time\_output\_format](../../operations/settings/settings.md#settings-date_time_output_format) setting. `YYYY-MM-DD hh:mm:ss` text format by default. Additionaly you can change the output with the [formatDateTime](../../sql-reference/functions/date-time-functions.md#formatdatetime) function.
+ClickHouse outputs values depending on the value of the [date_time_output_format](../../operations/settings/settings.md#settings-date_time_output_format) setting. `YYYY-MM-DD hh:mm:ss` text format by default. Additionaly you can change the output with the [formatDateTime](../../sql-reference/functions/date-time-functions.md#formatdatetime) function.

 When inserting data into ClickHouse, you can use different formats of date and time strings, depending on the value of the [date_time_input_format](../../operations/settings/settings.md#settings-date_time_input_format) setting.

@ -114,6 +116,24 @@ FROM dt
 └─────────────────────┴─────────────────────┘
 ```

+As timezone conversion only changes the metadata, the operation has no computation cost.
+
+
+## Limitations on timezones support
+
+Some timezones may not be supported completely. There are a few cases:
+
+If the offset from UTC is not a multiple of 15 minutes, the calculation of hours and minutes can be incorrect. For example, the time zone in Monrovia, Liberia has offset UTC -0:44:30 before 7 Jan 1972. If you are doing calculations on the historical time in Monrovia timezone, the time processing functions may give incorrect results. The results after 7 Jan 1972 will be correct nevertheless.
+
+If the time transition (due to daylight saving time or for other reasons) was performed at a point of time that is not a multiple of 15 minutes, you can also get incorrect results at this specific day.
+
+Non-monotonic calendar dates. For example, in Happy Valley - Goose Bay, the time was transitioned one hour backwards at 00:01:00 7 Nov 2010 (one minute after midnight). So after 6th Nov has ended, people observed a whole one minute of 7th Nov, then time was changed back to 23:01 6th Nov and after another 59 minutes the 7th Nov started again. ClickHouse does not (yet) support this kind of fun. During these days the results of time processing functions may be slightly incorrect.
+
+Similar issue exists for Casey Antarctic station in year 2010. They changed time three hours back at 5 Mar, 02:00. If you are working in antarctic station, please don't afraid to use ClickHouse. Just make sure you set timezone to UTC or be aware of inaccuracies.
+
+Time shifts for multiple days. Some pacific islands changed their timezone offset from UTC+14 to UTC-12. That's alright but some inaccuracies may present if you do calculations with their timezone for historical time points at the days of conversion.
+
+
 ## See Also {#see-also}

 -   [Type conversion functions](../../sql-reference/functions/type-conversion-functions.md)
--- a/docs/en/sql-reference/functions/date-time-functions.md
+++ b/docs/en/sql-reference/functions/date-time-functions.md
@ -538,7 +538,7 @@ Result:

 ## date\_sub {#date_sub}

-Subtracts a time/date interval from the provided date.
+Subtracts the time interval or date interval from the provided date or date with time.

 **Syntax**

@ -546,19 +546,30 @@ Subtracts a time/date interval from the provided date.
 date_sub(unit, value, date)
 ```

-Aliases: `dateSub`, `DATE_SUB`. 
+Aliases: `dateSub`, `DATE_SUB`.

 **Arguments**

 -   `unit` — The type of interval to subtract. [String](../../sql-reference/data-types/string.md).
+    Possible values:

-        Supported values: second, minute, hour, day, week, month, quarter, year.
-   `value` - Value in specified unit - [Int](../../sql-reference/data-types/int-uint.md)    
-   `date` — [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md) to subtract value from.
+    - `second`
+    - `minute`
+    - `hour`
+    - `day`
+    - `week`
+    - `month`
+    - `quarter`
+    - `year`
+		
+-   `value` — Value of interval to subtract. [Int](../../sql-reference/data-types/int-uint.md).    
+-   `date` — The date or date with time from which `value` is subtracted. [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md).

 **Returned value**

-Returns Date or DateTime with `value` expressed in `unit` subtracted from `date`. 
+Returns the date or date with time obtained by subtracting `value`, expressed in `unit`, from `date`.
+
+Type: [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md).

 **Example**

--- a/docs/en/sql-reference/functions/ip-address-functions.md
+++ b/docs/en/sql-reference/functions/ip-address-functions.md
@ -121,24 +121,53 @@ LIMIT 10
 └────────────────────────────┴────────┘
 ```

-## IPv6StringToNum(s) {#ipv6stringtonums}
+## IPv6StringToNum {#ipv6stringtonums}

-The reverse function of IPv6NumToString. If the IPv6 address has an invalid format, it returns a string of null bytes. 
-If the IP address is a valid IPv4 address then the IPv6 equivalent of the IPv4 address is returned.
+The reverse function of [IPv6NumToString](#ipv6numtostringx). If the IPv6 address has an invalid format, it returns a string of null bytes. 
+
+If the input string contains a valid IPv4 address, returns its IPv6 equivalent.
 HEX can be uppercase or lowercase.

 Alias: `INET6_ATON`.

+**Syntax**
+
 ``` sql
-SELECT cutIPv6(IPv6StringToNum('127.0.0.1'), 0, 0);
+IPv6StringToNum(string)
 ```

-``` text
-┌─cutIPv6(IPv6StringToNum('127.0.0.1'), 0, 0)─┐
-│ ::ffff:127.0.0.1                            │
-└─────────────────────────────────────────────┘
+**Argument** 
+
+-   `string` — IP address. [String](../../sql-reference/data-types/string.md).
+
+**Returned value**
+
+-   IPv6 address in binary format. 
+
+Type: [FixedString(16)](../../sql-reference/data-types/fixedstring.md).
+
+**Example**
+
+Query:
+
+``` sql
+SELECT addr, cutIPv6(IPv6StringToNum(addr), 0, 0) FROM (SELECT ['notaddress', '127.0.0.1', '1111::ffff'] AS addr) ARRAY JOIN addr;
 ```

+Result:
+
+``` text
+┌─addr───────┬─cutIPv6(IPv6StringToNum(addr), 0, 0)─┐
+│ notaddress │ ::                                   │
+│ 127.0.0.1  │ ::ffff:127.0.0.1                     │
+│ 1111::ffff │ 1111::ffff                           │
+└────────────┴──────────────────────────────────────┘
+```
+
+**See also**
+
+-   [cutIPv6](#cutipv6x-bytestocutforipv6-bytestocutforipv4).
+
 ## IPv4ToIPv6(x) {#ipv4toipv6x}

 Takes a `UInt32` number. Interprets it as an IPv4 address in [big endian](https://en.wikipedia.org/wiki/Endianness). Returns a `FixedString(16)` value containing the IPv6 address in binary format. Examples:
@ -232,43 +261,56 @@ SELECT
 └───────────────────────────────────┴──────────────────────────┘
 ```

-## toIPv6(string) {#toipv6string}
+## toIPv6 {#toipv6string}

-An alias to `IPv6StringToNum()` that takes a string form of IPv6 address and returns value of [IPv6](../../sql-reference/data-types/domains/ipv6.md) type, which is binary equal to value returned by `IPv6StringToNum()`.
-If the IP address is a valid IPv4 address then the IPv6 equivalent of the IPv4 address is returned.
+Converts a string form of IPv6 address to [IPv6](../../sql-reference/data-types/domains/ipv6.md) type. If the IPv6 address has an invalid format, returns an empty value.
+Similar to [IPv6StringToNum](#ipv6stringtonums) function, which converts IPv6 address to binary format.

-``` sql
-WITH
-    '2001:438:ffff::407d:1bc1' as IPv6_string
-SELECT
-    toTypeName(IPv6StringToNum(IPv6_string)),
-    toTypeName(toIPv6(IPv6_string))
+If the input string contains a valid IPv4 address, then the IPv6 equivalent of the IPv4 address is returned.
+
+**Syntax**
+
+```sql
+toIPv6(string)
 ```

-``` text
-┌─toTypeName(IPv6StringToNum(IPv6_string))─┬─toTypeName(toIPv6(IPv6_string))─┐
-│ FixedString(16)                          │ IPv6                            │
-└──────────────────────────────────────────┴─────────────────────────────────┘
-```
+**Argument**
+
+-   `string` — IP address. [String](../../sql-reference/data-types/string.md)
+
+**Returned value**
+
+-   IP address. 
+
+Type: [IPv6](../../sql-reference/data-types/domains/ipv6.md).
+
+**Examples**
+
+Query:

 ``` sql
-WITH
-    '2001:438:ffff::407d:1bc1' as IPv6_string
+WITH '2001:438:ffff::407d:1bc1' AS IPv6_string
 SELECT
    hex(IPv6StringToNum(IPv6_string)),
-    hex(toIPv6(IPv6_string))
+    hex(toIPv6(IPv6_string));
 ```

+Result:
+
 ``` text
 ┌─hex(IPv6StringToNum(IPv6_string))─┬─hex(toIPv6(IPv6_string))─────────┐
 │ 20010438FFFF000000000000407D1BC1  │ 20010438FFFF000000000000407D1BC1 │
 └───────────────────────────────────┴──────────────────────────────────┘
 ```

+Query:
+
 ``` sql
-SELECT toIPv6('127.0.0.1')
+SELECT toIPv6('127.0.0.1');
 ```

+Result:
+
 ``` text
 ┌─toIPv6('127.0.0.1')─┐
 │ ::ffff:127.0.0.1    │
--- a/docs/en/sql-reference/functions/other-functions.md
+++ b/docs/en/sql-reference/functions/other-functions.md
@ -728,7 +728,7 @@ The result of the function depends on the affected data blocks and the order of
    It can reach the neighbor rows only inside the currently processed data block.

 The rows order used during the calculation of `neighbor` can differ from the order of rows returned to the user.
-To prevent that you can make a subquery with ORDER BY and call the function from outside the subquery.
+To prevent that you can make a subquery with [ORDER BY](../../sql-reference/statements/select/order-by.md) and call the function from outside the subquery.

 **Arguments**

@ -834,12 +834,12 @@ Calculates the difference between successive row values in the data block.
 Returns 0 for the first row and the difference from the previous row for each subsequent row.

 !!! warning "Warning"
-    It can reach the previos row only inside the currently processed data block.
+    It can reach the previous row only inside the currently processed data block.
    
 The result of the function depends on the affected data blocks and the order of data in the block.

 The rows order used during the calculation of `runningDifference` can differ from the order of rows returned to the user.
-To prevent that you can make a subquery with ORDER BY and call the function from outside the subquery.
+To prevent that you can make a subquery with [ORDER BY](../../sql-reference/statements/select/order-by.md) and call the function from outside the subquery.

 Example:

--- a/docs/en/sql-reference/statements/optimize.md
+++ b/docs/en/sql-reference/statements/optimize.md
@ -17,7 +17,7 @@ When `OPTIMIZE` is used with the [ReplicatedMergeTree](../../engines/table-engin

 -   If `OPTIMIZE` doesn’t perform a merge for any reason, it doesn’t notify the client. To enable notifications, use the [optimize_throw_if_noop](../../operations/settings/settings.md#setting-optimize_throw_if_noop) setting.
 -   If you specify a `PARTITION`, only the specified partition is optimized. [How to set partition expression](../../sql-reference/statements/alter/index.md#alter-how-to-specify-part-expr).
-   If you specify `FINAL`, optimization is performed even when all the data is already in one part.
+-   If you specify `FINAL`, optimization is performed even when all the data is already in one part. Also merge is forced even if concurrent merges are performed.
 -   If you specify `DEDUPLICATE`, then completely identical rows will be deduplicated (all columns are compared), it makes sense only for the MergeTree engine.

 !!! warning "Warning"
--- a/docs/ru/operations/opentelemetry.md
+++ b/docs/ru/operations/opentelemetry.md
@ -0,0 +1,37 @@
+---
+toc_priority: 62
+toc_title: Поддержка OpenTelemetry
+---
+
+# [экспериментально] Поддержка OpenTelemetry
+
+ClickHouse поддерживает [OpenTelemetry](https://opentelemetry.io/) — открытый стандарт для сбора трассировок и метрик из распределенного приложения. 
+
+!!! warning "Предупреждение"
+Поддержка стандарта экспериментальная и будет со временем меняться.
+
+## Обеспечение поддержки контекста трассировки в ClickHouse
+
+ClickHouse принимает контекстную информацию трассировки через HTTP заголовок `tracecontext`, как описано в [рекомендации W3C](https://www.w3.org/TR/trace-context/). Также он принимает контекстную информацию через нативный протокол, который используется для связи между серверами ClickHouse или между клиентом и сервером. Для ручного тестирования стандартный заголовок `tracecontext`, содержащий контекст трассировки, может быть передан в `clickhouse-client` через флаги: `--opentelemetry-traceparent` и `--opentelemetry-tracestate`.
+
+Если входящий контекст трассировки не указан, ClickHouse может начать трассировку с вероятностью, задаваемой настройкой [opentelemetry_start_trace_probability](../operations/settings/settings.md#opentelemetry-start-trace-probability).
+
+## Распространение контекста трассировки
+
+Контекст трассировки распространяется на нижестоящие сервисы в следующих случаях:
+
+* При использовании запросов к удаленным серверам ClickHouse, например, при использовании движка таблиц [Distributed](../engines/table-engines/special/distributed.md).
+
+* При использовании табличной функции [url](../sql-reference/table-functions/url.md). Информация контекста трассировки передается в HTTP заголовки.
+
+## Как ClickHouse выполняет трассировку
+
+ClickHouse создает `trace spans` для каждого запроса и некоторых этапов выполнения запроса, таких как планирование запросов или распределенные запросы.
+
+Чтобы анализировать информацию трассировки, ее следует экспортировать в систему мониторинга, поддерживающую OpenTelemetry, такую как [Jaeger](https://jaegertracing.io/) или [Prometheus](https://prometheus.io/). ClickHouse не зависит от конкретной системы мониторинга, вместо этого предоставляя данные трассировки только через системную таблицу. Информация о диапазоне трассировки в OpenTelemetry, [требуемая стандартом](https://github.com/open-telemetry/opentelemetry-specification/blob/master/specification/overview.md#span), хранится в системной таблице [system.opentelemetry_span_log](../operations/system-tables/opentelemetry_span_log.md).
+
+Таблица должна быть включена в конфигурации сервера, смотрите элемент `opentelemetry_span_log` в файле конфигурации `config.xml`. По умолчанию таблица включена всегда.
+
+Теги или атрибуты сохраняются в виде двух параллельных массивов, содержащих ключи и значения. Для работы с ними используйте [ARRAY JOIN](../sql-reference/statements/select/array-join.md).
+
+[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/opentelemetry/) <!--hide-->
--- a/docs/ru/operations/system-tables/part_log.md
+++ b/docs/ru/operations/system-tables/part_log.md
@ -16,6 +16,7 @@
    -   `MOVE_PART` — перемещение куска между дисками.
 -   `event_date` ([Date](../../sql-reference/data-types/date.md)) — дата события.
 -   `event_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — время события.
+-   `event_time_microseconds` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — время события с точностью до микросекунд.
 -   `duration_ms` ([UInt64](../../sql-reference/data-types/int-uint.md)) — длительность.
 -   `database` ([String](../../sql-reference/data-types/string.md)) — имя базы данных, в которой находится кусок.
 -   `table` ([String](../../sql-reference/data-types/string.md)) — имя таблицы, в которой находится кусок.
@ -47,6 +48,7 @@ query_id:                      983ad9c7-28d5-4ae1-844e-603116b7de31
 event_type:                    NewPart
 event_date:                    2021-02-02
 event_time:                    2021-02-02 11:14:28
+event_time_microseconds:                    2021-02-02 11:14:28.861919
 duration_ms:                   35
 database:                      default
 table:                         log_mt_2
--- a/docs/ru/sql-reference/functions/date-time-functions.md
+++ b/docs/ru/sql-reference/functions/date-time-functions.md
@ -23,8 +23,6 @@ SELECT
 └─────────────────────┴────────────┴────────────┴─────────────────────┘
 ```

-Поддерживаются только часовые пояса, отличающиеся от UTC на целое число часов.
-
 ## toTimeZone {#totimezone}

 Переводит дату или дату-с-временем в указанный часовой пояс. Часовой пояс (таймзона) это атрибут типов Date/DateTime, внутреннее значение (количество секунд) поля таблицы или колонки результата не изменяется, изменяется тип поля и автоматически его текстовое отображение.
@ -446,9 +444,9 @@ date_trunc(unit, value[, timezone])

 Синоним: `dateTrunc`. 

-**Параметры**
+**Аргументы**

-   `unit` — Название части даты или времени. [String](../syntax.md#syntax-string-literal).
+-   `unit` — название части даты или времени. [String Literal](../syntax.md#syntax-string-literal).
    Возможные значения:

    - `second`
@ -460,8 +458,8 @@ date_trunc(unit, value[, timezone])
    - `quarter`
    - `year`

-   `value` — Дата и время. [DateTime](../../sql-reference/data-types/datetime.md) или [DateTime64](../../sql-reference/data-types/datetime64.md).
-   `timezone` — [Часовой пояс](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) для возвращаемого значения (необязательно). Если параметр не задан, используется часовой пояс параметра `value`. [String](../../sql-reference/data-types/string.md)
+-   `value` — дата и время. [DateTime](../../sql-reference/data-types/datetime.md) или [DateTime64](../../sql-reference/data-types/datetime64.md).
+-   `timezone` — [часовой пояс](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) для возвращаемого значения (необязательно). Если параметр не задан, используется часовой пояс параметра `value`. [String](../../sql-reference/data-types/string.md)

 **Возвращаемое значение**

@ -618,6 +616,57 @@ SELECT dateDiff('hour', toDateTime('2018-01-01 22:00:00'), toDateTime('2018-01-0
 └────────────────────────────────────────────────────────────────────────────────────────┘
 ```

+## date\_sub {#date_sub}
+
+Вычитает интервал времени или даты из указанной даты или даты со временем.
+
+**Синтаксис**
+
+``` sql
+date_sub(unit, value, date)
+```
+
+Синонимы: `dateSub`, `DATE_SUB`.
+
+**Аргументы**
+
+-   `unit` — единица измерения времени, в которой задан интервал для вычитания. [String](../../sql-reference/data-types/string.md).
+    Возможные значения:
+
+    - `second`
+    - `minute`
+    - `hour`
+    - `day`
+    - `week`
+    - `month`
+    - `quarter`
+    - `year`
+		
+-   `value` — значение интервала для вычитания. [Int](../../sql-reference/data-types/int-uint.md).    
+-   `date` — дата или дата со временем, из которой вычитается `value`. [Date](../../sql-reference/data-types/date.md) или [DateTime](../../sql-reference/data-types/datetime.md).
+
+**Возвращаемое значение**
+
+Возвращает дату или дату со временем, полученную в результате вычитания `value`, выраженного в `unit`, из `date`.
+
+Тип: [Date](../../sql-reference/data-types/date.md) или [DateTime](../../sql-reference/data-types/datetime.md).
+
+**Пример**
+
+Запрос:
+
+``` sql
+SELECT date_sub(YEAR, 3, toDate('2018-01-01'));
+```
+
+Результат:
+
+``` text
+┌─minus(toDate('2018-01-01'), toIntervalYear(3))─┐
+│                                     2015-01-01 │
+└────────────────────────────────────────────────┘
+```
+
 ## timeSlot {#timeslot}

 Округляет время до получаса.
--- a/docs/ru/sql-reference/functions/ip-address-functions.md
+++ b/docs/ru/sql-reference/functions/ip-address-functions.md
@ -121,12 +121,53 @@ LIMIT 10
 └────────────────────────────┴────────┘
 ```

-## IPv6StringToNum(s) {#ipv6stringtonums}
+## IPv6StringToNum {#ipv6stringtonums}
+
+Функция, обратная к [IPv6NumToString](#ipv6numtostringx). Если IPv6 адрес передан в неправильном формате, то возвращает строку из нулевых байт.
+
+Если IP адрес является корректным IPv4 адресом, функция возвращает его IPv6 эквивалент.

-Функция, обратная к IPv6NumToString. Если IPv6 адрес в неправильном формате, то возвращает строку из нулевых байт.
 HEX может быть в любом регистре.

-Alias: `INET6_ATON`.
+Синоним: `INET6_ATON`.
+
+**Синтаксис**
+
+``` sql
+IPv6StringToNum(string)
+```
+
+**Аргумент** 
+
+-   `string` — IP адрес. [String](../../sql-reference/data-types/string.md).
+
+**Возвращаемое значение**
+
+-   Адрес IPv6 в двоичном представлении.
+
+Тип: [FixedString(16)](../../sql-reference/data-types/fixedstring.md).
+
+**Пример**
+
+Запрос:
+
+``` sql
+SELECT addr, cutIPv6(IPv6StringToNum(addr), 0, 0) FROM (SELECT ['notaddress', '127.0.0.1', '1111::ffff'] AS addr) ARRAY JOIN addr;
+```
+
+Результат:
+
+``` text
+┌─addr───────┬─cutIPv6(IPv6StringToNum(addr), 0, 0)─┐
+│ notaddress │ ::                                   │
+│ 127.0.0.1  │ ::ffff:127.0.0.1                     │
+│ 1111::ffff │ 1111::ffff                           │
+└────────────┴──────────────────────────────────────┘
+```
+
+**Смотрите также**
+
+-   [cutIPv6](#cutipv6x-bytestocutforipv6-bytestocutforipv4).

 ## IPv4ToIPv6(x) {#ipv4toipv6x}

@ -221,38 +262,62 @@ SELECT
 └───────────────────────────────────┴──────────────────────────┘
 ```

-## toIPv6(string) {#toipv6string}
+## toIPv6 {#toipv6string}

-Псевдоним функции `IPv6StringToNum()` которая принимает строку с адресом IPv6 и возвращает значение типа [IPv6](../../sql-reference/functions/ip-address-functions.md), которое равно значению, возвращаемому функцией `IPv6StringToNum()`.
+Приводит строку с адресом в формате IPv6 к типу [IPv6](../../sql-reference/data-types/domains/ipv6.md). Возвращает пустое значение, если входящая строка не является корректным IP адресом.
+Похоже на функцию [IPv6StringToNum](#ipv6stringtonums), которая представляет адрес IPv6 в двоичном виде.

-``` sql
-WITH
-    '2001:438:ffff::407d:1bc1' as IPv6_string
-SELECT
-    toTypeName(IPv6StringToNum(IPv6_string)),
-    toTypeName(toIPv6(IPv6_string))
+Если входящая строка содержит корректный IPv4 адрес, функция возвращает его IPv6 эквивалент.
+
+**Синтаксис**
+
+```sql
+toIPv6(string)
 ```

-``` text
-┌─toTypeName(IPv6StringToNum(IPv6_string))─┬─toTypeName(toIPv6(IPv6_string))─┐
-│ FixedString(16)                          │ IPv6                            │
-└──────────────────────────────────────────┴─────────────────────────────────┘
-```
+**Аргумент**
+
+-   `string` — IP адрес. [String](../../sql-reference/data-types/string.md)
+
+**Возвращаемое значение**
+
+-   IP адрес. 
+
+Тип: [IPv6](../../sql-reference/data-types/domains/ipv6.md).
+
+**Примеры**
+
+Запрос:

 ``` sql
-WITH
-    '2001:438:ffff::407d:1bc1' as IPv6_string
+WITH '2001:438:ffff::407d:1bc1' AS IPv6_string
 SELECT
    hex(IPv6StringToNum(IPv6_string)),
-    hex(toIPv6(IPv6_string))
+    hex(toIPv6(IPv6_string));
 ```

+Результат:
+
 ``` text
 ┌─hex(IPv6StringToNum(IPv6_string))─┬─hex(toIPv6(IPv6_string))─────────┐
 │ 20010438FFFF000000000000407D1BC1  │ 20010438FFFF000000000000407D1BC1 │
 └───────────────────────────────────┴──────────────────────────────────┘
 ```

+Запрос:
+
+``` sql
+SELECT toIPv6('127.0.0.1');
+```
+
+Результат:
+
+``` text
+┌─toIPv6('127.0.0.1')─┐
+│ ::ffff:127.0.0.1    │
+└─────────────────────┘
+```
+
 ## isIPv4String {#isipv4string}

 Определяет, является ли строка адресом IPv4 или нет. Также вернет `0`, если `string` — адрес IPv6.
--- a/docs/ru/sql-reference/functions/other-functions.md
+++ b/docs/ru/sql-reference/functions/other-functions.md
@ -659,7 +659,7 @@ SELECT

 ## neighbor {#neighbor}

-Функция позволяет получить доступ к значению в колонке `column`, находящемуся на смещении `offset` относительно текущей строки. Является частичной реализацией [оконных функций](https://en.wikipedia.org/wiki/SQL_window_function) `LEAD()` и `LAG()`.
+Функция позволяет получить доступ к значению в столбце `column`, находящемуся на смещении `offset` относительно текущей строки. Является частичной реализацией [оконных функций](https://en.wikipedia.org/wiki/SQL_window_function) `LEAD()` и `LAG()`.

 **Синтаксис**

@ -667,7 +667,13 @@ SELECT
 neighbor(column, offset[, default_value])
 ```

-Результат функции зависит от затронутых блоков данных и порядка данных в блоке. Если сделать подзапрос с ORDER BY и вызывать функцию извне подзапроса, можно будет получить ожидаемый результат.
+Результат функции зависит от затронутых блоков данных и порядка данных в блоке.
+
+!!! warning "Предупреждение"
+    Функция может получить доступ к значению в столбце соседней строки только внутри обрабатываемого в данный момент блока данных.
+
+Порядок строк, используемый при вычислении функции `neighbor`, может отличаться от порядка строк, возвращаемых пользователю.
+Чтобы этого не случилось, вы можете сделать подзапрос с [ORDER BY](../../sql-reference/statements/select/order-by.md) и вызвать функцию изне подзапроса.

 **Параметры**

@ -772,8 +778,13 @@ FROM numbers(16)
 Считает разницу между последовательными значениями строк в блоке данных.
 Возвращает 0 для первой строки и разницу с предыдущей строкой для каждой последующей строки.

+!!! warning "Предупреждение"
+    Функция может взять значение предыдущей строки только внутри текущего обработанного блока данных.
+
 Результат функции зависит от затронутых блоков данных и порядка данных в блоке.
-Если сделать подзапрос с ORDER BY и вызывать функцию извне подзапроса, можно будет получить ожидаемый результат.
+
+Порядок строк, используемый при вычислении функции `runningDifference`, может отличаться от порядка строк, возвращаемых пользователю.
+Чтобы этого не случилось, вы можете сделать подзапрос с [ORDER BY](../../sql-reference/statements/select/order-by.md) и вызвать функцию извне подзапроса.

 Пример:

--- a/docs/ru/sql-reference/statements/optimize.md
+++ b/docs/ru/sql-reference/statements/optimize.md
@ -15,11 +15,10 @@ OPTIMIZE TABLE [db.]name [ON CLUSTER cluster] [PARTITION partition | PARTITION I

 -   Если `OPTIMIZE` не выполняет мёрж по любой причине, ClickHouse не оповещает об этом клиента. Чтобы включить оповещения, используйте настройку [optimize_throw_if_noop](../../operations/settings/settings.md#setting-optimize_throw_if_noop).
 -   Если указать `PARTITION`, то оптимизация выполняется только для указанной партиции. [Как задавать имя партиции в запросах](alter/index.md#alter-how-to-specify-part-expr).
-   Если указать `FINAL`, то оптимизация выполняется даже в том случае, если все данные уже лежат в одном куске.
+-   Если указать `FINAL`, то оптимизация выполняется даже в том случае, если все данные уже лежат в одном куске. Кроме того, слияние является принудительным, даже если выполняются параллельные слияния.
 -   Если указать `DEDUPLICATE`, то произойдет схлопывание полностью одинаковых строк (сравниваются значения во всех колонках), имеет смысл только для движка MergeTree.

 !!! warning "Внимание"
    Запрос `OPTIMIZE` не может устранить причину появления ошибки «Too many parts».
-
    
-[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/statements/optimize/) <!--hide-->
+[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/statements/optimize/) <!--hide-->
--- a/docs/ru/sql-reference/statements/select/join.md
+++ b/docs/ru/sql-reference/statements/select/join.md
@ -102,7 +102,7 @@ USING (equi_column1, ... equi_columnN, asof_column)
 -   При использовании обычного `JOIN` , запрос отправляется на удалённые серверы. На каждом из них выполняются подзапросы для формирования «правой» таблицы, и с этой таблицей выполняется соединение. То есть, «правая» таблица формируется на каждом сервере отдельно.
 -   При использовании `GLOBAL ... JOIN`, сначала сервер-инициатор запроса запускает подзапрос для вычисления правой таблицы. Эта временная таблица передаётся на каждый удалённый сервер, и на них выполняются запросы с использованием переданных временных данных.

-Будьте аккуратны при использовании `GLOBAL`. За дополнительной информацией обращайтесь в раздел [Распределенные подзапросы](#select-distributed-subqueries).
+Будьте аккуратны при использовании `GLOBAL`. За дополнительной информацией обращайтесь в раздел [Распределенные подзапросы](../../../sql-reference/operators/in.md#select-distributed-subqueries).

 ## Рекомендации по использованию {#usage-recommendations}

--- a/docs/zh/engines/table-engines/mergetree-family/replication.md
+++ b/docs/zh/engines/table-engines/mergetree-family/replication.md
@ -47,7 +47,7 @@

 如果配置文件中没有设置 ZooKeeper ，则无法创建复制表，并且任何现有的复制表都将变为只读。

-`SELECT` 查询并不需要借助 ZooKeeper ，复本并不影响 `SELECT` 的性能，查询复制表与非复制表速度是一样的。查询分布式表时，ClickHouse的处理方式可通过设置 [max_replica_delay_for_distributed_queries](../../../operations/settings/settings.md#settings-max_replica_delay_for_distributed_queries) 和 [fallback_to_stale_replicas_for_distributed_queries](../../../operations/settings/settings.md) 修改。
+`SELECT` 查询并不需要借助 ZooKeeper ，副本并不影响 `SELECT` 的性能，查询复制表与非复制表速度是一样的。查询分布式表时，ClickHouse的处理方式可通过设置 [max_replica_delay_for_distributed_queries](../../../operations/settings/settings.md#settings-max_replica_delay_for_distributed_queries) 和 [fallback_to_stale_replicas_for_distributed_queries](../../../operations/settings/settings.md) 修改。

 对于每个 `INSERT` 语句，会通过几个事务将十来个记录添加到 ZooKeeper。（确切地说，这是针对每个插入的数据块; 每个 INSERT 语句的每 `max_insert_block_size = 1048576` 行和最后剩余的都各算作一个块。）相比非复制表，写 zk 会导致 `INSERT` 的延迟略长一些。但只要你按照建议每秒不超过一个 `INSERT` 地批量插入数据，不会有任何问题。一个 ZooKeeper 集群能给整个 ClickHouse 集群支撑协调每秒几百个 `INSERT`。数据插入的吞吐量（每秒的行数）可以跟不用复制的数据一样高。

--- a/docs/zh/sql-reference/operators/in.md
+++ b/docs/zh/sql-reference/operators/in.md
@ -18,7 +18,7 @@ SELECT (CounterID, UserID) IN ((34, 123), (101500, 456)) FROM ...

 如果左侧是索引中的单列，而右侧是一组常量，则系统将使用索引处理查询。

-Don't list too many values explicitly (i.e. millions). If a data set is large, put it in a temporary table (for example, see the section “External data for query processing”），然后使用子查询。
+请不要列举太多具体的常量 (比方说 几百万条)。如果数据集非常大，请把它放在一张临时表里（例如，参考章节[用于查询处理的外部数据](../../engines/table-engines/special/external-data.md)），然后使用子查询。

 运算符的右侧可以是一组常量表达式、一组带有常量表达式的元组（如上面的示例所示），或括号中的数据库表或SELECT子查询的名称。

--- a/programs/format/Format.cpp
+++ b/programs/format/Format.cpp
@ -7,10 +7,12 @@
 #include <IO/ReadHelpers.h>
 #include <IO/WriteBufferFromFileDescriptor.h>
 #include <IO/WriteBufferFromOStream.h>
+#include <Parsers/ASTInsertQuery.h>
 #include <Parsers/ParserQuery.h>
-#include <Parsers/parseQuery.h>
 #include <Parsers/formatAST.h>
 #include <Parsers/obfuscateQueries.h>
+#include <Parsers/parseQuery.h>
+#include <Common/ErrorCodes.h>
 #include <Common/TerminalSize.h>

 #include <Interpreters/Context.h>
@ -28,6 +30,14 @@
 #pragma GCC diagnostic ignored "-Wunused-function"
 #pragma GCC diagnostic ignored "-Wmissing-declarations"

+namespace DB
+{
+namespace ErrorCodes
+{
+extern const int INVALID_FORMAT_INSERT_QUERY_WITH_DATA;
+}
+}
+
 int mainEntryClickHouseFormat(int argc, char ** argv)
 {
    using namespace DB;
@ -128,6 +138,14 @@ int mainEntryClickHouseFormat(int argc, char ** argv)
            do
            {
                ASTPtr res = parseQueryAndMovePosition(parser, pos, end, "query", multiple, 0, DBMS_DEFAULT_MAX_PARSER_DEPTH);
+                /// For insert query with data(INSERT INTO ... VALUES ...), will lead to format fail,
+                /// should throw exception early and make exception message more readable.
+                if (const auto * insert_query = res->as<ASTInsertQuery>(); insert_query && insert_query->data)
+                {
+                    throw Exception(
+                        "Can't format ASTInsertQuery with data, since data will be lost",
+                        DB::ErrorCodes::INVALID_FORMAT_INSERT_QUERY_WITH_DATA);
+                }
                if (!quiet)
                {
                    WriteBufferFromOStream res_buf(std::cout, 4096);
@ -137,6 +155,26 @@ int mainEntryClickHouseFormat(int argc, char ** argv)
                        std::cout << "\n;\n";
                    std::cout << std::endl;
                }
+
+                do
+                {
+                    /// skip spaces to avoid throw exception after last query
+                    while (pos != end && std::isspace(*pos))
+                        ++pos;
+
+                    /// for skip comment after the last query and to not throw exception
+                    if (end - pos > 2 && *pos == '-' && *(pos + 1) == '-')
+                    {
+                        pos += 2;
+                        /// skip until the end of the line
+                        while (pos != end && *pos != '\n')
+                            ++pos;
+                    }
+                    /// need to parse next sql
+                    else
+                        break;
+                } while (pos != end);
+
            } while (multiple && pos != end);
        }
    }
--- a/src/Common/BorrowedObjectPool.h
+++ b/src/Common/BorrowedObjectPool.h
@ -0,0 +1,157 @@
+#pragma once
+
+#include <cstdint>
+#include <vector>
+#include <chrono>
+#include <mutex>
+#include <condition_variable>
+
+#include <common/defines.h>
+
+#include <Common/MoveOrCopyIfThrow.h>
+
+/** Pool for limited size objects that cannot be used from different threads simultaneously.
+  * The main use case is to have fixed size of objects that can be reused in difference threads during their lifetime
+  * and have to be initialized on demand.
+  * Two main properties of pool are allocated objects size and borrowed objects size.
+  * Allocated objects size is size of objects that are currently allocated by the pool.
+  * Borrowed objects size is size of objects that are borrowed by clients.
+  * If max_size == 0 then pool has unlimited size and objects will be allocated without limit.
+  *
+  * Pool provides following strategy for borrowing object:
+  * If max_size == 0 then pool has unlimited size and objects will be allocated without limit.
+  * 1. If pool has objects that can be borrowed increase borrowed objects size and return it.
+  * 2. If pool allocatedObjectsSize is lower than max objects size or pool has unlimited size
+  * allocate new object, increase borrowed objects size and return it.
+  * 3. If pool is full wait on condition variable with or without timeout until some object
+  * will be returned to the pool.
+  */
+template <typename T>
+class BorrowedObjectPool final
+{
+public:
+    explicit BorrowedObjectPool(size_t max_size_) : max_size(max_size_) {}
+
+    /// Borrow object from pool. If pull is full and all objects were borrowed
+    /// then calling thread will wait until some object will be returned into pool.
+    template <typename FactoryFunc>
+    void borrowObject(T & dest, FactoryFunc && func)
+    {
+        std::unique_lock<std::mutex> lock(objects_mutex);
+
+        if (!objects.empty())
+        {
+            dest = borrowFromObjects(lock);
+            return;
+        }
+
+        bool has_unlimited_size = (max_size == 0);
+
+        if (unlikely(has_unlimited_size) || allocated_objects_size < max_size)
+        {
+            dest = allocateObjectForBorrowing(lock, std::forward<FactoryFunc>(func));
+            return;
+        }
+
+        condition_variable.wait(lock, [this] { return !objects.empty(); });
+        dest = borrowFromObjects(lock);
+    }
+
+    /// Same as borrowObject function, but wait with timeout.
+    /// Returns true if object was borrowed during timeout.
+    template <typename FactoryFunc>
+    bool tryBorrowObject(T & dest, FactoryFunc && func, size_t timeout_in_milliseconds = 0)
+    {
+        std::unique_lock<std::mutex> lock(objects_mutex);
+
+        if (!objects.empty())
+        {
+            dest = borrowFromObjects(lock);
+            return true;
+        }
+
+        bool has_unlimited_size = (max_size == 0);
+
+        if (unlikely(has_unlimited_size) || allocated_objects_size < max_size)
+        {
+            dest = allocateObjectForBorrowing(lock, std::forward<FactoryFunc>(func));
+            return true;
+        }
+
+        bool wait_result = condition_variable.wait_for(lock, std::chrono::milliseconds(timeout_in_milliseconds), [this] { return !objects.empty(); });
+
+        if (wait_result)
+            dest = borrowFromObjects(lock);
+
+        return wait_result;
+    }
+
+    /// Return object into pool. Client must return same object that was borrowed.
+    inline void returnObject(T && object_to_return)
+    {
+        std::unique_lock<std::mutex> lck(objects_mutex);
+
+        objects.emplace_back(std::move(object_to_return));
+        --borrowed_objects_size;
+
+        condition_variable.notify_one();
+    }
+
+    /// Max pool size
+    inline size_t maxSize() const
+    {
+        return max_size;
+    }
+
+    /// Allocated objects size by the pool. If allocatedObjectsSize == maxSize then pool is full.
+    inline size_t allocatedObjectsSize() const
+    {
+        std::unique_lock<std::mutex> lock(objects_mutex);
+        return allocated_objects_size;
+    }
+
+    /// Returns allocatedObjectsSize == maxSize
+    inline bool isFull() const
+    {
+        std::unique_lock<std::mutex> lock(objects_mutex);
+        return allocated_objects_size == max_size;
+    }
+
+    /// Borrowed objects size. If borrowedObjectsSize == allocatedObjectsSize and pool is full.
+    /// Then client will wait during borrowObject function call.
+    inline size_t borrowedObjectsSize() const
+    {
+        std::unique_lock<std::mutex> lock(objects_mutex);
+        return borrowed_objects_size;
+    }
+
+private:
+
+    template <typename FactoryFunc>
+    inline T allocateObjectForBorrowing(const std::unique_lock<std::mutex> &, FactoryFunc && func)
+    {
+        ++allocated_objects_size;
+        ++borrowed_objects_size;
+
+        return std::forward<FactoryFunc>(func)();
+    }
+
+    inline T borrowFromObjects(const std::unique_lock<std::mutex> &)
+    {
+        T dst;
+        detail::moveOrCopyIfThrow(std::move(objects.back()), dst);
+        objects.pop_back();
+
+        ++borrowed_objects_size;
+
+        return dst;
+    }
+
+    size_t max_size;
+
+    mutable std::mutex objects_mutex;
+    std::condition_variable condition_variable;
+    size_t allocated_objects_size = 0;
+    size_t borrowed_objects_size = 0;
+    std::vector<T> objects;
+};
--- a/src/Common/ConcurrentBoundedQueue.h
+++ b/src/Common/ConcurrentBoundedQueue.h
@ -6,38 +6,7 @@
 #include <Poco/Mutex.h>
 #include <Poco/Semaphore.h>

-#include <common/types.h>
-
-
-namespace detail
-{
-    template <typename T, bool is_nothrow_move_assignable = std::is_nothrow_move_assignable_v<T>>
-    struct MoveOrCopyIfThrow;
-
-    template <typename T>
-    struct MoveOrCopyIfThrow<T, true>
-    {
-        void operator()(T && src, T & dst) const
-        {
-            dst = std::forward<T>(src);
-        }
-    };
-
-    template <typename T>
-    struct MoveOrCopyIfThrow<T, false>
-    {
-        void operator()(T && src, T & dst) const
-        {
-            dst = src;
-        }
-    };
-
-    template <typename T>
-    void moveOrCopyIfThrow(T && src, T & dst)
-    {
-        MoveOrCopyIfThrow<T>()(std::forward<T>(src), dst);
-    }
-}
+#include <Common/MoveOrCopyIfThrow.h>

 /** A very simple thread-safe queue of limited size.
  * If you try to pop an item from an empty queue, the thread is blocked until the queue becomes nonempty.
@ -53,8 +22,10 @@ private:
    Poco::Semaphore empty_count;

 public:
-    ConcurrentBoundedQueue(size_t max_fill)
-        : fill_count(0, max_fill), empty_count(max_fill, max_fill) {}
+    explicit ConcurrentBoundedQueue(size_t max_fill)
+        : fill_count(0, max_fill)
+        , empty_count(max_fill, max_fill)
+    {}

    void push(const T & x)
    {
--- a/src/Common/CurrentMetrics.h
+++ b/src/Common/CurrentMetrics.h
@ -100,6 +100,12 @@ namespace CurrentMetrics
            amount -= value;
        }

+        void add(Value value = 1)
+        {
+            what->fetch_add(value, std::memory_order_relaxed);
+            amount += value;
+        }
+
        /// Subtract value before destructor.
        void destroy()
        {
--- a/src/Common/ErrorCodes.cpp
+++ b/src/Common/ErrorCodes.cpp
@ -540,12 +540,14 @@
    M(571, DATABASE_REPLICATION_FAILED) \
    M(572, TOO_MANY_QUERY_PLAN_OPTIMIZATIONS) \
    M(573, EPOLL_ERROR) \
+    M(574, DISTRIBUTED_TOO_MANY_PENDING_BYTES) \
    \
    M(999, KEEPER_EXCEPTION) \
    M(1000, POCO_EXCEPTION) \
    M(1001, STD_EXCEPTION) \
    M(1002, UNKNOWN_EXCEPTION) \
    M(1003, INVALID_SHARD_ID) \
+    M(1004, INVALID_FORMAT_INSERT_QUERY_WITH_DATA)

 /* See END */

--- a/src/Common/Exception.cpp
+++ b/src/Common/Exception.cpp
@ -458,33 +458,25 @@ ExecutionStatus ExecutionStatus::fromCurrentException(const std::string & start_
    return ExecutionStatus(getCurrentExceptionCode(), msg);
 }

-ParsingException::ParsingException()
-{
-    Exception::message(Exception::message() + "{}");
-}
-
+ParsingException::ParsingException() = default;
 ParsingException::ParsingException(const std::string & msg, int code)
    : Exception(msg, code)
 {
-    Exception::message(Exception::message() + "{}");
 }
-
 ParsingException::ParsingException(int code, const std::string & message)
    : Exception(message, code)
 {
-    Exception::message(Exception::message() + "{}");
 }

-
 /// We use additional field formatted_message_ to make this method const.
 std::string ParsingException::displayText() const
 {
    try
    {
        if (line_number_ == -1)
-            formatted_message_ = fmt::format(message(), "");
+            formatted_message_ = message();
        else
-            formatted_message_ = fmt::format(message(), fmt::format(": (at row {})\n", line_number_));
+            formatted_message_ = message() + fmt::format(": (at row {})\n", line_number_);
    }
    catch (...)
    {}
--- a/src/Common/Exception.h
+++ b/src/Common/Exception.h
@ -115,9 +115,7 @@ public:
    template <typename ...Args>
    ParsingException(int code, const std::string & fmt, Args&&... args)
        : Exception(fmt::format(fmt, std::forward<Args>(args)...), code)
-    {
-        Exception::message(Exception::message() + "{}");
-    }
+    {}


    std::string displayText() const
--- a/src/Common/MoveOrCopyIfThrow.h
+++ b/src/Common/MoveOrCopyIfThrow.h
@ -0,0 +1,33 @@
+#pragma once
+
+#include <common/types.h>
+
+namespace detail
+{
+    template <typename T, bool is_nothrow_move_assignable = std::is_nothrow_move_assignable_v<T>>
+    struct MoveOrCopyIfThrow;
+
+    template <typename T>
+    struct MoveOrCopyIfThrow<T, true>
+    {
+        void operator()(T && src, T & dst) const
+        {
+            dst = std::forward<T>(src);
+        }
+    };
+
+    template <typename T>
+    struct MoveOrCopyIfThrow<T, false>
+    {
+        void operator()(T && src, T & dst) const
+        {
+            dst = src;
+        }
+    };
+
+    template <typename T>
+    void moveOrCopyIfThrow(T && src, T & dst)
+    {
+        MoveOrCopyIfThrow<T>()(std::forward<T>(src), dst);
+    }
+}
--- a/src/Common/ProfileEvents.cpp
+++ b/src/Common/ProfileEvents.cpp
@ -68,6 +68,9 @@
    M(DelayedInserts, "Number of times the INSERT of a block to a MergeTree table was throttled due to high number of active data parts for partition.") \
    M(RejectedInserts, "Number of times the INSERT of a block to a MergeTree table was rejected with 'Too many parts' exception due to high number of active data parts for partition.") \
    M(DelayedInsertsMilliseconds, "Total number of milliseconds spent while the INSERT of a block to a MergeTree table was throttled due to high number of active data parts for partition.") \
+    M(DistributedDelayedInserts, "Number of times the INSERT of a block to a Distributed table was throttled due to high number of pending bytes.") \
+    M(DistributedRejectedInserts, "Number of times the INSERT of a block to a Distributed table was rejected with 'Too many bytes' exception due to high number of pending bytes.") \
+    M(DistributedDelayedInsertsMilliseconds, "Total number of milliseconds spent while the INSERT of a block to a Distributed table was throttled due to high number of pending bytes.") \
    M(DuplicatedInsertedBlocks, "Number of times the INSERTed block to a ReplicatedMergeTree table was deduplicated.") \
    \
    M(ZooKeeperInit, "") \
--- a/src/Common/ShellCommand.cpp
+++ b/src/Common/ShellCommand.cpp
@ -2,15 +2,18 @@
 #include <sys/wait.h>
 #include <fcntl.h>
 #include <dlfcn.h>
+#include <unistd.h>
+#include <time.h>
+#include <csignal>
+
+#include <common/logger_useful.h>
+#include <common/errnoToString.h>
 #include <Common/Exception.h>
 #include <Common/ShellCommand.h>
 #include <Common/PipeFDs.h>
-#include <common/logger_useful.h>
-#include <common/errnoToString.h>
 #include <IO/WriteHelpers.h>
 #include <IO/Operators.h>
-#include <unistd.h>
-#include <csignal>
+

 namespace
 {
@ -36,9 +39,9 @@ namespace ErrorCodes
    extern const int CANNOT_CREATE_CHILD_PROCESS;
 }

-ShellCommand::ShellCommand(pid_t pid_, int & in_fd_, int & out_fd_, int & err_fd_, bool terminate_in_destructor_)
+ShellCommand::ShellCommand(pid_t pid_, int & in_fd_, int & out_fd_, int & err_fd_, ShellCommandDestructorStrategy destructor_strategy_)
    : pid(pid_)
-    , terminate_in_destructor(terminate_in_destructor_)
+    , destructor_strategy(destructor_strategy_)
    , in(in_fd_)
    , out(out_fd_)
    , err(err_fd_)
@ -52,14 +55,24 @@ Poco::Logger * ShellCommand::getLogger()

 ShellCommand::~ShellCommand()
 {
-    if (terminate_in_destructor)
+    if (wait_called)
+        return;
+
+    if (destructor_strategy.terminate_in_destructor)
    {
-        LOG_TRACE(getLogger(), "Will kill shell command pid {} with SIGTERM", pid);
-        int retcode = kill(pid, SIGTERM);
-        if (retcode != 0)
-            LOG_WARNING(getLogger(), "Cannot kill shell command pid {} errno '{}'", pid, errnoToString(retcode));
+        size_t try_wait_timeout = destructor_strategy.wait_for_normal_exit_before_termination_seconds;
+        bool process_terminated_normally = tryWaitProcessWithTimeout(try_wait_timeout);
+
+        if (!process_terminated_normally)
+        {
+            LOG_TRACE(getLogger(), "Will kill shell command pid {} with SIGTERM", pid);
+
+            int retcode = kill(pid, SIGTERM);
+            if (retcode != 0)
+                LOG_WARNING(getLogger(), "Cannot kill shell command pid {} errno '{}'", pid, errnoToString(retcode));
+        }
    }
-    else if (!wait_called)
+    else
    {
        try
        {
@ -72,6 +85,54 @@ ShellCommand::~ShellCommand()
    }
 }

+bool ShellCommand::tryWaitProcessWithTimeout(size_t timeout_in_seconds)
+{
+    int status = 0;
+
+    LOG_TRACE(getLogger(), "Try wait for shell command pid ({}) with timeout ({})", pid, timeout_in_seconds);
+
+    wait_called = true;
+    struct timespec interval {.tv_sec = 1, .tv_nsec = 0};
+
+    in.close();
+    out.close();
+    err.close();
+
+    if (timeout_in_seconds == 0)
+    {
+        /// If there is no timeout before signal try to waitpid 1 time without block so we can avoid sending
+        /// signal if process is already normally terminated.
+
+        int waitpid_res = waitpid(pid, &status, WNOHANG);
+        bool process_terminated_normally = (waitpid_res == pid);
+        return process_terminated_normally;
+    }
+
+    /// If timeout is positive try waitpid without block in loop until
+    /// process is normally terminated or waitpid return error
+
+    while (timeout_in_seconds != 0)
+    {
+        int waitpid_res = waitpid(pid, &status, WNOHANG);
+
+        bool process_terminated_normally = (waitpid_res == pid);
+
+        if (process_terminated_normally)
+            return true;
+        else if (waitpid_res == 0)
+        {
+            --timeout_in_seconds;
+            nanosleep(&interval, nullptr);
+
+            continue;
+        }
+        else if (waitpid_res == -1 && errno != EINTR)
+            return false;
+    }
+
+    return false;
+}
+
 void ShellCommand::logCommand(const char * filename, char * const argv[])
 {
    WriteBufferFromOwnString args;
@ -87,7 +148,10 @@ void ShellCommand::logCommand(const char * filename, char * const argv[])
 }

 std::unique_ptr<ShellCommand> ShellCommand::executeImpl(
-    const char * filename, char * const argv[], bool pipe_stdin_only, bool terminate_in_destructor)
+    const char * filename,
+    char * const argv[],
+    bool pipe_stdin_only,
+    ShellCommandDestructorStrategy terminate_in_destructor_strategy)
 {
    logCommand(filename, argv);

@ -144,7 +208,7 @@ std::unique_ptr<ShellCommand> ShellCommand::executeImpl(
    }

    std::unique_ptr<ShellCommand> res(new ShellCommand(
-        pid, pipe_stdin.fds_rw[1], pipe_stdout.fds_rw[0], pipe_stderr.fds_rw[0], terminate_in_destructor));
+        pid, pipe_stdin.fds_rw[1], pipe_stdout.fds_rw[0], pipe_stderr.fds_rw[0], terminate_in_destructor_strategy));

    LOG_TRACE(getLogger(), "Started shell command '{}' with pid {}", filename, pid);
    return res;
@ -152,7 +216,9 @@ std::unique_ptr<ShellCommand> ShellCommand::executeImpl(


 std::unique_ptr<ShellCommand> ShellCommand::execute(
-    const std::string & command, bool pipe_stdin_only, bool terminate_in_destructor)
+    const std::string & command,
+    bool pipe_stdin_only,
+    ShellCommandDestructorStrategy terminate_in_destructor_strategy)
 {
    /// Arguments in non-constant chunks of memory (as required for `execv`).
    /// Moreover, their copying must be done before calling `vfork`, so after `vfork` do a minimum of things.
@ -162,12 +228,14 @@ std::unique_ptr<ShellCommand> ShellCommand::execute(

    char * const argv[] = { argv0.data(), argv1.data(), argv2.data(), nullptr };

-    return executeImpl("/bin/sh", argv, pipe_stdin_only, terminate_in_destructor);
+    return executeImpl("/bin/sh", argv, pipe_stdin_only, terminate_in_destructor_strategy);
 }


 std::unique_ptr<ShellCommand> ShellCommand::executeDirect(
-    const std::string & path, const std::vector<std::string> & arguments, bool terminate_in_destructor)
+    const std::string & path,
+    const std::vector<std::string> & arguments,
+    ShellCommandDestructorStrategy terminate_in_destructor_strategy)
 {
    size_t argv_sum_size = path.size() + 1;
    for (const auto & arg : arguments)
@ -188,7 +256,7 @@ std::unique_ptr<ShellCommand> ShellCommand::executeDirect(

    argv[arguments.size() + 1] = nullptr;

-    return executeImpl(path.data(), argv.data(), false, terminate_in_destructor);
+    return executeImpl(path.data(), argv.data(), false, terminate_in_destructor_strategy);
 }


--- a/src/Common/ShellCommand.h
+++ b/src/Common/ShellCommand.h
@ -23,21 +23,38 @@ namespace DB
  * The second difference - allows to work simultaneously with stdin, and with stdout, and with stderr of running process,
  *  and also to obtain the return code and completion status.
  */
-class ShellCommand
+
+struct ShellCommandDestructorStrategy final
+{
+    explicit ShellCommandDestructorStrategy(bool terminate_in_destructor_, size_t wait_for_normal_exit_before_termination_seconds_ = 0)
+        : terminate_in_destructor(terminate_in_destructor_)
+        , wait_for_normal_exit_before_termination_seconds(wait_for_normal_exit_before_termination_seconds_)
+    {
+    }
+
+    bool terminate_in_destructor;
+
+    /// If terminate in destructor is true, command will wait until send SIGTERM signal to created process
+    size_t wait_for_normal_exit_before_termination_seconds = 0;
+};
+
+class ShellCommand final
 {
 private:
    pid_t pid;
    bool wait_called = false;
-    bool terminate_in_destructor;
+    ShellCommandDestructorStrategy destructor_strategy;

-    ShellCommand(pid_t pid_, int & in_fd_, int & out_fd_, int & err_fd_, bool terminate_in_destructor_);
+    ShellCommand(pid_t pid_, int & in_fd_, int & out_fd_, int & err_fd_, ShellCommandDestructorStrategy destructor_strategy_);
+
+    bool tryWaitProcessWithTimeout(size_t timeout_in_seconds);

    static Poco::Logger * getLogger();

    /// Print command name and the list of arguments to log. NOTE: No escaping of arguments is performed.
    static void logCommand(const char * filename, char * const argv[]);

-    static std::unique_ptr<ShellCommand> executeImpl(const char * filename, char * const argv[], bool pipe_stdin_only, bool terminate_in_destructor);
+    static std::unique_ptr<ShellCommand> executeImpl(const char * filename, char * const argv[], bool pipe_stdin_only, ShellCommandDestructorStrategy terminate_in_destructor_strategy);

 public:
    WriteBufferFromFile in;        /// If the command reads from stdin, do not forget to call in.close() after writing all the data there.
@ -48,11 +65,11 @@ public:

    /// Run the command using /bin/sh -c.
    /// If terminate_in_destructor is true, send terminate signal in destructor and don't wait process.
-    static std::unique_ptr<ShellCommand> execute(const std::string & command, bool pipe_stdin_only = false, bool terminate_in_destructor = false);
+    static std::unique_ptr<ShellCommand> execute(const std::string & command, bool pipe_stdin_only = false, ShellCommandDestructorStrategy terminate_in_destructor_strategy = ShellCommandDestructorStrategy(false));

    /// Run the executable with the specified arguments. `arguments` - without argv[0].
    /// If terminate_in_destructor is true, send terminate signal in destructor and don't wait process.
-    static std::unique_ptr<ShellCommand> executeDirect(const std::string & path, const std::vector<std::string> & arguments, bool terminate_in_destructor = false);
+    static std::unique_ptr<ShellCommand> executeDirect(const std::string & path, const std::vector<std::string> & arguments, ShellCommandDestructorStrategy terminate_in_destructor_strategy = ShellCommandDestructorStrategy(false));

    /// Wait for the process to end, throw an exception if the code is not 0 or if the process was not completed by itself.
    void wait();
--- a/src/Common/XDBCBridgeHelper.h
+++ b/src/Common/XDBCBridgeHelper.h
@ -345,7 +345,7 @@ struct ODBCBridgeMixin

        LOG_TRACE(log, "Starting {}", serviceAlias());

-        return ShellCommand::executeDirect(path.toString(), cmd_args, true);
+        return ShellCommand::executeDirect(path.toString(), cmd_args, ShellCommandDestructorStrategy(true));
    }
 };
 }
--- a/src/Coordination/NuKeeperStateManager.cpp
+++ b/src/Coordination/NuKeeperStateManager.cpp
@ -26,7 +26,7 @@ NuKeeperStateManager::NuKeeperStateManager(
    const CoordinationSettingsPtr & coordination_settings)
    : my_server_id(my_server_id_)
    , log_store(nuraft::cs_new<NuKeeperLogStore>(
-                    config.getString(config_prefix + ".log_storage_path"),
+                    config.getString(config_prefix + ".log_storage_path", config.getString("path", DBMS_DEFAULT_PATH) + "coordination/logs"),
                    coordination_settings->rotate_log_storage_interval, coordination_settings->force_sync))
    , cluster_config(nuraft::cs_new<nuraft::cluster_config>())
 {
--- a/src/Dictionaries/DictionarySourceHelpers.cpp
+++ b/src/Dictionaries/DictionarySourceHelpers.cpp
@ -13,6 +13,11 @@
 namespace DB
 {

+namespace ErrorCodes
+{
+    extern const int SIZES_OF_COLUMNS_DOESNT_MATCH;
+}
+
 void formatBlock(BlockOutputStreamPtr & out, const Block & block)
 {
    out->writePrefix();
@ -88,4 +93,63 @@ Context copyContextAndApplySettings(
    return local_context;
 }

+
+BlockInputStreamWithAdditionalColumns::BlockInputStreamWithAdditionalColumns(
+    Block block_to_add_, std::unique_ptr<IBlockInputStream> && stream_)
+    : block_to_add(std::move(block_to_add_))
+    , stream(std::move(stream_))
+{
+}
+
+Block BlockInputStreamWithAdditionalColumns::getHeader() const
+{
+    auto header = stream->getHeader();
+
+    if (header)
+    {
+        for (Int64 i = static_cast<Int64>(block_to_add.columns() - 1); i >= 0; --i)
+            header.insert(0, block_to_add.getByPosition(i).cloneEmpty());
+    }
+
+    return header;
+}
+
+Block BlockInputStreamWithAdditionalColumns::readImpl()
+{
+    auto block = stream->read();
+
+    if (block)
+    {
+        auto block_rows = block.rows();
+
+        auto cut_block = block_to_add.cloneWithCutColumns(current_range_index, block_rows);
+
+        if (cut_block.rows() != block_rows)
+            throw Exception(
+                "Number of rows in block to add after cut must equal to number of rows in block from inner stream",
+                ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
+
+        for (Int64 i = static_cast<Int64>(cut_block.columns() - 1); i >= 0; --i)
+            block.insert(0, cut_block.getByPosition(i));
+
+        current_range_index += block_rows;
+    }
+
+    return block;
+}
+
+void BlockInputStreamWithAdditionalColumns::readPrefix()
+{
+    stream->readPrefix();
+}
+
+void BlockInputStreamWithAdditionalColumns::readSuffix()
+{
+    stream->readSuffix();
+}
+
+String BlockInputStreamWithAdditionalColumns::getName() const
+{
+    return "BlockInputStreamWithAdditionalColumns";
+}
 }
--- a/src/Dictionaries/DictionarySourceHelpers.h
+++ b/src/Dictionaries/DictionarySourceHelpers.h
@ -6,12 +6,13 @@

 #include <Poco/File.h>
 #include <Poco/Util/AbstractConfiguration.h>
-
+#include <DataStreams/IBlockInputStream.h>
 #include <Columns/IColumn.h>
 #include <Core/Block.h>

 namespace DB
 {
+
 class IBlockOutputStream;
 using BlockOutputStreamPtr = std::shared_ptr<IBlockOutputStream>;

@ -46,4 +47,29 @@ void applySettingsToContext(
    Context & context,
    const Poco::Util::AbstractConfiguration & config);

+/** A stream, adds additional columns to each block that it will read from inner stream.
+     *
+     *  block_to_add rows size must be equal to final sum rows size of all inner stream blocks.
+     */
+class BlockInputStreamWithAdditionalColumns final : public IBlockInputStream
+{
+public:
+    BlockInputStreamWithAdditionalColumns(Block block_to_add_, std::unique_ptr<IBlockInputStream> && stream_);
+
+    Block getHeader() const override;
+
+    Block readImpl() override;
+
+    void readPrefix() override;
+
+    void readSuffix() override;
+
+    String getName() const override;
+
+private:
+    Block block_to_add;
+    std::unique_ptr<IBlockInputStream> stream;
+    size_t current_range_index = 0;
+};
+
 }
--- a/src/Dictionaries/ExecutableDictionarySource.cpp
+++ b/src/Dictionaries/ExecutableDictionarySource.cpp
@ -27,7 +27,6 @@ namespace ErrorCodes
    extern const int LOGICAL_ERROR;
    extern const int DICTIONARY_ACCESS_DENIED;
    extern const int UNSUPPORTED_METHOD;
-    extern const int SIZES_OF_COLUMNS_DOESNT_MATCH;
 }

 namespace
@ -58,7 +57,6 @@ namespace

 }

-
 ExecutableDictionarySource::ExecutableDictionarySource(
    const DictionaryStructure & dict_struct_,
    const Poco::Util::AbstractConfiguration & config,
@ -200,101 +198,14 @@ namespace
        std::function<void(WriteBufferFromFile &)> send_data;
        ThreadFromGlobalPool thread;
    };
-
-    /** A stream, adds additional columns to each block that it will read from inner stream.
-     *
-     *  block_to_add rows size must be equal to final sum rows size of all inner stream blocks.
-     */
-    class BlockInputStreamWithAdditionalColumns final: public IBlockInputStream
-    {
-    public:
-        BlockInputStreamWithAdditionalColumns(
-            Block block_to_add_,
-            std::unique_ptr<IBlockInputStream>&& stream_)
-            : block_to_add(std::move(block_to_add_))
-            , stream(std::move(stream_))
-        {
-        }
-
-        Block getHeader() const override
-        {
-            auto header = stream->getHeader();
-
-            if (header)
-            {
-                for (Int64 i = static_cast<Int64>(block_to_add.columns() - 1); i >= 0; --i)
-                    header.insert(0, block_to_add.getByPosition(i).cloneEmpty());
-            }
-
-            return header;
-        }
-
-        Block readImpl() override
-        {
-            auto block = stream->read();
-
-            if (block)
-            {
-                auto block_rows = block.rows();
-
-                auto cut_block = block_to_add.cloneWithCutColumns(current_range_index, block_rows);
-
-                if (cut_block.rows() != block_rows)
-                    throw Exception(
-                        "Number of rows in block to add after cut must equal to number of rows in block from inner stream",
-                        ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
-
-                for (Int64 i = static_cast<Int64>(cut_block.columns() - 1); i >= 0; --i)
-                    block.insert(0, cut_block.getByPosition(i));
-
-                current_range_index += block_rows;
-            }
-
-            return block;
-        }
-
-        void readPrefix() override
-        {
-            stream->readPrefix();
-        }
-
-        void readSuffix() override
-        {
-            stream->readSuffix();
-        }
-
-        String getName() const override { return "BlockInputStreamWithAdditionalColumns"; }
-
-    private:
-        Block block_to_add;
-        std::unique_ptr<IBlockInputStream> stream;
-        size_t current_range_index = 0;
-    };
-
 }

-
 BlockInputStreamPtr ExecutableDictionarySource::loadIds(const std::vector<UInt64> & ids)
 {
    LOG_TRACE(log, "loadIds {} size = {}", toString(), ids.size());

    auto block = blockForIds(dict_struct, ids);
-
-    auto stream = std::make_unique<BlockInputStreamWithBackgroundThread>(
-        context, format, sample_block, command, log,
-        [block, this](WriteBufferFromFile & out) mutable
-        {
-            auto output_stream = context.getOutputStream(format, out, block.cloneEmpty());
-            formatBlock(output_stream, block);
-            out.close();
-        });
-
-    if (implicit_key)
-    {
-        return std::make_shared<BlockInputStreamWithAdditionalColumns>(block, std::move(stream));
-    }
-    else
-        return std::shared_ptr<BlockInputStreamWithBackgroundThread>(stream.release());
+    return getStreamForBlock(block);
 }

 BlockInputStreamPtr ExecutableDictionarySource::loadKeys(const Columns & key_columns, const std::vector<size_t> & requested_rows)
@ -302,7 +213,11 @@ BlockInputStreamPtr ExecutableDictionarySource::loadKeys(const Columns & key_col
    LOG_TRACE(log, "loadKeys {} size = {}", toString(), requested_rows.size());

    auto block = blockForKeys(dict_struct, key_columns, requested_rows);
+    return getStreamForBlock(block);
+}

+BlockInputStreamPtr ExecutableDictionarySource::getStreamForBlock(const Block & block)
+{
    auto stream = std::make_unique<BlockInputStreamWithBackgroundThread>(
        context, format, sample_block, command, log,
        [block, this](WriteBufferFromFile & out) mutable
@ -354,13 +269,13 @@ void registerDictionarySourceExecutable(DictionarySourceFactory & factory)
                                 bool check_config) -> DictionarySourcePtr
    {
        if (dict_struct.has_expressions)
-            throw Exception{"Dictionary source of type `executable` does not support attribute expressions", ErrorCodes::LOGICAL_ERROR};
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Dictionary source of type `executable` does not support attribute expressions");

        /// Executable dictionaries may execute arbitrary commands.
        /// It's OK for dictionaries created by administrator from xml-file, but
        /// maybe dangerous for dictionaries created from DDL-queries.
        if (check_config)
-            throw Exception("Dictionaries with Executable dictionary source is not allowed", ErrorCodes::DICTIONARY_ACCESS_DENIED);
+            throw Exception(ErrorCodes::DICTIONARY_ACCESS_DENIED, "Dictionaries with executable dictionary source are not allowed to be created from DDL query");

        Context context_local_copy = copyContextAndApplySettings(config_prefix, context, config);

--- a/src/Dictionaries/ExecutableDictionarySource.h
+++ b/src/Dictionaries/ExecutableDictionarySource.h
@ -47,6 +47,8 @@ public:

    std::string toString() const override;

+    BlockInputStreamPtr getStreamForBlock(const Block & block);
+
 private:
    Poco::Logger * log;
    time_t update_time = 0;
--- a/src/Dictionaries/ExecutablePoolDictionarySource.cpp
+++ b/src/Dictionaries/ExecutablePoolDictionarySource.cpp
@ -0,0 +1,321 @@
+#include "ExecutablePoolDictionarySource.h"
+
+#include <functional>
+#include <ext/scope_guard.h>
+#include <DataStreams/IBlockOutputStream.h>
+#include <Interpreters/Context.h>
+#include <IO/WriteHelpers.h>
+#include <IO/ReadHelpers.h>
+#include <IO/copyData.h>
+#include <Common/ShellCommand.h>
+#include <Common/ThreadPool.h>
+#include <common/logger_useful.h>
+#include <common/LocalDateTime.h>
+#include "DictionarySourceFactory.h"
+#include "DictionarySourceHelpers.h"
+#include "DictionaryStructure.h"
+#include "registerDictionaries.h"
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+    extern const int DICTIONARY_ACCESS_DENIED;
+    extern const int UNSUPPORTED_METHOD;
+    extern const int TIMEOUT_EXCEEDED;
+}
+
+ExecutablePoolDictionarySource::ExecutablePoolDictionarySource(
+    const DictionaryStructure & dict_struct_,
+    const Configuration & configuration_,
+    Block & sample_block_,
+    const Context & context_)
+    : log(&Poco::Logger::get("ExecutablePoolDictionarySource"))
+    , dict_struct{dict_struct_}
+    , configuration{configuration_}
+    , sample_block{sample_block_}
+    , context{context_}
+    /// If pool size == 0 then there is no size restrictions. Poco max size of semaphore is integer type.
+    , process_pool{std::make_shared<ProcessPool>(configuration.pool_size == 0 ? std::numeric_limits<int>::max() : configuration.pool_size)}
+{
+    /// Remove keys from sample_block for implicit_key dictionary because
+    /// these columns will not be returned from source
+    /// Implicit key means that the source script will return only values,
+    /// and the correspondence to the requested keys is determined implicitly - by the order of rows in the result.
+    if (configuration.implicit_key)
+    {
+        auto keys_names = dict_struct.getKeysNames();
+
+        for (auto & key_name : keys_names)
+        {
+            size_t key_column_position_in_block = sample_block.getPositionByName(key_name);
+            sample_block.erase(key_column_position_in_block);
+        }
+    }
+}
+
+ExecutablePoolDictionarySource::ExecutablePoolDictionarySource(const ExecutablePoolDictionarySource & other)
+    : log(&Poco::Logger::get("ExecutablePoolDictionarySource"))
+    , update_time{other.update_time}
+    , dict_struct{other.dict_struct}
+    , configuration{other.configuration}
+    , sample_block{other.sample_block}
+    , context{other.context}
+    , process_pool{std::make_shared<ProcessPool>(configuration.pool_size)}
+{
+}
+
+BlockInputStreamPtr ExecutablePoolDictionarySource::loadAll()
+{
+    throw Exception("ExecutablePoolDictionarySource with implicit_key does not support loadAll method", ErrorCodes::UNSUPPORTED_METHOD);
+}
+
+BlockInputStreamPtr ExecutablePoolDictionarySource::loadUpdatedAll()
+{
+    throw Exception("ExecutablePoolDictionarySource with implicit_key does not support loadAll method", ErrorCodes::UNSUPPORTED_METHOD);
+}
+
+namespace
+{
+    /** A stream, that runs child process and sends data to its stdin in background thread,
+      *  and receives data from its stdout.
+      */
+    class PoolBlockInputStreamWithBackgroundThread final : public IBlockInputStream
+    {
+    public:
+        PoolBlockInputStreamWithBackgroundThread(
+            std::shared_ptr<ProcessPool> process_pool_,
+            std::unique_ptr<ShellCommand> && command_,
+            BlockInputStreamPtr && stream_,
+            size_t read_rows_,
+            Poco::Logger * log_,
+            std::function<void(WriteBufferFromFile &)> && send_data_)
+            : process_pool(process_pool_)
+            , command(std::move(command_))
+            , stream(std::move(stream_))
+            , rows_to_read(read_rows_)
+            , log(log_)
+            , send_data(std::move(send_data_))
+            , thread([this]
+            {
+                try
+                {
+                    send_data(command->in);
+                }
+                catch (...)
+                {
+                    std::lock_guard<std::mutex> lck(exception_during_read_lock);
+                    exception_during_read = std::current_exception();
+                }
+            })
+        {}
+
+        ~PoolBlockInputStreamWithBackgroundThread() override
+        {
+            if (thread.joinable())
+                thread.join();
+
+            if (command)
+                process_pool->returnObject(std::move(command));
+        }
+
+        Block getHeader() const override
+        {
+            return stream->getHeader();
+        }
+
+    private:
+        Block readImpl() override
+        {
+            rethrowExceptionDuringReadIfNeeded();
+
+            if (current_read_rows == rows_to_read)
+                return Block();
+
+            Block block;
+
+            try
+            {
+                block = stream->read();
+                current_read_rows += block.rows();
+            }
+            catch (...)
+            {
+                tryLogCurrentException(log);
+                command = nullptr;
+                throw;
+            }
+
+            return block;
+        }
+
+        void readPrefix() override
+        {
+            rethrowExceptionDuringReadIfNeeded();
+            stream->readPrefix();
+        }
+
+        void readSuffix() override
+        {
+            if (thread.joinable())
+                thread.join();
+
+            rethrowExceptionDuringReadIfNeeded();
+            stream->readSuffix();
+        }
+
+        void rethrowExceptionDuringReadIfNeeded()
+        {
+            std::lock_guard<std::mutex> lck(exception_during_read_lock);
+            if (exception_during_read)
+            {
+                command = nullptr;
+                std::rethrow_exception(exception_during_read);
+            }
+        }
+
+        String getName() const override { return "PoolWithBackgroundThread"; }
+
+        std::shared_ptr<ProcessPool> process_pool;
+        std::unique_ptr<ShellCommand> command;
+        BlockInputStreamPtr stream;
+        size_t rows_to_read;
+        Poco::Logger * log;
+        std::function<void(WriteBufferFromFile &)> send_data;
+        ThreadFromGlobalPool thread;
+        size_t current_read_rows = 0;
+        std::mutex exception_during_read_lock;
+        std::exception_ptr exception_during_read;
+    };
+
+}
+
+BlockInputStreamPtr ExecutablePoolDictionarySource::loadIds(const std::vector<UInt64> & ids)
+{
+    LOG_TRACE(log, "loadIds {} size = {}", toString(), ids.size());
+
+    auto block = blockForIds(dict_struct, ids);
+    return getStreamForBlock(block);
+}
+
+BlockInputStreamPtr ExecutablePoolDictionarySource::loadKeys(const Columns & key_columns, const std::vector<size_t> & requested_rows)
+{
+    LOG_TRACE(log, "loadKeys {} size = {}", toString(), requested_rows.size());
+
+    auto block = blockForKeys(dict_struct, key_columns, requested_rows);
+    return getStreamForBlock(block);
+}
+
+BlockInputStreamPtr ExecutablePoolDictionarySource::getStreamForBlock(const Block & block)
+{
+    std::unique_ptr<ShellCommand> process;
+    bool result = process_pool->tryBorrowObject(process, [this]()
+    {
+        bool terminate_in_destructor = true;
+        ShellCommandDestructorStrategy strategy { terminate_in_destructor, configuration.command_termination_timeout };
+        auto shell_command = ShellCommand::execute(configuration.command, false, strategy);
+        return shell_command;
+    }, configuration.max_command_execution_time * 10000);
+
+    if (!result)
+        throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, "Could not get process from pool, max command execution timeout exceeded");
+
+    size_t rows_to_read = block.rows();
+    auto read_stream = context.getInputFormat(configuration.format, process->out, sample_block, rows_to_read);
+
+    auto stream = std::make_unique<PoolBlockInputStreamWithBackgroundThread>(
+        process_pool, std::move(process), std::move(read_stream), rows_to_read, log,
+        [block, this](WriteBufferFromFile & out) mutable
+        {
+            auto output_stream = context.getOutputStream(configuration.format, out, block.cloneEmpty());
+            formatBlock(output_stream, block);
+        });
+
+    if (configuration.implicit_key)
+        return std::make_shared<BlockInputStreamWithAdditionalColumns>(block, std::move(stream));
+    else
+        return std::shared_ptr<PoolBlockInputStreamWithBackgroundThread>(stream.release());
+}
+
+bool ExecutablePoolDictionarySource::isModified() const
+{
+    return true;
+}
+
+bool ExecutablePoolDictionarySource::supportsSelectiveLoad() const
+{
+    return true;
+}
+
+bool ExecutablePoolDictionarySource::hasUpdateField() const
+{
+    return !configuration.update_field.empty();
+}
+
+DictionarySourcePtr ExecutablePoolDictionarySource::clone() const
+{
+    return std::make_unique<ExecutablePoolDictionarySource>(*this);
+}
+
+std::string ExecutablePoolDictionarySource::toString() const
+{
+    return "ExecutablePool size: " + std::to_string(configuration.pool_size) + " command: " + configuration.command;
+}
+
+void registerDictionarySourceExecutablePool(DictionarySourceFactory & factory)
+{
+    auto create_table_source = [=](const DictionaryStructure & dict_struct,
+                                 const Poco::Util::AbstractConfiguration & config,
+                                 const std::string & config_prefix,
+                                 Block & sample_block,
+                                 const Context & context,
+                                 const std::string & /* default_database */,
+                                 bool check_config) -> DictionarySourcePtr
+    {
+        if (dict_struct.has_expressions)
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Dictionary source of type `executable_pool` does not support attribute expressions");
+
+        /// Executable dictionaries may execute arbitrary commands.
+        /// It's OK for dictionaries created by administrator from xml-file, but
+        /// maybe dangerous for dictionaries created from DDL-queries.
+        if (check_config)
+            throw Exception(ErrorCodes::DICTIONARY_ACCESS_DENIED, "Dictionaries with executable pool dictionary source are not allowed to be created from DDL query");
+
+        Context context_local_copy = copyContextAndApplySettings(config_prefix, context, config);
+
+        /** Currently parallel parsing input format cannot read exactly max_block_size rows from input,
+         *  so it will be blocked on ReadBufferFromFileDescriptor because this file descriptor represent pipe that does not have eof.
+         */
+        auto settings_no_parallel_parsing = context_local_copy.getSettings();
+        settings_no_parallel_parsing.input_format_parallel_parsing = false;
+        context_local_copy.setSettings(settings_no_parallel_parsing);
+
+        String configuration_config_prefix = config_prefix + ".executable_pool";
+
+        size_t max_command_execution_time = config.getUInt64(configuration_config_prefix + ".max_command_execution_time", 10);
+
+        size_t max_execution_time_seconds = static_cast<size_t>(context.getSettings().max_execution_time.totalSeconds());
+        if (max_command_execution_time > max_execution_time_seconds)
+            max_command_execution_time = max_execution_time_seconds;
+
+        ExecutablePoolDictionarySource::Configuration configuration
+        {
+            .command = config.getString(configuration_config_prefix + ".command"),
+            .format = config.getString(configuration_config_prefix + ".format"),
+            .pool_size = config.getUInt64(configuration_config_prefix + ".size"),
+            .update_field = config.getString(configuration_config_prefix + ".update_field", ""),
+            .implicit_key = config.getBool(configuration_config_prefix + ".implicit_key", false),
+            .command_termination_timeout = config.getUInt64(configuration_config_prefix + ".command_termination_timeout", 10),
+            .max_command_execution_time = max_command_execution_time
+        };
+
+        return std::make_unique<ExecutablePoolDictionarySource>(dict_struct, configuration, sample_block, context_local_copy);
+    };
+
+    factory.registerSource("executable_pool", create_table_source);
+}
+
+}
--- a/src/Dictionaries/ExecutablePoolDictionarySource.h
+++ b/src/Dictionaries/ExecutablePoolDictionarySource.h
@ -0,0 +1,84 @@
+#pragma once
+
+#include <Core/Block.h>
+#include <Common/BorrowedObjectPool.h>
+#include <Interpreters/Context.h>
+
+#include "IDictionarySource.h"
+#include "DictionaryStructure.h"
+
+namespace Poco { class Logger; }
+
+
+namespace DB
+{
+
+using ProcessPool = BorrowedObjectPool<std::unique_ptr<ShellCommand>>;
+
+/** ExecutablePoolDictionarySource allows loading data from pool of processes.
+  * When client requests ids or keys source get process from ProcessPool
+  * and create stream based on source format from process stdout.
+  * It is important that stream format will expect only rows that were requested.
+  * When stream is finished process is returned back to the ProcessPool.
+  * If there are no processes in pool during request client will be blocked
+  * until some process will be retunred to pool.
+  */
+class ExecutablePoolDictionarySource final : public IDictionarySource
+{
+public:
+    struct Configuration
+    {
+        const String command;
+        const String format;
+        const size_t pool_size;
+        const String update_field;
+        const bool implicit_key;
+        const size_t command_termination_timeout;
+        const size_t max_command_execution_time;
+    };
+
+    ExecutablePoolDictionarySource(
+        const DictionaryStructure & dict_struct_,
+        const Configuration & configuration_,
+        Block & sample_block_,
+        const Context & context_);
+
+    ExecutablePoolDictionarySource(const ExecutablePoolDictionarySource & other);
+    ExecutablePoolDictionarySource & operator=(const ExecutablePoolDictionarySource &) = delete;
+
+    BlockInputStreamPtr loadAll() override;
+
+    /** The logic of this method is flawed, absolutely incorrect and ignorant.
+      * It may lead to skipping some values due to clock sync or timezone changes.
+      * The intended usage of "update_field" is totally different.
+      */
+    BlockInputStreamPtr loadUpdatedAll() override;
+
+    BlockInputStreamPtr loadIds(const std::vector<UInt64> & ids) override;
+
+    BlockInputStreamPtr loadKeys(const Columns & key_columns, const std::vector<size_t> & requested_rows) override;
+
+    bool isModified() const override;
+
+    bool supportsSelectiveLoad() const override;
+
+    bool hasUpdateField() const override;
+
+    DictionarySourcePtr clone() const override;
+
+    std::string toString() const override;
+
+    BlockInputStreamPtr getStreamForBlock(const Block & block);
+
+private:
+    Poco::Logger * log;
+    time_t update_time = 0;
+    const DictionaryStructure dict_struct;
+    const Configuration configuration;
+
+    Block sample_block;
+    Context context;
+    std::shared_ptr<ProcessPool> process_pool;
+};
+
+}
--- a/src/Dictionaries/registerDictionaries.cpp
+++ b/src/Dictionaries/registerDictionaries.cpp
@ -18,6 +18,7 @@ void registerDictionarySourceJDBC(DictionarySourceFactory & source_factory);
 void registerDictionarySourcePostgreSQL(DictionarySourceFactory & source_factory);
 #endif
 void registerDictionarySourceExecutable(DictionarySourceFactory & source_factory);
+void registerDictionarySourceExecutablePool(DictionarySourceFactory & source_factory);
 void registerDictionarySourceHTTP(DictionarySourceFactory & source_factory);
 void registerDictionarySourceLibrary(DictionarySourceFactory & source_factory);

@ -54,6 +55,7 @@ void registerDictionaries()
        registerDictionarySourcePostgreSQL(source_factory);
 #endif
        registerDictionarySourceExecutable(source_factory);
+        registerDictionarySourceExecutablePool(source_factory);
        registerDictionarySourceHTTP(source_factory);
        registerDictionarySourceLibrary(source_factory);
    }
--- a/src/Dictionaries/ya.make
+++ b/src/Dictionaries/ya.make
@ -42,6 +42,7 @@ SRCS(
    Embedded/RegionsHierarchy.cpp
    Embedded/RegionsNames.cpp
    ExecutableDictionarySource.cpp
+    ExecutablePoolDictionarySource.cpp
    ExternalQueryBuilder.cpp
    FileDictionarySource.cpp
    FlatDictionary.cpp
--- a/src/Functions/FunctionsBitmap.h
+++ b/src/Functions/FunctionsBitmap.h
@ -1100,14 +1100,14 @@ public:
            return executeBitmapData<UInt32>(arguments, input_rows_count);
        else if (which.isUInt64())
            return executeBitmapData<UInt64>(arguments, input_rows_count);
-        else if (which.isUInt8())
-            return executeBitmapData<UInt8>(arguments, input_rows_count);
-        else if (which.isUInt16())
-            return executeBitmapData<UInt16>(arguments, input_rows_count);
-        else if (which.isUInt32())
-            return executeBitmapData<UInt32>(arguments, input_rows_count);
-        else if (which.isUInt64())
-            return executeBitmapData<UInt64>(arguments, input_rows_count);
+        else if (which.isInt8())
+            return executeBitmapData<Int8>(arguments, input_rows_count);
+        else if (which.isInt16())
+            return executeBitmapData<Int16>(arguments, input_rows_count);
+        else if (which.isInt32())
+            return executeBitmapData<Int32>(arguments, input_rows_count);
+        else if (which.isInt64())
+            return executeBitmapData<Int64>(arguments, input_rows_count);
        else
            throw Exception(
                "Unexpected type " + from_type->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
--- a/src/Functions/FunctionsExternalDictionaries.h
+++ b/src/Functions/FunctionsExternalDictionaries.h
@ -358,9 +358,23 @@ public:
        }
        else if (dictionary_key_type == DictionaryKeyType::complex)
        {
+            if (!isTuple(key_col_with_type.type))
+                throw Exception(
+                    ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+                    "Third argument of function ({}) must be tuple when dictionary is complex. Actual type ({}).",
+                    getName(),
+                    key_col_with_type.type->getName());
+
            /// Functions in external dictionaries_loader only support full-value (not constant) columns with keys.
            ColumnPtr key_column_full = key_col_with_type.column->convertToFullColumnIfConst();

+            if (!isTuple(key_col_with_type.type))
+                throw Exception(
+                    ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+                    "Third argument of function ({}) must be tuple when dictionary is complex. Actual type ({}).",
+                    getName(),
+                    key_col_with_type.type->getName());
+
            const auto & key_columns = typeid_cast<const ColumnTuple &>(*key_column_full).getColumnsCopy();
            const auto & key_types = static_cast<const DataTypeTuple &>(*key_col_with_type.type).getElements();

--- a/src/IO/CompressionMethod.cpp
+++ b/src/IO/CompressionMethod.cpp
@ -15,6 +15,8 @@
 #    include <Common/config.h>
 #endif

+#include <boost/algorithm/string/case_conv.hpp>
+

 namespace DB
 {
@ -55,23 +57,24 @@ CompressionMethod chooseCompressionMethod(const std::string & path, const std::s
            file_extension = path.substr(pos + 1, std::string::npos);
    }

-    const std::string * method_str = file_extension.empty() ? &hint : &file_extension;
+    std::string method_str = file_extension.empty() ? hint : std::move(file_extension);
+    boost::algorithm::to_lower(method_str);

-    if (*method_str == "gzip" || *method_str == "gz")
+    if (method_str == "gzip" || method_str == "gz")
        return CompressionMethod::Gzip;
-    if (*method_str == "deflate")
+    if (method_str == "deflate")
        return CompressionMethod::Zlib;
-    if (*method_str == "brotli" || *method_str == "br")
+    if (method_str == "brotli" || method_str == "br")
        return CompressionMethod::Brotli;
-    if (*method_str == "LZMA" || *method_str == "xz")
+    if (method_str == "lzma" || method_str == "xz")
        return CompressionMethod::Xz;
-    if (*method_str == "zstd" || *method_str == "zst")
+    if (method_str == "zstd" || method_str == "zst")
        return CompressionMethod::Zstd;
    if (hint.empty() || hint == "auto" || hint == "none")
        return CompressionMethod::None;

    throw Exception(
-        "Unknown compression method " + hint + ". Only 'auto', 'none', 'gzip', 'br', 'xz', 'zstd' are supported as compression methods",
+        "Unknown compression method " + hint + ". Only 'auto', 'none', 'gzip', 'deflate', 'br', 'xz', 'zstd' are supported as compression methods",
        ErrorCodes::NOT_IMPLEMENTED);
 }

--- a/src/Interpreters/ActionsDAG.cpp
+++ b/src/Interpreters/ActionsDAG.cpp
@ -688,7 +688,9 @@ ActionsDAGPtr ActionsDAG::makeConvertingActions(
    const ColumnsWithTypeAndName & source,
    const ColumnsWithTypeAndName & result,
    MatchColumnsMode mode,
-    bool ignore_constant_values)
+    bool ignore_constant_values,
+    bool add_casted_columns,
+    NameToNameMap * new_names)
 {
    size_t num_input_columns = source.size();
    size_t num_result_columns = result.size();
@ -696,6 +698,9 @@ ActionsDAGPtr ActionsDAG::makeConvertingActions(
    if (mode == MatchColumnsMode::Position && num_input_columns != num_result_columns)
        throw Exception("Number of columns doesn't match", ErrorCodes::NUMBER_OF_COLUMNS_DOESNT_MATCH);

+    if (add_casted_columns && mode != MatchColumnsMode::Name)
+        throw Exception("Converting with add_casted_columns supported only for MatchColumnsMode::Name", ErrorCodes::LOGICAL_ERROR);
+
    auto actions_dag = std::make_shared<ActionsDAG>(source);
    std::vector<Node *> projection(num_result_columns);

@ -715,12 +720,13 @@ ActionsDAGPtr ActionsDAG::makeConvertingActions(
    {
        const auto & res_elem = result[result_col_num];
        Node * src_node = nullptr;
+        Node * dst_node = nullptr;

        switch (mode)
        {
            case MatchColumnsMode::Position:
            {
-                src_node = actions_dag->inputs[result_col_num];
+                src_node = dst_node = actions_dag->inputs[result_col_num];
                break;
            }

@ -731,7 +737,7 @@ ActionsDAGPtr ActionsDAG::makeConvertingActions(
                    throw Exception("Cannot find column " + backQuote(res_elem.name) + " in source stream",
                                    ErrorCodes::THERE_IS_NO_COLUMN);

-                src_node = actions_dag->inputs[input.front()];
+                src_node = dst_node = actions_dag->inputs[input.front()];
                input.pop_front();
                break;
            }
@ -740,10 +746,10 @@ ActionsDAGPtr ActionsDAG::makeConvertingActions(
        /// Check constants.
        if (const auto * res_const = typeid_cast<const ColumnConst *>(res_elem.column.get()))
        {
-            if (const auto * src_const = typeid_cast<const ColumnConst *>(src_node->column.get()))
+            if (const auto * src_const = typeid_cast<const ColumnConst *>(dst_node->column.get()))
            {
                if (ignore_constant_values)
-                   src_node = const_cast<Node *>(&actions_dag->addColumn(res_elem, true));
+                    dst_node = const_cast<Node *>(&actions_dag->addColumn(res_elem, true));
                else if (res_const->getField() != src_const->getField())
                    throw Exception("Cannot convert column " + backQuote(res_elem.name) + " because "
                                    "it is constant but values of constants are different in source and result",
@ -756,7 +762,7 @@ ActionsDAGPtr ActionsDAG::makeConvertingActions(
        }

        /// Add CAST function to convert into result type if needed.
-        if (!res_elem.type->equals(*src_node->result_type))
+        if (!res_elem.type->equals(*dst_node->result_type))
        {
            ColumnWithTypeAndName column;
            column.name = res_elem.type->getName();
@ -764,27 +770,49 @@ ActionsDAGPtr ActionsDAG::makeConvertingActions(
            column.type = std::make_shared<DataTypeString>();

            auto * right_arg = const_cast<Node *>(&actions_dag->addColumn(std::move(column), true));
-            auto * left_arg = src_node;
+            auto * left_arg = dst_node;

-            FunctionCast::Diagnostic diagnostic = {src_node->result_name, res_elem.name};
+            FunctionCast::Diagnostic diagnostic = {dst_node->result_name, res_elem.name};
            FunctionOverloadResolverPtr func_builder_cast =
                    std::make_shared<FunctionOverloadResolverAdaptor>(
                            CastOverloadResolver<CastType::nonAccurate>::createImpl(false, std::move(diagnostic)));

            Inputs children = { left_arg, right_arg };
-            src_node = &actions_dag->addFunction(func_builder_cast, std::move(children), {}, true);
+            dst_node = &actions_dag->addFunction(func_builder_cast, std::move(children), {}, true);
        }

-        if (src_node->column && isColumnConst(*src_node->column) && !(res_elem.column && isColumnConst(*res_elem.column)))
+        if (dst_node->column && isColumnConst(*dst_node->column) && !(res_elem.column && isColumnConst(*res_elem.column)))
        {
-            Inputs children = {src_node};
-            src_node = &actions_dag->addFunction(func_builder_materialize, std::move(children), {}, true);
+            Inputs children = {dst_node};
+            dst_node = &actions_dag->addFunction(func_builder_materialize, std::move(children), {}, true);
        }

-        if (src_node->result_name != res_elem.name)
-            src_node = &actions_dag->addAlias(*src_node, res_elem.name, true);
+        if (dst_node->result_name != res_elem.name)
+        {
+            if (add_casted_columns)
+            {
+                if (inputs.contains(dst_node->result_name))
+                    throw Exception("Cannot convert column " + backQuote(res_elem.name) +
+                                    " to "+ backQuote(dst_node->result_name) +
+                                    " because other column have same name",
+                                    ErrorCodes::ILLEGAL_COLUMN);
+                if (new_names)
+                    new_names->emplace(res_elem.name, dst_node->result_name);

-        projection[result_col_num] = src_node;
+                /// Leave current column on same place, add converted to back
+                projection[result_col_num] = src_node;
+                projection.push_back(dst_node);
+            }
+            else
+            {
+                dst_node = &actions_dag->addAlias(*dst_node, res_elem.name, true);
+                projection[result_col_num] = dst_node;
+            }
+        }
+        else
+        {
+            projection[result_col_num] = dst_node;
+        }
    }

    actions_dag->removeUnusedActions(projection);
--- a/src/Interpreters/ActionsDAG.h
+++ b/src/Interpreters/ActionsDAG.h
@ -253,11 +253,15 @@ public:
    /// Create ActionsDAG which converts block structure from source to result.
    /// It is needed to convert result from different sources to the same structure, e.g. for UNION query.
    /// Conversion should be possible with only usage of CAST function and renames.
+    /// @param ignore_constant_values - Do not check that constants are same. Use value from result_header.
+    /// @param add_casted_columns - Create new columns with converted values instead of replacing original.
    static ActionsDAGPtr makeConvertingActions(
        const ColumnsWithTypeAndName & source,
        const ColumnsWithTypeAndName & result,
        MatchColumnsMode mode,
-        bool ignore_constant_values = false); /// Do not check that constants are same. Use value from result_header.
+        bool ignore_constant_values = false,
+        bool add_casted_columns = false,
+        NameToNameMap * new_names = nullptr);

    /// Create expression which add const column and then materialize it.
    static ActionsDAGPtr makeAddingColumnActions(ColumnWithTypeAndName column);
--- a/src/Interpreters/ActionsVisitor.cpp
+++ b/src/Interpreters/ActionsVisitor.cpp
@ -674,7 +674,7 @@ void ActionsMatcher::visit(const ASTIdentifier & identifier, const ASTPtr & ast,
            if (column_name_type.name == column_name)
            {
                throw Exception("Column " + backQuote(column_name) + " is not under aggregate function and not in GROUP BY",
-                ErrorCodes::NOT_AN_AGGREGATE);
+                                ErrorCodes::NOT_AN_AGGREGATE);
            }
        }

--- a/src/Interpreters/DDLTask.cpp
+++ b/src/Interpreters/DDLTask.cpp
@ -320,6 +320,8 @@ std::unique_ptr<Context> DatabaseReplicatedTask::makeQueryContext(Context & from

 String DDLTaskBase::getLogEntryName(UInt32 log_entry_number)
 {
+    /// Sequential counter in ZooKeeper is Int32.
+    assert(log_entry_number < std::numeric_limits<Int32>::max());
    constexpr size_t seq_node_digits = 10;
    String number = toString(log_entry_number);
    String name = "query-" + String(seq_node_digits - number.size(), '0') + number;
@ -330,7 +332,9 @@ UInt32 DDLTaskBase::getLogEntryNumber(const String & log_entry_name)
 {
    constexpr const char * name = "query-";
    assert(startsWith(log_entry_name, name));
-    return parse<UInt32>(log_entry_name.substr(strlen(name)));
+    UInt32 num = parse<UInt32>(log_entry_name.substr(strlen(name)));
+    assert(num < std::numeric_limits<Int32>::max());
+    return num;
 }

 void ZooKeeperMetadataTransaction::commit()
--- a/src/Interpreters/DDLWorker.cpp
+++ b/src/Interpreters/DDLWorker.cpp
@ -48,6 +48,7 @@ namespace ErrorCodes
    extern const int MEMORY_LIMIT_EXCEEDED;
 }

+constexpr const char * TASK_PROCESSED_OUT_REASON = "Task has been already processed";

 namespace
 {
@ -290,7 +291,7 @@ DDLTaskPtr DDLWorker::initAndCheckTask(const String & entry_name, String & out_r

    if (zookeeper->exists(task->getFinishedNodePath()))
    {
-        out_reason = "Task has been already processed";
+        out_reason = TASK_PROCESSED_OUT_REASON;
        return {};
    }

@ -311,51 +312,120 @@ void DDLWorker::scheduleTasks(bool reinitialized)
    auto zookeeper = tryGetZooKeeper();

    /// Main thread of DDLWorker was restarted, probably due to lost connection with ZooKeeper.
-    /// We have some unfinished tasks. To avoid duplication of some queries, try to write execution status.
+    /// We have some unfinished tasks.
+    /// To avoid duplication of some queries we should try to write execution status again.
+    /// To avoid skipping of some entries which were not executed we should be careful when choosing begin_node to start from.
+    /// NOTE: It does not protect from all cases of query duplication, see also comments in processTask(...)
    if (reinitialized)
    {
-        for (auto & task : current_tasks)
+        if (current_tasks.empty())
+            LOG_TRACE(log, "Don't have unfinished tasks after restarting");
+        else
+            LOG_INFO(log, "Have {} unfinished tasks, will check them", current_tasks.size());
+        assert(current_tasks.size() <= pool_size + (worker_pool != nullptr));
+        auto task_it = current_tasks.begin();
+        while (task_it != current_tasks.end())
        {
-            if (task->was_executed)
+            auto & task = *task_it;
+            if (task->completely_processed)
            {
-                bool task_still_exists = zookeeper->exists(task->entry_path);
+                assert(task->was_executed);
+                /// Status must be written (but finished/ node may not exist if entry was deleted).
+                /// If someone is deleting entry concurrently, then /active status dir must not exist.
+                assert(zookeeper->exists(task->getFinishedNodePath()) || !zookeeper->exists(fs::path(task->entry_path) / "active"));
+                ++task_it;
+            }
+            else if (task->was_executed)
+            {
+                /// Connection was lost on attempt to write status. Will retry.
                bool status_written = zookeeper->exists(task->getFinishedNodePath());
-                if (!status_written && task_still_exists)
-                {
+                /// You might think that the following condition is redundant, because status_written implies completely_processed.
+                /// But it's wrong. It's possible that (!task->completely_processed && status_written)
+                /// if ZooKeeper successfully received and processed our request
+                /// but we lost connection while waiting for the response.
+                /// Yeah, distributed systems is a zoo.
+                if (status_written)
+                    task->completely_processed = true;
+                else
                    processTask(*task, zookeeper);
-                }
+                ++task_it;
+            }
+            else
+            {
+                /// We didn't even executed a query, so let's just remove it.
+                /// We will try to read the task again and execute it from the beginning.
+                /// NOTE: We can safely compare entry names as Strings, because they are padded.
+                /// Entry name always starts with "query-" and contain exactly 10 decimal digits
+                /// of log entry number (with leading zeros).
+                if (!first_failed_task_name || task->entry_name < *first_failed_task_name)
+                    first_failed_task_name = task->entry_name;
+                task_it = current_tasks.erase(task_it);
            }
        }
    }

    Strings queue_nodes = zookeeper->getChildren(queue_dir, nullptr, queue_updated_event);
    filterAndSortQueueNodes(queue_nodes);
-    if (queue_nodes.empty())
-    {
-        LOG_TRACE(log, "No tasks to schedule");
-        return;
-    }
-    else if (max_tasks_in_queue < queue_nodes.size())
+    if (max_tasks_in_queue < queue_nodes.size())
        cleanup_event->set();

    /// Detect queue start, using:
    /// - skipped tasks
-    /// - in memory tasks (that are currently active)
+    /// - in memory tasks (that are currently active or were finished recently)
+    /// - failed tasks (that should be processed again)
    auto begin_node = queue_nodes.begin();
-    UInt64 last_task_id = 0;
-    if (!current_tasks.empty())
+    if (first_failed_task_name)
    {
-        auto & last_task = current_tasks.back();
-        last_task_id = DDLTaskBase::getLogEntryNumber(last_task->entry_name);
-        begin_node = std::upper_bound(queue_nodes.begin(), queue_nodes.end(), last_task->entry_name);
+        /// If we had failed tasks, then we should start from the first failed task.
+        assert(reinitialized);
+        begin_node = std::lower_bound(queue_nodes.begin(), queue_nodes.end(), first_failed_task_name);
    }
-    if (last_skipped_entry_name)
+    else
    {
-        UInt64 last_skipped_entry_id = DDLTaskBase::getLogEntryNumber(*last_skipped_entry_name);
-        if (last_skipped_entry_id > last_task_id)
-            begin_node = std::upper_bound(queue_nodes.begin(), queue_nodes.end(), *last_skipped_entry_name);
+        /// We had no failed tasks. Let's just choose the maximum entry we have previously seen.
+        String last_task_name;
+        if (!current_tasks.empty())
+            last_task_name = current_tasks.back()->entry_name;
+        if (last_skipped_entry_name && last_task_name < *last_skipped_entry_name)
+            last_task_name = *last_skipped_entry_name;
+        begin_node = std::upper_bound(queue_nodes.begin(), queue_nodes.end(), last_task_name);
    }

+    if (begin_node == queue_nodes.end())
+        LOG_DEBUG(log, "No tasks to schedule");
+    else
+        LOG_DEBUG(log, "Will schedule {} tasks starting from {}", std::distance(begin_node, queue_nodes.end()), *begin_node);
+
+    /// Let's ensure that it's exactly the first task we should process.
+    /// Maybe such asserts are too paranoid and excessive,
+    /// but it's easy enough to break DDLWorker in a very unobvious way by making some minor change in code.
+    [[maybe_unused]] bool have_no_tasks_info = !first_failed_task_name && current_tasks.empty() && !last_skipped_entry_name;
+    assert(have_no_tasks_info || queue_nodes.end() == std::find_if(queue_nodes.begin(), queue_nodes.end(), [&](const String & entry_name)
+    {
+        /// We should return true if some invariants are violated.
+        String reason;
+        auto task = initAndCheckTask(entry_name, reason, zookeeper);
+        bool maybe_currently_processing = current_tasks.end() != std::find_if(current_tasks.begin(), current_tasks.end(), [&](const auto & t)
+        {
+            return t->entry_name == entry_name;
+        });
+        /// begin_node is something like a log pointer
+        if (begin_node == queue_nodes.end() || entry_name < *begin_node)
+        {
+            /// Return true if entry should be scheduled.
+            /// There is a minor race condition: initAndCheckTask(...) may return not null
+            /// if someone is deleting outdated entry right now (including finished/ nodes), so we also check active/ status dir.
+            bool maybe_concurrently_deleting = task && !zookeeper->exists(fs::path(task->entry_path) / "active");
+            return task && !maybe_concurrently_deleting && !maybe_currently_processing;
+        }
+        else
+        {
+            /// Return true if entry should not be scheduled.
+            bool processed = !task && reason == TASK_PROCESSED_OUT_REASON;
+            return processed || maybe_currently_processing;
+        }
+    }));
+
    for (auto it = begin_node; it != queue_nodes.end() && !stop_flag; ++it)
    {
        String entry_name = *it;
@ -391,8 +461,18 @@ void DDLWorker::scheduleTasks(bool reinitialized)
 DDLTaskBase & DDLWorker::saveTask(DDLTaskPtr && task)
 {
    current_tasks.remove_if([](const DDLTaskPtr & t) { return t->completely_processed.load(); });
-    assert(current_tasks.size() <= pool_size);
+    /// Tasks are scheduled and executed in main thread <==> Parallel execution is disabled
+    assert((worker_pool != nullptr) == (1 < pool_size));
+    /// Parallel execution is disabled ==> All previous tasks are failed to start or finished,
+    /// so current tasks list must be empty when we are ready to process new one.
+    assert(worker_pool || current_tasks.empty());
+    /// Parallel execution is enabled ==> Not more than pool_size tasks are currently executing.
+    /// Note: If current_tasks.size() == pool_size, then all worker threads are busy,
+    /// so we will wait on worker_pool->scheduleOrThrowOnError(...)
+    assert(!worker_pool || current_tasks.size() <= pool_size);
    current_tasks.emplace_back(std::move(task));
+    if (first_failed_task_name && *first_failed_task_name == current_tasks.back()->entry_name)
+        first_failed_task_name.reset();
    return *current_tasks.back();
 }

@ -479,10 +559,15 @@ void DDLWorker::updateMaxDDLEntryID(const String & entry_name)
 void DDLWorker::processTask(DDLTaskBase & task, const ZooKeeperPtr & zookeeper)
 {
    LOG_DEBUG(log, "Processing task {} ({})", task.entry_name, task.entry.query);
+    assert(!task.completely_processed);

    String active_node_path = task.getActiveNodePath();
    String finished_node_path = task.getFinishedNodePath();

+    /// Step 1: Create ephemeral node in active/ status dir.
+    /// It allows other hosts to understand that task is currently executing (useful for system.distributed_ddl_queue)
+    /// and protects from concurrent deletion or the task.
+
    /// It will tryRemove(...) on exception
    auto active_node = zkutil::EphemeralNodeHolder::existing(active_node_path, *zookeeper);

@ -498,7 +583,21 @@ void DDLWorker::processTask(DDLTaskBase & task, const ZooKeeperPtr & zookeeper)

        /// Status dirs were not created in enqueueQuery(...) or someone is removing entry
        if (create_active_res == Coordination::Error::ZNONODE)
+        {
+            assert(dynamic_cast<DatabaseReplicatedTask *>(&task) == nullptr);
+            if (task.was_executed)
+            {
+                /// Special case:
+                /// Task was executed (and we are trying to write status after connection loss) ==> Status dirs were previously created.
+                /// (Status dirs were previously created AND active/ does not exist) ==> Task was removed.
+                /// We cannot write status, but it's not required anymore, because no one will try to execute it again.
+                /// So we consider task as completely processed.
+                LOG_WARNING(log, "Task {} is executed, but looks like entry {} was deleted, cannot write status", task.entry_name, task.entry_path);
+                task.completely_processed = true;
+                return;
+            }
            createStatusDirs(task.entry_path, zookeeper);
+        }

        if (create_active_res == Coordination::Error::ZNODEEXISTS)
        {
@ -508,7 +607,7 @@ void DDLWorker::processTask(DDLTaskBase & task, const ZooKeeperPtr & zookeeper)
            String dummy;
            if (zookeeper->tryGet(active_node_path, dummy, nullptr, eph_node_disappeared))
            {
-                constexpr int timeout_ms = 5000;
+                constexpr int timeout_ms = 30 * 1000;
                if (!eph_node_disappeared->tryWait(timeout_ms))
                    throw Exception(ErrorCodes::LOGICAL_ERROR, "Ephemeral node {} still exists, "
                                    "probably it's owned by someone else", active_node_path);
@ -518,6 +617,7 @@ void DDLWorker::processTask(DDLTaskBase & task, const ZooKeeperPtr & zookeeper)
        zookeeper->create(active_node_path, {}, zkutil::CreateMode::Ephemeral);
    }

+    /// Step 2: Execute query from the task.
    if (!task.was_executed)
    {
        /// If table and database engine supports it, they will execute task.ops by their own in a single transaction
@ -588,6 +688,7 @@ void DDLWorker::processTask(DDLTaskBase & task, const ZooKeeperPtr & zookeeper)

    updateMaxDDLEntryID(task.entry_name);

+    /// Step 3: Create node in finished/ status dir and write execution status.
    /// FIXME: if server fails right here, the task will be executed twice. We need WAL here.
    /// NOTE: If ZooKeeper connection is lost here, we will try again to write query status.
    /// NOTE: If both table and database are replicated, task is executed in single ZK transaction.
--- a/src/Interpreters/DDLWorker.h
+++ b/src/Interpreters/DDLWorker.h
@ -123,6 +123,7 @@ protected:

    /// Save state of executed task to avoid duplicate execution on ZK error
    std::optional<String> last_skipped_entry_name;
+    std::optional<String> first_failed_task_name;
    std::list<DDLTaskPtr> current_tasks;

    std::shared_ptr<Poco::Event> queue_updated_event = std::make_shared<Poco::Event>();
--- a/src/Interpreters/ExpressionActions.cpp
+++ b/src/Interpreters/ExpressionActions.cpp
@ -726,7 +726,7 @@ ExpressionActionsChain::JoinStep::JoinStep(
    for (const auto & column : result_columns)
        required_columns.emplace_back(column.name, column.type);

-    analyzed_join->addJoinedColumnsAndCorrectNullability(result_columns);
+    analyzed_join->addJoinedColumnsAndCorrectTypes(result_columns);
 }

 void ExpressionActionsChain::JoinStep::finalize(const Names & required_output_)
@ -747,8 +747,8 @@ void ExpressionActionsChain::JoinStep::finalize(const Names & required_output_)
    }

    /// Result will also contain joined columns.
-    for (const auto & column : analyzed_join->columnsAddedByJoin())
-        required_names.emplace(column.name);
+    for (const auto & column_name : analyzed_join->columnsAddedByJoin())
+        required_names.emplace(column_name);

    for (const auto & column : result_columns)
    {
--- a/src/Interpreters/ExpressionAnalyzer.cpp
+++ b/src/Interpreters/ExpressionAnalyzer.cpp
@ -42,14 +42,13 @@

 #include <DataTypes/DataTypeFactory.h>
 #include <Parsers/parseQuery.h>
-#include <Interpreters/interpretSubquery.h>
-#include <Interpreters/DatabaseAndTableWithAlias.h>
-#include <Interpreters/misc.h>

 #include <Interpreters/ActionsVisitor.h>
-
-#include <Interpreters/GlobalSubqueriesVisitor.h>
 #include <Interpreters/GetAggregatesVisitor.h>
+#include <Interpreters/GlobalSubqueriesVisitor.h>
+#include <Interpreters/interpretSubquery.h>
+#include <Interpreters/join_common.h>
+#include <Interpreters/misc.h>

 #include <IO/Operators.h>
 #include <IO/WriteBufferFromString.h>
@ -208,13 +207,12 @@ void ExpressionAnalyzer::analyzeAggregation()
        {
            getRootActionsNoMakeSet(analyzedJoin().leftKeysList(), true, temp_actions, false);
            auto sample_columns = temp_actions->getResultColumns();
-            analyzedJoin().addJoinedColumnsAndCorrectNullability(sample_columns);
+            analyzedJoin().addJoinedColumnsAndCorrectTypes(sample_columns);
            temp_actions = std::make_shared<ActionsDAG>(sample_columns);
        }

        columns_after_join = columns_after_array_join;
-        const auto & added_by_join = analyzedJoin().columnsAddedByJoin();
-        columns_after_join.insert(columns_after_join.end(), added_by_join.begin(), added_by_join.end());
+        analyzedJoin().addJoinedColumnsAndCorrectTypes(columns_after_join, false);
    }

    has_aggregation = makeAggregateDescriptions(temp_actions);
@ -727,13 +725,17 @@ bool SelectQueryExpressionAnalyzer::appendJoinLeftKeys(ExpressionActionsChain &

 JoinPtr SelectQueryExpressionAnalyzer::appendJoin(ExpressionActionsChain & chain)
 {
-    JoinPtr table_join = makeTableJoin(*syntax->ast_join);
+    const ColumnsWithTypeAndName & left_sample_columns = chain.getLastStep().getResultColumns();
+    JoinPtr table_join = makeTableJoin(*syntax->ast_join, left_sample_columns);
+
+    if (syntax->analyzed_join->needConvert())
+    {
+        chain.steps.push_back(std::make_unique<ExpressionActionsChain::ExpressionActionsStep>(syntax->analyzed_join->leftConvertingActions()));
+        chain.addStep();
+    }

    ExpressionActionsChain::Step & step = chain.lastStep(columns_after_array_join);
-
-    chain.steps.push_back(std::make_unique<ExpressionActionsChain::JoinStep>(
-            syntax->analyzed_join, table_join, step.getResultColumns()));
-
+    chain.steps.push_back(std::make_unique<ExpressionActionsChain::JoinStep>(syntax->analyzed_join, table_join, step.getResultColumns()));
    chain.addStep();
    return table_join;
 }
@ -798,7 +800,8 @@ static std::shared_ptr<IJoin> makeJoin(std::shared_ptr<TableJoin> analyzed_join,
    return std::make_shared<JoinSwitcher>(analyzed_join, sample_block);
 }

-JoinPtr SelectQueryExpressionAnalyzer::makeTableJoin(const ASTTablesInSelectQueryElement & join_element)
+JoinPtr SelectQueryExpressionAnalyzer::makeTableJoin(
+    const ASTTablesInSelectQueryElement & join_element, const ColumnsWithTypeAndName & left_sample_columns)
 {
    /// Two JOINs are not supported with the same subquery, but different USINGs.
    auto join_hash = join_element.getTreeHash();
@ -834,7 +837,13 @@ JoinPtr SelectQueryExpressionAnalyzer::makeTableJoin(const ASTTablesInSelectQuer
        }

        /// TODO You do not need to set this up when JOIN is only needed on remote servers.
-        subquery_for_join.setJoinActions(joined_block_actions); /// changes subquery_for_join.sample_block inside
+        subquery_for_join.addJoinActions(joined_block_actions); /// changes subquery_for_join.sample_block inside
+
+        const ColumnsWithTypeAndName & right_sample_columns = subquery_for_join.sample_block.getColumnsWithTypeAndName();
+        bool need_convert = syntax->analyzed_join->applyJoinKeyConvert(left_sample_columns, right_sample_columns);
+        if (need_convert)
+            subquery_for_join.addJoinActions(std::make_shared<ExpressionActions>(syntax->analyzed_join->rightConvertingActions()));
+
        subquery_for_join.join = makeJoin(syntax->analyzed_join, subquery_for_join.sample_block, context);

        /// Do not make subquery for join over dictionary.
@ -1434,9 +1443,9 @@ ExpressionAnalysisResult::ExpressionAnalysisResult(
        if (query_analyzer.hasTableJoin())
        {
            query_analyzer.appendJoinLeftKeys(chain, only_types || !first_stage);
-
            before_join = chain.getLastActions();
            join = query_analyzer.appendJoin(chain);
+            converting_join_columns = query_analyzer.analyzedJoin().leftConvertingActions();
            chain.addStep();
        }

--- a/src/Interpreters/ExpressionAnalyzer.h
+++ b/src/Interpreters/ExpressionAnalyzer.h
@ -1,15 +1,16 @@
 #pragma once

-#include <DataStreams/IBlockStream_fwd.h>
 #include <Columns/FilterDescription.h>
+#include <DataStreams/IBlockStream_fwd.h>
 #include <Interpreters/AggregateDescription.h>
-#include <Interpreters/WindowDescription.h>
-#include <Interpreters/TreeRewriter.h>
+#include <Interpreters/DatabaseCatalog.h>
 #include <Interpreters/SubqueryForSet.h>
+#include <Interpreters/TreeRewriter.h>
+#include <Interpreters/WindowDescription.h>
+#include <Interpreters/join_common.h>
 #include <Parsers/IAST_fwd.h>
 #include <Storages/IStorage_fwd.h>
 #include <Storages/SelectQueryInfo.h>
-#include <Interpreters/DatabaseCatalog.h>

 namespace DB
 {
@ -199,6 +200,7 @@ struct ExpressionAnalysisResult
    ActionsDAGPtr before_array_join;
    ArrayJoinActionPtr array_join;
    ActionsDAGPtr before_join;
+    ActionsDAGPtr converting_join_columns;
    JoinPtr join;
    ActionsDAGPtr before_where;
    ActionsDAGPtr before_aggregation;
@ -313,7 +315,9 @@ private:
    /// Create Set-s that we make from IN section to use index on them.
    void makeSetsForIndex(const ASTPtr & node);

-    JoinPtr makeTableJoin(const ASTTablesInSelectQueryElement & join_element);
+    JoinPtr makeTableJoin(
+        const ASTTablesInSelectQueryElement & join_element,
+        const ColumnsWithTypeAndName & left_sample_columns);

    const ASTSelectQuery * getAggregatingQuery() const;

--- a/src/Interpreters/InterpreterInsertQuery.cpp
+++ b/src/Interpreters/InterpreterInsertQuery.cpp
@ -208,7 +208,7 @@ BlockIO InterpreterInsertQuery::execute()

        auto storage_dst = std::dynamic_pointer_cast<StorageDistributed>(table);

-        if (storage_src && storage_dst && storage_src->cluster_name == storage_dst->cluster_name)
+        if (storage_src && storage_dst && storage_src->getClusterName() == storage_dst->getClusterName())
        {
            is_distributed_insert_select = true;

--- a/src/Interpreters/InterpreterSelectQuery.cpp
+++ b/src/Interpreters/InterpreterSelectQuery.cpp
@ -502,7 +502,10 @@ InterpreterSelectQuery::InterpreterSelectQuery(

    if (need_analyze_again)
    {
-        subquery_for_sets = std::move(query_analyzer->getSubqueriesForSets());
+        LOG_TRACE(log, "Running 'analyze' second time");
+        query_analyzer->getSubqueriesForSets().clear();
+        subquery_for_sets = SubqueriesForSets();
+
        /// Do not try move conditions to PREWHERE for the second time.
        /// Otherwise, we won't be able to fallback from inefficient PREWHERE to WHERE later.
        analyze(/* try_move_to_prewhere = */ false);
@ -1059,14 +1062,22 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu
                query_plan.addStep(std::move(before_join_step));
            }

+            /// Optional step to convert key columns to common supertype.
+            /// Columns with changed types will be returned to user,
+            ///  so its only suitable for `USING` join.
+            if (expressions.converting_join_columns)
+            {
+                QueryPlanStepPtr convert_join_step = std::make_unique<ExpressionStep>(
+                    query_plan.getCurrentDataStream(),
+                    expressions.converting_join_columns);
+                convert_join_step->setStepDescription("Convert JOIN columns");
+                query_plan.addStep(std::move(convert_join_step));
+            }
+
            if (expressions.hasJoin())
            {
-                Block join_result_sample;
                JoinPtr join = expressions.join;

-                join_result_sample = JoiningTransform::transformHeader(
-                    query_plan.getCurrentDataStream().header, expressions.join);
-
                QueryPlanStepPtr join_step = std::make_unique<JoinStep>(
                    query_plan.getCurrentDataStream(),
                    expressions.join);
@ -1076,6 +1087,7 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu

                if (expressions.join_has_delayed_stream)
                {
+                    const Block & join_result_sample = query_plan.getCurrentDataStream().header;
                    auto stream = std::make_shared<LazyNonJoinedBlockInputStream>(*join, join_result_sample, settings.max_block_size);
                    auto source = std::make_shared<SourceFromInputStream>(std::move(stream));
                    auto add_non_joined_rows_step = std::make_unique<AddingDelayedSourceStep>(
--- a/src/Interpreters/MutationsInterpreter.cpp
+++ b/src/Interpreters/MutationsInterpreter.cpp
@ -273,10 +273,9 @@ MutationsInterpreter::MutationsInterpreter(
    , commands(std::move(commands_))
    , context(context_)
    , can_execute(can_execute_)
+    , select_limits(SelectQueryOptions().analyze(!can_execute).ignoreLimits())
 {
    mutation_ast = prepare(!can_execute);
-    SelectQueryOptions limits = SelectQueryOptions().analyze(!can_execute).ignoreLimits();
-    select_interpreter = std::make_unique<InterpreterSelectQuery>(mutation_ast, context, storage, metadata_snapshot_, limits);
 }

 static NameSet getKeyColumns(const StoragePtr & storage, const StorageMetadataPtr & metadata_snapshot)
@ -767,6 +766,9 @@ QueryPipelinePtr MutationsInterpreter::addStreamsForLaterStages(const std::vecto

 void MutationsInterpreter::validate()
 {
+    if (!select_interpreter)
+        select_interpreter = std::make_unique<InterpreterSelectQuery>(mutation_ast, context, storage, metadata_snapshot, select_limits);
+
    const Settings & settings = context.getSettingsRef();

    /// For Replicated* storages mutations cannot employ non-deterministic functions
@ -794,6 +796,9 @@ BlockInputStreamPtr MutationsInterpreter::execute()
    if (!can_execute)
        throw Exception("Cannot execute mutations interpreter because can_execute flag set to false", ErrorCodes::LOGICAL_ERROR);

+    if (!select_interpreter)
+        select_interpreter = std::make_unique<InterpreterSelectQuery>(mutation_ast, context, storage, metadata_snapshot, select_limits);
+
    QueryPlan plan;
    select_interpreter->buildQueryPlan(plan);

--- a/src/Interpreters/MutationsInterpreter.h
+++ b/src/Interpreters/MutationsInterpreter.h
@ -76,6 +76,7 @@ private:
    MutationCommands commands;
    Context context;
    bool can_execute;
+    SelectQueryOptions select_limits;

    ASTPtr mutation_ast;

--- a/src/Interpreters/SubqueryForSet.cpp
+++ b/src/Interpreters/SubqueryForSet.cpp
@ -39,10 +39,20 @@ void SubqueryForSet::renameColumns(Block & block)
    }
 }

-void SubqueryForSet::setJoinActions(ExpressionActionsPtr actions)
+void SubqueryForSet::addJoinActions(ExpressionActionsPtr actions)
 {
    actions->execute(sample_block);
-    joined_block_actions = actions;
+    if (joined_block_actions == nullptr)
+    {
+        joined_block_actions = actions;
+    }
+    else
+    {
+        auto new_dag = ActionsDAG::merge(
+            std::move(*joined_block_actions->getActionsDAG().clone()),
+            std::move(*actions->getActionsDAG().clone()));
+        joined_block_actions = std::make_shared<ExpressionActions>(new_dag);
+    }
 }

 bool SubqueryForSet::insertJoinedBlock(Block & block)
--- a/src/Interpreters/SubqueryForSet.h
+++ b/src/Interpreters/SubqueryForSet.h
@ -40,7 +40,7 @@ struct SubqueryForSet
    void makeSource(std::shared_ptr<InterpreterSelectWithUnionQuery> & interpreter,
                    NamesWithAliases && joined_block_aliases_);

-    void setJoinActions(ExpressionActionsPtr actions);
+    void addJoinActions(ExpressionActionsPtr actions);

    bool insertJoinedBlock(Block & block);
    void setTotals(Block totals);
--- a/src/Interpreters/TableJoin.cpp
+++ b/src/Interpreters/TableJoin.cpp
@ -1,5 +1,7 @@
 #include <Interpreters/TableJoin.h>

+#include <common/logger_useful.h>
+
 #include <Parsers/ASTExpressionList.h>

 #include <Core/Settings.h>
@ -15,6 +17,11 @@
 namespace DB
 {

+namespace ErrorCodes
+{
+    extern const int TYPE_MISMATCH;
+}
+
 TableJoin::TableJoin(const Settings & settings, VolumePtr tmp_volume_)
    : size_limits(SizeLimits{settings.max_rows_in_join, settings.max_bytes_in_join, settings.join_overflow_mode})
    , default_max_bytes(settings.default_max_bytes_in_join)
@ -40,6 +47,10 @@ void TableJoin::resetCollected()
    columns_added_by_join.clear();
    original_names.clear();
    renames.clear();
+    left_type_map.clear();
+    right_type_map.clear();
+    left_converting_actions = nullptr;
+    right_converting_actions = nullptr;
 }

 void TableJoin::addUsingKey(const ASTPtr & ast)
@ -105,14 +116,6 @@ void TableJoin::deduplicateAndQualifyColumnNames(const NameSet & left_table_colu
    columns_from_joined_table.swap(dedup_columns);
 }

-NameSet TableJoin::getQualifiedColumnsSet() const
-{
-    NameSet out;
-    for (const auto & names : original_names)
-        out.insert(names.first);
-    return out;
-}
-
 NamesWithAliases TableJoin::getNamesWithAliases(const NameSet & required_columns) const
 {
    NamesWithAliases out;
@ -221,49 +224,54 @@ bool TableJoin::rightBecomeNullable(const DataTypePtr & column_type) const

 void TableJoin::addJoinedColumn(const NameAndTypePair & joined_column)
 {
-    if (rightBecomeNullable(joined_column.type))
-        columns_added_by_join.emplace_back(NameAndTypePair(joined_column.name, makeNullable(joined_column.type)));
-    else
-        columns_added_by_join.push_back(joined_column);
+    DataTypePtr type = joined_column.type;
+
+    if (hasUsing())
+    {
+        if (auto it = right_type_map.find(joined_column.name); it != right_type_map.end())
+            type = it->second;
+    }
+
+    if (rightBecomeNullable(type))
+        type = makeNullable(type);
+
+    columns_added_by_join.emplace_back(joined_column.name, type);
 }

-void TableJoin::addJoinedColumnsAndCorrectNullability(ColumnsWithTypeAndName & columns) const
+void TableJoin::addJoinedColumnsAndCorrectTypes(NamesAndTypesList & names_and_types, bool correct_nullability) const
+{
+    ColumnsWithTypeAndName columns;
+    for (auto & pair : names_and_types)
+        columns.emplace_back(nullptr, std::move(pair.type), std::move(pair.name));
+    names_and_types.clear();
+
+    addJoinedColumnsAndCorrectTypes(columns, correct_nullability);
+
+    for (auto & col : columns)
+        names_and_types.emplace_back(std::move(col.name), std::move(col.type));
+}
+
+void TableJoin::addJoinedColumnsAndCorrectTypes(ColumnsWithTypeAndName & columns, bool correct_nullability) const
 {
    for (auto & col : columns)
    {
-        if (leftBecomeNullable(col.type))
+        if (hasUsing())
+        {
+            if (auto it = left_type_map.find(col.name); it != left_type_map.end())
+                col.type = it->second;
+        }
+        if (correct_nullability && leftBecomeNullable(col.type))
        {
            /// No need to nullify constants
-            if (!(col.column && isColumnConst(*col.column)))
-            {
+            bool is_column_const = col.column && isColumnConst(*col.column);
+            if (!is_column_const)
                col.type = makeNullable(col.type);
-            }
        }
    }

+    /// Types in columns_added_by_join already converted and set nullable if needed
    for (const auto & col : columns_added_by_join)
-    {
-        auto res_type = col.type;
-
-        if (rightBecomeNullable(res_type))
-            res_type = makeNullable(res_type);
-
-        columns.emplace_back(nullptr, res_type, col.name);
-    }
-}
-
-bool TableJoin::sameJoin(const TableJoin * x, const TableJoin * y)
-{
-    if (!x && !y)
-        return true;
-    if (!x || !y)
-        return false;
-
-    return x->table_join.kind == y->table_join.kind
-        && x->table_join.strictness == y->table_join.strictness
-        && x->key_names_left == y->key_names_left
-        && x->key_names_right == y->key_names_right
-        && x->columns_added_by_join == y->columns_added_by_join;
+        columns.emplace_back(nullptr, col.type, col.name);
 }

 bool TableJoin::sameStrictnessAndKind(ASTTableJoin::Strictness strictness_, ASTTableJoin::Kind kind_) const
@ -336,4 +344,124 @@ bool TableJoin::allowDictJoin(const String & dict_key, const Block & sample_bloc
    return true;
 }

+bool TableJoin::applyJoinKeyConvert(const ColumnsWithTypeAndName & left_sample_columns, const ColumnsWithTypeAndName & right_sample_columns)
+{
+    bool need_convert = needConvert();
+    if (!need_convert && !hasUsing())
+    {
+        /// For `USING` we already inferred common type an syntax analyzer stage
+        NamesAndTypesList left_list;
+        NamesAndTypesList right_list;
+        for (const auto & col : left_sample_columns)
+            left_list.emplace_back(col.name, col.type);
+        for (const auto & col : right_sample_columns)
+            right_list.emplace_back(col.name, col.type);
+
+        need_convert = inferJoinKeyCommonType(left_list, right_list);
+    }
+
+    if (need_convert)
+    {
+        left_converting_actions = applyKeyConvertToTable(left_sample_columns, left_type_map, key_names_left);
+        right_converting_actions = applyKeyConvertToTable(right_sample_columns, right_type_map, key_names_right);
+    }
+
+    return need_convert;
+}
+
+bool TableJoin::inferJoinKeyCommonType(const NamesAndTypesList & left, const NamesAndTypesList & right)
+{
+    std::unordered_map<String, DataTypePtr> left_types;
+    for (const auto & col : left)
+    {
+        left_types[col.name] = col.type;
+    }
+
+    std::unordered_map<String, DataTypePtr> right_types;
+    for (const auto & col : right)
+    {
+        if (auto it = renames.find(col.name); it != renames.end())
+            right_types[it->second] = col.type;
+        else
+            right_types[col.name] = col.type;
+    }
+
+    for (size_t i = 0; i < key_names_left.size(); ++i)
+    {
+        auto ltype = left_types.find(key_names_left[i]);
+        auto rtype = right_types.find(key_names_right[i]);
+        if (ltype == left_types.end() || rtype == right_types.end())
+        {
+            /// Name mismatch, give up
+            left_type_map.clear();
+            right_type_map.clear();
+            return false;
+        }
+
+        if (JoinCommon::typesEqualUpToNullability(ltype->second, rtype->second))
+            continue;
+
+        DataTypePtr supertype;
+        try
+        {
+            supertype = DB::getLeastSupertype({ltype->second, rtype->second});
+        }
+        catch (DB::Exception & ex)
+        {
+            throw Exception(
+                "Type mismatch of columns to JOIN by: " +
+                    key_names_left[i] + ": " + ltype->second->getName() + " at left, " +
+                    key_names_right[i] + ": " + rtype->second->getName() + " at right. " +
+                    "Can't get supertype: " + ex.message(),
+                ErrorCodes::TYPE_MISMATCH);
+        }
+        left_type_map[key_names_left[i]] = right_type_map[key_names_right[i]] = supertype;
+    }
+
+    if (!left_type_map.empty() || !right_type_map.empty())
+    {
+        auto format_type_map = [](NameToTypeMap mapping) -> std::string
+        {
+            std::vector<std::string> text;
+            for (const auto & [k, v] : mapping)
+                text.push_back(k + ": " + v->getName());
+            return fmt::format("{}", fmt::join(text, ", "));
+        };
+        LOG_TRACE(
+            &Poco::Logger::get("TableJoin"),
+            "Infer supertype for joined columns. Left: [{}], Right: [{}]",
+            format_type_map(left_type_map),
+            format_type_map(right_type_map));
+    }
+
+    return !left_type_map.empty();
+}
+
+ActionsDAGPtr TableJoin::applyKeyConvertToTable(
+    const ColumnsWithTypeAndName & cols_src, const NameToTypeMap & type_mapping, Names & names_to_rename) const
+{
+    ColumnsWithTypeAndName cols_dst = cols_src;
+    for (auto & col : cols_dst)
+    {
+        if (auto it = type_mapping.find(col.name); it != type_mapping.end())
+        {
+            col.type = it->second;
+            col.column = nullptr;
+        }
+    }
+
+    NameToNameMap key_column_rename;
+    /// Returns converting actions for tables that need to be performed before join
+    auto dag = ActionsDAG::makeConvertingActions(
+        cols_src, cols_dst, ActionsDAG::MatchColumnsMode::Name, true, !hasUsing(), &key_column_rename);
+
+    for (auto & name : names_to_rename)
+    {
+        const auto it = key_column_rename.find(name);
+        if (it != key_column_rename.end())
+            name = it->second;
+    }
+    return dag;
+}
+
 }
--- a/src/Interpreters/TableJoin.h
+++ b/src/Interpreters/TableJoin.h
@ -5,9 +5,11 @@
 #include <Core/SettingsEnums.h>
 #include <Parsers/ASTTablesInSelectQuery.h>
 #include <Interpreters/IJoin.h>
+#include <Interpreters/join_common.h>
 #include <Interpreters/asof.h>
 #include <DataStreams/IBlockStream_fwd.h>
 #include <DataStreams/SizeLimits.h>
+#include <DataTypes/getLeastSupertype.h>
 #include <Storages/IStorage_fwd.h>

 #include <utility>
@ -32,6 +34,11 @@ using VolumePtr = std::shared_ptr<IVolume>;

 class TableJoin
 {
+
+public:
+    using NameToTypeMap = std::unordered_map<String, DataTypePtr>;
+
+private:
    /** Query of the form `SELECT expr(x) AS k FROM t1 ANY LEFT JOIN (SELECT expr(x) AS k FROM t2) USING k`
      * The join is made by column k.
      * During the JOIN,
@ -59,23 +66,39 @@ class TableJoin

    Names key_names_left;
    Names key_names_right; /// Duplicating names are qualified.
+
    ASTs key_asts_left;
    ASTs key_asts_right;
    ASTTableJoin table_join;
+
    ASOF::Inequality asof_inequality = ASOF::Inequality::GreaterOrEquals;

    /// All columns which can be read from joined table. Duplicating names are qualified.
    NamesAndTypesList columns_from_joined_table;
-    /// Columns will be added to block by JOIN. It's a subset of columns_from_joined_table with corrected Nullability
+    /// Columns will be added to block by JOIN.
+    /// It's a subset of columns_from_joined_table with corrected Nullability and type (if inplace type conversion is required)
    NamesAndTypesList columns_added_by_join;

+    /// Target type to convert key columns before join
+    NameToTypeMap left_type_map;
+    NameToTypeMap right_type_map;
+
+    ActionsDAGPtr left_converting_actions;
+    ActionsDAGPtr right_converting_actions;
+
    /// Name -> original name. Names are the same as in columns_from_joined_table list.
    std::unordered_map<String, String> original_names;
-    /// Original name -> name. Only ranamed columns.
+    /// Original name -> name. Only renamed columns.
    std::unordered_map<String, String> renames;

    VolumePtr tmp_volume;

+    Names requiredJoinedNames() const;
+
+    /// Create converting actions and change key column names if required
+    ActionsDAGPtr applyKeyConvertToTable(
+        const ColumnsWithTypeAndName & cols_src, const NameToTypeMap & type_mapping, Names & names_to_rename) const;
+
 public:
    TableJoin() = default;
    TableJoin(const Settings &, VolumePtr tmp_volume);
@ -125,7 +148,6 @@ public:
    bool hasUsing() const { return table_join.using_expression_list != nullptr; }
    bool hasOn() const { return table_join.on_expression != nullptr; }

-    NameSet getQualifiedColumnsSet() const;
    NamesWithAliases getNamesWithAliases(const NameSet & required_columns) const;
    NamesWithAliases getRequiredColumns(const Block & sample, const Names & action_required_columns) const;

@ -136,7 +158,23 @@ public:
    bool leftBecomeNullable(const DataTypePtr & column_type) const;
    bool rightBecomeNullable(const DataTypePtr & column_type) const;
    void addJoinedColumn(const NameAndTypePair & joined_column);
-    void addJoinedColumnsAndCorrectNullability(ColumnsWithTypeAndName & columns) const;
+
+    void addJoinedColumnsAndCorrectTypes(NamesAndTypesList & names_and_types, bool correct_nullability = true) const;
+    void addJoinedColumnsAndCorrectTypes(ColumnsWithTypeAndName & columns, bool correct_nullability = true) const;
+
+    /// Calculates common supertypes for corresponding join key columns.
+    bool inferJoinKeyCommonType(const NamesAndTypesList & left, const NamesAndTypesList & right);
+
+    /// Calculate converting actions, rename key columns in required
+    /// For `USING` join we will convert key columns inplace and affect into types in the result table
+    /// For `JOIN ON` we will create new columns with converted keys to join by.
+    bool applyJoinKeyConvert(const ColumnsWithTypeAndName & left_sample_columns, const ColumnsWithTypeAndName & right_sample_columns);
+
+    bool needConvert() const { return !left_type_map.empty(); }
+
+    /// Key columns should be converted before join.
+    ActionsDAGPtr leftConvertingActions() const { return left_converting_actions; }
+    ActionsDAGPtr rightConvertingActions() const { return right_converting_actions; }

    void setAsofInequality(ASOF::Inequality inequality) { asof_inequality = inequality; }
    ASOF::Inequality getAsofInequality() { return asof_inequality; }
@ -144,11 +182,16 @@ public:
    ASTPtr leftKeysList() const;
    ASTPtr rightKeysList() const; /// For ON syntax only

-    Names requiredJoinedNames() const;
    const Names & keyNamesLeft() const { return key_names_left; }
    const Names & keyNamesRight() const { return key_names_right; }
    const NamesAndTypesList & columnsFromJoinedTable() const { return columns_from_joined_table; }
-    const NamesAndTypesList & columnsAddedByJoin() const { return columns_added_by_join; }
+    Names columnsAddedByJoin() const
+    {
+        Names res;
+        for (const auto & col : columns_added_by_join)
+            res.push_back(col.name);
+        return res;
+    }

    /// StorageJoin overrides key names (cause of different names qualification)
    void setRightKeys(const Names & keys) { key_names_right = keys; }
@ -156,8 +199,6 @@ public:
    /// Split key and other columns by keys name list
    void splitAdditionalColumns(const Block & sample_block, Block & block_keys, Block & block_others) const;
    Block getRequiredRightKeys(const Block & right_table_keys, std::vector<String> & keys_sources) const;
-
-    static bool sameJoin(const TableJoin * x, const TableJoin * y);
 };

 }
--- a/src/Interpreters/TreeRewriter.cpp
+++ b/src/Interpreters/TreeRewriter.cpp
@ -418,6 +418,13 @@ void collectJoinedColumns(TableJoin & analyzed_join, const ASTSelectQuery & sele
        const auto & keys = table_join.using_expression_list->as<ASTExpressionList &>();
        for (const auto & key : keys.children)
            analyzed_join.addUsingKey(key);
+
+        /// `USING` semantic allows to have columns with changed types in result table.
+        /// `JOIN ON` should preserve types from original table
+        /// We can infer common type on syntax stage for `USING` because join is performed only by columns (not expressions)
+        /// We need to know  changed types in result tables because some analysis (e.g. analyzeAggregation) performed before join
+        /// For `JOIN ON expr1 == expr2` we will infer common type later in ExpressionAnalyzer, when types of expression will be known
+        analyzed_join.inferJoinKeyCommonType(tables[0].columns, tables[1].columns);
    }
    else if (table_join.on_expression)
    {
@ -571,7 +578,6 @@ void TreeRewriterResult::collectUsedColumns(const ASTPtr & query, bool is_select
        source_column_names.insert(column.name);

    NameSet required = columns_context.requiredColumns();
-
    if (columns_context.has_table_join)
    {
        NameSet available_columns;
--- a/src/Interpreters/getClusterName.cpp
+++ b/src/Interpreters/getClusterName.cpp
@ -46,7 +46,7 @@ std::string getClusterName(const IAST & node)
 }


-String getClusterNameAndMakeLiteral(ASTPtr & node)
+std::string getClusterNameAndMakeLiteral(ASTPtr & node)
 {
    String cluster_name = getClusterName(*node);
    node = std::make_shared<ASTLiteral>(cluster_name);
--- a/src/Interpreters/getClusterName.h
+++ b/src/Interpreters/getClusterName.h
@ -14,8 +14,8 @@ namespace DB
  * This name will be parsed as an expression with an operator minus - not at all what you need.
  * Therefore, consider this case separately.
  */
-String getClusterName(const IAST & node);
+std::string getClusterName(const IAST & node);

-String getClusterNameAndMakeLiteral(ASTPtr & node);
+std::string getClusterNameAndMakeLiteral(ASTPtr & node);

 }
--- a/src/Interpreters/join_common.cpp
+++ b/src/Interpreters/join_common.cpp
@ -1,9 +1,11 @@
 #include <Interpreters/join_common.h>
 #include <Interpreters/TableJoin.h>
+#include <Interpreters/ActionsDAG.h>
 #include <Columns/ColumnNullable.h>
 #include <Columns/ColumnLowCardinality.h>
 #include <DataTypes/DataTypeNullable.h>
 #include <DataTypes/DataTypeLowCardinality.h>
+#include <DataTypes/getLeastSupertype.h>
 #include <DataStreams/materializeBlock.h>
 #include <IO/WriteHelpers.h>

@ -293,6 +295,13 @@ void addDefaultValues(IColumn & column, const DataTypePtr & type, size_t count)
        type->insertDefaultInto(column);
 }

+bool typesEqualUpToNullability(DataTypePtr left_type, DataTypePtr right_type)
+{
+    DataTypePtr left_type_strict = removeNullable(recursiveRemoveLowCardinality(left_type));
+    DataTypePtr right_type_strict = removeNullable(recursiveRemoveLowCardinality(right_type));
+    return left_type_strict->equals(*right_type_strict);
+}
+
 }


@ -307,19 +316,21 @@ NotJoined::NotJoined(const TableJoin & table_join, const Block & saved_block_sam
    table_join.splitAdditionalColumns(right_sample_block, right_table_keys, sample_block_with_columns_to_add);
    Block required_right_keys = table_join.getRequiredRightKeys(right_table_keys, tmp);

-    bool remap_keys = table_join.hasUsing();
    std::unordered_map<size_t, size_t> left_to_right_key_remap;

-    for (size_t i = 0; i < table_join.keyNamesLeft().size(); ++i)
+    if (table_join.hasUsing())
    {
-        const String & left_key_name = table_join.keyNamesLeft()[i];
-        const String & right_key_name = table_join.keyNamesRight()[i];
+        for (size_t i = 0; i < table_join.keyNamesLeft().size(); ++i)
+        {
+            const String & left_key_name = table_join.keyNamesLeft()[i];
+            const String & right_key_name = table_join.keyNamesRight()[i];

-        size_t left_key_pos = result_sample_block.getPositionByName(left_key_name);
-        size_t right_key_pos = saved_block_sample.getPositionByName(right_key_name);
+            size_t left_key_pos = result_sample_block.getPositionByName(left_key_name);
+            size_t right_key_pos = saved_block_sample.getPositionByName(right_key_name);

-        if (remap_keys && !required_right_keys.has(right_key_name))
-            left_to_right_key_remap[left_key_pos] = right_key_pos;
+            if (!required_right_keys.has(right_key_name))
+                left_to_right_key_remap[left_key_pos] = right_key_pos;
+        }
    }

    /// result_sample_block: left_sample_block + left expressions, right not key columns, required right keys
--- a/src/Interpreters/join_common.h
+++ b/src/Interpreters/join_common.h
@ -2,6 +2,8 @@

 #include <Core/Block.h>
 #include <Interpreters/IJoin.h>
+#include <Interpreters/ActionsDAG.h>
+#include <Interpreters/ExpressionActions.h>

 namespace DB
 {
@ -36,6 +38,8 @@ void joinTotals(const Block & totals, const Block & columns_to_add, const TableJ

 void addDefaultValues(IColumn & column, const DataTypePtr & type, size_t count);

+bool typesEqualUpToNullability(DataTypePtr left_type, DataTypePtr right_type);
+
 }

 /// Creates result from right table data in RIGHT and FULL JOIN when keys are not present in left table.
--- a/src/Storages/Distributed/DirectoryMonitor.cpp
+++ b/src/Storages/Distributed/DirectoryMonitor.cpp
@ -276,7 +276,7 @@ void StorageDistributedDirectoryMonitor::flushAllData()
    if (quit)
        return;

-    std::unique_lock lock{mutex};
+    std::lock_guard lock{mutex};

    const auto & files = getFiles();
    if (!files.empty())
@ -303,7 +303,7 @@ void StorageDistributedDirectoryMonitor::shutdownAndDropAllData()

 void StorageDistributedDirectoryMonitor::run()
 {
-    std::unique_lock lock{mutex};
+    std::lock_guard lock{mutex};

    bool do_sleep = false;
    while (!quit)
@ -320,12 +320,12 @@ void StorageDistributedDirectoryMonitor::run()
            {
                do_sleep = !processFiles(files);

-                std::unique_lock metrics_lock(metrics_mutex);
+                std::lock_guard metrics_lock(metrics_mutex);
                last_exception = std::exception_ptr{};
            }
            catch (...)
            {
-                std::unique_lock metrics_lock(metrics_mutex);
+                std::lock_guard metrics_lock(metrics_mutex);

                do_sleep = true;
                ++error_count;
@ -344,7 +344,7 @@ void StorageDistributedDirectoryMonitor::run()
        const auto now = std::chrono::system_clock::now();
        if (now - last_decrease_time > decrease_error_count_period)
        {
-            std::unique_lock metrics_lock(metrics_mutex);
+            std::lock_guard metrics_lock(metrics_mutex);

            error_count /= 2;
            last_decrease_time = now;
@ -453,10 +453,15 @@ std::map<UInt64, std::string> StorageDistributedDirectoryMonitor::getFiles()
        }
    }

-    metric_pending_files.changeTo(files.size());
-
    {
-        std::unique_lock metrics_lock(metrics_mutex);
+        std::lock_guard metrics_lock(metrics_mutex);
+
+        if (files_count != files.size())
+            LOG_TRACE(log, "Files set to {} (was {})", files.size(), files_count);
+        if (bytes_count != new_bytes_count)
+            LOG_TRACE(log, "Bytes set to {} (was {})", new_bytes_count, bytes_count);
+
+        metric_pending_files.changeTo(files.size());
        files_count = files.size();
        bytes_count = new_bytes_count;
    }
@ -509,9 +514,7 @@ void StorageDistributedDirectoryMonitor::processFile(const std::string & file_pa
    }

    auto dir_sync_guard = getDirectorySyncGuard(dir_fsync, disk, relative_path);
-    Poco::File{file_path}.remove();
-    metric_pending_files.sub();
-
+    markAsSend(file_path);
    LOG_TRACE(log, "Finished processing `{}`", file_path);
 }

@ -661,7 +664,7 @@ struct StorageDistributedDirectoryMonitor::Batch

            auto dir_sync_guard = getDirectorySyncGuard(dir_fsync, parent.disk, parent.relative_path);
            for (UInt64 file_index : file_indices)
-                Poco::File{file_index_to_path.at(file_index)}.remove();
+                parent.markAsSend(file_index_to_path.at(file_index));
        }
        else
        {
@ -747,16 +750,24 @@ BlockInputStreamPtr StorageDistributedDirectoryMonitor::createStreamFromFile(con
    return std::make_shared<DirectoryMonitorBlockInputStream>(file_name);
 }

-bool StorageDistributedDirectoryMonitor::scheduleAfter(size_t ms)
+bool StorageDistributedDirectoryMonitor::addAndSchedule(size_t file_size, size_t ms)
 {
    if (quit)
        return false;
+
+    {
+        std::lock_guard metrics_lock(metrics_mutex);
+        metric_pending_files.add();
+        bytes_count += file_size;
+        ++files_count;
+    }
+
    return task_handle->scheduleAfter(ms, false);
 }

-StorageDistributedDirectoryMonitor::Status StorageDistributedDirectoryMonitor::getStatus() const
+StorageDistributedDirectoryMonitor::Status StorageDistributedDirectoryMonitor::getStatus()
 {
-    std::unique_lock metrics_lock(metrics_mutex);
+    std::lock_guard metrics_lock(metrics_mutex);

    return Status{
        path,
@ -780,7 +791,6 @@ void StorageDistributedDirectoryMonitor::processFilesWithBatching(const std::map
        batch.readText(in);
        file_indices_to_skip.insert(batch.file_indices.begin(), batch.file_indices.end());
        batch.send();
-        metric_pending_files.sub(batch.file_indices.size());
    }

    std::unordered_map<BatchHeader, Batch, BatchHeader::Hash> header_to_batch;
@ -850,7 +860,6 @@ void StorageDistributedDirectoryMonitor::processFilesWithBatching(const std::map
        if (batch.isEnoughSize())
        {
            batch.send();
-            metric_pending_files.sub(batch.file_indices.size());
        }
    }

@ -858,7 +867,6 @@ void StorageDistributedDirectoryMonitor::processFilesWithBatching(const std::map
    {
        Batch & batch = kv.second;
        batch.send();
-        metric_pending_files.sub(batch.file_indices.size());
    }

    {
@ -871,7 +879,7 @@ void StorageDistributedDirectoryMonitor::processFilesWithBatching(const std::map
    }
 }

-void StorageDistributedDirectoryMonitor::markAsBroken(const std::string & file_path) const
+void StorageDistributedDirectoryMonitor::markAsBroken(const std::string & file_path)
 {
    const auto last_path_separator_pos = file_path.rfind('/');
    const auto & base_path = file_path.substr(0, last_path_separator_pos + 1);
@ -884,12 +892,37 @@ void StorageDistributedDirectoryMonitor::markAsBroken(const std::string & file_p
    auto dir_sync_guard = getDirectorySyncGuard(dir_fsync, disk, relative_path);
    auto broken_dir_sync_guard = getDirectorySyncGuard(dir_fsync, disk, relative_path + "/broken/");

-    Poco::File{file_path}.renameTo(broken_file_path);
+    Poco::File file(file_path);
+
+    {
+        std::lock_guard metrics_lock(metrics_mutex);
+
+        size_t file_size = file.getSize();
+        --files_count;
+        bytes_count -= file_size;
+    }
+
+    file.renameTo(broken_file_path);

    LOG_ERROR(log, "Renamed `{}` to `{}`", file_path, broken_file_path);
 }
+void StorageDistributedDirectoryMonitor::markAsSend(const std::string & file_path)
+{
+    Poco::File file(file_path);

-bool StorageDistributedDirectoryMonitor::maybeMarkAsBroken(const std::string & file_path, const Exception & e) const
+    size_t file_size = file.getSize();
+
+    {
+        std::lock_guard metrics_lock(metrics_mutex);
+        metric_pending_files.sub();
+        --files_count;
+        bytes_count -= file_size;
+    }
+
+    file.remove();
+}
+
+bool StorageDistributedDirectoryMonitor::maybeMarkAsBroken(const std::string & file_path, const Exception & e)
 {
    /// mark file as broken if necessary
    if (isFileBrokenErrorCode(e.code(), e.isRemoteException()))
@ -912,7 +945,7 @@ void StorageDistributedDirectoryMonitor::updatePath(const std::string & new_rela
    std::lock_guard lock{mutex};

    {
-        std::unique_lock metrics_lock(metrics_mutex);
+        std::lock_guard metrics_lock(metrics_mutex);
        relative_path = new_relative_path;
        path = disk->getPath() + relative_path + '/';
    }
--- a/src/Storages/Distributed/DirectoryMonitor.h
+++ b/src/Storages/Distributed/DirectoryMonitor.h
@ -48,7 +48,7 @@ public:
    static BlockInputStreamPtr createStreamFromFile(const String & file_name);

    /// For scheduling via DistributedBlockOutputStream
-    bool scheduleAfter(size_t ms);
+    bool addAndSchedule(size_t file_size, size_t ms);

    /// system.distribution_queue interface
    struct Status
@ -60,7 +60,7 @@ public:
        size_t bytes_count;
        bool is_blocked;
    };
-    Status getStatus() const;
+    Status getStatus();

 private:
    void run();
@ -70,8 +70,9 @@ private:
    void processFile(const std::string & file_path);
    void processFilesWithBatching(const std::map<UInt64, std::string> & files);

-    void markAsBroken(const std::string & file_path) const;
-    bool maybeMarkAsBroken(const std::string & file_path, const Exception & e) const;
+    void markAsBroken(const std::string & file_path);
+    void markAsSend(const std::string & file_path);
+    bool maybeMarkAsBroken(const std::string & file_path, const Exception & e);

    std::string getLoggerName() const;

@ -91,7 +92,7 @@ private:
    struct BatchHeader;
    struct Batch;

-    mutable std::mutex metrics_mutex;
+    std::mutex metrics_mutex;
    size_t error_count = 0;
    size_t files_count = 0;
    size_t bytes_count = 0;
--- a/src/Storages/Distributed/DistributedBlockOutputStream.cpp
+++ b/src/Storages/Distributed/DistributedBlockOutputStream.cpp
@ -114,6 +114,7 @@ Block DistributedBlockOutputStream::getHeader() const

 void DistributedBlockOutputStream::writePrefix()
 {
+    storage.delayInsertOrThrowIfNeeded();
 }


@ -717,6 +718,7 @@ void DistributedBlockOutputStream::writeToShard(const Block & block, const std::
        auto dir_sync_guard = make_directory_sync_guard(*it);
    }

+    auto file_size = Poco::File(first_file_tmp_path).getSize();
    /// remove the temporary file, enabling the OS to reclaim inode after all threads
    /// have removed their corresponding files
    Poco::File(first_file_tmp_path).remove();
@ -726,7 +728,7 @@ void DistributedBlockOutputStream::writeToShard(const Block & block, const std::
    for (const auto & dir_name : dir_names)
    {
        auto & directory_monitor = storage.requireDirectoryMonitor(disk, dir_name);
-        directory_monitor.scheduleAfter(sleep_ms.totalMilliseconds());
+        directory_monitor.addAndSchedule(file_size, sleep_ms.totalMilliseconds());
    }
 }

--- a/src/Storages/Distributed/DistributedSettings.h
+++ b/src/Storages/Distributed/DistributedSettings.h
@ -17,6 +17,10 @@ class ASTStorage;
 #define LIST_OF_DISTRIBUTED_SETTINGS(M) \
    M(Bool, fsync_after_insert, false, "Do fsync for every inserted. Will decreases performance of inserts (only for async INSERT, i.e. insert_distributed_sync=false)", 0) \
    M(Bool, fsync_directories, false, "Do fsync for temporary directory (that is used for async INSERT only) after all part operations (writes, renames, etc.).", 0) \
+    /** Inserts settings. */ \
+    M(UInt64, bytes_to_throw_insert, 0, "If more than this number of compressed bytes will be pending for async INSERT, an exception will be thrown. 0 - do not throw.", 0) \
+    M(UInt64, bytes_to_delay_insert, 0, "If more than this number of compressed bytes will be pending for async INSERT, the query will be delayed. 0 - do not delay.", 0) \
+    M(UInt64, max_delay_to_insert, 60, "Max delay of inserting data into Distributed table in seconds, if there are a lot of pending bytes for async send.", 0) \

 DECLARE_SETTINGS_TRAITS(DistributedSettingsTraits, LIST_OF_DISTRIBUTED_SETTINGS)

--- a/src/Storages/MergeTree/registerStorageMergeTree.cpp
+++ b/src/Storages/MergeTree/registerStorageMergeTree.cpp
@ -60,6 +60,27 @@ static Names extractColumnNames(const ASTPtr & node)
    }
 }

+/** Is used to order Graphite::Retentions by age and precision descending.
+  * Throws exception if not both age and precision are less or greater then another.
+  */
+static bool compareRetentions(const Graphite::Retention & a, const Graphite::Retention & b)
+{
+    if (a.age > b.age && a.precision > b.precision)
+    {
+        return true;
+    }
+    else if (a.age < b.age && a.precision < b.precision)
+    {
+        return false;
+    }
+    String error_msg = "age and precision should only grow up: "
+        + std::to_string(a.age) + ":" + std::to_string(a.precision) + " vs "
+        + std::to_string(b.age) + ":" + std::to_string(b.precision);
+    throw Exception(
+        error_msg,
+        ErrorCodes::BAD_ARGUMENTS);
+}
+
 /** Read the settings for Graphite rollup from config.
  * Example
  *
@ -157,8 +178,7 @@ appendGraphitePattern(const Poco::Util::AbstractConfiguration & config, const St

    /// retention should be in descending order of age.
    if (pattern.type & pattern.TypeRetention) /// TypeRetention or TypeAll
-        std::sort(pattern.retentions.begin(), pattern.retentions.end(),
-            [] (const Graphite::Retention & a, const Graphite::Retention & b) { return a.age > b.age; });
+        std::sort(pattern.retentions.begin(), pattern.retentions.end(), compareRetentions);

    patterns.emplace_back(pattern);
 }
--- a/src/Storages/StorageDistributed.cpp
+++ b/src/Storages/StorageDistributed.cpp
@ -14,10 +14,12 @@
 #include <Columns/ColumnConst.h>

 #include <Common/Macros.h>
+#include <Common/ProfileEvents.h>
 #include <Common/escapeForFileName.h>
 #include <Common/typeid_cast.h>
 #include <Common/quoteString.h>
 #include <Common/randomSeed.h>
+#include <Common/formatReadable.h>

 #include <Parsers/ASTDropQuery.h>
 #include <Parsers/ASTExpressionList.h>
@ -69,6 +71,13 @@ const UInt64 FORCE_OPTIMIZE_SKIP_UNUSED_SHARDS_ALWAYS           = 2;
 const UInt64 DISTRIBUTED_GROUP_BY_NO_MERGE_AFTER_AGGREGATION = 2;
 }

+namespace ProfileEvents
+{
+    extern const Event DistributedRejectedInserts;
+    extern const Event DistributedDelayedInserts;
+    extern const Event DistributedDelayedInsertsMilliseconds;
+}
+
 namespace DB
 {

@ -85,6 +94,8 @@ namespace ErrorCodes
    extern const int UNABLE_TO_SKIP_UNUSED_SHARDS;
    extern const int INVALID_SHARD_ID;
    extern const int ALTER_OF_COLUMN_IS_FORBIDDEN;
+    extern const int DISTRIBUTED_TOO_MANY_PENDING_BYTES;
+    extern const int ARGUMENT_OUT_OF_BOUND;
 }

 namespace ActionLocks
@ -768,6 +779,14 @@ std::vector<StorageDistributedDirectoryMonitor::Status> StorageDistributed::getD
    return statuses;
 }

+std::optional<UInt64> StorageDistributed::totalBytes(const Settings &) const
+{
+    UInt64 total_bytes = 0;
+    for (const auto & status : getDirectoryMonitorsStatuses())
+        total_bytes += status.bytes_count;
+    return total_bytes;
+}
+
 size_t StorageDistributed::getShardCount() const
 {
    return getCluster()->getShardCount();
@ -967,6 +986,54 @@ void StorageDistributed::renameOnDisk(const String & new_path_to_table_data)
    relative_data_path = new_path_to_table_data;
 }

+void StorageDistributed::delayInsertOrThrowIfNeeded() const
+{
+    if (!distributed_settings.bytes_to_throw_insert &&
+        !distributed_settings.bytes_to_delay_insert)
+        return;
+
+    UInt64 total_bytes = *totalBytes(global_context.getSettingsRef());
+
+    if (distributed_settings.bytes_to_throw_insert && total_bytes > distributed_settings.bytes_to_throw_insert)
+    {
+        ProfileEvents::increment(ProfileEvents::DistributedRejectedInserts);
+        throw Exception(ErrorCodes::DISTRIBUTED_TOO_MANY_PENDING_BYTES,
+            "Too many bytes pending for async INSERT: {} (bytes_to_throw_insert={})",
+            formatReadableSizeWithBinarySuffix(total_bytes),
+            formatReadableSizeWithBinarySuffix(distributed_settings.bytes_to_throw_insert));
+    }
+
+    if (distributed_settings.bytes_to_delay_insert && total_bytes > distributed_settings.bytes_to_delay_insert)
+    {
+        /// Step is 5% of the delay and minimal one second.
+        /// NOTE: max_delay_to_insert is in seconds, and step is in ms.
+        const size_t step_ms = std::min<double>(1., double(distributed_settings.max_delay_to_insert) * 1'000 * 0.05);
+        UInt64 delayed_ms = 0;
+
+        do {
+            delayed_ms += step_ms;
+            std::this_thread::sleep_for(std::chrono::milliseconds(step_ms));
+        } while (*totalBytes(global_context.getSettingsRef()) > distributed_settings.bytes_to_delay_insert && delayed_ms < distributed_settings.max_delay_to_insert*1000);
+
+        ProfileEvents::increment(ProfileEvents::DistributedDelayedInserts);
+        ProfileEvents::increment(ProfileEvents::DistributedDelayedInsertsMilliseconds, delayed_ms);
+
+        UInt64 new_total_bytes = *totalBytes(global_context.getSettingsRef());
+        LOG_INFO(log, "Too many bytes pending for async INSERT: was {}, now {}, INSERT was delayed to {} ms",
+            formatReadableSizeWithBinarySuffix(total_bytes),
+            formatReadableSizeWithBinarySuffix(new_total_bytes),
+            delayed_ms);
+
+        if (new_total_bytes > distributed_settings.bytes_to_delay_insert)
+        {
+            ProfileEvents::increment(ProfileEvents::DistributedRejectedInserts);
+            throw Exception(ErrorCodes::DISTRIBUTED_TOO_MANY_PENDING_BYTES,
+                "Too many bytes pending for async INSERT: {} (bytes_to_delay_insert={})",
+                formatReadableSizeWithBinarySuffix(new_total_bytes),
+                formatReadableSizeWithBinarySuffix(distributed_settings.bytes_to_delay_insert));
+        }
+    }
+}

 void registerStorageDistributed(StorageFactory & factory)
 {
@ -1033,6 +1100,17 @@ void registerStorageDistributed(StorageFactory & factory)
            distributed_settings.loadFromQuery(*args.storage_def);
        }

+        if (distributed_settings.max_delay_to_insert < 1)
+            throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND,
+                "max_delay_to_insert cannot be less then 1");
+
+        if (distributed_settings.bytes_to_throw_insert && distributed_settings.bytes_to_delay_insert &&
+            distributed_settings.bytes_to_throw_insert <= distributed_settings.bytes_to_delay_insert)
+        {
+            throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND,
+                "bytes_to_throw_insert cannot be less or equal to bytes_to_delay_insert (since it is handled first)");
+        }
+
        return StorageDistributed::create(
            args.table_id, args.columns, args.constraints,
            remote_database, remote_table, cluster_name,
--- a/src/Storages/StorageDistributed.h
+++ b/src/Storages/StorageDistributed.h
@ -41,6 +41,7 @@ class StorageDistributed final : public ext::shared_ptr_helper<StorageDistribute
    friend struct ext::shared_ptr_helper<StorageDistributed>;
    friend class DistributedBlockOutputStream;
    friend class StorageDistributedDirectoryMonitor;
+    friend class StorageSystemDistributionQueue;

 public:
    ~StorageDistributed() override;
@ -76,6 +77,7 @@ public:
        unsigned /*num_streams*/) override;

    bool supportsParallelInsert() const override { return true; }
+    std::optional<UInt64> totalBytes(const Settings &) const override;

    BlockOutputStreamPtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, const Context & context) override;

@ -83,7 +85,6 @@ public:
    void truncate(const ASTPtr &, const StorageMetadataPtr &, const Context &, TableExclusiveLockHolder &) override;

    void rename(const String & new_path_to_table_data, const StorageID & new_table_id) override;
-    void renameOnDisk(const String & new_path_to_table_data);

    void checkAlterIsPossible(const AlterCommands & commands, const Context & context) const override;

@ -98,65 +99,24 @@ public:
    bool storesDataOnDisk() const override { return true; }
    Strings getDataPaths() const override;

-    const ExpressionActionsPtr & getShardingKeyExpr() const { return sharding_key_expr; }
-    const String & getShardingKeyColumnName() const { return sharding_key_column_name; }
-    size_t getShardCount() const;
-    const String & getRelativeDataPath() const { return relative_data_path; }
-    std::string getRemoteDatabaseName() const { return remote_database; }
-    std::string getRemoteTableName() const { return remote_table; }
-    std::string getClusterName() const { return cluster_name; } /// Returns empty string if tables is used by TableFunctionRemote
-
-    /// create directory monitors for each existing subdirectory
-    void createDirectoryMonitors(const DiskPtr & disk);
-    /// ensure directory monitor thread and connectoin pool creation by disk and subdirectory name
-    StorageDistributedDirectoryMonitor & requireDirectoryMonitor(const DiskPtr & disk, const std::string & name);
-    /// Return list of metrics for all created monitors
-    /// (note that monitors are created lazily, i.e. until at least one INSERT executed)
-    std::vector<StorageDistributedDirectoryMonitor::Status> getDirectoryMonitorsStatuses() const;
-
-    void flushClusterNodesAllData(const Context & context);
-
-    ClusterPtr getCluster() const;
-
-    static IColumn::Selector createSelector(const ClusterPtr cluster, const ColumnWithTypeAndName & result);
-    /// Apply the following settings:
-    /// - optimize_skip_unused_shards
-    /// - force_optimize_skip_unused_shards
-    ClusterPtr getOptimizedCluster(const Context &, const StorageMetadataPtr & metadata_snapshot, const ASTPtr & query_ptr) const;
-    ClusterPtr skipUnusedShards(ClusterPtr cluster, const ASTPtr & query_ptr, const StorageMetadataPtr & metadata_snapshot, const Context & context) const;
-
    ActionLock getActionLock(StorageActionBlockType type) override;

    NamesAndTypesList getVirtuals() const override;

-    size_t getRandomShardIndex(const Cluster::ShardsInfo & shards);
+    /// Used by InterpreterInsertQuery
+    std::string getRemoteDatabaseName() const { return remote_database; }
+    std::string getRemoteTableName() const { return remote_table; }
+    /// Returns empty string if tables is used by TableFunctionRemote
+    std::string getClusterName() const { return cluster_name; }
+    ClusterPtr getCluster() const;

-    const DistributedSettings & getDistributedSettingsRef() const { return distributed_settings; }
+    /// Used by InterpreterSystemQuery
+    void flushClusterNodesAllData(const Context & context);

-    String remote_database;
-    String remote_table;
-    ASTPtr remote_table_function_ptr;
+    /// Used by ClusterCopier
+    size_t getShardCount() const;

-    const Context & global_context;
-    Poco::Logger * log;
-
-    /// Used to implement TableFunctionRemote.
-    std::shared_ptr<Cluster> owned_cluster;
-
-    /// Is empty if this storage implements TableFunctionRemote.
-    const String cluster_name;
-
-    bool has_sharding_key;
-    bool sharding_key_is_deterministic = false;
-    ExpressionActionsPtr sharding_key_expr;
-    String sharding_key_column_name;
-
-    /// Used for global monotonic ordering of files to send.
-    SimpleIncrement file_names_increment;
-
-    ActionBlocker monitors_blocker;
-
-protected:
+private:
    StorageDistributed(
        const StorageID & id_,
        const ColumnsDescription & columns_,
@ -186,6 +146,60 @@ protected:
        bool attach,
        ClusterPtr owned_cluster_ = {});

+    void renameOnDisk(const String & new_path_to_table_data);
+
+    const ExpressionActionsPtr & getShardingKeyExpr() const { return sharding_key_expr; }
+    const String & getShardingKeyColumnName() const { return sharding_key_column_name; }
+    const String & getRelativeDataPath() const { return relative_data_path; }
+
+    /// create directory monitors for each existing subdirectory
+    void createDirectoryMonitors(const DiskPtr & disk);
+    /// ensure directory monitor thread and connectoin pool creation by disk and subdirectory name
+    StorageDistributedDirectoryMonitor & requireDirectoryMonitor(const DiskPtr & disk, const std::string & name);
+
+    /// Return list of metrics for all created monitors
+    /// (note that monitors are created lazily, i.e. until at least one INSERT executed)
+    ///
+    /// Used by StorageSystemDistributionQueue
+    std::vector<StorageDistributedDirectoryMonitor::Status> getDirectoryMonitorsStatuses() const;
+
+    static IColumn::Selector createSelector(const ClusterPtr cluster, const ColumnWithTypeAndName & result);
+    /// Apply the following settings:
+    /// - optimize_skip_unused_shards
+    /// - force_optimize_skip_unused_shards
+    ClusterPtr getOptimizedCluster(const Context &, const StorageMetadataPtr & metadata_snapshot, const ASTPtr & query_ptr) const;
+    ClusterPtr skipUnusedShards(ClusterPtr cluster, const ASTPtr & query_ptr, const StorageMetadataPtr & metadata_snapshot, const Context & context) const;
+
+    size_t getRandomShardIndex(const Cluster::ShardsInfo & shards);
+
+    const DistributedSettings & getDistributedSettingsRef() const { return distributed_settings; }
+
+    void delayInsertOrThrowIfNeeded() const;
+
+private:
+    String remote_database;
+    String remote_table;
+    ASTPtr remote_table_function_ptr;
+
+    const Context & global_context;
+    Poco::Logger * log;
+
+    /// Used to implement TableFunctionRemote.
+    std::shared_ptr<Cluster> owned_cluster;
+
+    /// Is empty if this storage implements TableFunctionRemote.
+    const String cluster_name;
+
+    bool has_sharding_key;
+    bool sharding_key_is_deterministic = false;
+    ExpressionActionsPtr sharding_key_expr;
+    String sharding_key_column_name;
+
+    /// Used for global monotonic ordering of files to send.
+    SimpleIncrement file_names_increment;
+
+    ActionBlocker monitors_blocker;
+
    String relative_data_path;

    /// Can be empty if relative_data_path is empty. In this case, a directory for the data to be sent is not created.
--- a/src/Storages/System/StorageSystemContributors.generated.cpp
+++ b/src/Storages/System/StorageSystemContributors.generated.cpp
@ -13,9 +13,11 @@ const char * auto_contributors[] {
    "Aleksandra (Ася)",
    "Aleksandrov Vladimir",
    "Aleksei Levushkin",
+    "Aleksei Semiglazov",
    "Aleksey",
    "Aleksey Akulovich",
    "Alex Bocharov",
+    "Alex Karo",
    "Alex Krash",
    "Alex Ryndin",
    "Alex Zatelepin",
@ -81,8 +83,10 @@ const char * auto_contributors[] {
    "Andrey Skobtsov",
    "Andrey Urusov",
    "Andy Yang",
+    "Anmol Arora",
    "Anna",
    "Anna Shakhova",
+    "Anthony N. Simon",
    "Anton Ivashkin",
    "Anton Kobzev",
    "Anton Kvasha",
@ -143,6 +147,7 @@ const char * auto_contributors[] {
    "Colum",
    "Constantin S. Pan",
    "CurtizJ",
+    "DIAOZHAFENG",
    "Daniel Bershatsky",
    "Daniel Dao",
    "Daniel Qin",
@ -203,6 +208,7 @@ const char * auto_contributors[] {
    "Fan()",
    "FawnD2",
    "FeehanG",
+    "FgoDt",
    "Filipe Caixeta",
    "Flowyi",
    "Francisco Barón",
@ -224,7 +230,10 @@ const char * auto_contributors[] {
    "Grigory Buteyko",
    "Grigory Pervakov",
    "Guillaume Tassery",
+    "Haavard Kvaalen",
+    "Habibullah Oladepo",
    "Hamoon",
+    "Hasitha Kanchana",
    "Hasnat",
    "Hiroaki Nakamura",
    "HuFuwang",
@ -261,6 +270,7 @@ const char * auto_contributors[] {
    "Ivan Remen",
    "Ivan Starkov",
    "Ivan Zhukov",
+    "JackyWoo",
    "Jacob Hayes",
    "JaosnHsieh",
    "Jason",
@ -271,6 +281,7 @@ const char * auto_contributors[] {
    "Jiang Tao",
    "Jochen Schalanda",
    "John",
+    "John Hummel",
    "Jonatas Freitas",
    "Kang Liu",
    "Karl Pietrzak",
@ -304,6 +315,7 @@ const char * auto_contributors[] {
    "Luis Bosque",
    "Lv Feng",
    "Léo Ercolanelli",
+    "M0r64n",
    "Maks Skorokhod",
    "Maksim",
    "Maksim Fedotov",
@ -317,9 +329,11 @@ const char * auto_contributors[] {
    "Mark Frost",
    "Mark Papadakis",
    "Maroun Maroun",
+    "Marquitos",
    "Marsel Arduanov",
    "Marti Raudsepp",
    "Martijn Bakker",
+    "Marvin Taschenberger",
    "Masha",
    "Matthew Peveler",
    "Matwey V. Kornilov",
@ -338,6 +352,7 @@ const char * auto_contributors[] {
    "Maxim Smirnov",
    "Maxim Ulanovskiy",
    "MaximAL",
+    "Mc.Spring",
    "MeiK",
    "Metehan Çetinkaya",
    "Metikov Vadim",
@ -386,6 +401,7 @@ const char * auto_contributors[] {
    "Nikita Vasilev",
    "Nikolai Kochetov",
    "Nikolai Sorokin",
+    "Nikolay",
    "Nikolay Degterinsky",
    "Nikolay Kirsh",
    "Nikolay Semyachkin",
@ -439,6 +455,7 @@ const char * auto_contributors[] {
    "Roman Peshkurov",
    "Roman Tsisyk",
    "Ruslan",
+    "Russ Frank",
    "Ruzal Ibragimov",
    "S.M.A. Djawadi",
    "Sabyanin Maxim",
@ -461,12 +478,14 @@ const char * auto_contributors[] {
    "Sergey Shtykov",
    "Sergey V. Galtsev",
    "Sergey Zaikin",
+    "Sergi Almacellas Abellana",
    "Sergi Vladykin",
    "SevaCode",
    "Sherry Wang",
    "Silviu Caragea",
    "Simon Liu",
    "Simon Podlipsky",
+    "Sina",
    "Sjoerd Mulder",
    "Slach",
    "Snow",
@ -502,6 +521,7 @@ const char * auto_contributors[] {
    "Vadim Plakhtinskiy",
    "Vadim Skipin",
    "VadimPE",
+    "Val",
    "Valera Ryaboshapko",
    "Vasily Kozhukhovskiy",
    "Vasily Morozov",
@ -525,6 +545,7 @@ const char * auto_contributors[] {
    "Vlad Arkhipov",
    "Vladimir",
    "Vladimir Bunchuk",
+    "Vladimir Ch",
    "Vladimir Chebotarev",
    "Vladimir Golovchenko",
    "Vladimir Goncharov",
@ -662,11 +683,13 @@ const char * auto_contributors[] {
    "frank",
    "franklee",
    "fredchenbj",
+    "fuwhu",
    "g-arslan",
    "ggerogery",
    "giordyb",
    "glockbender",
    "glushkovds",
+    "guoleiyi",
    "gyuton",
    "hao.he",
    "hchen9",
@ -675,6 +698,7 @@ const char * auto_contributors[] {
    "hexiaoting",
    "hotid",
    "hustnn",
+    "idfer",
    "igor",
    "igor.lapko",
    "ikopylov",
@ -686,9 +710,11 @@ const char * auto_contributors[] {
    "javartisan",
    "javi",
    "javi santana",
+    "jennyma",
    "jetgm",
    "jianmei zhang",
    "jyz0309",
+    "keenwolf",
    "kmeaw",
    "koshachy",
    "kreuzerkrieg",
@ -697,6 +723,7 @@ const char * auto_contributors[] {
    "kssenii",
    "l",
    "lalex",
+    "lehasm",
    "leozhang",
    "levushkin aleksej",
    "levysh",
@ -781,6 +808,7 @@ const char * auto_contributors[] {
    "shangshujie",
    "shedx",
    "simon-says",
+    "spff",
    "spongedc",
    "spyros87",
    "stavrolia",
@ -799,6 +827,7 @@ const char * auto_contributors[] {
    "tiger.yan",
    "tison",
    "topvisor",
+    "turbo jason",
    "tyrionhuang",
    "ubuntu",
    "unegare",
@ -820,6 +849,7 @@ const char * auto_contributors[] {
    "yangshuai",
    "ygrek",
    "yhgcn",
+    "yiguolei",
    "ylchou",
    "yonesko",
    "yuefoo",
@ -829,8 +859,10 @@ const char * auto_contributors[] {
    "zhang2014",
    "zhangshengyu",
    "zhangxiao018",
+    "zhangxiao871",
    "zhen ni",
    "zhukai",
+    "zlx19950903",
    "zvrr",
    "zvvr",
    "zzsmdfj",
--- a/tests/clickhouse-test-server
+++ b/tests/clickhouse-test-server
@ -60,6 +60,7 @@ if [ "$DATA_DIR_PATTERN" != "$DATA_DIR" ]; then
    cat ${CONFIG_SERVER_DIR}/ints_dictionary.xml | sed -e s!9000!$CLICKHOUSE_PORT_TCP! > $DATA_DIR/etc/ints_dictionary.xml
    cat ${CONFIG_SERVER_DIR}/strings_dictionary.xml | sed -e s!9000!$CLICKHOUSE_PORT_TCP! > $DATA_DIR/etc/strings_dictionary.xml
    cat ${CONFIG_SERVER_DIR}/decimals_dictionary.xml | sed -e s!9000!$CLICKHOUSE_PORT_TCP! > $DATA_DIR/etc/decimals_dictionary.xml
+    cat ${CONFIG_SERVER_DIR}/executable_pool_dictionary.xml | sed -e s!9000!$CLICKHOUSE_PORT_TCP! > $DATA_DIR/etc/executable_pool_dictionary.xml
 fi

 CLICKHOUSE_EXTRACT_CONFIG=${CLICKHOUSE_EXTRACT_CONFIG:="${CLICKHOUSE_EXTRACT} --config=$CLICKHOUSE_CONFIG"}
--- a/tests/config/config.d/test_keeper_port.xml
+++ b/tests/config/config.d/test_keeper_port.xml
@ -2,7 +2,6 @@
    <test_keeper_server>
        <tcp_port>9181</tcp_port>
        <server_id>1</server_id>
-        <log_storage_path>/var/lib/clickhouse/coordination/log</log_storage_path>

        <coordination_settings>
            <operation_timeout_ms>10000</operation_timeout_ms>
--- a/tests/config/executable_pool_dictionary.xml
+++ b/tests/config/executable_pool_dictionary.xml
@ -0,0 +1,154 @@
+<dictionaries>
+
+<dictionary>
+    <name>executable_pool_simple</name>
+
+    <structure>
+        <id>
+            <name>x</name>
+        </id>
+        <attribute>
+            <name>a</name>
+            <type>String</type>
+            <null_value></null_value>
+        </attribute>
+        <attribute>
+            <name>b</name>
+            <type>String</type>
+            <null_value></null_value>
+        </attribute>
+    </structure>
+
+    <source>
+        <executable_pool>
+            <format>TabSeparated</format>
+            <command>while read read_data; do printf "$read_data\t$read_data a\t$read_data b\n"; done</command>
+            <size>5</size>
+        </executable_pool>
+    </source>
+
+    <layout>
+        <direct />
+    </layout>
+
+    <lifetime>300</lifetime>
+</dictionary>
+
+<dictionary>
+    <name>executable_pool_complex</name>
+
+    <structure>
+        <key>
+            <attribute>
+                <name>x</name>
+                <type>String</type>
+            </attribute>
+            <attribute>
+                <name>y</name>
+                <type>String</type>
+            </attribute>
+        </key>
+        <attribute>
+            <name>a</name>
+            <type>String</type>
+            <null_value></null_value>
+        </attribute>
+        <attribute>
+            <name>b</name>
+            <type>String</type>
+            <null_value></null_value>
+        </attribute>
+    </structure>
+
+    <source>
+        <executable>
+            <format>TabSeparated</format>
+            <command>while read read_data; do printf "$read_data\tvalue a\tvalue b\n"; done</command>
+        </executable>
+    </source>
+
+    <layout>
+        <complex_key_direct />
+    </layout>
+
+    <lifetime>300</lifetime>
+</dictionary>
+
+<dictionary>
+    <name>executable_pool_simple_implicit_key</name>
+
+    <structure>
+        <id>
+            <name>x</name>
+        </id>
+        <attribute>
+            <name>a</name>
+            <type>String</type>
+            <null_value></null_value>
+        </attribute>
+        <attribute>
+            <name>b</name>
+            <type>String</type>
+            <null_value></null_value>
+        </attribute>
+    </structure>
+
+    <source>
+        <executable_pool>
+            <format>TabSeparated</format>
+            <command>while read read_data; do printf "$read_data a\t$read_data b\n"; done</command>
+            <size>5</size>
+            <implicit_key>true</implicit_key>
+        </executable_pool>
+    </source>
+
+    <layout>
+        <direct />
+    </layout>
+
+    <lifetime>300</lifetime>
+</dictionary>
+
+<dictionary>
+    <name>executable_pool_complex_implicit_key</name>
+
+    <structure>
+        <key>
+            <attribute>
+                <name>x</name>
+                <type>String</type>
+            </attribute>
+            <attribute>
+                <name>y</name>
+                <type>String</type>
+            </attribute>
+        </key>
+        <attribute>
+            <name>a</name>
+            <type>String</type>
+            <null_value></null_value>
+        </attribute>
+        <attribute>
+            <name>b</name>
+            <type>String</type>
+            <null_value></null_value>
+        </attribute>
+    </structure>
+
+    <source>
+        <executable_pool>
+            <format>TabSeparated</format>
+            <command>while read read_data; do printf "data a\tdata b\n"; done</command>
+            <size>5</size>
+            <implicit_key>true</implicit_key>
+        </executable_pool>
+    </source>
+
+    <layout>
+        <complex_key_direct />
+    </layout>
+
+    <lifetime>300</lifetime>
+</dictionary>
+
+</dictionaries>
--- a/tests/config/install.sh
+++ b/tests/config/install.sh
@ -39,11 +39,13 @@ ln -sf $SRC_PATH/users.d/readonly.xml $DEST_SERVER_PATH/users.d/
 ln -sf $SRC_PATH/users.d/access_management.xml $DEST_SERVER_PATH/users.d/
 ln -sf $SRC_PATH/users.d/database_atomic_drop_detach_sync.xml $DEST_SERVER_PATH/users.d/
 ln -sf $SRC_PATH/users.d/opentelemetry.xml $DEST_SERVER_PATH/users.d/
+ln -sf $SRC_PATH/users.d/remote_queries.xml $DEST_SERVER_PATH/users.d/

 ln -sf $SRC_PATH/ints_dictionary.xml $DEST_SERVER_PATH/
 ln -sf $SRC_PATH/strings_dictionary.xml $DEST_SERVER_PATH/
 ln -sf $SRC_PATH/decimals_dictionary.xml $DEST_SERVER_PATH/
 ln -sf $SRC_PATH/executable_dictionary.xml $DEST_SERVER_PATH/
+ln -sf $SRC_PATH/executable_pool_dictionary.xml $DEST_SERVER_PATH/

 ln -sf $SRC_PATH/top_level_domains $DEST_SERVER_PATH/

--- a/tests/config/users.d/remote_queries.xml
+++ b/tests/config/users.d/remote_queries.xml
@ -0,0 +1,13 @@
+<yandex>
+    <profiles>
+        <default>
+            <!-- Avoid "Connection failed at try №1" messages. -->
+            <connect_timeout_with_failover_ms>2000</connect_timeout_with_failover_ms>
+            <connect_timeout_with_failover_secure_ms>3000</connect_timeout_with_failover_secure_ms>
+            <!-- Avoid this logic in tests to avoid EOF (10 hours is enough for tests)-->
+            <idle_connection_timeout>36000</idle_connection_timeout>
+            <!-- NOTE: instead of tunning idle_connection_timeout,
+                 SYSTEM RELOAD CONFIG can be executed before each test -->
+        </default>
+    </profiles>
+</yandex>
--- a/tests/integration/test_graphite_merge_tree/configs/graphite_rollup.xml
+++ b/tests/integration/test_graphite_merge_tree/configs/graphite_rollup.xml
@ -94,4 +94,25 @@
            </retention>
        </default>
    </graphite_rollup_broken>
+    <graphite_rollup_wrong_age_precision>
+        <path_column_name>metric</path_column_name>
+        <time_column_name>timestamp</time_column_name>
+        <value_column_name>value</value_column_name>
+        <version_column_name>updated</version_column_name>
+        <default>
+            <function>avg</function>
+            <retention>
+                <age>0</age>
+                <precision>60</precision>
+            </retention>
+            <retention>
+                <age>36000</age>
+                <precision>600</precision>
+            </retention>
+            <retention>
+                <age>72000</age>
+                <precision>300</precision>
+            </retention>
+        </default>
+    </graphite_rollup_wrong_age_precision>
 </yandex>
--- a/tests/integration/test_graphite_merge_tree/test.py
+++ b/tests/integration/test_graphite_merge_tree/test.py
@ -3,6 +3,7 @@ import os.path as p
 import time

 import pytest
+from helpers.client import QueryRuntimeException
 from helpers.cluster import ClickHouseCluster
 from helpers.test_tools import TSV

@ -442,3 +443,20 @@ SELECT * FROM test.graphite;
 ''')

    assert TSV(result) == TSV(expected)
+
+
+def test_wrong_rollup_config(graphite_table):
+    with pytest.raises(QueryRuntimeException) as exc:
+        q('''
+CREATE TABLE test.graphite_not_created
+    (metric String, value Float64, timestamp UInt32, date Date, updated UInt32)
+    ENGINE = GraphiteMergeTree('graphite_rollup_wrong_age_precision')
+    PARTITION BY toYYYYMM(date)
+    ORDER BY (metric, timestamp)
+    SETTINGS index_granularity=1;
+        ''')
+
+    # The order of retentions is not guaranteed
+    assert ("age and precision should only grow up: " in str(exc.value))
+    assert ("36000:600" in str(exc.value))
+    assert ("72000:300" in str(exc.value))
--- a/tests/integration/test_materialize_mysql_database/materialize_with_ddl.py
+++ b/tests/integration/test_materialize_mysql_database/materialize_with_ddl.py
@ -471,12 +471,14 @@ def select_without_columns(clickhouse_node, mysql_node, service_name):
    mysql_node.query("CREATE DATABASE db")
    mysql_node.query("CREATE TABLE db.t (a INT PRIMARY KEY, b INT)")
    clickhouse_node.query(
-        "CREATE DATABASE db ENGINE = MaterializeMySQL('{}:3306', 'db', 'root', 'clickhouse')".format(service_name))
+        "CREATE DATABASE db ENGINE = MaterializeMySQL('{}:3306', 'db', 'root', 'clickhouse') SETTINGS max_flush_data_time = 100000".format(service_name))
    check_query(clickhouse_node, "SHOW TABLES FROM db FORMAT TSV", "t\n")
    clickhouse_node.query("SYSTEM STOP MERGES db.t")
    clickhouse_node.query("CREATE VIEW v AS SELECT * FROM db.t")
    mysql_node.query("INSERT INTO db.t VALUES (1, 1), (2, 2)")
-    mysql_node.query("DELETE FROM db.t WHERE a=2;")
+    mysql_node.query("DELETE FROM db.t WHERE a = 2;")
+    # We need to execute a DDL for flush data buffer
+    mysql_node.query("CREATE TABLE db.temporary(a INT PRIMARY KEY, b INT)")

    optimize_on_insert = clickhouse_node.query("SELECT value FROM system.settings WHERE name='optimize_on_insert'").strip()
    if optimize_on_insert == "0":
--- a/tests/integration/test_storage_kafka/test.py
+++ b/tests/integration/test_storage_kafka/test.py
@ -2583,6 +2583,8 @@ def test_kafka_issue14202(kafka_cluster):
                     kafka_format = 'JSONEachRow';
        ''')

+    time.sleep(3)
+
    instance.query(
        'INSERT INTO test.kafka_q SELECT t, some_string  FROM ( SELECT dt AS t, some_string FROM test.empty_table )')
    # check instance is alive
--- a/tests/performance/prewhere_with_row_level_filter.xml
+++ b/tests/performance/prewhere_with_row_level_filter.xml
@ -1,17 +1,22 @@
 <test>
-    <create_query>DROP TABLE IF EXISTS test_prl;</create_query>
-    <create_query>CREATE TABLE test_prl (n UInt64) ENGINE MergeTree ORDER BY n;</create_query>
-    <create_query>GRANT CREATE ROW POLICY ON *.* TO CURRENT_USER</create_query>
-    <create_query>CREATE ROW POLICY OR REPLACE test_prl_policy ON test_prl AS PERMISSIVE FOR SELECT USING n % 7 TO ALL;</create_query>
+    <settings>
+        <max_threads>1</max_threads>
+        <max_insert_threads>10</max_insert_threads>
+        <optimize_move_to_prewhere>0</optimize_move_to_prewhere>
+    </settings>

-    <fill_query>INSERT INTO test_prl SELECT number FROM numbers(50000000);</fill_query>
+    <create_query>DROP TABLE IF EXISTS test_prl</create_query>
+    <create_query>CREATE TABLE test_prl (n UInt64) ENGINE MergeTree ORDER BY n</create_query>
+    <create_query>CREATE ROW POLICY OR REPLACE test_prl_policy ON test_prl AS PERMISSIVE FOR SELECT USING n % 7 TO ALL</create_query>

-    <query>SELECT * FROM test_prl;</query>
-    <query>SELECT * FROM test_prl WHERE n % 3 AND n % 5 SETTINGS optimize_move_to_prewhere = 0;</query>
-    <query>SELECT * FROM test_prl PREWHERE n % 3 AND n % 5;</query>
-    <query>SELECT * FROM test_prl PREWHERE n % 3 WHERE n % 5;</query>
-    <query>SELECT * FROM test_prl PREWHERE n % 5 WHERE n % 3;</query>
+    <fill_query>INSERT INTO test_prl SELECT number FROM numbers(100000000)</fill_query>

-    <drop_query>DROP ROW POLICY IF EXISTS test_prl_policy ON test_prl;</drop_query>
-    <drop_query>DROP TABLE IF EXISTS test_prl;</drop_query>
+    <query>SELECT sum(n) FROM test_prl</query>
+    <query>SELECT sum(n) FROM test_prl WHERE n % 3 AND n % 5</query>
+    <query>SELECT sum(n) FROM test_prl PREWHERE n % 3 AND n % 5</query>
+    <query>SELECT sum(n) FROM test_prl PREWHERE n % 3 WHERE n % 5</query>
+    <query>SELECT sum(n) FROM test_prl PREWHERE n % 5 WHERE n % 3</query>
+
+    <drop_query>DROP ROW POLICY IF EXISTS test_prl_policy ON test_prl</drop_query>
+    <drop_query>DROP TABLE IF EXISTS test_prl</drop_query>
 </test>
--- a/tests/queries/0_stateless/00753_distributed_system_columns_and_system_tables.reference
+++ b/tests/queries/0_stateless/00753_distributed_system_columns_and_system_tables.reference
@ -0,0 +1,4 @@
+Check total_bytes/total_rows for Distributed
+0	\N
+1	\N
+0	\N
--- a/tests/queries/0_stateless/00753_distributed_system_columns_and_system_tables.sql
+++ b/tests/queries/0_stateless/00753_distributed_system_columns_and_system_tables.sql
@ -0,0 +1,11 @@
+SELECT 'Check total_bytes/total_rows for Distributed';
+CREATE TABLE check_system_tables_null (key Int) Engine=Null();
+CREATE TABLE check_system_tables AS check_system_tables_null Engine=Distributed(test_shard_localhost, currentDatabase(), check_system_tables_null);
+SYSTEM STOP DISTRIBUTED SENDS check_system_tables;
+SELECT total_bytes, total_rows FROM system.tables WHERE database = currentDatabase() AND name = 'check_system_tables';
+INSERT INTO check_system_tables SELECT * FROM numbers(1) SETTINGS prefer_localhost_replica=0;
+SELECT total_bytes>0, total_rows FROM system.tables WHERE database = currentDatabase() AND name = 'check_system_tables';
+SYSTEM FLUSH DISTRIBUTED check_system_tables;
+SELECT total_bytes, total_rows FROM system.tables WHERE database = currentDatabase() AND name = 'check_system_tables';
+DROP TABLE check_system_tables_null;
+DROP TABLE check_system_tables;
--- a/tests/queries/0_stateless/00974_distributed_join_on.sql
+++ b/tests/queries/0_stateless/00974_distributed_join_on.sql
@ -1,7 +1,3 @@
-- Avoid "Connection failed at try №1" messages.
-SET send_logs_level = 'none';
-SET connect_timeout_with_failover_ms = 5000;
-
 DROP TABLE IF EXISTS source_table1;
 DROP TABLE IF EXISTS source_table2;
 DROP TABLE IF EXISTS distributed_table1;
--- a/tests/queries/0_stateless/01244_optimize_distributed_group_by_sharding_key.sql
+++ b/tests/queries/0_stateless/01244_optimize_distributed_group_by_sharding_key.sql
@ -1,9 +1,5 @@
 -- TODO: correct testing with real unique shards

-- Avoid "Connection failed at try №1" messages.
-SET send_logs_level = 'fatal';
-SET connect_timeout_with_failover_ms = 5000;
-
 set optimize_distributed_group_by_sharding_key=1;

 drop table if exists dist_01247;
--- a/tests/queries/0_stateless/01247_optimize_distributed_group_by_sharding_key_dist_on_dist.sql
+++ b/tests/queries/0_stateless/01247_optimize_distributed_group_by_sharding_key_dist_on_dist.sql
@ -1,9 +1,5 @@
 -- TODO: correct testing with real unique shards

-- Avoid "Connection failed at try №1" messages.
-SET send_logs_level = 'fatal';
-SET connect_timeout_with_failover_ms = 5000;
-
 set optimize_distributed_group_by_sharding_key=1;

 drop table if exists dist_01247;
--- a/tests/queries/0_stateless/01293_show_settings.reference
+++ b/tests/queries/0_stateless/01293_show_settings.reference
@ -1,5 +1,5 @@
 send_timeout	Seconds	300
 connect_timeout	Seconds	10
-connect_timeout_with_failover_ms	Milliseconds	50
-connect_timeout_with_failover_secure_ms	Milliseconds	100
+connect_timeout_with_failover_ms	Milliseconds	2000
+connect_timeout_with_failover_secure_ms	Milliseconds	3000
 max_memory_usage	UInt64	10000000000
--- a/tests/queries/0_stateless/01293_system_distribution_queue.sql
+++ b/tests/queries/0_stateless/01293_system_distribution_queue.sql
@ -10,20 +10,15 @@ select * from system.distribution_queue;
 select 'INSERT';
 system stop distributed sends dist_01293;
 insert into dist_01293 select * from numbers(10);
-- metrics updated only after distributed_directory_monitor_sleep_time_ms
-set distributed_directory_monitor_sleep_time_ms=10;
-- 1 second should guarantee metrics update
-- XXX: but this is kind of quirk, way more better will be account this metrics without any delays.
-select sleep(1) format Null;
-select is_blocked, error_count, data_files, data_compressed_bytes>100 from system.distribution_queue;
+select is_blocked, error_count, data_files, data_compressed_bytes>100 from system.distribution_queue where database = currentDatabase();
 system flush distributed dist_01293;

 select 'FLUSH';
-select is_blocked, error_count, data_files, data_compressed_bytes from system.distribution_queue;
+select is_blocked, error_count, data_files, data_compressed_bytes from system.distribution_queue where database = currentDatabase();

 select 'UNBLOCK';
 system start distributed sends dist_01293;
-select is_blocked, error_count, data_files, data_compressed_bytes from system.distribution_queue;
+select is_blocked, error_count, data_files, data_compressed_bytes from system.distribution_queue where database = currentDatabase();

 drop table null_01293;
 drop table dist_01293;
--- a/tests/queries/0_stateless/01670_distributed_bytes_to_throw_insert.reference
+++ b/tests/queries/0_stateless/01670_distributed_bytes_to_throw_insert.reference
--- a/tests/queries/0_stateless/01670_distributed_bytes_to_throw_insert.sql
+++ b/tests/queries/0_stateless/01670_distributed_bytes_to_throw_insert.sql
@ -0,0 +1,14 @@
+drop table if exists dist_01670;
+drop table if exists data_01670;
+
+create table data_01670 (key Int) engine=Null();
+create table dist_01670 (key Int) engine=Distributed(test_shard_localhost, currentDatabase(), data_01670) settings bytes_to_throw_insert=1;
+system stop distributed sends dist_01670;
+-- first batch is always OK, since there is no pending bytes yet
+insert into dist_01670 select * from numbers(1) settings prefer_localhost_replica=0;
+-- second will fail, because of bytes_to_throw_insert=1
+-- (previous block definitelly takes more, since it has header)
+insert into dist_01670 select * from numbers(1) settings prefer_localhost_replica=0; -- { serverError 574 }
+system flush distributed dist_01670;
+drop table dist_01670;
+drop table data_01670;
--- a/tests/queries/0_stateless/01675_distributed_bytes_to_delay_insert.reference
+++ b/tests/queries/0_stateless/01675_distributed_bytes_to_delay_insert.reference
@ -0,0 +1,3 @@
+max_delay_to_insert will throw
+max_delay_to_insert will succeed
+flushed
--- a/tests/queries/0_stateless/01675_distributed_bytes_to_delay_insert.sh
+++ b/tests/queries/0_stateless/01675_distributed_bytes_to_delay_insert.sh
@ -0,0 +1,86 @@
+#!/usr/bin/env bash
+
+# NOTE: $SECONDS accuracy is second, so we need some delta, hence -1 in time conditions.
+
+CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
+. "$CURDIR"/../shell_config.sh
+
+max_delay_to_insert=5
+
+${CLICKHOUSE_CLIENT} -nq "
+drop table if exists dist_01675;
+drop table if exists data_01675;
+"
+
+${CLICKHOUSE_CLIENT} -nq "
+create table data_01675 (key Int) engine=Null();
+create table dist_01675 (key Int) engine=Distributed(test_shard_localhost, currentDatabase(), data_01675) settings bytes_to_delay_insert=1, max_delay_to_insert=$max_delay_to_insert;
+system stop distributed sends dist_01675;
+"
+
+#
+# Case 1: max_delay_to_insert will throw.
+#
+echo "max_delay_to_insert will throw"
+
+start_seconds=$SECONDS
+${CLICKHOUSE_CLIENT} --testmode -nq "
+-- first batch is always OK, since there is no pending bytes yet
+insert into dist_01675 select * from numbers(1) settings prefer_localhost_replica=0;
+-- second will fail, because of bytes_to_delay_insert=1 and max_delay_to_insert=5,
+-- while distributed sends is stopped.
+--
+-- (previous block definitelly takes more, since it has header)
+insert into dist_01675 select * from numbers(1) settings prefer_localhost_replica=0; -- { serverError 574 }
+system flush distributed dist_01675;
+"
+end_seconds=$SECONDS
+
+if (( (end_seconds-start_seconds)<(max_delay_to_insert-1) )); then
+    echo "max_delay_to_insert was not satisfied ($end_seconds-$start_seconds)"
+fi
+
+#
+# Case 2: max_delay_to_insert will finally finished.
+#
+echo "max_delay_to_insert will succeed"
+
+max_delay_to_insert=10
+${CLICKHOUSE_CLIENT} -nq "
+drop table dist_01675;
+create table dist_01675 (key Int) engine=Distributed(test_shard_localhost, currentDatabase(), data_01675) settings bytes_to_delay_insert=1, max_delay_to_insert=$max_delay_to_insert;
+system stop distributed sends dist_01675;
+"
+
+flush_delay=4
+function flush_distributed_worker()
+{
+    sleep $flush_delay
+    ${CLICKHOUSE_CLIENT} -q "system flush distributed dist_01675"
+    echo flushed
+}
+flush_distributed_worker &
+
+start_seconds=$SECONDS
+${CLICKHOUSE_CLIENT} --testmode -nq "
+-- first batch is always OK, since there is no pending bytes yet
+insert into dist_01675 select * from numbers(1) settings prefer_localhost_replica=0;
+-- second will succcedd, due to SYSTEM FLUSH DISTRIBUTED in background.
+insert into dist_01675 select * from numbers(1) settings prefer_localhost_replica=0;
+"
+end_seconds=$SECONDS
+wait
+
+if (( (end_seconds-start_seconds)<(flush_delay-1) )); then
+    echo "max_delay_to_insert was not wait flush_delay ($end_seconds-$start_seconds)"
+fi
+if (( (end_seconds-start_seconds)>=(max_delay_to_insert-1) )); then
+    echo "max_delay_to_insert was overcommited ($end_seconds-$start_seconds)"
+fi
+
+
+${CLICKHOUSE_CLIENT} -nq "
+drop table dist_01675;
+drop table data_01675;
+"
--- a/tests/queries/0_stateless/01710_join_use_nulls.reference
+++ b/tests/queries/0_stateless/01710_join_use_nulls.reference
@ -1,3 +1,4 @@
 3
+3
 1
 1
--- a/Show More
+++ b/Show More