Merge branch 'master' into jart-memcpy

2024-11-21 23:21:59 +00:00 · 2021-03-03 20:11:38 +03:00 · 2021-03-03 20:11:38 +03:00 · 680b0d1531
commit 680b0d1531
parent aecdadd02e e8df9971f1
224 changed files with 6766 additions and 1367 deletions
--- a/contrib/boost
+++ b/contrib/boost
@ -1 +1 @@
-Subproject commit 48f40ebb539220d328958f8823b094c0b07a4e79
+Subproject commit ee24fa55bc46e4d2ce7d0d052cc5a0d9b1be8c36
--- a/docker/test/performance-comparison/compare.sh
+++ b/docker/test/performance-comparison/compare.sh
@ -358,6 +358,8 @@ mkdir analyze analyze/tmp ||:
 build_log_column_definitions

 # Split the raw test output into files suitable for analysis.
+# To debug calculations only for a particular test, substitute a suitable
+# wildcard here, e.g. `for test_file in modulo-raw.tsv`.
 for test_file in *-raw.tsv
 do
    test_name=$(basename "$test_file" "-raw.tsv")
@ -467,7 +469,13 @@ create view broken_queries as
 create table query_run_metrics_for_stats engine File(
        TSV, -- do not add header -- will parse with grep
        'analyze/query-run-metrics-for-stats.tsv')
-    as select test, query_index, 0 run, version, metric_values
+    as select test, query_index, 0 run, version,
+        -- For debugging, add a filter for a particular metric like this:
+        -- arrayFilter(m, n -> n = 'client_time', metric_values, metric_names)
+        --     metric_values
+        -- Note that further reporting may break, because the metric names are
+        -- not filtered.
+        metric_values
    from query_run_metric_arrays
    where (test, query_index) not in broken_queries
    order by test, query_index, run, version
@ -585,8 +593,19 @@ create view query_metric_stats as
 -- Main statistics for queries -- query time as reported in query log.
 create table queries engine File(TSVWithNamesAndTypes, 'report/queries.tsv')
    as select
-        abs(diff) > report_threshold        and abs(diff) > stat_threshold as changed_fail,
-        abs(diff) > report_threshold - 0.05 and abs(diff) > stat_threshold as changed_show,
+        -- It is important to have a non-strict inequality with stat_threshold
+        -- here. The randomization distribution is actually discrete, and when
+        -- the number of runs is small, the quantile we need (e.g. 0.99) turns
+        -- out to be the maximum value of the distribution. We can also hit this
+        -- maximum possible value with our test run, and this obviously means
+        -- that we have observed the difference to the best precision possible
+        -- for the given number of runs. If we use a strict equality here, we
+        -- will miss such cases. This happened in the wild and lead to some
+        -- uncaught regressions, because for the default 7 runs we do for PRs,
+        -- the randomization distribution has only 16 values, so the max quantile
+        -- is actually 0.9375.
+        abs(diff) > report_threshold        and abs(diff) >= stat_threshold as changed_fail,
+        abs(diff) > report_threshold - 0.05 and abs(diff) >= stat_threshold as changed_show,

        not changed_fail and stat_threshold > report_threshold + 0.10 as unstable_fail,
        not changed_show and stat_threshold > report_threshold - 0.05 as unstable_show,
--- a/docker/test/performance-comparison/config/config.d/user_files.xml
+++ b/docker/test/performance-comparison/config/config.d/user_files.xml
@ -0,0 +1,7 @@
+<yandex>
+    <!-- Directory with user provided files that are accessible by 'file' table function. -->
+    <user_files_path>/var/lib/clickhouse/user_files/</user_files_path>
+
+    <!-- Path to configuration file with users, access rights, profiles of settings, quotas. -->
+    <users_config>users.xml</users_config>
+</yandex>
--- a/docker/test/performance-comparison/config/users.d/perf-comparison-tweaks-users.xml
+++ b/docker/test/performance-comparison/config/users.d/perf-comparison-tweaks-users.xml
@ -19,4 +19,9 @@
            <max_threads>12</max_threads>
        </default>
    </profiles>
+    <users>
+        <default>
+            <access_management>1</access_management>
+        </default>
+    </users>
 </yandex>
--- a/docker/test/performance-comparison/eqmed.sql
+++ b/docker/test/performance-comparison/eqmed.sql
@ -1,4 +1,6 @@
-- input is table(test text, query text, run UInt32, version int, metrics Array(float))
+-- The input is table(test text, query text, run UInt32, version UInt8, metrics Array(float)).
+-- Run like this:
+-- clickhouse-local --queries-file eqmed.sql -S 'test text, query text, run UInt32, version UInt8, metrics Array(float)' --file analyze/tmp/modulo_0.tsv
 select
   arrayMap(x -> floor(x, 4), original_medians_array.medians_by_version[1] as l) l_rounded,
   arrayMap(x -> floor(x, 4), original_medians_array.medians_by_version[2] as r) r_rounded,
@ -8,14 +10,19 @@ select
 from
   (
      -- quantiles of randomization distributions
+      -- note that for small number of runs, the exact quantile might not make
+      -- sense, because the last possible value of randomization distribution
+      -- might take a larger percentage of distirbution (i.e. the distribution
+      -- actually has discrete values, and the last step can be large).
      select quantileExactForEach(0.99)(
        arrayMap(x, y -> abs(x - y), metrics_by_label[1], metrics_by_label[2]) as d
      ) threshold
-      ---- uncomment to see what the distribution is really like
-      --, uniqExact(d.1) u
+      ---- Uncomment to see what the distribution is really like. This debug
+      ---- code only works for single (the first) metric.
+      --, uniqExact(d[1]) u
      --, arraySort(x->x.1,
      --      arrayZip(
-      --          (sumMap([d.1], [1]) as f).1,
+      --          (sumMap([d[1]], [1]) as f).1,
      --          f.2)) full_histogram
      from
         (
--- a/docs/_description_templates/template-data-type.md
+++ b/docs/_description_templates/template-data-type.md
@ -26,4 +26,4 @@ The name of an additional section can be any, for example, **Usage**.

 -   [link](#)

-[Original article](https://clickhouse.tech/docs/en/data_types/<data-type-name>/) <!--hide-->
+[Original article](https://clickhouse.tech/docs/en/data-types/<data-type-name>/) <!--hide-->
--- a/docs/en/engines/table-engines/integrations/kafka.md
+++ b/docs/en/engines/table-engines/integrations/kafka.md
@ -38,20 +38,20 @@ SETTINGS

 Required parameters:

-   `kafka_broker_list` – A comma-separated list of brokers (for example, `localhost:9092`).
-   `kafka_topic_list` – A list of Kafka topics.
-   `kafka_group_name` – A group of Kafka consumers. Reading margins are tracked for each group separately. If you don’t want messages to be duplicated in the cluster, use the same group name everywhere.
-   `kafka_format` – Message format. Uses the same notation as the SQL `FORMAT` function, such as `JSONEachRow`. For more information, see the [Formats](../../../interfaces/formats.md) section.
+-   `kafka_broker_list` — A comma-separated list of brokers (for example, `localhost:9092`).
+-   `kafka_topic_list` — A list of Kafka topics.
+-   `kafka_group_name` — A group of Kafka consumers. Reading margins are tracked for each group separately. If you don’t want messages to be duplicated in the cluster, use the same group name everywhere.
+-   `kafka_format` — Message format. Uses the same notation as the SQL `FORMAT` function, such as `JSONEachRow`. For more information, see the [Formats](../../../interfaces/formats.md) section.

 Optional parameters:

-   `kafka_row_delimiter` – Delimiter character, which ends the message.
-   `kafka_schema` – Parameter that must be used if the format requires a schema definition. For example, [Cap’n Proto](https://capnproto.org/) requires the path to the schema file and the name of the root `schema.capnp:Message` object.
-   `kafka_num_consumers` – The number of consumers per table. Default: `1`. Specify more consumers if the throughput of one consumer is insufficient. The total number of consumers should not exceed the number of partitions in the topic, since only one consumer can be assigned per partition.
-   `kafka_max_block_size` - The maximum batch size (in messages) for poll (default: `max_block_size`).
-   `kafka_skip_broken_messages` – Kafka message parser tolerance to schema-incompatible messages per block. Default: `0`. If `kafka_skip_broken_messages = N` then the engine skips *N* Kafka messages that cannot be parsed (a message equals a row of data).
-   `kafka_commit_every_batch` - Commit every consumed and handled batch instead of a single commit after writing a whole block (default: `0`).
-   `kafka_thread_per_consumer` - Provide independent thread for each consumer (default: `0`). When enabled, every consumer flush the data independently, in parallel (otherwise - rows from several consumers squashed to form one block).
+-   `kafka_row_delimiter` — Delimiter character, which ends the message.
+-   `kafka_schema` — Parameter that must be used if the format requires a schema definition. For example, [Cap’n Proto](https://capnproto.org/) requires the path to the schema file and the name of the root `schema.capnp:Message` object.
+-   `kafka_num_consumers` — The number of consumers per table. Default: `1`. Specify more consumers if the throughput of one consumer is insufficient. The total number of consumers should not exceed the number of partitions in the topic, since only one consumer can be assigned per partition.
+-   `kafka_max_block_size` — The maximum batch size (in messages) for poll (default: `max_block_size`).
+-   `kafka_skip_broken_messages` — Kafka message parser tolerance to schema-incompatible messages per block. Default: `0`. If `kafka_skip_broken_messages = N` then the engine skips *N* Kafka messages that cannot be parsed (a message equals a row of data).
+-   `kafka_commit_every_batch` — Commit every consumed and handled batch instead of a single commit after writing a whole block (default: `0`).
+-   `kafka_thread_per_consumer` — Provide independent thread for each consumer (default: `0`). When enabled, every consumer flush the data independently, in parallel (otherwise — rows from several consumers squashed to form one block).

 Examples:

--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@ -1956,8 +1956,8 @@ Default value: 16.

 **See Also**

-   [Kafka](../../engines/table-engines/integrations/kafka.md#kafka) engine
-   [RabbitMQ](../../engines/table-engines/integrations/rabbitmq.md#rabbitmq-engine) engine
+-   [Kafka](../../engines/table-engines/integrations/kafka.md#kafka) engine.
+-   [RabbitMQ](../../engines/table-engines/integrations/rabbitmq.md#rabbitmq-engine) engine.

 ## validate_polygons {#validate_polygons}

@ -2658,8 +2658,6 @@ Result:

 Note that this setting influences [Materialized view](../../sql-reference/statements/create/view.md#materialized) and [MaterializeMySQL](../../engines/database-engines/materialize-mysql.md) behaviour.

-[Original article](https://clickhouse.tech/docs/en/operations/settings/settings/) <!-- hide -->
-
 ## engine_file_empty_if_not_exists {#engine-file-empty_if-not-exists}

 Allows to select data from a file engine table without file.
@ -2679,3 +2677,16 @@ Possible values:
 - 1 — Enabled.

 Default value: `0`.
+
+## allow_experimental_geo_types {#allow-experimental-geo-types}
+
+Allows working with experimental [geo data types](../../sql-reference/data-types/geo.md).
+
+Possible values:
+
+-   0 — Working with geo data types is disabled.
+-   1 — Working with geo data types is enabled.
+
+Default value: `0`.
+
+[Original article](https://clickhouse.tech/docs/en/operations/settings/settings/) <!-- hide -->
--- a/docs/en/operations/system-tables/index.md
+++ b/docs/en/operations/system-tables/index.md
@ -20,7 +20,7 @@ System tables:

 Most of system tables store their data in RAM. A ClickHouse server creates such system tables at the start.

-Unlike other system tables, the system log tables [metric_log](../../operations/system-tables/metric_log.md), [query_log](../../operations/system-tables/query_log.md), [query_thread_log](../../operations/system-tables/query_thread_log.md), [trace_log](../../operations/system-tables/trace_log.md), [part_log](../../operations/system-tables/part_log.md), crash_log and [text_log](../../operations/system-tables/text_log.md) are served by [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) table engine and store their data in a storage filesystem by default. If you remove a table from a filesystem, the ClickHouse server creates the empty one again at the time of the next data writing. If system table schema changed in a new release, then ClickHouse renames the current table and creates a new one.
+Unlike other system tables, the system log tables [metric_log](../../operations/system-tables/metric_log.md), [query_log](../../operations/system-tables/query_log.md), [query_thread_log](../../operations/system-tables/query_thread_log.md), [trace_log](../../operations/system-tables/trace_log.md), [part_log](../../operations/system-tables/part_log.md), [crash_log](../../operations/system-tables/crash-log.md) and [text_log](../../operations/system-tables/text_log.md) are served by [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) table engine and store their data in a filesystem by default. If you remove a table from a filesystem, the ClickHouse server creates the empty one again at the time of the next data writing. If system table schema changed in a new release, then ClickHouse renames the current table and creates a new one.

 System log tables can be customized by creating a config file with the same name as the table under `/etc/clickhouse-server/config.d/`, or setting corresponding elements in `/etc/clickhouse-server/config.xml`. Elements can be customized are:

@ -33,7 +33,7 @@ System log tables can be customized by creating a config file with the same name

 An example:

-```
+```xml
 <yandex>
    <query_log>
        <database>system</database>
--- a/docs/en/sql-reference/aggregate-functions/parametric-functions.md
+++ b/docs/en/sql-reference/aggregate-functions/parametric-functions.md
@ -253,8 +253,8 @@ windowFunnel(window, [mode])(timestamp, cond1, cond2, ..., condN)

 **Parameters**

-   `window` — Length of the sliding window. The unit of `window` depends on the timestamp itself and varies. Determined using the expression `timestamp of cond2 <= timestamp of cond1 + window`.
-   `mode` - It is an optional parameter.
+-   `window` — Length of the sliding window. The unit of `window` depends on the `timestamp` itself and varies. Determined using the expression `timestamp of cond2 <= timestamp of cond1 + window`.
+-   `mode` - It is an optional argument.
    -   `'strict'` - When the `'strict'` is set, the windowFunnel() applies conditions only for the unique values.

 **Returned value**
--- a/docs/en/sql-reference/data-types/geo.md
+++ b/docs/en/sql-reference/data-types/geo.md
@ -0,0 +1,106 @@
+---
+toc_priority: 62
+toc_title: Geo
+---
+
+# Geo Data Types {#geo-data-types}
+
+Clickhouse supports data types for representing geographical objects — locations, lands, etc. 
+
+!!! warning "Warning"
+    Currently geo data types are an experimental feature. To work with them you must set `allow_experimental_geo_types = 1`.
+
+**See Also**
+- [Representing simple geographical features](https://en.wikipedia.org/wiki/GeoJSON).
+- [allow_experimental_geo_types](../../operations/settings/settings.md#allow-experimental-geo-types) setting.
+
+## Point {#point-data-type}
+
+`Point` is represented by its X and Y coordinates, stored as a [Tuple](tuple.md)([Float64](float.md), [Float64](float.md)).
+
+**Example**
+
+Query:
+
+```sql
+SET allow_experimental_geo_types = 1;
+CREATE TABLE geo_point (p Point) ENGINE = Memory();
+INSERT INTO geo_point VALUES((10, 10));
+SELECT p, toTypeName(p) FROM geo_point;
+```
+Result: 
+
+``` text
+┌─p─────┬─toTypeName(p)─┐
+│ (10,10) │ Point         │
+└───────┴───────────────┘
+```
+
+## Ring {#ring-data-type}
+
+`Ring` is a simple polygon without holes stored as an array of points: [Array](array.md)([Point](#point-data-type)).
+
+**Example**
+
+Query:
+
+```sql
+SET allow_experimental_geo_types = 1;
+CREATE TABLE geo_ring (r Ring) ENGINE = Memory();
+INSERT INTO geo_ring VALUES([(0, 0), (10, 0), (10, 10), (0, 10)]);
+SELECT r, toTypeName(r) FROM geo_ring;
+```
+Result: 
+
+``` text
+┌─r─────────────────────────────┬─toTypeName(r)─┐
+│ [(0,0),(10,0),(10,10),(0,10)] │ Ring          │
+└───────────────────────────────┴───────────────┘
+```
+
+## Polygon {#polygon-data-type}
+
+`Polygon` is a polygon with holes stored as an array of rings: [Array](array.md)([Ring](#ring-data-type)). First element of outer array is the outer shape of polygon and all the following elements are holes.
+
+**Example**
+
+This is a polygon with one hole:
+
+```sql
+SET allow_experimental_geo_types = 1;
+CREATE TABLE geo_polygon (pg Polygon) ENGINE = Memory();
+INSERT INTO geo_polygon VALUES([[(20, 20), (50, 20), (50, 50), (20, 50)], [(30, 30), (50, 50), (50, 30)]]);
+SELECT pg, toTypeName(pg) FROM geo_polygon;
+```
+
+Result: 
+
+``` text
+┌─pg────────────────────────────────────────────────────────────┬─toTypeName(pg)─┐
+│ [[(20,20),(50,20),(50,50),(20,50)],[(30,30),(50,50),(50,30)]] │ Polygon        │
+└───────────────────────────────────────────────────────────────┴────────────────┘
+```
+
+## MultiPolygon {#multipolygon-data-type}
+
+`MultiPolygon` consists of multiple polygons and is stored as an array of polygons: [Array](array.md)([Polygon](#polygon-data-type)). 
+
+**Example**
+
+This multipolygon consists of two separate polygons — the first one without holes, and the second with one hole:
+
+```sql
+SET allow_experimental_geo_types = 1;
+CREATE TABLE geo_multipolygon (mpg MultiPolygon) ENGINE = Memory();
+INSERT INTO geo_multipolygon VALUES([[[(0, 0), (10, 0), (10, 10), (0, 10)]], [[(20, 20), (50, 20), (50, 50), (20, 50)],[(30, 30), (50, 50), (50, 30)]]]);
+SELECT mpg, toTypeName(mpg) FROM geo_multipolygon;
+```
+Result: 
+
+``` text
+┌─mpg─────────────────────────────────────────────────────────────────────────────────────────────┬─toTypeName(mpg)─┐
+│ [[[(0,0),(10,0),(10,10),(0,10)]],[[(20,20),(50,20),(50,50),(20,50)],[(30,30),(50,50),(50,30)]]] │ MultiPolygon    │
+└─────────────────────────────────────────────────────────────────────────────────────────────────┴─────────────────┘
+```
+
+[Original article](https://clickhouse.tech/docs/en/data-types/geo/) <!--hide-->
--- a/docs/en/sql-reference/functions/encryption-functions.md
+++ b/docs/en/sql-reference/functions/encryption-functions.md
@ -55,7 +55,7 @@ CREATE TABLE encryption_test
    `comment` String,
    `secret` String
 )
-ENGINE = Memory
+ENGINE = Memory;
 ```

 Insert some data (please avoid storing the keys/ivs in the database as this undermines the whole concept of encryption), also storing 'hints' is unsafe too and used only for illustrative purposes:
@ -110,7 +110,7 @@ Result:

 Compatible with mysql encryption and resulting ciphertext can be decrypted with [AES_DECRYPT](https://dev.mysql.com/doc/refman/8.0/en/encryption-functions.html#function_aes-decrypt) function.

-Will produce same ciphertext as `encrypt` on equal inputs. But when `key` or `iv` are longer than they should normally be, `aes_encrypt_mysql` will stick to what MySQL's `aes_encrypt` does: 'fold' `key` and ignore excess bits of `IV`.
+Will produce the same ciphertext as `encrypt` on equal inputs. But when `key` or `iv` are longer than they should normally be, `aes_encrypt_mysql` will stick to what MySQL's `aes_encrypt` does: 'fold' `key` and ignore excess bits of `iv`.

 Supported encryption modes:

@ -138,7 +138,6 @@ aes_encrypt_mysql('mode', 'plaintext', 'key' [, iv])

 - Ciphertext binary string. [String](../../sql-reference/data-types/string.md#string).

-
 **Examples**

 Given equal input `encrypt` and `aes_encrypt_mysql` produce the same ciphertext:
@ -157,7 +156,6 @@ Result:
 └───────────────────┘
 ```

-
 But `encrypt` fails when `key` or `iv` is longer than expected:

 Query:
@ -252,7 +250,7 @@ decrypt('mode', 'ciphertext', 'key' [, iv, aad])

 **Examples**

-Re-using table from [encrypt](./encryption-functions.md#encrypt).
+Re-using table from [encrypt](#encrypt).

 Query:

@ -284,6 +282,7 @@ SELECT comment, decrypt('aes-256-cfb128', secret, '12345678910121314151617181920
 ```

 Result:
+
 ``` text
 ┌─comment─────────────────────────────┬─plaintext─┐
 │ aes-256-cfb128 no IV                │ Secret    │
@ -294,7 +293,7 @@ Result:
 └─────────────────────────────────────┴───────────┘
 ```

-Notice how only portion of the data was properly decrypted, and the rest is gibberish since either `mode`, `key`, or `iv` were different upon encryption.
+Notice how only a portion of the data was properly decrypted, and the rest is gibberish since either `mode`, `key`, or `iv` were different upon encryption.

 ## aes_decrypt_mysql {#aes_decrypt_mysql}

@ -331,6 +330,7 @@ aes_decrypt_mysql('mode', 'ciphertext', 'key' [, iv])
 **Examples**

 Let's decrypt data we've previously encrypted with MySQL:
+
 ``` sql
 mysql> SET  block_encryption_mode='aes-256-cfb128';
 Query OK, 0 rows affected (0.00 sec)
@ -345,11 +345,13 @@ mysql> SELECT aes_encrypt('Secret', '123456789101213141516171819202122', 'iviviv
 ```

 Query:
+
 ``` sql
 SELECT aes_decrypt_mysql('aes-256-cfb128', unhex('24E9E4966469'), '123456789101213141516171819202122', 'iviviviviviviviv123456') AS plaintext
 ```

 Result:
+
 ``` text
 ┌─plaintext─┐
 │ Secret    │
--- a/docs/en/sql-reference/statements/select/all.md
+++ b/docs/en/sql-reference/statements/select/all.md
@ -4,10 +4,8 @@ toc_title: ALL

 # ALL Clause {#select-all}

-`SELECT ALL` is identical to `SELECT` without `DISTINCT`.
+If there are multiple matching rows in the table, then `ALL` returns all of them. `SELECT ALL` is identical to `SELECT` without `DISTINCT`. If both `ALL` and `DISTINCT` specified, exception will be thrown.

- If `ALL` specified, ignore it.
- If both `ALL` and `DISTINCT` specified, exception will be thrown.

 `ALL` can also be specified inside aggregate function with the same effect(noop), for instance:

@ -19,3 +17,5 @@ equals to
 ```sql
 SELECT sum(number) FROM numbers(10);
 ```
+
+[Original article](https://clickhouse.tech/docs/en/sql-reference/statements/select/all) <!--hide-->
--- a/docs/ru/engines/table-engines/integrations/kafka.md
+++ b/docs/ru/engines/table-engines/integrations/kafka.md
@ -31,21 +31,26 @@ SETTINGS
    [kafka_schema = '',]
    [kafka_num_consumers = N,]
    [kafka_skip_broken_messages = N]
+    [kafka_commit_every_batch = 0,]
+    [kafka_thread_per_consumer = 0]
 ```

 Обязательные параметры:

-   `kafka_broker_list` – перечень брокеров, разделенный запятыми (`localhost:9092`).
-   `kafka_topic_list` – перечень необходимых топиков Kafka.
-   `kafka_group_name` – группа потребителя Kafka. Отступы для чтения отслеживаются для каждой группы отдельно. Если необходимо, чтобы сообщения не повторялись на кластере, используйте везде одно имя группы.
-   `kafka_format` – формат сообщений. Названия форматов должны быть теми же, что можно использовать в секции `FORMAT`, например, `JSONEachRow`. Подробнее читайте в разделе [Форматы](../../../interfaces/formats.md).
+-   `kafka_broker_list` — перечень брокеров, разделенный запятыми (`localhost:9092`).
+-   `kafka_topic_list` — перечень необходимых топиков Kafka.
+-   `kafka_group_name` — группа потребителя Kafka. Отступы для чтения отслеживаются для каждой группы отдельно. Если необходимо, чтобы сообщения не повторялись на кластере, используйте везде одно имя группы.
+-   `kafka_format` — формат сообщений. Названия форматов должны быть теми же, что можно использовать в секции `FORMAT`, например, `JSONEachRow`. Подробнее читайте в разделе [Форматы](../../../interfaces/formats.md).

 Опциональные параметры:

-   `kafka_row_delimiter` – символ-разделитель записей (строк), которым завершается сообщение.
-   `kafka_schema` – опциональный параметр, необходимый, если используется формат, требующий определения схемы. Например, [Cap’n Proto](https://capnproto.org/) требует путь к файлу со схемой и название корневого объекта `schema.capnp:Message`.
-   `kafka_num_consumers` – количество потребителей (consumer) на таблицу. По умолчанию: `1`. Укажите больше потребителей, если пропускная способность одного потребителя недостаточна. Общее число потребителей не должно превышать количество партиций в топике, так как на одну партицию может быть назначено не более одного потребителя.
-   `kafka_skip_broken_messages` – максимальное количество некорректных сообщений в блоке. Если `kafka_skip_broken_messages = N`, то движок отбрасывает `N` сообщений Кафки, которые не получилось обработать. Одно сообщение в точности соответствует одной записи (строке). Значение по умолчанию – 0.
+-   `kafka_row_delimiter` — символ-разделитель записей (строк), которым завершается сообщение.
+-   `kafka_schema` — опциональный параметр, необходимый, если используется формат, требующий определения схемы. Например, [Cap’n Proto](https://capnproto.org/) требует путь к файлу со схемой и название корневого объекта `schema.capnp:Message`.
+-   `kafka_num_consumers` — количество потребителей (consumer) на таблицу. По умолчанию: `1`. Укажите больше потребителей, если пропускная способность одного потребителя недостаточна. Общее число потребителей не должно превышать количество партиций в топике, так как на одну партицию может быть назначено не более одного потребителя.
+-   `kafka_max_block_size` — максимальный размер пачек (в сообщениях) для poll (по умолчанию `max_block_size`).
+-   `kafka_skip_broken_messages` — максимальное количество некорректных сообщений в блоке. Если `kafka_skip_broken_messages = N`, то движок отбрасывает `N` сообщений Кафки, которые не получилось обработать. Одно сообщение в точности соответствует одной записи (строке). Значение по умолчанию – 0.
+-   `kafka_commit_every_batch` — включает или отключает режим записи каждой принятой и обработанной пачки по отдельности вместо единой записи целого блока (по умолчанию `0`).
+-   `kafka_thread_per_consumer` — включает или отключает предоставление отдельного потока каждому потребителю (по умолчанию `0`). При включенном режиме каждый потребитель сбрасывает данные независимо и параллельно, при отключённом — строки с данными от нескольких потребителей собираются в один блок.

 Примеры

--- a/docs/ru/operations/settings/settings.md
+++ b/docs/ru/operations/settings/settings.md
@ -1937,6 +1937,21 @@ SELECT idx, i FROM null_in WHERE i IN (1, NULL) SETTINGS transform_null_in = 1;

 Значение по умолчанию: 16.

+## background_message_broker_schedule_pool_size {#background_message_broker_schedule_pool_size}
+
+Задает количество потоков для фонового потокового вывода сообщений. Настройка применяется при запуске сервера ClickHouse и не может быть изменена в пользовательском сеансе.
+
+Допустимые значения:
+
+-   Положительное целое число.
+
+Значение по умолчанию: 16.
+
+**Смотрите также**
+
+-   Движок [Kafka](../../engines/table-engines/integrations/kafka.md#kafka).
+-   Движок [RabbitMQ](../../engines/table-engines/integrations/rabbitmq.md#rabbitmq-engine).
+
 ## format_avro_schema_registry_url {#format_avro_schema_registry_url}

 Задает URL реестра схем [Confluent](https://docs.confluent.io/current/schema-registry/index.html) для использования с форматом [AvroConfluent](../../interfaces/formats.md#data-format-avro-confluent).
@ -2537,4 +2552,15 @@ SELECT * FROM test2;

 Обратите внимание на то, что эта настройка влияет на поведение [материализованных представлений](../../sql-reference/statements/create/view.md#materialized) и БД [MaterializeMySQL](../../engines/database-engines/materialize-mysql.md).

+## allow_experimental_geo_types {#allow-experimental-geo-types}
+
+Разрешает использование экспериментальных типов данных для работы с [географическими структурами](../../sql-reference/data-types/geo.md).
+
+Возможные значения:
+
+-   0 — Использование типов данных для работы с географическими структурами не поддерживается.
+-   1 — Использование типов данных для работы с географическими структурами поддерживается.
+
+Значение по умолчанию: `0`.
+
 [Оригинальная статья](https://clickhouse.tech/docs/ru/operations/settings/settings/) <!--hide-->
--- a/docs/ru/operations/system-tables/index.md
+++ b/docs/ru/operations/system-tables/index.md
@ -9,25 +9,54 @@ toc_title: "\u0421\u0438\u0441\u0442\u0435\u043c\u043d\u044b\u0435\u0020\u0442\u

 Системные таблицы содержат информацию о:

-   Состоянии сервера, процессов и окружении.
-   Внутренних процессах сервера.
+-   состоянии сервера, процессов и окружении.
+-   внутренних процессах сервера.

 Системные таблицы:

-   Находятся в базе данных `system`.
-   Доступны только для чтения данных.
-   Не могут быть удалены или изменены, но их можно отсоединить.
+-   находятся в базе данных `system`.
+-   доступны только для чтения данных.
+-   не могут быть удалены или изменены, но их можно отсоединить.

-Системные таблицы `metric_log`, `query_log`, `query_thread_log`, `trace_log` системные таблицы хранят данные в файловой системе. Остальные системные таблицы хранят свои данные в оперативной памяти. Сервер ClickHouse создает такие системные таблицы при запуске.
+Большинство системных таблиц хранят свои данные в оперативной памяти. Сервер ClickHouse создает эти системные таблицы при старте.
+
+В отличие от других системных таблиц, таблицы с системными логами [metric_log](../../operations/system-tables/metric_log.md), [query_log](../../operations/system-tables/query_log.md), [query_thread_log](../../operations/system-tables/query_thread_log.md), [trace_log](../../operations/system-tables/trace_log.md), [part_log](../../operations/system-tables/part_log.md), [crash_log](../../operations/system-tables/crash-log.md) и [text_log](../../operations/system-tables/text_log.md) используют движок таблиц [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) и по умолчанию хранят свои данные в файловой системе. Если удалить таблицу из файловой системы, сервер ClickHouse снова создаст пустую таблицу во время следующей записи данных. Если схема системной таблицы изменилась в новом релизе, то ClickHouse переименует текущую таблицу и создаст новую.
+
+Таблицы с системными логами `log` можно настроить, создав конфигурационный файл с тем же именем, что и таблица в разделе `/etc/clickhouse-server/config.d/`, или указав соответствующие элементы в `/etc/clickhouse-server/config.xml`. Настраиваться могут следующие элементы:
+
+-   `database` — база данных, к которой принадлежит системная таблица. Эта опция на текущий момент устарела. Все системные таблицы находятся в базе данных `system`.
+-   `table` — таблица для добавления данных.
+-   `partition_by` — [ключ партиционирования](../../engines/table-engines/mergetree-family/custom-partitioning-key.md).
+-   `ttl` — [время жизни](../../sql-reference/statements/alter/ttl.md) таблицы.
+-   `flush_interval_milliseconds` — интервал сброса данных на диск, в миллисекундах.
+-   `engine` — полное имя движка (начиная с `ENGINE =` ) с параметрами. Эта опция противоречит `partition_by` и `ttl`. Если указать оба параметра вместе, сервер вернет ошибку и завершит работу.
+
+Пример:
+
+```xml
+<yandex>
+    <query_log>
+        <database>system</database>
+        <table>query_log</table>
+        <partition_by>toYYYYMM(event_date)</partition_by>
+        <ttl>event_date + INTERVAL 30 DAY DELETE</ttl>
+        <!--
+        <engine>ENGINE = MergeTree PARTITION BY toYYYYMM(event_date) ORDER BY (event_date, event_time) SETTINGS index_granularity = 1024</engine>
+        -->
+        <flush_interval_milliseconds>7500</flush_interval_milliseconds>
+    </query_log>
+</yandex>
+```
+
+По умолчанию размер таблицы не ограничен. Управлять размером таблицы можно используя [TTL](../../sql-reference/statements/alter/ttl.md#manipuliatsii-s-ttl-tablitsy) для удаления устаревших записей журнала. Также вы можете использовать функцию партиционирования для таблиц `MergeTree`.

 ### Источники системных показателей 

 Для сбора системных показателей сервер ClickHouse использует:

-   Возможности `CAP_NET_ADMIN`.
+-   возможности `CAP_NET_ADMIN`.
 -   [procfs](https://ru.wikipedia.org/wiki/Procfs) (только Linux).

-**procfs**

 Если для сервера ClickHouse не включено `CAP_NET_ADMIN`, он пытается обратиться к `ProcfsMetricsProvider`. `ProcfsMetricsProvider` позволяет собирать системные показатели для каждого запроса (для CPU и I/O).

--- a/docs/ru/sql-reference/aggregate-functions/parametric-functions.md
+++ b/docs/ru/sql-reference/aggregate-functions/parametric-functions.md
@ -239,7 +239,7 @@ windowFunnel(window, [mode])(timestamp, cond1, cond2, ..., condN)

 **Параметры**

-   `window` — ширина скользящего окна по времени в секундах. [UInt](../../sql-reference/aggregate-functions/parametric-functions.md).
+-   `window` — ширина скользящего окна по времени. Единица измерения зависит от `timestamp` и может варьироваться. Должно соблюдаться условие `timestamp события cond2 <= timestamp события cond1 + window`.
 -   `mode` - необязательный параметр. Если установлено значение `'strict'`, то функция `windowFunnel()` применяет условия только для уникальных значений.
 -   `timestamp` — имя столбца, содержащего временные отметки. [Date](../../sql-reference/aggregate-functions/parametric-functions.md), [DateTime](../../sql-reference/aggregate-functions/parametric-functions.md#data_type-datetime) и другие параметры с типом `Integer`. В случае хранения меток времени в столбцах с типом `UInt64`, максимально допустимое значение соответствует ограничению для типа `Int64`, т.е. равно `2^63-1`.
 -   `cond` — условия или данные, описывающие цепочку событий. [UInt8](../../sql-reference/aggregate-functions/parametric-functions.md).
--- a/docs/ru/sql-reference/data-types/geo.md
+++ b/docs/ru/sql-reference/data-types/geo.md
@ -0,0 +1,106 @@
+---
+toc_priority: 62
+toc_title: Географические структуры
+---
+
+# Типы данных для работы с географическими структурами {#geo-data-types}
+
+ClickHouse поддерживает типы данных для отображения географических объектов — точек (местоположений), территорий и т.п.
+
+!!! warning "Предупреждение"
+    Сейчас использование типов данных для работы с географическими структурами является экспериментальной возможностью. Чтобы использовать эти типы данных, включите настройку `allow_experimental_geo_types = 1`.
+
+**См. также**
+- [Хранение географических структур данных](https://ru.wikipedia.org/wiki/GeoJSON).
+- Настройка [allow_experimental_geo_types](../../operations/settings/settings.md#allow-experimental-geo-types).
+
+## Point {#point-data-type}
+
+Тип `Point` (точка) определяется парой координат X и Y и хранится в виде кортежа [Tuple](tuple.md)([Float64](float.md), [Float64](float.md)).
+
+**Пример**
+
+Запрос:
+
+```sql
+SET allow_experimental_geo_types = 1;
+CREATE TABLE geo_point (p Point) ENGINE = Memory();
+INSERT INTO geo_point VALUES((10, 10));
+SELECT p, toTypeName(p) FROM geo_point;
+```
+Результат: 
+
+``` text
+┌─p─────┬─toTypeName(p)─┐
+│ (10,10) │ Point         │
+└───────┴───────────────┘
+```
+
+## Ring {#ring-data-type}
+
+Тип `Ring` описывает простой многоугольник без внутренних областей (дыр) и хранится в виде массива точек: [Array](array.md)([Point](#point-data-type)).
+
+**Пример**
+
+Запрос:
+
+```sql
+SET allow_experimental_geo_types = 1;
+CREATE TABLE geo_ring (r Ring) ENGINE = Memory();
+INSERT INTO geo_ring VALUES([(0, 0), (10, 0), (10, 10), (0, 10)]);
+SELECT r, toTypeName(r) FROM geo_ring;
+```
+Результат: 
+
+``` text
+┌─r─────────────────────────────┬─toTypeName(r)─┐
+│ [(0,0),(10,0),(10,10),(0,10)] │ Ring          │
+└───────────────────────────────┴───────────────┘
+```
+
+## Polygon {#polygon-data-type}
+
+Тип `Polygon` описывает многоугольник с внутренними областями (дырами) и хранится в виде массива: [Array](array.md)([Ring](#ring-data-type)). Первый элемент массива описывает внешний многоугольник (контур), а остальные элементы описывают дыры.
+
+**Пример**
+
+Запись в этой таблице описывает многоугольник с одной дырой:
+
+```sql
+SET allow_experimental_geo_types = 1;
+CREATE TABLE geo_polygon (pg Polygon) ENGINE = Memory();
+INSERT INTO geo_polygon VALUES([[(20, 20), (50, 20), (50, 50), (20, 50)], [(30, 30), (50, 50), (50, 30)]]);
+SELECT pg, toTypeName(pg) FROM geo_polygon;
+```
+
+Результат: 
+
+``` text
+┌─pg────────────────────────────────────────────────────────────┬─toTypeName(pg)─┐
+│ [[(20,20),(50,20),(50,50),(20,50)],[(30,30),(50,50),(50,30)]] │ Polygon        │
+└───────────────────────────────────────────────────────────────┴────────────────┘
+```
+
+## MultiPolygon {#multipolygon-data-type}
+
+Тип `MultiPolygon` описывает элемент, состоящий из нескольких простых многоугольников (полигональную сетку). Он хранится в виде массива многоугольников: [Array](array.md)([Polygon](#polygon-data-type)). 
+
+**Пример**
+
+Запись в этой таблице описывает элемент, состоящий из двух многоугольников — первый без дыр, а второй с одной дырой:
+
+```sql
+SET allow_experimental_geo_types = 1;
+CREATE TABLE geo_multipolygon (mpg MultiPolygon) ENGINE = Memory();
+INSERT INTO geo_multipolygon VALUES([[[(0, 0), (10, 0), (10, 10), (0, 10)]], [[(20, 20), (50, 20), (50, 50), (20, 50)],[(30, 30), (50, 50), (50, 30)]]]);
+SELECT mpg, toTypeName(mpg) FROM geo_multipolygon;
+```
+Result: 
+
+``` text
+┌─mpg─────────────────────────────────────────────────────────────────────────────────────────────┬─toTypeName(mpg)─┐
+│ [[[(0,0),(10,0),(10,10),(0,10)]],[[(20,20),(50,20),(50,50),(20,50)],[(30,30),(50,50),(50,30)]]] │ MultiPolygon    │
+└─────────────────────────────────────────────────────────────────────────────────────────────────┴─────────────────┘
+```
+
+[Оригинальная статья](https://clickhouse.tech/docs/ru/data-types/geo/) <!--hide-->
--- a/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md
+++ b/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md
@ -572,7 +572,7 @@ SOURCE(CLICKHOUSE(
 или

 ``` sql
-SOURCE(MONGO(
+SOURCE(MONGODB(
    host 'localhost'
    port 27017
    user ''
--- a/docs/ru/sql-reference/functions/encryption-functions.md
+++ b/docs/ru/sql-reference/functions/encryption-functions.md
@ -11,7 +11,7 @@ toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438 \u0434\u043b\u044f \u0448

 Длина инициализирующего вектора всегда 16 байт (лишнии байты игнорируются). 

-Обратите внимание, что эти функции работают медленно.
+Обратите внимание, что до версии Clickhouse 21.1 эти функции работали медленно.

 ## encrypt {#encrypt}

@ -41,7 +41,7 @@ encrypt('mode', 'plaintext', 'key' [, iv, aad])

 **Возвращаемое значение**

-   Зашифрованная строка. [String](../../sql-reference/data-types/string.md#string).
+-   Бинарная зашифрованная строка. [String](../../sql-reference/data-types/string.md#string).

 **Примеры**

@ -52,57 +52,38 @@ encrypt('mode', 'plaintext', 'key' [, iv, aad])
 ``` sql
 CREATE TABLE encryption_test
 (
-    input String,
-    key String DEFAULT unhex('fb9958e2e897ef3fdb49067b51a24af645b3626eed2f9ea1dc7fd4dd71b7e38f9a68db2a3184f952382c783785f9d77bf923577108a88adaacae5c141b1576b0'),
-    iv String DEFAULT unhex('8CA3554377DFF8A369BC50A89780DD85'),
-    key32 String DEFAULT substring(key, 1, 32),
-    key24 String DEFAULT substring(key, 1, 24),
-    key16 String DEFAULT substring(key, 1, 16)
-) Engine = Memory;
+    `comment` String,
+    `secret` String
+)
+ENGINE = Memory;
 ```

-Вставим эти данные:
+Вставим некоторые данные (замечание: не храните ключи или инициализирующие векторы в базе данных, так как это компрометирует всю концепцию шифрования), также хранение "подсказок" небезопасно и используется только для наглядности: 

 Запрос:

 ``` sql
-INSERT INTO encryption_test (input) VALUES (''), ('text'), ('What Is ClickHouse?');
+INSERT INTO encryption_test VALUES('aes-256-cfb128 no IV', encrypt('aes-256-cfb128', 'Secret', '12345678910121314151617181920212')),\
+('aes-256-cfb128 no IV, different key', encrypt('aes-256-cfb128', 'Secret', 'keykeykeykeykeykeykeykeykeykeyke')),\
+('aes-256-cfb128 with IV', encrypt('aes-256-cfb128', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv')),\
+('aes-256-cbc no IV', encrypt('aes-256-cbc', 'Secret', '12345678910121314151617181920212'));
 ```

-Пример без `iv`:
-
 Запрос:

 ``` sql
-SELECT 'aes-128-ecb' AS mode, hex(encrypt(mode, input, key16)) FROM encryption_test;
+SELECT comment, hex(secret) FROM encryption_test;
 ```

 Результат:

 ``` text
-┌─mode────────┬─hex(encrypt('aes-128-ecb', input, key16))────────────────────────┐
-│ aes-128-ecb │ 4603E6862B0D94BBEC68E0B0DF51D60F                                 │
-│ aes-128-ecb │ 3004851B86D3F3950672DE7085D27C03                                 │
-│ aes-128-ecb │ E807F8C8D40A11F65076361AFC7D8B68D8658C5FAA6457985CAA380F16B3F7E4 │
-└─────────────┴──────────────────────────────────────────────────────────────────┘
-```
-
-Пример с `iv`:
-
-Запрос:
-
-``` sql
-SELECT 'aes-256-ctr' AS mode, hex(encrypt(mode, input, key32, iv)) FROM encryption_test;
-```
-
-Результат:
-
-``` text
-┌─mode────────┬─hex(encrypt('aes-256-ctr', input, key32, iv))─┐
-│ aes-256-ctr │                                               │
-│ aes-256-ctr │ 7FB039F7                                      │
-│ aes-256-ctr │ 5CBD20F7ABD3AC41FCAA1A5C0E119E2B325949        │
-└─────────────┴───────────────────────────────────────────────┘
+┌─comment─────────────────────────────┬─hex(secret)──────────────────────┐
+│ aes-256-cfb128 no IV                │ B4972BDC4459                     │
+│ aes-256-cfb128 no IV, different key │ 2FF57C092DC9                     │
+│ aes-256-cfb128 with IV              │ 5E6CB398F653                     │
+│ aes-256-cbc no IV                   │ 1BC0629A92450D9E73A00E7D02CF4142 │
+└─────────────────────────────────────┴──────────────────────────────────┘
 ```

 Пример в режиме `-gcm`:
@ -110,41 +91,27 @@ SELECT 'aes-256-ctr' AS mode, hex(encrypt(mode, input, key32, iv)) FROM encrypti
 Запрос:

 ``` sql
-SELECT 'aes-256-gcm' AS mode, hex(encrypt(mode, input, key32, iv)) FROM encryption_test;
+INSERT INTO encryption_test VALUES('aes-256-gcm', encrypt('aes-256-gcm', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv')), \
+('aes-256-gcm with AAD', encrypt('aes-256-gcm', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv', 'aad'));
+
+SELECT comment, hex(secret) FROM encryption_test WHERE comment LIKE '%gcm%';
 ```

 Результат:

 ``` text
-┌─mode────────┬─hex(encrypt('aes-256-gcm', input, key32, iv))──────────────────────────┐
-│ aes-256-gcm │ E99DBEBC01F021758352D7FBD9039EFA                                       │
-│ aes-256-gcm │ 8742CE3A7B0595B281C712600D274CA881F47414                               │
-│ aes-256-gcm │ A44FD73ACEB1A64BDE2D03808A2576EDBB60764CC6982DB9AF2C33C893D91B00C60DC5 │
-└─────────────┴────────────────────────────────────────────────────────────────────────┘
-```
-
-Пример в режиме `-gcm` и с `aad`:
-
-Запрос:
-
-``` sql
-SELECT 'aes-192-gcm' AS mode, hex(encrypt(mode, input, key24, iv, 'AAD')) FROM encryption_test;
-```
-
-Результат:
-
-``` text
-┌─mode────────┬─hex(encrypt('aes-192-gcm', input, key24, iv, 'AAD'))───────────────────┐
-│ aes-192-gcm │ 04C13E4B1D62481ED22B3644595CB5DB                                       │
-│ aes-192-gcm │ 9A6CF0FD2B329B04EAD18301818F016DF8F77447                               │
-│ aes-192-gcm │ B961E9FD9B940EBAD7ADDA75C9F198A40797A5EA1722D542890CC976E21113BBB8A7AA │
-└─────────────┴────────────────────────────────────────────────────────────────────────┘
+┌─comment──────────────┬─hex(secret)──────────────────────────────────┐
+│ aes-256-gcm          │ A8A3CCBC6426CFEEB60E4EAE03D3E94204C1B09E0254 │
+│ aes-256-gcm with AAD │ A8A3CCBC6426D9A1017A0A932322F1852260A4AD6837 │
+└──────────────────────┴──────────────────────────────────────────────┘
 ```

 ## aes_encrypt_mysql {#aes_encrypt_mysql}

 Совместима с шифрованием myqsl, результат может быть расшифрован функцией [AES_DECRYPT](https://dev.mysql.com/doc/refman/8.0/en/encryption-functions.html#function_aes-decrypt).

+При одинаковых входящих значениях зашифрованный текст будет совпадать с результатом, возвращаемым функцией `encrypt`. Однако если `key` или `iv` длиннее, чем должны быть, `aes_encrypt_mysql` будет работать аналогично функции `aes_encrypt` в MySQL: свернет ключ и проигнорирует лишнюю часть `iv`.
+
 Функция поддерживает шифрофание данных следующими режимами:

 -   aes-128-ecb, aes-192-ecb, aes-256-ecb
@ -156,7 +123,7 @@ SELECT 'aes-192-gcm' AS mode, hex(encrypt(mode, input, key24, iv, 'AAD')) FROM e

 **Синтаксис**

-```sql
+``` sql
 aes_encrypt_mysql('mode', 'plaintext', 'key' [, iv])
 ```

@ -164,78 +131,96 @@ aes_encrypt_mysql('mode', 'plaintext', 'key' [, iv])

 -   `mode` — режим шифрования. [String](../../sql-reference/data-types/string.md#string).
 -   `plaintext` — текст, который будет зашифрован. [String](../../sql-reference/data-types/string.md#string).
-   `key` — ключ шифрования. [String](../../sql-reference/data-types/string.md#string).
-   `iv` — инициализирующий вектор. Необязателен. [String](../../sql-reference/data-types/string.md#string).
+-   `key` — ключ шифрования. Если ключ длиннее, чем требует режим шифрования, производится специфичная для MySQL свертка ключа. [String](../../sql-reference/data-types/string.md#string).
+-   `iv` — инициализирующий вектор. Необязателен, учитываются только первые 16 байтов. [String](../../sql-reference/data-types/string.md#string).

 **Возвращаемое значение**

-   Зашифрованная строка. [String](../../sql-reference/data-types/string.md#string).
+-   Бинарная зашифрованная строка. [String](../../sql-reference/data-types/string.md#string).

 **Примеры**

-Создадим такую таблицу:
+При одинаковых входящих значениях результаты шифрования у функций `encrypt` и `aes_encrypt_mysql`  совпадают.

 Запрос:

 ``` sql
-CREATE TABLE encryption_test
-(
-    input String,
-    key String DEFAULT unhex('fb9958e2e897ef3fdb49067b51a24af645b3626eed2f9ea1dc7fd4dd71b7e38f9a68db2a3184f952382c783785f9d77bf923577108a88adaacae5c141b1576b0'),
-    iv String DEFAULT unhex('8CA3554377DFF8A369BC50A89780DD85'),
-    key32 String DEFAULT substring(key, 1, 32),
-    key24 String DEFAULT substring(key, 1, 24),
-    key16 String DEFAULT substring(key, 1, 16)
-) Engine = Memory;
-```
-
-Вставим эти данные:
-
-Запрос:
-
-``` sql
-INSERT INTO encryption_test (input) VALUES (''), ('text'), ('What Is ClickHouse?');
-```
-
-Пример без `iv`:
-
-Запрос:
-
-``` sql
-SELECT 'aes-128-cbc' AS mode, hex(aes_encrypt_mysql(mode, input, key32)) FROM encryption_test;
+SELECT encrypt('aes-256-cfb128', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv') = aes_encrypt_mysql('aes-256-cfb128', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv') AS ciphertexts_equal;
 ```

 Результат:

 ``` text
-┌─mode────────┬─hex(aes_encrypt_mysql('aes-128-cbc', input, key32))──────────────┐
-│ aes-128-cbc │ FEA8CFDE6EE2C6E7A2CC6ADDC9F62C83                                 │
-│ aes-128-cbc │ 78B16CD4BE107660156124C5FEE6454A                                 │
-│ aes-128-cbc │ 67C0B119D96F18E2823968D42871B3D179221B1E7EE642D628341C2B29BA2E18 │
-└─────────────┴──────────────────────────────────────────────────────────────────┘
+┌─ciphertexts_equal─┐
+│                 1 │
+└───────────────────┘
 ```

-Пример с `iv`:
+Функция `encrypt` генерирует исключение, если `key` или `iv` длиннее чем нужно:

 Запрос:

 ``` sql
-SELECT 'aes-256-cfb128' AS mode, hex(aes_encrypt_mysql(mode, input, key32, iv)) FROM encryption_test;
+SELECT encrypt('aes-256-cfb128', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123');
 ```

 Результат:

 ``` text
-┌─mode───────────┬─hex(aes_encrypt_mysql('aes-256-cfb128', input, key32, iv))─┐
-│ aes-256-cfb128 │                                                            │
-│ aes-256-cfb128 │ 7FB039F7                                                   │
-│ aes-256-cfb128 │ 5CBD20F7ABD3AC41FCAA1A5C0E119E2BB5174F                     │
-└────────────────┴────────────────────────────────────────────────────────────┘
+Received exception from server (version 21.1.2):
+Code: 36. DB::Exception: Received from localhost:9000. DB::Exception: Invalid key size: 33 expected 32: While processing encrypt('aes-256-cfb128', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123'). 
+```
+
+Однако функция `aes_encrypt_mysql` в аналогичном случае возвращает результат, который может быть обработан MySQL:
+
+Запрос:
+
+``` sql
+SELECT hex(aes_encrypt_mysql('aes-256-cfb128', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123')) AS ciphertext;
+```
+
+Результат:
+
+```text
+┌─ciphertext───┐
+│ 24E9E4966469 │
+└──────────────┘
+```
+
+Если передать `iv` еще длиннее, результат останется таким же:
+
+Запрос:
+
+``` sql
+SELECT hex(aes_encrypt_mysql('aes-256-cfb128', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123456')) AS ciphertext
+```
+
+Результат:
+
+``` text
+┌─ciphertext───┐
+│ 24E9E4966469 │
+└──────────────┘
+```
+
+Это совпадает с результатом, возвращаемым MySQL при таких же входящих значениях:
+
+``` sql
+mysql> SET  block_encryption_mode='aes-256-cfb128';
+Query OK, 0 rows affected (0.00 sec)
+
+mysql> SELECT aes_encrypt('Secret', '123456789101213141516171819202122', 'iviviviviviviviv123456') as ciphertext;
+------------------------+
+| ciphertext             |
+------------------------+
+| 0x24E9E4966469         |
+------------------------+
+1 row in set (0.00 sec)
 ```

 ## decrypt {#decrypt}

-Функция поддерживает расшифровку данных следующими режимами:
+Функция расшифровывает зашифрованный текст и может работать в следующих режимах:

 -   aes-128-ecb, aes-192-ecb, aes-256-ecb
 -   aes-128-cbc, aes-192-cbc, aes-256-cbc
@ -247,7 +232,7 @@ SELECT 'aes-256-cfb128' AS mode, hex(aes_encrypt_mysql(mode, input, key32, iv))

 **Синтаксис**

-```sql
+``` sql
 decrypt('mode', 'ciphertext', 'key' [, iv, aad])
 ```

@ -265,52 +250,58 @@ decrypt('mode', 'ciphertext', 'key' [, iv, aad])

 **Примеры**

-Создадим такую таблицу:
+Рассмотрим таблицу из примера для функции [encrypt](#encrypt).

 Запрос:

 ``` sql
-CREATE TABLE encryption_test
-(
-    input String,
-    key String DEFAULT unhex('fb9958e2e897ef3fdb49067b51a24af645b3626eed2f9ea1dc7fd4dd71b7e38f9a68db2a3184f952382c783785f9d77bf923577108a88adaacae5c141b1576b0'),
-    iv String DEFAULT unhex('8CA3554377DFF8A369BC50A89780DD85'),
-    key32 String DEFAULT substring(key, 1, 32),
-    key24 String DEFAULT substring(key, 1, 24),
-    key16 String DEFAULT substring(key, 1, 16)
-) Engine = Memory;
-```
-
-Вставим эти данные:
-
-Запрос:
-
-``` sql
-INSERT INTO encryption_test (input) VALUES (''), ('text'), ('What Is ClickHouse?');
-```
-
-Запрос:
-
-``` sql
-
-SELECT 'aes-128-ecb' AS mode, decrypt(mode, encrypt(mode, input, key16), key16) FROM encryption_test;
+SELECT comment, hex(secret) FROM encryption_test;
 ```

 Результат:

-```text
-┌─mode────────┬─decrypt('aes-128-ecb', encrypt('aes-128-ecb', input, key16), key16)─┐
-│ aes-128-ecb │                                                                     │
-│ aes-128-ecb │ text                                                                │
-│ aes-128-ecb │ What Is ClickHouse?                                                 │
-└─────────────┴─────────────────────────────────────────────────────────────────────┘
+``` text
+┌─comment──────────────┬─hex(secret)──────────────────────────────────┐
+│ aes-256-gcm          │ A8A3CCBC6426CFEEB60E4EAE03D3E94204C1B09E0254 │
+│ aes-256-gcm with AAD │ A8A3CCBC6426D9A1017A0A932322F1852260A4AD6837 │
+└──────────────────────┴──────────────────────────────────────────────┘
+┌─comment─────────────────────────────┬─hex(secret)──────────────────────┐
+│ aes-256-cfb128 no IV                │ B4972BDC4459                     │
+│ aes-256-cfb128 no IV, different key │ 2FF57C092DC9                     │
+│ aes-256-cfb128 with IV              │ 5E6CB398F653                     │
+│ aes-256-cbc no IV                   │ 1BC0629A92450D9E73A00E7D02CF4142 │
+└─────────────────────────────────────┴──────────────────────────────────┘
 ```

+Теперь попытаемся расшифровать эти данные:
+
+Запрос:
+
+``` sql
+SELECT comment, decrypt('aes-256-cfb128', secret, '12345678910121314151617181920212') as plaintext FROM encryption_test;
+```
+
+Результат:
+
+``` text
+┌─comment─────────────────────────────┬─plaintext─┐
+│ aes-256-cfb128 no IV                │ Secret    │
+│ aes-256-cfb128 no IV, different key │ <20>4<EFBFBD>
+                                           <20>         │
+│ aes-256-cfb128 with IV              │ <20><><EFBFBD>6<EFBFBD>~        │
+ │aes-256-cbc no IV                   │ <20>2*4<>h3c<33>4w<34><77>@
+└─────────────────────────────────────┴───────────┘
+```
+
+Обратите внимание, что только часть данных была расшифрована верно. Оставшаяся часть расшифрована некорректно, так как при шифровании использовались другие значения `mode`, `key`, или `iv`.
+
 ## aes_decrypt_mysql {#aes_decrypt_mysql}

 Совместима с шифрованием myqsl и может расшифровать данные, зашифрованные функцией [AES_ENCRYPT](https://dev.mysql.com/doc/refman/8.0/en/encryption-functions.html#function_aes-encrypt). 

-Функция поддерживает расшифровку данных следующими режимами:
+При одинаковых входящих значениях расшифрованный текст будет совпадать с результатом, возвращаемым функцией `decrypt`. Однако если `key` или `iv` длиннее, чем должны быть, `aes_decrypt_mysql` будет работать аналогично функции `aes_decrypt` в MySQL: свернет ключ и проигнорирует лишнюю часть `iv`.
+
+Функция поддерживает расшифровку данных в следующих режимах:

 -   aes-128-ecb, aes-192-ecb, aes-256-ecb
 -   aes-128-cbc, aes-192-cbc, aes-256-cbc
@ -321,7 +312,7 @@ SELECT 'aes-128-ecb' AS mode, decrypt(mode, encrypt(mode, input, key16), key16)

 **Синтаксис**

-```sql
+``` sql
 aes_decrypt_mysql('mode', 'ciphertext', 'key' [, iv])
 ```

@ -332,51 +323,39 @@ aes_decrypt_mysql('mode', 'ciphertext', 'key' [, iv])
 -   `key` — ключ шифрования. [String](../../sql-reference/data-types/string.md#string).
 -   `iv` — инициализирующий вектор. Необязателен. [String](../../sql-reference/data-types/string.md#string).

-
 **Возвращаемое значение**

 -   Расшифрованная строка. [String](../../sql-reference/data-types/string.md#string).

 **Примеры**

-Создадим такую таблицу:
+Расшифруем данные, которые до этого были зашифрованы в MySQL:

-Запрос:

 ``` sql
-CREATE TABLE encryption_test
-(
-    input String,
-    key String DEFAULT unhex('fb9958e2e897ef3fdb49067b51a24af645b3626eed2f9ea1dc7fd4dd71b7e38f9a68db2a3184f952382c783785f9d77bf923577108a88adaacae5c141b1576b0'),
-    iv String DEFAULT unhex('8CA3554377DFF8A369BC50A89780DD85'),
-    key32 String DEFAULT substring(key, 1, 32),
-    key24 String DEFAULT substring(key, 1, 24),
-    key16 String DEFAULT substring(key, 1, 16)
-) Engine = Memory;
-```
+mysql> SET  block_encryption_mode='aes-256-cfb128';
+Query OK, 0 rows affected (0.00 sec)

-Вставим эти данные:
-
-Запрос:
-
-``` sql
-INSERT INTO encryption_test (input) VALUES (''), ('text'), ('What Is ClickHouse?');
+mysql> SELECT aes_encrypt('Secret', '123456789101213141516171819202122', 'iviviviviviviviv123456') as ciphertext;
+------------------------+
+| ciphertext             |
+------------------------+
+| 0x24E9E4966469         |
+------------------------+
+1 row in set (0.00 sec)
 ```

 Запрос:

 ``` sql
-SELECT 'aes-128-cbc' AS mode, aes_decrypt_mysql(mode, aes_encrypt_mysql(mode, input, key), key) FROM encryption_test;
+SELECT aes_decrypt_mysql('aes-256-cfb128', unhex('24E9E4966469'), '123456789101213141516171819202122', 'iviviviviviviviv123456') AS plaintext;
 ```

 Результат:

 ``` text
-┌─mode────────┬─aes_decrypt_mysql('aes-128-cbc', aes_encrypt_mysql('aes-128-cbc', input, key), key)─┐
-│ aes-128-cbc │                                                                                     │
-│ aes-128-cbc │ text                                                                                │
-│ aes-128-cbc │ What Is ClickHouse?                                                                 │
-└─────────────┴─────────────────────────────────────────────────────────────────────────────────────┘
+┌─plaintext─┐
+│ Secret    │
+└───────────┘
 ```
-
 [Original article](https://clickhouse.tech/docs/ru/sql-reference/functions/encryption_functions/) <!--hide-->
--- a/docs/ru/sql-reference/statements/select/all.md
+++ b/docs/ru/sql-reference/statements/select/all.md
@ -0,0 +1,22 @@
+---
+toc_title: ALL
+---
+
+# Секция ALL {#select-all}
+
+Если в таблице несколько совпадающих строк, то `ALL` возвращает все из них. Поведение запроса `SELECT ALL` точно такое же, как и `SELECT` без аргумента `DISTINCT`. Если указаны оба аргумента: `ALL` и `DISTINCT`, функция вернет исключение.
+
+
+`ALL` может быть указан внутри агрегатной функции, например, результат выполнения запроса:
+
+```sql
+SELECT sum(ALL number) FROM numbers(10);
+```
+
+равен результату выполнения запроса:
+
+```sql
+SELECT sum(number) FROM numbers(10);
+```
+
+[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/statements/select/all) <!--hide-->
--- a/programs/install/Install.cpp
+++ b/programs/install/Install.cpp
@ -66,6 +66,7 @@ namespace ErrorCodes
    extern const int CANNOT_OPEN_FILE;
    extern const int SYSTEM_ERROR;
    extern const int NOT_ENOUGH_SPACE;
+    extern const int CANNOT_KILL;
 }

 }
@ -886,6 +887,27 @@ namespace
                fmt::print("Sent kill signal.\n", pid);
            else
                throwFromErrno("Cannot send kill signal", ErrorCodes::SYSTEM_ERROR);
+
+            /// Wait for the process (100 seconds).
+            constexpr size_t num_kill_check_tries = 1000;
+            constexpr size_t kill_check_delay_ms = 100;
+            for (size_t i = 0; i < num_kill_check_tries; ++i)
+            {
+                fmt::print("Waiting for server to be killed\n");
+                if (!isRunning(pid_file))
+                {
+                    fmt::print("Server exited\n");
+                    break;
+                }
+                sleepForMilliseconds(kill_check_delay_ms);
+            }
+
+            if (isRunning(pid_file))
+            {
+                throw Exception(ErrorCodes::CANNOT_KILL,
+                    "The server process still exists after %zu ms",
+                    num_kill_check_tries, kill_check_delay_ms);
+            }
        }

        return 0;
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@ -1017,17 +1017,6 @@ int Server::main(const std::vector<std::string> & /*args*/)
        LOG_INFO(log, "Query Profiler and TraceCollector are disabled because they require PHDR cache to be created"
            " (otherwise the function 'dl_iterate_phdr' is not lock free and not async-signal safe).");

-    if (has_zookeeper && config().has("distributed_ddl"))
-    {
-        /// DDL worker should be started after all tables were loaded
-        String ddl_zookeeper_path = config().getString("distributed_ddl.path", "/clickhouse/task_queue/ddl/");
-        int pool_size = config().getInt("distributed_ddl.pool_size", 1);
-        if (pool_size < 1)
-            throw Exception("distributed_ddl.pool_size should be greater then 0", ErrorCodes::ARGUMENT_OUT_OF_BOUND);
-        global_context->setDDLWorker(std::make_unique<DDLWorker>(pool_size, ddl_zookeeper_path, *global_context, &config(),
-                                                                 "distributed_ddl", "DDLWorker", &CurrentMetrics::MaxDDLEntryID));
-    }
-
    std::unique_ptr<DNSCacheUpdater> dns_cache_updater;
    if (config().has("disable_internal_dns_cache") && config().getInt("disable_internal_dns_cache"))
    {
@ -1309,6 +1298,37 @@ int Server::main(const std::vector<std::string> & /*args*/)
                std::thread::hardware_concurrency());
        }

+        /// try to load dictionaries immediately, throw on error and die
+        ext::scope_guard dictionaries_xmls, models_xmls;
+        try
+        {
+            if (!config().getBool("dictionaries_lazy_load", true))
+            {
+                global_context->tryCreateEmbeddedDictionaries();
+                global_context->getExternalDictionariesLoader().enableAlwaysLoadEverything(true);
+            }
+            dictionaries_xmls = global_context->getExternalDictionariesLoader().addConfigRepository(
+                std::make_unique<ExternalLoaderXMLConfigRepository>(config(), "dictionaries_config"));
+            models_xmls = global_context->getExternalModelsLoader().addConfigRepository(
+                std::make_unique<ExternalLoaderXMLConfigRepository>(config(), "models_config"));
+        }
+        catch (...)
+        {
+            LOG_ERROR(log, "Caught exception while loading dictionaries.");
+            throw;
+        }
+
+        if (has_zookeeper && config().has("distributed_ddl"))
+        {
+            /// DDL worker should be started after all tables were loaded
+            String ddl_zookeeper_path = config().getString("distributed_ddl.path", "/clickhouse/task_queue/ddl/");
+            int pool_size = config().getInt("distributed_ddl.pool_size", 1);
+            if (pool_size < 1)
+                throw Exception("distributed_ddl.pool_size should be greater then 0", ErrorCodes::ARGUMENT_OUT_OF_BOUND);
+            global_context->setDDLWorker(std::make_unique<DDLWorker>(pool_size, ddl_zookeeper_path, *global_context, &config(),
+                                                                     "distributed_ddl", "DDLWorker", &CurrentMetrics::MaxDDLEntryID));
+        }
+
        LOG_INFO(log, "Ready for connections.");

        SCOPE_EXIT({
@ -1358,26 +1378,6 @@ int Server::main(const std::vector<std::string> & /*args*/)
            }
        });

-        /// try to load dictionaries immediately, throw on error and die
-        ext::scope_guard dictionaries_xmls, models_xmls;
-        try
-        {
-            if (!config().getBool("dictionaries_lazy_load", true))
-            {
-                global_context->tryCreateEmbeddedDictionaries();
-                global_context->getExternalDictionariesLoader().enableAlwaysLoadEverything(true);
-            }
-            dictionaries_xmls = global_context->getExternalDictionariesLoader().addConfigRepository(
-                std::make_unique<ExternalLoaderXMLConfigRepository>(config(), "dictionaries_config"));
-            models_xmls = global_context->getExternalModelsLoader().addConfigRepository(
-                std::make_unique<ExternalLoaderXMLConfigRepository>(config(), "models_config"));
-        }
-        catch (...)
-        {
-            LOG_ERROR(log, "Caught exception while loading dictionaries.");
-            throw;
-        }
-
        std::vector<std::unique_ptr<MetricsTransmitter>> metrics_transmitters;
        for (const auto & graphite_key : DB::getMultipleKeysFromConfig(config(), "", "graphite"))
        {
--- a/programs/server/config.xml
+++ b/programs/server/config.xml
@ -892,6 +892,19 @@

        <!-- Controls how much ON CLUSTER queries can be run simultaneously. -->
        <!-- <pool_size>1</pool_size> -->
+
+        <!--
+             Cleanup settings (active tasks will not be removed)
+        -->
+
+        <!-- Controls task TTL (default 1 week) -->
+        <!-- <task_max_lifetime>604800</task_max_lifetime> -->
+
+        <!-- Controls how often cleanup should be performed (in seconds) -->
+        <!-- <cleanup_delay_period>60</cleanup_delay_period> -->
+
+        <!-- Controls how many tasks could be in the queue -->
+        <!-- <max_tasks_in_queue>1000</max_tasks_in_queue> -->
    </distributed_ddl>

    <!-- Settings to fine tune MergeTree tables. See documentation in source code, in MergeTreeSettings.h -->
--- a/src/AggregateFunctions/AggregateFunctionSumMap.h
+++ b/src/AggregateFunctions/AggregateFunctionSumMap.h
@ -118,6 +118,8 @@ public:
                WhichDataType value_type_to_check(value_type);

                /// Do not promote decimal because of implementation issues of this function design
+                /// Currently we cannot get result column type in case of decimal we cannot get decimal scale
+                /// in method void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
                /// If we decide to make this function more efficient we should promote decimal type during summ
                if (value_type_to_check.isDecimal())
                    result_type = value_type_without_nullable;
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -101,8 +101,8 @@ endif()
 list (APPEND clickhouse_common_io_sources ${CONFIG_BUILD})
 list (APPEND clickhouse_common_io_headers ${CONFIG_VERSION} ${CONFIG_COMMON})

-list (APPEND dbms_sources Functions/IFunction.cpp Functions/FunctionFactory.cpp Functions/FunctionHelpers.cpp Functions/extractTimeZoneFromFunctionArguments.cpp Functions/replicate.cpp)
-list (APPEND dbms_headers Functions/IFunctionImpl.h Functions/FunctionFactory.h Functions/FunctionHelpers.h Functions/extractTimeZoneFromFunctionArguments.h Functions/replicate.h)
+list (APPEND dbms_sources Functions/IFunction.cpp Functions/FunctionFactory.cpp Functions/FunctionHelpers.cpp Functions/extractTimeZoneFromFunctionArguments.cpp Functions/replicate.cpp Functions/FunctionsLogical.cpp)
+list (APPEND dbms_headers Functions/IFunctionImpl.h Functions/FunctionFactory.h Functions/FunctionHelpers.h Functions/extractTimeZoneFromFunctionArguments.h Functions/replicate.h Functions/FunctionsLogical.h)

 list (APPEND dbms_sources
    AggregateFunctions/AggregateFunctionFactory.cpp
--- a/src/Client/Connection.cpp
+++ b/src/Client/Connection.cpp
@ -109,6 +109,8 @@ void Connection::connect(const ConnectionTimeouts & timeouts)
        }

        in = std::make_shared<ReadBufferFromPocoSocket>(*socket);
+        in->setAsyncCallback(std::move(async_callback));
+
        out = std::make_shared<WriteBufferFromPocoSocket>(*socket);

        connected = true;
@ -753,15 +755,8 @@ std::optional<UInt64> Connection::checkPacket(size_t timeout_microseconds)
 }


-Packet Connection::receivePacket(std::function<void(Poco::Net::Socket &)> async_callback)
+Packet Connection::receivePacket()
 {
-    in->setAsyncCallback(std::move(async_callback));
-    SCOPE_EXIT({
-        /// disconnect() will reset "in".
-        if (in)
-            in->setAsyncCallback({});
-    });
-
    try
    {
        Packet res;
--- a/src/Client/Connection.h
+++ b/src/Client/Connection.h
@ -27,7 +27,6 @@
 #include <atomic>
 #include <optional>

-
 namespace DB
 {

@ -175,8 +174,7 @@ public:
    std::optional<UInt64> checkPacket(size_t timeout_microseconds = 0);

    /// Receive packet from server.
-    /// Each time read blocks and async_callback is set, it will be called. You can poll socket inside it.
-    Packet receivePacket(std::function<void(Poco::Net::Socket &)> async_callback = {});
+    Packet receivePacket();

    /// If not connected yet, or if connection is broken - then connect. If cannot connect - throw an exception.
    void forceConnected(const ConnectionTimeouts & timeouts);
@ -195,6 +193,16 @@ public:
    size_t outBytesCount() const { return out ? out->count() : 0; }
    size_t inBytesCount() const { return in ? in->count() : 0; }

+    Poco::Net::Socket * getSocket() { return socket.get(); }
+
+    /// Each time read from socket blocks and async_callback is set, it will be called. You can poll socket inside it.
+    void setAsyncCallback(AsyncCallback async_callback_)
+    {
+        async_callback = std::move(async_callback_);
+        if (in)
+            in->setAsyncCallback(std::move(async_callback));
+    }
+
 private:
    String host;
    UInt16 port;
@ -282,6 +290,8 @@ private:

    LoggerWrapper log_wrapper;

+    AsyncCallback async_callback = {};
+
    void connect(const ConnectionTimeouts & timeouts);
    void sendHello();
    void receiveHello();
@ -307,4 +317,20 @@ private:
    [[noreturn]] void throwUnexpectedPacket(UInt64 packet_type, const char * expected) const;
 };

+class AsyncCallbackSetter
+{
+public:
+    AsyncCallbackSetter(Connection * connection_, AsyncCallback async_callback) : connection(connection_)
+    {
+        connection->setAsyncCallback(std::move(async_callback));
+    }
+
+    ~AsyncCallbackSetter()
+    {
+        connection->setAsyncCallback({});
+    }
+private:
+    Connection * connection;
+};
+
 }
--- a/src/Client/ConnectionEstablisher.cpp
+++ b/src/Client/ConnectionEstablisher.cpp
@ -0,0 +1,239 @@
+#include <Client/ConnectionEstablisher.h>
+#include <Common/quoteString.h>
+#include <Common/ProfileEvents.h>
+
+namespace ProfileEvents
+{
+    extern const Event DistributedConnectionMissingTable;
+    extern const Event DistributedConnectionStaleReplica;
+}
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int ATTEMPT_TO_READ_AFTER_EOF;
+    extern const int NETWORK_ERROR;
+    extern const int SOCKET_TIMEOUT;
+}
+
+ConnectionEstablisher::ConnectionEstablisher(
+    IConnectionPool * pool_,
+    const ConnectionTimeouts * timeouts_,
+    const Settings * settings_,
+    Poco::Logger * log_,
+    const QualifiedTableName * table_to_check_)
+    : pool(pool_), timeouts(timeouts_), settings(settings_), log(log_), table_to_check(table_to_check_), is_finished(false)
+{
+}
+
+void ConnectionEstablisher::run(ConnectionEstablisher::TryResult & result, std::string & fail_message)
+{
+    is_finished = false;
+    SCOPE_EXIT(is_finished = true);
+    try
+    {
+        result.entry = pool->get(*timeouts, settings, /* force_connected = */ false);
+        AsyncCallbackSetter async_setter(&*result.entry, std::move(async_callback));
+
+        UInt64 server_revision = 0;
+        if (table_to_check)
+            server_revision = result.entry->getServerRevision(*timeouts);
+
+        if (!table_to_check || server_revision < DBMS_MIN_REVISION_WITH_TABLES_STATUS)
+        {
+            result.entry->forceConnected(*timeouts);
+            result.is_usable = true;
+            result.is_up_to_date = true;
+            return;
+        }
+
+        /// Only status of the remote table corresponding to the Distributed table is taken into account.
+        /// TODO: request status for joined tables also.
+        TablesStatusRequest status_request;
+        status_request.tables.emplace(*table_to_check);
+
+        TablesStatusResponse status_response = result.entry->getTablesStatus(*timeouts, status_request);
+        auto table_status_it = status_response.table_states_by_id.find(*table_to_check);
+        if (table_status_it == status_response.table_states_by_id.end())
+        {
+            const char * message_pattern = "There is no table {}.{} on server: {}";
+            fail_message = fmt::format(message_pattern, backQuote(table_to_check->database), backQuote(table_to_check->table), result.entry->getDescription());
+            LOG_WARNING(log, fail_message);
+            ProfileEvents::increment(ProfileEvents::DistributedConnectionMissingTable);
+            return;
+        }
+
+        result.is_usable = true;
+
+        UInt64 max_allowed_delay = settings ? UInt64(settings->max_replica_delay_for_distributed_queries) : 0;
+        if (!max_allowed_delay)
+        {
+            result.is_up_to_date = true;
+            return;
+        }
+
+        UInt32 delay = table_status_it->second.absolute_delay;
+
+        if (delay < max_allowed_delay)
+            result.is_up_to_date = true;
+        else
+        {
+            result.is_up_to_date = false;
+            result.staleness = delay;
+
+            LOG_TRACE(log, "Server {} has unacceptable replica delay for table {}.{}: {}", result.entry->getDescription(), table_to_check->database, table_to_check->table, delay);
+            ProfileEvents::increment(ProfileEvents::DistributedConnectionStaleReplica);
+        }
+    }
+    catch (const Exception & e)
+    {
+        if (e.code() != ErrorCodes::NETWORK_ERROR && e.code() != ErrorCodes::SOCKET_TIMEOUT
+            && e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF)
+            throw;
+
+        fail_message = getCurrentExceptionMessage(/* with_stacktrace = */ false);
+
+        if (!result.entry.isNull())
+        {
+            result.entry->disconnect();
+            result.reset();
+        }
+    }
+}
+
+#if defined(OS_LINUX)
+
+ConnectionEstablisherAsync::ConnectionEstablisherAsync(
+    IConnectionPool * pool_,
+    const ConnectionTimeouts * timeouts_,
+    const Settings * settings_,
+    Poco::Logger * log_,
+    const QualifiedTableName * table_to_check_)
+    : connection_establisher(pool_, timeouts_, settings_, log_, table_to_check_)
+{
+    epoll.add(receive_timeout.getDescriptor());
+}
+
+void ConnectionEstablisherAsync::Routine::ReadCallback::operator()(int fd, const Poco::Timespan & timeout, const std::string &)
+{
+    /// Check if it's the first time and we need to add socket fd to epoll.
+    if (connection_establisher_async.socket_fd == -1)
+    {
+        connection_establisher_async.epoll.add(fd);
+        connection_establisher_async.socket_fd = fd;
+    }
+
+    connection_establisher_async.receive_timeout.setRelative(timeout);
+    fiber = std::move(fiber).resume();
+    connection_establisher_async.receive_timeout.reset();
+}
+
+Fiber ConnectionEstablisherAsync::Routine::operator()(Fiber && sink)
+{
+    try
+    {
+        connection_establisher_async.connection_establisher.setAsyncCallback(ReadCallback{connection_establisher_async, sink});
+        connection_establisher_async.connection_establisher.run(connection_establisher_async.result, connection_establisher_async.fail_message);
+    }
+    catch (const boost::context::detail::forced_unwind &)
+    {
+        /// This exception is thrown by fiber implementation in case if fiber is being deleted but hasn't exited
+        /// It should not be caught or it will segfault.
+        /// Other exceptions must be caught
+        throw;
+    }
+    catch (...)
+    {
+        connection_establisher_async.exception = std::current_exception();
+    }
+
+    return std::move(sink);
+}
+
+std::variant<int, ConnectionEstablisher::TryResult> ConnectionEstablisherAsync::resume()
+{
+    if (!fiber_created)
+    {
+        reset();
+        fiber = boost::context::fiber(std::allocator_arg_t(), fiber_stack, Routine{*this});
+        fiber_created = true;
+    } else if (!checkReceiveTimeout())
+        return result;
+
+    fiber = std::move(fiber).resume();
+
+    if (exception)
+        std::rethrow_exception(std::move(exception));
+
+    if (connection_establisher.isFinished())
+    {
+        destroyFiber();
+        return result;
+    }
+
+    return epoll.getFileDescriptor();
+}
+
+bool ConnectionEstablisherAsync::checkReceiveTimeout()
+{
+    bool is_socket_ready = false;
+    bool is_receive_timeout_alarmed = false;
+
+    epoll_event events[2];
+    events[0].data.fd = events[1].data.fd = -1;
+    size_t ready_count = epoll.getManyReady(2, events, false);
+    for (size_t i = 0; i != ready_count; ++i)
+    {
+        if (events[i].data.fd == socket_fd)
+            is_socket_ready = true;
+        if (events[i].data.fd == receive_timeout.getDescriptor())
+            is_receive_timeout_alarmed = true;
+    }
+
+    if (is_receive_timeout_alarmed && !is_socket_ready)
+    {
+        destroyFiber();
+        /// In not async case this exception would be thrown and caught in ConnectionEstablisher::run,
+        /// but in async case we process timeout outside and cannot throw exception. So, we just save fail message.
+        fail_message = "Timeout exceeded while reading from socket (" + result.entry->getDescription() + ")";
+        epoll.remove(socket_fd);
+        resetResult();
+        return false;
+    }
+
+    return true;
+}
+
+void ConnectionEstablisherAsync::cancel()
+{
+    destroyFiber();
+    reset();
+}
+
+void ConnectionEstablisherAsync::reset()
+{
+    resetResult();
+    fail_message.clear();
+    socket_fd = -1;
+}
+
+void ConnectionEstablisherAsync::resetResult()
+{
+    if (!result.entry.isNull())
+    {
+        result.entry->disconnect();
+        result.reset();
+    }
+}
+
+void ConnectionEstablisherAsync::destroyFiber()
+{
+    Fiber to_destroy = std::move(fiber);
+    fiber_created = false;
+}
+
+#endif
+
+}
--- a/src/Client/ConnectionEstablisher.h
+++ b/src/Client/ConnectionEstablisher.h
@ -0,0 +1,131 @@
+#pragma once
+
+#include <variant>
+
+#include <Common/Epoll.h>
+#include <Common/Fiber.h>
+#include <Common/FiberStack.h>
+#include <Common/TimerDescriptor.h>
+#include <Common/PoolWithFailoverBase.h>
+#include <Client/ConnectionPool.h>
+
+namespace DB
+{
+
+/// Class for establishing connection to the replica. It supports setting up
+/// an async callback that will be called when reading from socket blocks.
+class ConnectionEstablisher
+{
+public:
+    using TryResult = PoolWithFailoverBase<IConnectionPool>::TryResult;
+
+    ConnectionEstablisher(IConnectionPool * pool_,
+                          const ConnectionTimeouts * timeouts_,
+                          const Settings * settings_,
+                          Poco::Logger * log,
+                          const QualifiedTableName * table_to_check = nullptr);
+
+    /// Establish connection and save it in result, write possible exception message in fail_message.
+    void run(TryResult & result, std::string & fail_message);
+
+    /// Set async callback that will be called when reading from socket blocks.
+    void setAsyncCallback(AsyncCallback async_callback_) { async_callback = std::move(async_callback_); }
+
+    bool isFinished() const { return is_finished; }
+
+private:
+    IConnectionPool * pool;
+    const ConnectionTimeouts * timeouts;
+    const Settings * settings;
+    Poco::Logger * log;
+    const QualifiedTableName * table_to_check;
+
+    bool is_finished;
+    AsyncCallback async_callback = {};
+
+};
+
+#if defined(OS_LINUX)
+
+/// Class for nonblocking establishing connection to the replica.
+/// It runs establishing connection process in fiber and sets special
+/// read callback which is called when reading from socket blocks.
+/// When read callback is called, socket and receive timeout are added in epoll
+/// and execution returns to the main program.
+/// So, you can poll this epoll file descriptor to determine when to resume.
+class ConnectionEstablisherAsync
+{
+public:
+    using TryResult = PoolWithFailoverBase<IConnectionPool>::TryResult;
+
+    ConnectionEstablisherAsync(IConnectionPool * pool_,
+                          const ConnectionTimeouts * timeouts_,
+                          const Settings * settings_,
+                          Poco::Logger * log_,
+                          const QualifiedTableName * table_to_check = nullptr);
+
+    /// Resume establishing connection. If the process was not finished,
+    /// return file descriptor (you can add it in epoll and poll it,
+    /// when this fd become ready, call resume again),
+    /// if the process was failed or finished, return it's result,
+    std::variant<int, TryResult> resume();
+
+    /// Cancel establishing connections. Fiber will be destroyed,
+    /// class will be set in initial stage.
+    void cancel();
+
+    TryResult getResult() const { return result; }
+
+    const std::string & getFailMessage() const { return fail_message; }
+
+private:
+    /// When epoll file descriptor is ready, check if it's an expired timeout.
+    /// Return false if receive timeout expired and socket is not ready, return true otherwise.
+    bool checkReceiveTimeout();
+
+    struct Routine
+    {
+        ConnectionEstablisherAsync & connection_establisher_async;
+
+        struct ReadCallback
+        {
+            ConnectionEstablisherAsync & connection_establisher_async;
+            Fiber & fiber;
+
+            void operator()(int fd, const Poco::Timespan & timeout, const std::string &);
+        };
+
+        Fiber operator()(Fiber && sink);
+    };
+
+    void reset();
+
+    void resetResult();
+
+    void destroyFiber();
+
+    ConnectionEstablisher connection_establisher;
+    TryResult result;
+    std::string fail_message;
+
+    Fiber fiber;
+    FiberStack fiber_stack;
+
+    /// We use timer descriptor for checking socket receive timeout.
+    TimerDescriptor receive_timeout;
+
+    /// In read callback we add socket file descriptor and timer descriptor with receive timeout
+    /// in epoll, so we can return epoll file descriptor outside for polling.
+    Epoll epoll;
+    int socket_fd = -1;
+    std::string socket_description;
+
+    /// If and exception occurred in fiber resume, we save it and rethrow.
+    std::exception_ptr exception;
+
+    bool fiber_created = false;
+};
+
+#endif
+
+}
--- a/src/Client/ConnectionPoolWithFailover.cpp
+++ b/src/Client/ConnectionPoolWithFailover.cpp
@ -1,4 +1,5 @@
 #include <Client/ConnectionPoolWithFailover.h>
+#include <Client/ConnectionEstablisher.h>

 #include <Poco/Net/NetException.h>
 #include <Poco/Net/DNS.h>
@ -23,9 +24,6 @@ namespace DB

 namespace ErrorCodes
 {
-    extern const int ATTEMPT_TO_READ_AFTER_EOF;
-    extern const int NETWORK_ERROR;
-    extern const int SOCKET_TIMEOUT;
    extern const int LOGICAL_ERROR;
 }

@ -172,6 +170,43 @@ std::vector<ConnectionPoolWithFailover::TryResult> ConnectionPoolWithFailover::g
    return getManyImpl(settings, pool_mode, try_get_entry);
 }

+ConnectionPoolWithFailover::Base::GetPriorityFunc ConnectionPoolWithFailover::makeGetPriorityFunc(const Settings * settings)
+{
+    size_t offset = 0;
+    if (settings)
+        offset = settings->load_balancing_first_offset % nested_pools.size();
+
+    GetPriorityFunc get_priority;
+    switch (settings ? LoadBalancing(settings->load_balancing) : default_load_balancing)
+    {
+        case LoadBalancing::NEAREST_HOSTNAME:
+            get_priority = [&](size_t i) { return hostname_differences[i]; };
+            break;
+        case LoadBalancing::IN_ORDER:
+            get_priority = [](size_t i) { return i; };
+            break;
+        case LoadBalancing::RANDOM:
+            break;
+        case LoadBalancing::FIRST_OR_RANDOM:
+            get_priority = [offset](size_t i) -> size_t { return i != offset; };
+            break;
+        case LoadBalancing::ROUND_ROBIN:
+            if (last_used >= nested_pools.size())
+                last_used = 0;
+            ++last_used;
+            /* Consider nested_pools.size() equals to 5
+             * last_used = 1 -> get_priority: 0 1 2 3 4
+             * last_used = 2 -> get_priority: 5 0 1 2 3
+             * last_used = 3 -> get_priority: 5 4 0 1 2
+             * ...
+             * */
+            get_priority = [&](size_t i) { ++i; return i < last_used ? nested_pools.size() - i : i - last_used; };
+            break;
+    }
+
+    return get_priority;
+}
+
 std::vector<ConnectionPoolWithFailover::TryResult> ConnectionPoolWithFailover::getManyImpl(
        const Settings * settings,
        PoolMode pool_mode,
@ -194,36 +229,7 @@ std::vector<ConnectionPoolWithFailover::TryResult> ConnectionPoolWithFailover::g
    else
        throw DB::Exception("Unknown pool allocation mode", DB::ErrorCodes::LOGICAL_ERROR);

-    size_t offset = 0;
-    if (settings)
-        offset = settings->load_balancing_first_offset % nested_pools.size();
-    GetPriorityFunc get_priority;
-    switch (settings ? LoadBalancing(settings->load_balancing) : default_load_balancing)
-    {
-    case LoadBalancing::NEAREST_HOSTNAME:
-        get_priority = [&](size_t i) { return hostname_differences[i]; };
-        break;
-    case LoadBalancing::IN_ORDER:
-        get_priority = [](size_t i) { return i; };
-        break;
-    case LoadBalancing::RANDOM:
-        break;
-    case LoadBalancing::FIRST_OR_RANDOM:
-        get_priority = [offset](size_t i) -> size_t { return i != offset; };
-        break;
-    case LoadBalancing::ROUND_ROBIN:
-        if (last_used >= nested_pools.size())
-            last_used = 0;
-        ++last_used;
-        /* Consider nested_pools.size() equals to 5
-         * last_used = 1 -> get_priority: 0 1 2 3 4
-         * last_used = 2 -> get_priority: 5 0 1 2 3
-         * last_used = 3 -> get_priority: 5 4 0 1 2
-         * ...
-         * */
-        get_priority = [&](size_t i) { ++i; return i < last_used ? nested_pools.size() - i : i - last_used; };
-        break;
-    }
+    GetPriorityFunc get_priority = makeGetPriorityFunc(settings);

    UInt64 max_ignored_errors = settings ? settings->distributed_replica_max_ignored_errors.value : 0;
    bool fallback_to_stale_replicas = settings ? settings->fallback_to_stale_replicas_for_distributed_queries.value : true;
@ -241,77 +247,17 @@ ConnectionPoolWithFailover::tryGetEntry(
        const Settings * settings,
        const QualifiedTableName * table_to_check)
 {
+    ConnectionEstablisher connection_establisher(&pool, &timeouts, settings, log, table_to_check);
    TryResult result;
-    try
-    {
-        result.entry = pool.get(timeouts, settings, /* force_connected = */ false);
-
-        UInt64 server_revision = 0;
-        if (table_to_check)
-            server_revision = result.entry->getServerRevision(timeouts);
-
-        if (!table_to_check || server_revision < DBMS_MIN_REVISION_WITH_TABLES_STATUS)
-        {
-            result.entry->forceConnected(timeouts);
-            result.is_usable = true;
-            result.is_up_to_date = true;
-            return result;
-        }
-
-        /// Only status of the remote table corresponding to the Distributed table is taken into account.
-        /// TODO: request status for joined tables also.
-        TablesStatusRequest status_request;
-        status_request.tables.emplace(*table_to_check);
-
-        TablesStatusResponse status_response = result.entry->getTablesStatus(timeouts, status_request);
-        auto table_status_it = status_response.table_states_by_id.find(*table_to_check);
-        if (table_status_it == status_response.table_states_by_id.end())
-        {
-            const char * message_pattern = "There is no table {}.{} on server: {}";
-            fail_message = fmt::format(message_pattern, backQuote(table_to_check->database), backQuote(table_to_check->table), result.entry->getDescription());
-            LOG_WARNING(log, fail_message);
-            ProfileEvents::increment(ProfileEvents::DistributedConnectionMissingTable);
-
-            return result;
-        }
-
-        result.is_usable = true;
-
-        UInt64 max_allowed_delay = settings ? UInt64(settings->max_replica_delay_for_distributed_queries) : 0;
-        if (!max_allowed_delay)
-        {
-            result.is_up_to_date = true;
-            return result;
-        }
-
-        UInt32 delay = table_status_it->second.absolute_delay;
-
-        if (delay < max_allowed_delay)
-            result.is_up_to_date = true;
-        else
-        {
-            result.is_up_to_date = false;
-            result.staleness = delay;
-
-            LOG_TRACE(log, "Server {} has unacceptable replica delay for table {}.{}: {}", result.entry->getDescription(), table_to_check->database, table_to_check->table, delay);
-            ProfileEvents::increment(ProfileEvents::DistributedConnectionStaleReplica);
-        }
-    }
-    catch (const Exception & e)
-    {
-        if (e.code() != ErrorCodes::NETWORK_ERROR && e.code() != ErrorCodes::SOCKET_TIMEOUT
-            && e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF)
-            throw;
-
-        fail_message = getCurrentExceptionMessage(/* with_stacktrace = */ false);
-
-        if (!result.entry.isNull())
-        {
-            result.entry->disconnect();
-            result.reset();
-        }
-    }
+    connection_establisher.run(result, fail_message);
    return result;
 }

+std::vector<ConnectionPoolWithFailover::Base::ShuffledPool> ConnectionPoolWithFailover::getShuffledPools(const Settings * settings)
+{
+    GetPriorityFunc get_priority = makeGetPriorityFunc(settings);
+    UInt64 max_ignored_errors = settings ? settings->distributed_replica_max_ignored_errors.value : 0;
+    return Base::getShuffledPools(max_ignored_errors, get_priority);
+}
+
 }
--- a/src/Client/ConnectionPoolWithFailover.h
+++ b/src/Client/ConnectionPoolWithFailover.h
@ -80,6 +80,15 @@ public:
    using Status = std::vector<NestedPoolStatus>;
    Status getStatus() const;

+    std::vector<Base::ShuffledPool> getShuffledPools(const Settings * settings);
+
+    size_t getMaxErrorCup() const { return Base::max_error_cap; }
+
+    void updateSharedError(std::vector<ShuffledPool> & shuffled_pools)
+    {
+        Base::updateSharedErrorCounts(shuffled_pools);
+    }
+
 private:
    /// Get the values of relevant settings and call Base::getMany()
    std::vector<TryResult> getManyImpl(
@ -97,6 +106,8 @@ private:
            const Settings * settings,
            const QualifiedTableName * table_to_check = nullptr);

+    GetPriorityFunc makeGetPriorityFunc(const Settings * settings);
+
 private:
    std::vector<size_t> hostname_differences; /// Distances from name of this host to the names of hosts of pools.
    size_t last_used = 0; /// Last used for round_robin policy.
--- a/src/Client/HedgedConnections.cpp
+++ b/src/Client/HedgedConnections.cpp
@ -0,0 +1,524 @@
+#if defined(OS_LINUX)
+
+#include <Client/HedgedConnections.h>
+#include <Interpreters/ClientInfo.h>
+
+namespace DB
+{
+namespace ErrorCodes
+{
+    extern const int MISMATCH_REPLICAS_DATA_SOURCES;
+    extern const int LOGICAL_ERROR;
+    extern const int SOCKET_TIMEOUT;
+    extern const int ALL_CONNECTION_TRIES_FAILED;
+}
+
+HedgedConnections::HedgedConnections(
+    const ConnectionPoolWithFailoverPtr & pool_,
+    const Settings & settings_,
+    const ConnectionTimeouts & timeouts_,
+    const ThrottlerPtr & throttler_,
+    PoolMode pool_mode,
+    std::shared_ptr<QualifiedTableName> table_to_check_)
+    : hedged_connections_factory(pool_, &settings_, timeouts_, table_to_check_)
+    , settings(settings_)
+    , throttler(throttler_)
+{
+    std::vector<Connection *> connections = hedged_connections_factory.getManyConnections(pool_mode);
+
+    if (connections.empty())
+        return;
+
+    offset_states.reserve(connections.size());
+    for (size_t i = 0; i != connections.size(); ++i)
+    {
+        offset_states.emplace_back();
+        offset_states[i].replicas.emplace_back(connections[i]);
+        offset_states[i].active_connection_count = 1;
+
+        ReplicaState & replica = offset_states[i].replicas.back();
+        replica.connection->setThrottler(throttler_);
+
+        epoll.add(replica.packet_receiver->getFileDescriptor());
+        fd_to_replica_location[replica.packet_receiver->getFileDescriptor()] = ReplicaLocation{i, 0};
+
+        epoll.add(replica.change_replica_timeout.getDescriptor());
+        timeout_fd_to_replica_location[replica.change_replica_timeout.getDescriptor()] = ReplicaLocation{i, 0};
+    }
+
+    active_connection_count = connections.size();
+    offsets_with_disabled_changing_replica = 0;
+    pipeline_for_new_replicas.add([throttler_](ReplicaState & replica_) { replica_.connection->setThrottler(throttler_); });
+}
+
+void HedgedConnections::Pipeline::add(std::function<void(ReplicaState & replica)> send_function)
+{
+    pipeline.push_back(send_function);
+}
+
+void HedgedConnections::Pipeline::run(ReplicaState & replica)
+{
+    for (auto & send_func : pipeline)
+        send_func(replica);
+}
+
+void HedgedConnections::sendScalarsData(Scalars & data)
+{
+    std::lock_guard lock(cancel_mutex);
+
+    if (!sent_query)
+        throw Exception("Cannot send scalars data: query not yet sent.", ErrorCodes::LOGICAL_ERROR);
+
+    auto send_scalars_data = [&data](ReplicaState & replica) { replica.connection->sendScalarsData(data); };
+
+    for (auto & offset_state : offset_states)
+        for (auto & replica : offset_state.replicas)
+            if (replica.connection)
+                send_scalars_data(replica);
+
+    pipeline_for_new_replicas.add(send_scalars_data);
+}
+
+void HedgedConnections::sendExternalTablesData(std::vector<ExternalTablesData> & data)
+{
+    std::lock_guard lock(cancel_mutex);
+
+    if (!sent_query)
+        throw Exception("Cannot send external tables data: query not yet sent.", ErrorCodes::LOGICAL_ERROR);
+
+    if (data.size() != size())
+        throw Exception("Mismatch between replicas and data sources", ErrorCodes::MISMATCH_REPLICAS_DATA_SOURCES);
+
+    auto send_external_tables_data = [&data](ReplicaState & replica) { replica.connection->sendExternalTablesData(data[0]); };
+
+    for (auto & offset_state : offset_states)
+        for (auto & replica : offset_state.replicas)
+            if (replica.connection)
+                send_external_tables_data(replica);
+
+    pipeline_for_new_replicas.add(send_external_tables_data);
+}
+
+void HedgedConnections::sendIgnoredPartUUIDs(const std::vector<UUID> & uuids)
+{
+    std::lock_guard lock(cancel_mutex);
+
+    if (sent_query)
+        throw Exception("Cannot send uuids after query is sent.", ErrorCodes::LOGICAL_ERROR);
+
+    auto send_ignored_part_uuids = [&uuids](ReplicaState & replica) { replica.connection->sendIgnoredPartUUIDs(uuids); };
+
+    for (auto & offset_state : offset_states)
+        for (auto & replica : offset_state.replicas)
+            if (replica.connection)
+                send_ignored_part_uuids(replica);
+
+    pipeline_for_new_replicas.add(send_ignored_part_uuids);
+}
+
+void HedgedConnections::sendQuery(
+    const ConnectionTimeouts & timeouts,
+    const String & query,
+    const String & query_id,
+    UInt64 stage,
+    const ClientInfo & client_info,
+    bool with_pending_data)
+{
+    std::lock_guard lock(cancel_mutex);
+
+    if (sent_query)
+        throw Exception("Query already sent.", ErrorCodes::LOGICAL_ERROR);
+
+    for (auto & offset_state : offset_states)
+    {
+        for (auto & replica : offset_state.replicas)
+        {
+            if (replica.connection->getServerRevision(timeouts) < DBMS_MIN_REVISION_WITH_CURRENT_AGGREGATION_VARIANT_SELECTION_METHOD)
+            {
+                disable_two_level_aggregation = true;
+                break;
+            }
+        }
+        if (disable_two_level_aggregation)
+            break;
+    }
+
+    if (!disable_two_level_aggregation)
+    {
+        /// Tell hedged_connections_factory to skip replicas that doesn't support two-level aggregation.
+        hedged_connections_factory.skipReplicasWithTwoLevelAggregationIncompatibility();
+    }
+
+    auto send_query = [this, timeouts, query, query_id, stage, client_info, with_pending_data](ReplicaState & replica)
+    {
+        Settings modified_settings = settings;
+
+        if (disable_two_level_aggregation)
+        {
+            /// Disable two-level aggregation due to version incompatibility.
+            modified_settings.group_by_two_level_threshold = 0;
+            modified_settings.group_by_two_level_threshold_bytes = 0;
+        }
+
+        if (offset_states.size() > 1)
+        {
+            modified_settings.parallel_replicas_count = offset_states.size();
+            modified_settings.parallel_replica_offset = fd_to_replica_location[replica.packet_receiver->getFileDescriptor()].offset;
+        }
+
+        replica.connection->sendQuery(timeouts, query, query_id, stage, &modified_settings, &client_info, with_pending_data);
+        replica.change_replica_timeout.setRelative(timeouts.receive_data_timeout);
+    };
+
+    for (auto & offset_status : offset_states)
+        for (auto & replica : offset_status.replicas)
+            send_query(replica);
+
+    pipeline_for_new_replicas.add(send_query);
+    sent_query = true;
+}
+
+void HedgedConnections::disconnect()
+{
+    std::lock_guard lock(cancel_mutex);
+
+    for (auto & offset_status : offset_states)
+        for (auto & replica : offset_status.replicas)
+            if (replica.connection)
+                finishProcessReplica(replica, true);
+
+    if (hedged_connections_factory.hasEventsInProcess())
+    {
+        if (hedged_connections_factory.numberOfProcessingReplicas() > 0)
+            epoll.remove(hedged_connections_factory.getFileDescriptor());
+
+        hedged_connections_factory.stopChoosingReplicas();
+    }
+}
+
+std::string HedgedConnections::dumpAddresses() const
+{
+    std::lock_guard lock(cancel_mutex);
+
+    std::string addresses;
+    bool is_first = true;
+
+    for (const auto & offset_state : offset_states)
+    {
+        for (const auto & replica : offset_state.replicas)
+        {
+            if (replica.connection)
+            {
+                addresses += (is_first ? "" : "; ") + replica.connection->getDescription();
+                is_first = false;
+            }
+        }
+    }
+
+    return addresses;
+}
+
+void HedgedConnections::sendCancel()
+{
+    std::lock_guard lock(cancel_mutex);
+
+    if (!sent_query || cancelled)
+        throw Exception("Cannot cancel. Either no query sent or already cancelled.", ErrorCodes::LOGICAL_ERROR);
+
+    for (auto & offset_status : offset_states)
+        for (auto & replica : offset_status.replicas)
+            if (replica.connection)
+                replica.connection->sendCancel();
+
+    cancelled = true;
+}
+
+Packet HedgedConnections::drain()
+{
+    std::lock_guard lock(cancel_mutex);
+
+    if (!cancelled)
+        throw Exception("Cannot drain connections: cancel first.", ErrorCodes::LOGICAL_ERROR);
+
+    Packet res;
+    res.type = Protocol::Server::EndOfStream;
+
+    while (!epoll.empty())
+    {
+        ReplicaLocation location = getReadyReplicaLocation();
+        Packet packet = receivePacketFromReplica(location);
+        switch (packet.type)
+        {
+            case Protocol::Server::PartUUIDs:
+            case Protocol::Server::Data:
+            case Protocol::Server::Progress:
+            case Protocol::Server::ProfileInfo:
+            case Protocol::Server::Totals:
+            case Protocol::Server::Extremes:
+            case Protocol::Server::EndOfStream:
+                break;
+
+            case Protocol::Server::Exception:
+            default:
+                /// If we receive an exception or an unknown packet, we save it.
+                res = std::move(packet);
+                break;
+        }
+    }
+
+    return res;
+}
+
+Packet HedgedConnections::receivePacket()
+{
+    std::lock_guard lock(cancel_mutex);
+    return receivePacketUnlocked({});
+}
+
+Packet HedgedConnections::receivePacketUnlocked(AsyncCallback async_callback)
+{
+    if (!sent_query)
+        throw Exception("Cannot receive packets: no query sent.", ErrorCodes::LOGICAL_ERROR);
+    if (!hasActiveConnections())
+        throw Exception("No more packets are available.", ErrorCodes::LOGICAL_ERROR);
+
+    if (epoll.empty())
+        throw Exception("No pending events in epoll.", ErrorCodes::LOGICAL_ERROR);
+
+    ReplicaLocation location = getReadyReplicaLocation(std::move(async_callback));
+    return receivePacketFromReplica(location);
+}
+
+HedgedConnections::ReplicaLocation HedgedConnections::getReadyReplicaLocation(AsyncCallback async_callback)
+{
+    /// Firstly, resume replica with the last received packet if it has pending data.
+    if (replica_with_last_received_packet)
+    {
+        ReplicaLocation location = replica_with_last_received_packet.value();
+        replica_with_last_received_packet.reset();
+        if (offset_states[location.offset].replicas[location.index].connection->hasReadPendingData() && resumePacketReceiver(location))
+            return location;
+    }
+
+    int event_fd;
+    while (true)
+    {
+        /// Get ready file descriptor from epoll and process it.
+        event_fd = getReadyFileDescriptor(async_callback);
+
+        if (event_fd == hedged_connections_factory.getFileDescriptor())
+            checkNewReplica();
+        else if (fd_to_replica_location.contains(event_fd))
+        {
+            ReplicaLocation location = fd_to_replica_location[event_fd];
+            if (resumePacketReceiver(location))
+                return location;
+        }
+        else if (timeout_fd_to_replica_location.contains(event_fd))
+        {
+            ReplicaLocation location = timeout_fd_to_replica_location[event_fd];
+            offset_states[location.offset].replicas[location.index].change_replica_timeout.reset();
+            offset_states[location.offset].replicas[location.index].is_change_replica_timeout_expired = true;
+            offset_states[location.offset].next_replica_in_process = true;
+            offsets_queue.push(location.offset);
+            startNewReplica();
+        }
+        else
+            throw Exception("Unknown event from epoll", ErrorCodes::LOGICAL_ERROR);
+    }
+};
+
+bool HedgedConnections::resumePacketReceiver(const HedgedConnections::ReplicaLocation & location)
+{
+    ReplicaState & replica_state = offset_states[location.offset].replicas[location.index];
+    auto res = replica_state.packet_receiver->resume();
+
+    if (std::holds_alternative<Packet>(res))
+    {
+        last_received_packet = std::move(std::get<Packet>(res));
+        return true;
+    }
+    else if (std::holds_alternative<Poco::Timespan>(res))
+    {
+        finishProcessReplica(replica_state, true);
+
+        /// Check if there is no more active connections with the same offset and there is no new replica in process.
+        if (offset_states[location.offset].active_connection_count == 0 && !offset_states[location.offset].next_replica_in_process)
+            throw NetException("Receive timeout expired", ErrorCodes::SOCKET_TIMEOUT);
+    }
+
+    return false;
+}
+
+int HedgedConnections::getReadyFileDescriptor(AsyncCallback async_callback)
+{
+    epoll_event event;
+    event.data.fd = -1;
+    size_t events_count = 0;
+    while (events_count == 0)
+    {
+        events_count = epoll.getManyReady(1, &event, false);
+        if (!events_count && async_callback)
+            async_callback(epoll.getFileDescriptor(), 0, epoll.getDescription());
+    }
+    return event.data.fd;
+}
+
+Packet HedgedConnections::receivePacketFromReplica(const ReplicaLocation & replica_location)
+{
+    ReplicaState & replica = offset_states[replica_location.offset].replicas[replica_location.index];
+    Packet packet = std::move(last_received_packet);
+    switch (packet.type)
+    {
+        case Protocol::Server::Data:
+            /// If we received the first not empty data packet and still can change replica,
+            /// disable changing replica with this offset.
+            if (offset_states[replica_location.offset].can_change_replica && packet.block.rows() > 0)
+                disableChangingReplica(replica_location);
+            replica_with_last_received_packet = replica_location;
+            break;
+        case Protocol::Server::Progress:
+            /// Check if we have made some progress and still can change replica.
+            if (offset_states[replica_location.offset].can_change_replica && packet.progress.read_bytes > 0)
+            {
+                /// If we are allowed to change replica until the first data packet,
+                /// just restart timeout (if it hasn't expired yet). Otherwise disable changing replica with this offset.
+                if (settings.allow_changing_replica_until_first_data_packet && !replica.is_change_replica_timeout_expired)
+                    replica.change_replica_timeout.setRelative(hedged_connections_factory.getConnectionTimeouts().receive_data_timeout);
+                else
+                    disableChangingReplica(replica_location);
+            }
+            replica_with_last_received_packet = replica_location;
+            break;
+        case Protocol::Server::PartUUIDs:
+        case Protocol::Server::ProfileInfo:
+        case Protocol::Server::Totals:
+        case Protocol::Server::Extremes:
+        case Protocol::Server::Log:
+            replica_with_last_received_packet = replica_location;
+            break;
+
+        case Protocol::Server::EndOfStream:
+            finishProcessReplica(replica, false);
+            break;
+
+        case Protocol::Server::Exception:
+        default:
+            finishProcessReplica(replica, true);
+            break;
+    }
+
+    return packet;
+}
+
+void HedgedConnections::disableChangingReplica(const ReplicaLocation & replica_location)
+{
+    /// Stop working with replicas, that are responsible for the same offset.
+    OffsetState & offset_state = offset_states[replica_location.offset];
+    offset_state.replicas[replica_location.index].change_replica_timeout.reset();
+    ++offsets_with_disabled_changing_replica;
+    offset_state.can_change_replica = false;
+
+    for (size_t i = 0; i != offset_state.replicas.size(); ++i)
+    {
+        if (i != replica_location.index && offset_state.replicas[i].connection)
+        {
+            offset_state.replicas[i].connection->sendCancel();
+            finishProcessReplica(offset_state.replicas[i], true);
+        }
+    }
+
+    /// If we disabled changing replica with all offsets, we need to stop choosing new replicas.
+    if (hedged_connections_factory.hasEventsInProcess() && offsets_with_disabled_changing_replica == offset_states.size())
+    {
+        if (hedged_connections_factory.numberOfProcessingReplicas() > 0)
+            epoll.remove(hedged_connections_factory.getFileDescriptor());
+        hedged_connections_factory.stopChoosingReplicas();
+    }
+}
+
+void HedgedConnections::startNewReplica()
+{
+    Connection * connection = nullptr;
+    HedgedConnectionsFactory::State state = hedged_connections_factory.startNewConnection(connection);
+
+    /// Check if we need to add hedged_connections_factory file descriptor to epoll.
+    if (state == HedgedConnectionsFactory::State::NOT_READY && hedged_connections_factory.numberOfProcessingReplicas() == 1)
+        epoll.add(hedged_connections_factory.getFileDescriptor());
+
+    processNewReplicaState(state, connection);
+}
+
+void HedgedConnections::checkNewReplica()
+{
+    Connection * connection = nullptr;
+    HedgedConnectionsFactory::State state = hedged_connections_factory.waitForReadyConnections(connection);
+
+    processNewReplicaState(state, connection);
+
+    /// Check if we don't need to listen hedged_connections_factory file descriptor in epoll anymore.
+    if (hedged_connections_factory.numberOfProcessingReplicas() == 0)
+        epoll.remove(hedged_connections_factory.getFileDescriptor());
+}
+
+void HedgedConnections::processNewReplicaState(HedgedConnectionsFactory::State state, Connection * connection)
+{
+    switch (state)
+    {
+        case HedgedConnectionsFactory::State::READY:
+        {
+            size_t offset = offsets_queue.front();
+            offsets_queue.pop();
+
+            offset_states[offset].replicas.emplace_back(connection);
+            ++offset_states[offset].active_connection_count;
+            offset_states[offset].next_replica_in_process = false;
+            ++active_connection_count;
+
+            ReplicaState & replica = offset_states[offset].replicas.back();
+            epoll.add(replica.packet_receiver->getFileDescriptor());
+            fd_to_replica_location[replica.packet_receiver->getFileDescriptor()] = ReplicaLocation{offset, offset_states[offset].replicas.size() - 1};
+            epoll.add(replica.change_replica_timeout.getDescriptor());
+            timeout_fd_to_replica_location[replica.change_replica_timeout.getDescriptor()] = ReplicaLocation{offset, offset_states[offset].replicas.size() - 1};
+
+            pipeline_for_new_replicas.run(replica);
+            break;
+        }
+        case HedgedConnectionsFactory::State::CANNOT_CHOOSE:
+        {
+            while (!offsets_queue.empty())
+            {
+                /// Check if there is no active replica with needed offsets.
+                if (offset_states[offsets_queue.front()].active_connection_count == 0)
+                    throw Exception("Cannot find enough connections to replicas", ErrorCodes::ALL_CONNECTION_TRIES_FAILED);
+                offset_states[offsets_queue.front()].next_replica_in_process = false;
+                offsets_queue.pop();
+            }
+            break;
+        }
+        case HedgedConnectionsFactory::State::NOT_READY:
+            break;
+    }
+}
+
+void HedgedConnections::finishProcessReplica(ReplicaState & replica, bool disconnect)
+{
+    replica.packet_receiver->cancel();
+    replica.change_replica_timeout.reset();
+
+    epoll.remove(replica.packet_receiver->getFileDescriptor());
+    --offset_states[fd_to_replica_location[replica.packet_receiver->getFileDescriptor()].offset].active_connection_count;
+    fd_to_replica_location.erase(replica.packet_receiver->getFileDescriptor());
+
+    epoll.remove(replica.change_replica_timeout.getDescriptor());
+    timeout_fd_to_replica_location.erase(replica.change_replica_timeout.getDescriptor());
+
+    --active_connection_count;
+
+    if (disconnect)
+        replica.connection->disconnect();
+    replica.connection = nullptr;
+}
+
+}
+#endif
--- a/src/Client/HedgedConnections.h
+++ b/src/Client/HedgedConnections.h
@ -0,0 +1,189 @@
+#pragma once
+#if defined(OS_LINUX)
+
+#include <functional>
+#include <queue>
+#include <optional>
+
+#include <Client/HedgedConnectionsFactory.h>
+#include <Client/IConnections.h>
+#include <Client/PacketReceiver.h>
+#include <Common/FiberStack.h>
+#include <Common/Fiber.h>
+
+namespace DB
+{
+
+/** To receive data from multiple replicas (connections) from one shard asynchronously.
+  * The principe of Hedged Connections is used to reduce tail latency:
+  * if we don't receive data from replica and there is no progress in query execution
+  * for a long time, we try to get new replica and send query to it,
+  * without cancelling working with previous replica. This class
+  * supports all functionality that MultipleConnections has.
+  */
+class HedgedConnections : public IConnections
+{
+public:
+    using PacketReceiverPtr = std::unique_ptr<PacketReceiver>;
+    struct ReplicaState
+    {
+        explicit ReplicaState(Connection * connection_) : connection(connection_), packet_receiver(std::make_unique<PacketReceiver>(connection_))
+        {
+        }
+
+        Connection * connection = nullptr;
+        PacketReceiverPtr packet_receiver;
+        TimerDescriptor change_replica_timeout;
+        bool is_change_replica_timeout_expired = false;
+    };
+
+    struct OffsetState
+    {
+        /// Replicas with the same offset.
+        std::vector<ReplicaState> replicas;
+        /// An amount of active replicas. When can_change_replica is false,
+        /// active_connection_count is always <= 1 (because we stopped working with
+        /// other replicas with the same offset)
+        size_t active_connection_count = 0;
+        bool can_change_replica = true;
+
+        /// This flag is true when this offset is in queue for
+        /// new replicas. It's needed to process receive timeout
+        /// (throw an exception when receive timeout expired and there is no
+        /// new replica in process)
+        bool next_replica_in_process = false;
+    };
+
+    /// We process events in epoll, so we need to determine replica by it's
+    /// file descriptor. We store map fd -> replica location. To determine
+    /// where replica is, we need a replica offset
+    /// (the same as parallel_replica_offset), and index, which is needed because
+    /// we can have many replicas with same offset (when receive_data_timeout has expired).
+    struct ReplicaLocation
+    {
+        size_t offset;
+        size_t index;
+    };
+
+    HedgedConnections(const ConnectionPoolWithFailoverPtr & pool_,
+                      const Settings & settings_,
+                      const ConnectionTimeouts & timeouts_,
+                      const ThrottlerPtr & throttler,
+                      PoolMode pool_mode,
+                      std::shared_ptr<QualifiedTableName> table_to_check_ = nullptr);
+
+    void sendScalarsData(Scalars & data) override;
+
+    void sendExternalTablesData(std::vector<ExternalTablesData> & data) override;
+
+    void sendQuery(
+        const ConnectionTimeouts & timeouts,
+        const String & query,
+        const String & query_id,
+        UInt64 stage,
+        const ClientInfo & client_info,
+        bool with_pending_data) override;
+
+    Packet receivePacket() override;
+
+    Packet receivePacketUnlocked(AsyncCallback async_callback) override;
+
+    void disconnect() override;
+
+    void sendCancel() override;
+
+    void sendIgnoredPartUUIDs(const std::vector<UUID> & uuids) override;
+
+    Packet drain() override;
+
+    std::string dumpAddresses() const override;
+
+    size_t size() const override { return offset_states.size(); }
+
+    bool hasActiveConnections() const override { return active_connection_count > 0; }
+
+private:
+    /// If we don't receive data from replica and there is no progress in query
+    /// execution for receive_data_timeout, we are trying to get new
+    /// replica and send query to it. Beside sending query, there are some
+    /// additional actions like sendScalarsData or sendExternalTablesData and we need
+    /// to perform these actions in the same order on the new replica. So, we will
+    /// save actions with replicas in pipeline to perform them on the new replicas.
+    class Pipeline
+    {
+    public:
+        void add(std::function<void(ReplicaState &)> send_function);
+
+        void run(ReplicaState & replica);
+    private:
+        std::vector<std::function<void(ReplicaState &)>> pipeline;
+    };
+
+    Packet receivePacketFromReplica(const ReplicaLocation & replica_location);
+
+    ReplicaLocation getReadyReplicaLocation(AsyncCallback async_callback = {});
+
+    bool resumePacketReceiver(const ReplicaLocation & replica_location);
+
+    void disableChangingReplica(const ReplicaLocation & replica_location);
+
+    void startNewReplica();
+
+    void checkNewReplica();
+
+    void processNewReplicaState(HedgedConnectionsFactory::State state, Connection * connection);
+
+    void finishProcessReplica(ReplicaState & replica, bool disconnect);
+
+    int getReadyFileDescriptor(AsyncCallback async_callback = {});
+
+    HedgedConnectionsFactory hedged_connections_factory;
+
+    /// All replicas in offset_states[offset] is responsible for process query
+    /// with setting parallel_replica_offset = offset. In common situations
+    /// replica_states[offset].replicas.size() = 1 (like in MultiplexedConnections).
+    std::vector<OffsetState> offset_states;
+
+    /// Map socket file descriptor to replica location (it's offset and index in OffsetState.replicas).
+    std::unordered_map<int, ReplicaLocation> fd_to_replica_location;
+
+    /// Map receive data timeout file descriptor to replica location.
+    std::unordered_map<int, ReplicaLocation> timeout_fd_to_replica_location;
+
+    /// A queue of offsets for new replicas. When we get RECEIVE_DATA_TIMEOUT from
+    /// the replica, we push it's offset to this queue and start trying to get
+    /// new replica.
+    std::queue<int> offsets_queue;
+
+    /// The current number of valid connections to the replicas of this shard.
+    size_t active_connection_count;
+
+    /// We count offsets in which we can't change replica anymore,
+    /// it's needed to cancel choosing new replicas when we
+    /// disabled replica changing in all offsets.
+    size_t offsets_with_disabled_changing_replica;
+
+    Pipeline pipeline_for_new_replicas;
+
+    /// New replica may not support two-level aggregation due to version incompatibility.
+    /// If we didn't disabled it, we need to skip this replica.
+    bool disable_two_level_aggregation = false;
+
+    /// We will save replica with last received packet
+    /// (except cases when packet type is EndOfStream or Exception)
+    /// to resume it's packet receiver when new packet is needed.
+    std::optional<ReplicaLocation> replica_with_last_received_packet;
+
+    Packet last_received_packet;
+
+    Epoll epoll;
+    const Settings & settings;
+    ThrottlerPtr throttler;
+    bool sent_query = false;
+    bool cancelled = false;
+
+    mutable std::mutex cancel_mutex;
+};
+
+}
+#endif
--- a/src/Client/HedgedConnectionsFactory.cpp
+++ b/src/Client/HedgedConnectionsFactory.cpp
@ -0,0 +1,387 @@
+#if defined(OS_LINUX)
+
+#include <Client/HedgedConnectionsFactory.h>
+#include <Common/typeid_cast.h>
+
+
+namespace DB
+{
+namespace ErrorCodes
+{
+    extern const int ALL_CONNECTION_TRIES_FAILED;
+    extern const int ALL_REPLICAS_ARE_STALE;
+    extern const int LOGICAL_ERROR;
+}
+
+HedgedConnectionsFactory::HedgedConnectionsFactory(
+    const ConnectionPoolWithFailoverPtr & pool_,
+    const Settings * settings_,
+    const ConnectionTimeouts & timeouts_,
+    std::shared_ptr<QualifiedTableName> table_to_check_)
+    : pool(pool_), settings(settings_), timeouts(timeouts_), table_to_check(table_to_check_), log(&Poco::Logger::get("HedgedConnectionsFactory"))
+{
+    shuffled_pools = pool->getShuffledPools(settings);
+    for (size_t i = 0; i != shuffled_pools.size(); ++i)
+        replicas.emplace_back(ConnectionEstablisherAsync(shuffled_pools[i].pool, &timeouts, settings, log, table_to_check.get()));
+
+    max_tries
+        = (settings ? size_t{settings->connections_with_failover_max_tries} : size_t{DBMS_CONNECTION_POOL_WITH_FAILOVER_DEFAULT_MAX_TRIES});
+
+    fallback_to_stale_replicas = settings && settings->fallback_to_stale_replicas_for_distributed_queries;
+}
+
+HedgedConnectionsFactory::~HedgedConnectionsFactory()
+{
+    pool->updateSharedError(shuffled_pools);
+}
+
+std::vector<Connection *> HedgedConnectionsFactory::getManyConnections(PoolMode pool_mode)
+{
+    size_t min_entries = (settings && settings->skip_unavailable_shards) ? 0 : 1;
+
+    size_t max_entries;
+    switch (pool_mode)
+    {
+        case PoolMode::GET_ALL:
+        {
+            min_entries = shuffled_pools.size();
+            max_entries = shuffled_pools.size();
+            break;
+        }
+        case PoolMode::GET_ONE:
+        {
+            max_entries = 1;
+            break;
+        }
+        case PoolMode::GET_MANY:
+        {
+            max_entries = settings ? size_t(settings->max_parallel_replicas) : 1;
+            break;
+        }
+    }
+
+    std::vector<Connection *> connections;
+    connections.reserve(max_entries);
+    Connection * connection = nullptr;
+
+    /// Try to start establishing connections with max_entries replicas.
+    for (size_t i = 0; i != max_entries; ++i)
+    {
+        ++requested_connections_count;
+        State state = startNewConnectionImpl(connection);
+        if (state == State::READY)
+            connections.push_back(connection);
+        if (state == State::CANNOT_CHOOSE)
+            break;
+    }
+
+    /// Process connections until we get enough READY connections
+    /// (work asynchronously with all connections we started).
+    /// TODO: when we get GET_ALL mode we can start reading packets from ready
+    /// TODO: connection as soon as we got it, not even waiting for the others.
+    while (connections.size() < max_entries)
+    {
+        /// Set blocking = true to avoid busy-waiting here.
+        auto state = waitForReadyConnectionsImpl(/*blocking = */true, connection);
+        if (state == State::READY)
+            connections.push_back(connection);
+        else if (state == State::CANNOT_CHOOSE)
+        {
+            if (connections.size() >= min_entries)
+                break;
+
+            /// Determine the reason of not enough replicas.
+            if (!fallback_to_stale_replicas && up_to_date_count < min_entries)
+                throw Exception(
+                    "Could not find enough connections to up-to-date replicas. Got: " + std::to_string(connections.size())
+                    + ", needed: " + std::to_string(min_entries),
+                    DB::ErrorCodes::ALL_REPLICAS_ARE_STALE);
+            if (usable_count < min_entries)
+                throw NetException(
+                    "All connection tries failed. Log: \n\n" + fail_messages + "\n",
+                    DB::ErrorCodes::ALL_CONNECTION_TRIES_FAILED);
+
+            throw Exception("Unknown reason of not enough replicas.", ErrorCodes::LOGICAL_ERROR);
+        }
+    }
+
+    return connections;
+}
+
+HedgedConnectionsFactory::State HedgedConnectionsFactory::startNewConnection(Connection *& connection_out)
+{
+    ++requested_connections_count;
+    State state = startNewConnectionImpl(connection_out);
+    /// If we cannot start new connection but there are connections in epoll, return NOT_READY.
+    if (state == State::CANNOT_CHOOSE && !epoll.empty())
+        state = State::NOT_READY;
+
+    return state;
+}
+
+HedgedConnectionsFactory::State HedgedConnectionsFactory::waitForReadyConnections(Connection *& connection_out)
+{
+    return waitForReadyConnectionsImpl(false, connection_out);
+}
+
+HedgedConnectionsFactory::State HedgedConnectionsFactory::waitForReadyConnectionsImpl(bool blocking, Connection *& connection_out)
+{
+    State state = processEpollEvents(blocking, connection_out);
+    if (state != State::CANNOT_CHOOSE)
+        return state;
+
+    /// We reach this point only if there was no free up to date replica.
+    /// We will try to use usable replica.
+
+    /// Check if we are not allowed to use usable replicas or there is no even a free usable replica.
+    if (!fallback_to_stale_replicas)
+        return State::CANNOT_CHOOSE;
+
+    return setBestUsableReplica(connection_out);
+}
+
+int HedgedConnectionsFactory::getNextIndex()
+{
+    /// Check if there is no free replica.
+    if (entries_count + replicas_in_process_count + failed_pools_count >= shuffled_pools.size())
+        return -1;
+
+    /// Check if it's the first time.
+    if (last_used_index == -1)
+    {
+        last_used_index = 0;
+        return 0;
+    }
+
+    bool finish = false;
+    int next_index = last_used_index;
+    while (!finish)
+    {
+        next_index = (next_index + 1) % shuffled_pools.size();
+
+        /// Check if we can try this replica.
+        if (replicas[next_index].connection_establisher.getResult().entry.isNull()
+            && (max_tries == 0 || shuffled_pools[next_index].error_count < max_tries))
+            finish = true;
+
+        /// If we made a complete round, there is no replica to connect.
+        else if (next_index == last_used_index)
+            return -1;
+    }
+
+    last_used_index = next_index;
+    return next_index;
+}
+
+HedgedConnectionsFactory::State HedgedConnectionsFactory::startNewConnectionImpl(Connection *& connection_out)
+{
+    int index;
+    State state;
+    do
+    {
+        index = getNextIndex();
+        if (index == -1)
+            return State::CANNOT_CHOOSE;
+
+        state = resumeConnectionEstablisher(index, connection_out);
+    }
+    while (state == State::CANNOT_CHOOSE);
+
+    return state;
+}
+
+HedgedConnectionsFactory::State HedgedConnectionsFactory::processEpollEvents(bool blocking, Connection *& connection_out)
+{
+    int event_fd;
+    while (!epoll.empty())
+    {
+        event_fd = getReadyFileDescriptor(blocking);
+
+        if (event_fd == -1)
+            return State::NOT_READY;
+
+        if (fd_to_replica_index.contains(event_fd))
+        {
+            int index = fd_to_replica_index[event_fd];
+            State state = resumeConnectionEstablisher(index, connection_out);
+            if (state == State::NOT_READY)
+                continue;
+
+            /// Connection establishing not in process now, remove all
+            /// information about it from epoll.
+            removeReplicaFromEpoll(index, event_fd);
+
+            if (state == State::READY)
+                return state;
+        }
+        else if (timeout_fd_to_replica_index.contains(event_fd))
+            replicas[timeout_fd_to_replica_index[event_fd]].change_replica_timeout.reset();
+        else
+            throw Exception("Unknown event from epoll", ErrorCodes::LOGICAL_ERROR);
+
+        /// We reach this point only if we need to start new connection
+        /// (Special timeout expired or one of the previous connections failed).
+        /// Return only if replica is ready.
+        if (startNewConnectionImpl(connection_out) == State::READY)
+            return State::READY;
+    }
+
+    return State::CANNOT_CHOOSE;
+}
+
+int HedgedConnectionsFactory::getReadyFileDescriptor(bool blocking)
+{
+    epoll_event event;
+    event.data.fd = -1;
+    epoll.getManyReady(1, &event, blocking);
+    return event.data.fd;
+}
+
+HedgedConnectionsFactory::State HedgedConnectionsFactory::resumeConnectionEstablisher(int index, Connection *& connection_out)
+{
+    auto res = replicas[index].connection_establisher.resume();
+
+    if (std::holds_alternative<TryResult>(res))
+        return processFinishedConnection(index, std::get<TryResult>(res), connection_out);
+
+    int fd = std::get<int>(res);
+    if (!fd_to_replica_index.contains(fd))
+        addNewReplicaToEpoll(index, fd);
+
+    return State::NOT_READY;
+}
+
+HedgedConnectionsFactory::State HedgedConnectionsFactory::processFinishedConnection(int index, TryResult result, Connection *& connection_out)
+{
+    const std::string & fail_message = replicas[index].connection_establisher.getFailMessage();
+    if (!fail_message.empty())
+        fail_messages += fail_message + "\n";
+
+    if (!result.entry.isNull())
+    {
+        ++entries_count;
+
+        if (result.is_usable)
+        {
+            ++usable_count;
+            if (result.is_up_to_date)
+            {
+                ++up_to_date_count;
+                if (!skip_replicas_with_two_level_aggregation_incompatibility || !isTwoLevelAggregationIncompatible(&*result.entry))
+                {
+                    replicas[index].is_ready = true;
+                    ++ready_replicas_count;
+                    connection_out = &*result.entry;
+                    return State::READY;
+                }
+            }
+        }
+    }
+    else
+    {
+        ShuffledPool & shuffled_pool = shuffled_pools[index];
+        LOG_WARNING(
+            log, "Connection failed at try №{}, reason: {}", (shuffled_pool.error_count + 1), fail_message);
+        ProfileEvents::increment(ProfileEvents::DistributedConnectionFailTry);
+
+        shuffled_pool.error_count = std::min(pool->getMaxErrorCup(), shuffled_pool.error_count + 1);
+
+        if (shuffled_pool.error_count >= max_tries)
+        {
+            ++failed_pools_count;
+            ProfileEvents::increment(ProfileEvents::DistributedConnectionFailAtAll);
+        }
+    }
+
+    return State::CANNOT_CHOOSE;
+}
+
+void HedgedConnectionsFactory::stopChoosingReplicas()
+{
+    for (auto & [fd, index] : fd_to_replica_index)
+    {
+        --replicas_in_process_count;
+        epoll.remove(fd);
+        replicas[index].connection_establisher.cancel();
+    }
+
+    for (auto & [timeout_fd, index] : timeout_fd_to_replica_index)
+    {
+        replicas[index].change_replica_timeout.reset();
+        epoll.remove(timeout_fd);
+    }
+
+    fd_to_replica_index.clear();
+    timeout_fd_to_replica_index.clear();
+}
+
+void HedgedConnectionsFactory::addNewReplicaToEpoll(int index, int fd)
+{
+    ++replicas_in_process_count;
+    epoll.add(fd);
+    fd_to_replica_index[fd] = index;
+
+    /// Add timeout for changing replica.
+    replicas[index].change_replica_timeout.setRelative(timeouts.hedged_connection_timeout);
+    epoll.add(replicas[index].change_replica_timeout.getDescriptor());
+    timeout_fd_to_replica_index[replicas[index].change_replica_timeout.getDescriptor()] = index;
+}
+
+void HedgedConnectionsFactory::removeReplicaFromEpoll(int index, int fd)
+{
+    --replicas_in_process_count;
+    epoll.remove(fd);
+    fd_to_replica_index.erase(fd);
+
+    replicas[index].change_replica_timeout.reset();
+    epoll.remove(replicas[index].change_replica_timeout.getDescriptor());
+    timeout_fd_to_replica_index.erase(replicas[index].change_replica_timeout.getDescriptor());
+}
+
+int HedgedConnectionsFactory::numberOfProcessingReplicas() const
+{
+    if (epoll.empty())
+        return 0;
+
+    return requested_connections_count - ready_replicas_count;
+}
+
+HedgedConnectionsFactory::State HedgedConnectionsFactory::setBestUsableReplica(Connection *& connection_out)
+{
+    std::vector<int> indexes;
+    for (size_t i = 0; i != replicas.size(); ++i)
+    {
+        /// Don't add unusable, failed replicas and replicas that are ready or in process.
+        TryResult result = replicas[i].connection_establisher.getResult();
+        if (!result.entry.isNull()
+            && result.is_usable
+            && !replicas[i].is_ready
+            && (!skip_replicas_with_two_level_aggregation_incompatibility || !isTwoLevelAggregationIncompatible(&*result.entry)))
+            indexes.push_back(i);
+    }
+
+    if (indexes.empty())
+        return State::CANNOT_CHOOSE;
+
+    /// Sort replicas by staleness.
+    std::stable_sort(
+        indexes.begin(),
+        indexes.end(),
+        [&](size_t lhs, size_t rhs)
+        {
+            return replicas[lhs].connection_establisher.getResult().staleness < replicas[rhs].connection_establisher.getResult().staleness;
+        });
+
+    replicas[indexes[0]].is_ready = true;
+    TryResult result = replicas[indexes[0]].connection_establisher.getResult();
+    connection_out = &*result.entry;
+    return State::READY;
+}
+
+bool HedgedConnectionsFactory::isTwoLevelAggregationIncompatible(Connection * connection)
+{
+    return connection->getServerRevision(timeouts) < DBMS_MIN_REVISION_WITH_CURRENT_AGGREGATION_VARIANT_SELECTION_METHOD;
+}
+
+}
+#endif
--- a/src/Client/HedgedConnectionsFactory.h
+++ b/src/Client/HedgedConnectionsFactory.h
@ -0,0 +1,158 @@
+#pragma once
+
+#if defined(OS_LINUX)
+
+#include <Common/TimerDescriptor.h>
+#include <Common/Epoll.h>
+#include <Common/FiberStack.h>
+#include <Common/Fiber.h>
+#include <Client/ConnectionEstablisher.h>
+#include <Client/ConnectionPoolWithFailover.h>
+#include <Core/Settings.h>
+#include <unordered_map>
+#include <memory>
+
+namespace DB
+{
+
+/** Class for establishing hedged connections with replicas.
+  * The process of establishing connection is divided on stages, on each stage if
+  * replica doesn't respond for a long time, we start establishing connection with
+  * the next replica, without cancelling working with previous one.
+  * It works with multiple replicas simultaneously without blocking by using epoll.
+  */
+class HedgedConnectionsFactory
+{
+public:
+    using ShuffledPool = ConnectionPoolWithFailover::Base::ShuffledPool;
+    using TryResult = PoolWithFailoverBase<IConnectionPool>::TryResult;
+
+    enum class State
+    {
+        READY,
+        NOT_READY,
+        CANNOT_CHOOSE,
+    };
+
+    struct ReplicaStatus
+    {
+        explicit ReplicaStatus(ConnectionEstablisherAsync connection_stablisher_) : connection_establisher(std::move(connection_stablisher_))
+        {
+        }
+
+        ConnectionEstablisherAsync connection_establisher;
+        TimerDescriptor change_replica_timeout;
+        bool is_ready = false;
+    };
+
+    HedgedConnectionsFactory(const ConnectionPoolWithFailoverPtr & pool_,
+                        const Settings * settings_,
+                        const ConnectionTimeouts & timeouts_,
+                        std::shared_ptr<QualifiedTableName> table_to_check_ = nullptr);
+
+    /// Create and return active connections according to pool_mode.
+    std::vector<Connection *> getManyConnections(PoolMode pool_mode);
+
+    /// Try to get connection to the new replica without blocking. Process all current events in epoll (connections, timeouts),
+    /// Returned state might be READY (connection established successfully),
+    /// NOT_READY (there are no ready events now) and CANNOT_CHOOSE (cannot produce new connection anymore).
+    /// If state is READY, replica connection will be written in connection_out.
+    State waitForReadyConnections(Connection *& connection_out);
+
+    State startNewConnection(Connection *& connection_out);
+
+    /// Stop working with all replicas that are not READY.
+    void stopChoosingReplicas();
+
+    bool hasEventsInProcess() const { return !epoll.empty(); }
+
+    int getFileDescriptor() const { return epoll.getFileDescriptor(); }
+
+    const ConnectionTimeouts & getConnectionTimeouts() const { return timeouts; }
+
+    int numberOfProcessingReplicas() const;
+
+    /// Tell Factory to not return connections with two level aggregation incompatibility.
+    void skipReplicasWithTwoLevelAggregationIncompatibility() { skip_replicas_with_two_level_aggregation_incompatibility = true; }
+
+    ~HedgedConnectionsFactory();
+
+private:
+    State waitForReadyConnectionsImpl(bool blocking, Connection *& connection_out);
+
+    /// Try to start establishing connection to the new replica. Return
+    /// the index of the new replica or -1 if cannot start new connection.
+    State startNewConnectionImpl(Connection *& connection_out);
+
+    /// Find an index of the next free replica to start connection.
+    /// Return -1 if there is no free replica.
+    int getNextIndex();
+
+    int getReadyFileDescriptor(bool blocking);
+
+    void processFailedConnection(int index, const std::string & fail_message);
+
+    State resumeConnectionEstablisher(int index, Connection *& connection_out);
+
+    State processFinishedConnection(int index, TryResult result, Connection *& connection_out);
+
+    void removeReplicaFromEpoll(int index, int fd);
+
+    void addNewReplicaToEpoll(int index, int fd);
+
+    /// Return NOT_READY state if there is no ready events, READY if replica is ready
+    /// and CANNOT_CHOOSE if there is no more events in epoll.
+    State processEpollEvents(bool blocking, Connection *& connection_out);
+
+    State setBestUsableReplica(Connection *& connection_out);
+
+    bool isTwoLevelAggregationIncompatible(Connection * connection);
+
+    const ConnectionPoolWithFailoverPtr pool;
+    const Settings * settings;
+    const ConnectionTimeouts timeouts;
+
+    std::vector<ShuffledPool> shuffled_pools;
+    std::vector<ReplicaStatus> replicas;
+
+    /// Map socket file descriptor to replica index.
+    std::unordered_map<int, int> fd_to_replica_index;
+
+    /// Map timeout for changing replica to replica index.
+    std::unordered_map<int, int> timeout_fd_to_replica_index;
+
+    /// If this flag is true, don't return connections with
+    /// two level aggregation incompatibility
+    bool skip_replicas_with_two_level_aggregation_incompatibility = false;
+
+    std::shared_ptr<QualifiedTableName> table_to_check;
+    int last_used_index = -1;
+    bool fallback_to_stale_replicas;
+    Epoll epoll;
+    Poco::Logger * log;
+    std::string fail_messages;
+
+    /// The maximum number of attempts to connect to replicas.
+    size_t max_tries;
+    /// Total number of established connections.
+    size_t entries_count = 0;
+    /// The number of established connections that are usable.
+    size_t usable_count = 0;
+    /// The number of established connections that are up to date.
+    size_t up_to_date_count = 0;
+    /// The number of failed connections (replica is considered failed after max_tries attempts to connect).
+    size_t failed_pools_count= 0;
+
+    /// The number of replicas that are in process of connection.
+    size_t replicas_in_process_count = 0;
+    /// The number of ready replicas (replica is considered ready when it's
+    /// connection returns outside).
+    size_t ready_replicas_count = 0;
+
+    /// The number of requested in startNewConnection replicas (it's needed for
+    /// checking the number of requested replicas that are still in process).
+    size_t requested_connections_count = 0;
+};
+
+}
+#endif
--- a/src/Client/IConnections.h
+++ b/src/Client/IConnections.h
@ -0,0 +1,60 @@
+#pragma once
+
+#include <Client/Connection.h>
+
+namespace DB
+{
+
+/// Base class for working with multiple replicas (connections)
+/// from one shard within a single thread
+class IConnections : boost::noncopyable
+{
+public:
+    /// Send all scalars to replicas.
+    virtual void sendScalarsData(Scalars & data) = 0;
+    /// Send all content of external tables to replicas.
+    virtual void sendExternalTablesData(std::vector<ExternalTablesData> & data) = 0;
+
+    /// Send request to replicas.
+    virtual void sendQuery(
+        const ConnectionTimeouts & timeouts,
+        const String & query,
+        const String & query_id,
+        UInt64 stage,
+        const ClientInfo & client_info,
+        bool with_pending_data) = 0;
+
+    /// Get packet from any replica.
+    virtual Packet receivePacket() = 0;
+
+    /// Version of `receivePacket` function without locking.
+    virtual Packet receivePacketUnlocked(AsyncCallback async_callback) = 0;
+
+    /// Break all active connections.
+    virtual void disconnect() = 0;
+
+    /// Send a request to replicas to cancel the request
+    virtual void sendCancel() = 0;
+
+    /// Send parts' uuids to replicas to exclude them from query processing
+    virtual void sendIgnoredPartUUIDs(const std::vector<UUID> & uuids) = 0;
+
+    /** On each replica, read and skip all packets to EndOfStream or Exception.
+      * Returns EndOfStream if no exception has been received. Otherwise
+      * returns the last received packet of type Exception.
+      */
+    virtual Packet drain() = 0;
+
+    /// Get the replica addresses as a string.
+    virtual std::string dumpAddresses() const = 0;
+
+    /// Returns the number of replicas.
+    virtual size_t size() const = 0;
+
+    /// Check if there are any valid replicas.
+    virtual bool hasActiveConnections() const = 0;
+
+    virtual ~IConnections() = default;
+};
+
+}
--- a/src/Client/MultiplexedConnections.cpp
+++ b/src/Client/MultiplexedConnections.cpp
@ -158,7 +158,7 @@ void MultiplexedConnections::sendIgnoredPartUUIDs(const std::vector<UUID> & uuid
 Packet MultiplexedConnections::receivePacket()
 {
    std::lock_guard lock(cancel_mutex);
-    Packet packet = receivePacketUnlocked();
+    Packet packet = receivePacketUnlocked({});
    return packet;
 }

@ -206,7 +206,7 @@ Packet MultiplexedConnections::drain()

    while (hasActiveConnections())
    {
-        Packet packet = receivePacketUnlocked();
+        Packet packet = receivePacketUnlocked({});

        switch (packet.type)
        {
@ -253,7 +253,7 @@ std::string MultiplexedConnections::dumpAddressesUnlocked() const
    return buf.str();
 }

-Packet MultiplexedConnections::receivePacketUnlocked(std::function<void(Poco::Net::Socket &)> async_callback)
+Packet MultiplexedConnections::receivePacketUnlocked(AsyncCallback async_callback)
 {
    if (!sent_query)
        throw Exception("Cannot receive packets: no query sent.", ErrorCodes::LOGICAL_ERROR);
@ -265,7 +265,11 @@ Packet MultiplexedConnections::receivePacketUnlocked(std::function<void(Poco::Ne
    if (current_connection == nullptr)
        throw Exception("Logical error: no available replica", ErrorCodes::NO_AVAILABLE_REPLICA);

-    Packet packet = current_connection->receivePacket(std::move(async_callback));
+    Packet packet;
+    {
+        AsyncCallbackSetter async_setter(current_connection, std::move(async_callback));
+        packet = current_connection->receivePacket();
+    }

    switch (packet.type)
    {
--- a/src/Client/MultiplexedConnections.h
+++ b/src/Client/MultiplexedConnections.h
@ -5,6 +5,7 @@
 #include <Client/Connection.h>
 #include <Client/ConnectionPoolWithFailover.h>
 #include <IO/ConnectionTimeouts.h>
+#include <Client/IConnections.h>

 namespace DB
 {
@ -16,7 +17,7 @@ namespace DB
  *
  * The interface is almost the same as Connection.
  */
-class MultiplexedConnections final : private boost::noncopyable
+class MultiplexedConnections final : public IConnections
 {
 public:
    /// Accepts ready connection.
@ -27,52 +28,38 @@ public:
        std::vector<IConnectionPool::Entry> && connections,
        const Settings & settings_, const ThrottlerPtr & throttler_);

-    /// Send all scalars to replicas.
-    void sendScalarsData(Scalars & data);
-    /// Send all content of external tables to replicas.
-    void sendExternalTablesData(std::vector<ExternalTablesData> & data);
+    void sendScalarsData(Scalars & data) override;
+    void sendExternalTablesData(std::vector<ExternalTablesData> & data) override;

-    /// Send request to replicas.
    void sendQuery(
        const ConnectionTimeouts & timeouts,
        const String & query,
        const String & query_id,
        UInt64 stage,
        const ClientInfo & client_info,
-        bool with_pending_data);
+        bool with_pending_data) override;

-    /// Get packet from any replica.
-    Packet receivePacket();
+    Packet receivePacket() override;

-    /// Break all active connections.
-    void disconnect();
+    void disconnect() override;

-    /// Send a request to the replica to cancel the request
-    void sendCancel();
+    void sendCancel() override;

    /// Send parts' uuids to replicas to exclude them from query processing
-    void sendIgnoredPartUUIDs(const std::vector<UUID> & uuids);
+    void sendIgnoredPartUUIDs(const std::vector<UUID> & uuids) override;

-    /** On each replica, read and skip all packets to EndOfStream or Exception.
-      * Returns EndOfStream if no exception has been received. Otherwise
-      * returns the last received packet of type Exception.
-      */
-    Packet drain();
+    Packet drain() override;

-    /// Get the replica addresses as a string.
-    std::string dumpAddresses() const;
+    std::string dumpAddresses() const override;

-    /// Returns the number of replicas.
    /// Without locking, because sendCancel() does not change this number.
-    size_t size() const { return replica_states.size(); }
+    size_t size() const override { return replica_states.size(); }

-    /// Check if there are any valid replicas.
    /// Without locking, because sendCancel() does not change the state of the replicas.
-    bool hasActiveConnections() const { return active_connection_count > 0; }
+    bool hasActiveConnections() const override { return active_connection_count > 0; }

 private:
-    /// Internal version of `receivePacket` function without locking.
-    Packet receivePacketUnlocked(std::function<void(Poco::Net::Socket &)> async_callback = {});
+    Packet receivePacketUnlocked(AsyncCallback async_callback) override;

    /// Internal version of `dumpAddresses` function without locking.
    std::string dumpAddressesUnlocked() const;
--- a/src/Client/PacketReceiver.h
+++ b/src/Client/PacketReceiver.h
@ -0,0 +1,161 @@
+#pragma once
+
+#if defined(OS_LINUX)
+
+#include <variant>
+
+#include <Client/IConnections.h>
+#include <Common/FiberStack.h>
+#include <Common/Fiber.h>
+#include <Common/Epoll.h>
+#include <Common/TimerDescriptor.h>
+
+namespace DB
+{
+
+/// Class for nonblocking packet receiving. It runs connection->receivePacket
+/// in fiber and sets special read callback which is called when
+/// reading from socket blocks. When read callback is called,
+/// socket and receive timeout are added in epoll and execution returns to the main program.
+/// So, you can poll this epoll file descriptor to determine when to resume
+/// packet receiving.
+class PacketReceiver
+{
+public:
+    explicit PacketReceiver(Connection * connection_) : connection(connection_)
+    {
+        epoll.add(receive_timeout.getDescriptor());
+        epoll.add(connection->getSocket()->impl()->sockfd());
+
+        fiber = boost::context::fiber(std::allocator_arg_t(), fiber_stack, Routine{*this});
+    }
+
+    /// Resume packet receiving.
+    std::variant<int, Packet, Poco::Timespan> resume()
+    {
+        /// If there is no pending data, check receive timeout.
+        if (!connection->hasReadPendingData() && !checkReceiveTimeout())
+        {
+            /// Receive timeout expired.
+            return Poco::Timespan();
+        }
+
+        /// Resume fiber.
+        fiber = std::move(fiber).resume();
+        if (exception)
+            std::rethrow_exception(std::move(exception));
+
+        if (is_read_in_process)
+            return epoll.getFileDescriptor();
+
+        /// Receiving packet was finished.
+        return std::move(packet);
+    }
+
+    void cancel()
+    {
+        Fiber to_destroy = std::move(fiber);
+        connection = nullptr;
+    }
+
+    int getFileDescriptor() const { return epoll.getFileDescriptor(); }
+
+private:
+    /// When epoll file descriptor is ready, check if it's an expired timeout.
+    /// Return false if receive timeout expired and socket is not ready, return true otherwise.
+    bool checkReceiveTimeout()
+    {
+        bool is_socket_ready = false;
+        bool is_receive_timeout_expired = false;
+
+        epoll_event events[2];
+        events[0].data.fd = events[1].data.fd = -1;
+        size_t ready_count = epoll.getManyReady(2, events, true);
+
+        for (size_t i = 0; i != ready_count; ++i)
+        {
+            if (events[i].data.fd == connection->getSocket()->impl()->sockfd())
+                is_socket_ready = true;
+            if (events[i].data.fd == receive_timeout.getDescriptor())
+                is_receive_timeout_expired = true;
+        }
+
+        if (is_receive_timeout_expired && !is_socket_ready)
+        {
+            receive_timeout.reset();
+            return false;
+        }
+
+        return true;
+    }
+
+    struct Routine
+    {
+        PacketReceiver & receiver;
+
+        struct ReadCallback
+        {
+            PacketReceiver & receiver;
+            Fiber & sink;
+
+            void operator()(int, const Poco::Timespan & timeout, const std::string &)
+            {
+                receiver.receive_timeout.setRelative(timeout);
+                receiver.is_read_in_process = true;
+                sink = std::move(sink).resume();
+                receiver.is_read_in_process = false;
+                receiver.receive_timeout.reset();
+            }
+        };
+
+        Fiber operator()(Fiber && sink)
+        {
+            try
+            {
+                while (true)
+                {
+                    {
+                        AsyncCallbackSetter async_setter(receiver.connection, ReadCallback{receiver, sink});
+                        receiver.packet = receiver.connection->receivePacket();
+                    }
+                    sink = std::move(sink).resume();
+                }
+
+            }
+            catch (const boost::context::detail::forced_unwind &)
+            {
+                /// This exception is thrown by fiber implementation in case if fiber is being deleted but hasn't exited
+                /// It should not be caught or it will segfault.
+                /// Other exceptions must be caught
+                throw;
+            }
+            catch (...)
+            {
+                receiver.exception = std::current_exception();
+            }
+
+            return std::move(sink);
+        }
+    };
+
+    Connection * connection;
+    Packet packet;
+
+    Fiber fiber;
+    FiberStack fiber_stack;
+
+    /// We use timer descriptor for checking socket receive timeout.
+    TimerDescriptor receive_timeout;
+
+    /// In read callback we add socket file descriptor and timer descriptor with receive timeout
+    /// in epoll, so we can return epoll file descriptor outside for polling.
+    Epoll epoll;
+
+    /// If and exception occurred in fiber resume, we save it and rethrow.
+    std::exception_ptr exception;
+
+    bool is_read_in_process = false;
+};
+
+}
+#endif
--- a/src/Client/ya.make
+++ b/src/Client/ya.make
@ -11,7 +11,10 @@ PEERDIR(

 SRCS(
    Connection.cpp
+    ConnectionEstablisher.cpp
    ConnectionPoolWithFailover.cpp
+    HedgedConnections.cpp
+    HedgedConnectionsFactory.cpp
    MultiplexedConnections.cpp
    TimeoutSetter.cpp

--- a/src/Columns/ColumnAggregateFunction.h
+++ b/src/Columns/ColumnAggregateFunction.h
@ -198,6 +198,11 @@ public:
        throw Exception("Method compareColumn is not supported for ColumnAggregateFunction", ErrorCodes::NOT_IMPLEMENTED);
    }

+    bool hasEqualValues() const override
+    {
+        throw Exception("Method hasEqualValues is not supported for ColumnAggregateFunction", ErrorCodes::NOT_IMPLEMENTED);
+    }
+
    void getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const override;
    void updatePermutation(bool reverse, size_t limit, int, Permutation & res, EqualRanges & equal_range) const override;

--- a/src/Columns/ColumnArray.cpp
+++ b/src/Columns/ColumnArray.cpp
@ -370,6 +370,10 @@ void ColumnArray::compareColumn(const IColumn & rhs, size_t rhs_row_num,
                                        compare_results, direction, nan_direction_hint);
 }

+bool ColumnArray::hasEqualValues() const
+{
+    return hasEqualValuesImpl<ColumnArray>();
+}

 namespace
 {
--- a/src/Columns/ColumnArray.h
+++ b/src/Columns/ColumnArray.h
@ -78,6 +78,7 @@ public:
                       PaddedPODArray<UInt64> * row_indexes, PaddedPODArray<Int8> & compare_results,
                       int direction, int nan_direction_hint) const override;
    int compareAtWithCollation(size_t n, size_t m, const IColumn & rhs_, int nan_direction_hint, const Collator & collator) const override;
+    bool hasEqualValues() const override;
    void getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const override;
    void updatePermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res, EqualRanges & equal_range) const override;
    void getPermutationWithCollation(const Collator & collator, bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const override;
--- a/src/Columns/ColumnCompressed.h
+++ b/src/Columns/ColumnCompressed.h
@ -96,6 +96,10 @@ public:
    {
        throwMustBeDecompressed();
    }
+    bool hasEqualValues() const override
+    {
+        throwMustBeDecompressed();
+    }
    void getPermutation(bool, size_t, int, Permutation &) const override { throwMustBeDecompressed(); }
    void updatePermutation(bool, size_t, int, Permutation &, EqualRanges &) const override { throwMustBeDecompressed(); }
    ColumnPtr replicate(const Offsets &) const override { throwMustBeDecompressed(); }
--- a/src/Columns/ColumnConst.h
+++ b/src/Columns/ColumnConst.h
@ -206,6 +206,8 @@ public:
                       PaddedPODArray<UInt64> * row_indexes, PaddedPODArray<Int8> & compare_results,
                       int direction, int nan_direction_hint) const override;

+    bool hasEqualValues() const override { return true; }
+
    MutableColumns scatter(ColumnIndex num_columns, const Selector & selector) const override;

    void gather(ColumnGathererStream &) override
--- a/src/Columns/ColumnDecimal.cpp
+++ b/src/Columns/ColumnDecimal.cpp
@ -58,6 +58,12 @@ void ColumnDecimal<T>::compareColumn(const IColumn & rhs, size_t rhs_row_num,
                                                         compare_results, direction, nan_direction_hint);
 }

+template <typename T>
+bool ColumnDecimal<T>::hasEqualValues() const
+{
+    return this->template hasEqualValuesImpl<ColumnDecimal<T>>();
+}
+
 template <typename T>
 StringRef ColumnDecimal<T>::serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const
 {
--- a/src/Columns/ColumnDecimal.h
+++ b/src/Columns/ColumnDecimal.h
@ -136,6 +136,7 @@ public:
    void compareColumn(const IColumn & rhs, size_t rhs_row_num,
                       PaddedPODArray<UInt64> * row_indexes, PaddedPODArray<Int8> & compare_results,
                       int direction, int nan_direction_hint) const override;
+    bool hasEqualValues() const override;
    void getPermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res) const override;
    void updatePermutation(bool reverse, size_t limit, int, IColumn::Permutation & res, EqualRanges& equal_range) const override;

--- a/src/Columns/ColumnFixedString.h
+++ b/src/Columns/ColumnFixedString.h
@ -132,6 +132,11 @@ public:
                                               compare_results, direction, nan_direction_hint);
    }

+    bool hasEqualValues() const override
+    {
+        return hasEqualValuesImpl<ColumnFixedString>();
+    }
+
    void getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const override;

    void updatePermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res, EqualRanges & equal_range) const override;
--- a/src/Columns/ColumnFunction.h
+++ b/src/Columns/ColumnFunction.h
@ -128,6 +128,11 @@ public:
        throw Exception("compareColumn is not implemented for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
    }

+    bool hasEqualValues() const override
+    {
+        throw Exception("hasEqualValues is not implemented for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
+    }
+
    void getPermutation(bool, size_t, int, Permutation &) const override
    {
        throw Exception("getPermutation is not implemented for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
--- a/src/Columns/ColumnLowCardinality.cpp
+++ b/src/Columns/ColumnLowCardinality.cpp
@ -311,6 +311,13 @@ void ColumnLowCardinality::compareColumn(const IColumn & rhs, size_t rhs_row_num
            compare_results, direction, nan_direction_hint);
 }

+bool ColumnLowCardinality::hasEqualValues() const
+{
+    if (getDictionary().size() <= 1)
+        return true;
+    return getIndexes().hasEqualValues();
+}
+
 void ColumnLowCardinality::getPermutationImpl(bool reverse, size_t limit, int nan_direction_hint, Permutation & res, const Collator * collator) const
 {
    if (limit == 0)
--- a/src/Columns/ColumnLowCardinality.h
+++ b/src/Columns/ColumnLowCardinality.h
@ -126,6 +126,8 @@ public:

    int compareAtWithCollation(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint, const Collator &) const override;

+    bool hasEqualValues() const override;
+
    void getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const override;

    void updatePermutation(bool reverse, size_t limit, int, IColumn::Permutation & res, EqualRanges & equal_range) const override;
--- a/src/Columns/ColumnMap.cpp
+++ b/src/Columns/ColumnMap.cpp
@ -187,6 +187,11 @@ void ColumnMap::compareColumn(const IColumn & rhs, size_t rhs_row_num,
                                        compare_results, direction, nan_direction_hint);
 }

+bool ColumnMap::hasEqualValues() const
+{
+    return hasEqualValuesImpl<ColumnMap>();
+}
+
 void ColumnMap::getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const
 {
    nested->getPermutation(reverse, limit, nan_direction_hint, res);
--- a/src/Columns/ColumnMap.h
+++ b/src/Columns/ColumnMap.h
@ -72,6 +72,7 @@ public:
    void compareColumn(const IColumn & rhs, size_t rhs_row_num,
                       PaddedPODArray<UInt64> * row_indexes, PaddedPODArray<Int8> & compare_results,
                       int direction, int nan_direction_hint) const override;
+    bool hasEqualValues() const override;
    void getExtremes(Field & min, Field & max) const override;
    void getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const override;
    void updatePermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges & equal_range) const override;
--- a/src/Columns/ColumnNullable.cpp
+++ b/src/Columns/ColumnNullable.cpp
@ -271,6 +271,11 @@ void ColumnNullable::compareColumn(const IColumn & rhs, size_t rhs_row_num,
                                           compare_results, direction, nan_direction_hint);
 }

+bool ColumnNullable::hasEqualValues() const
+{
+    return hasEqualValuesImpl<ColumnNullable>();
+}
+
 void ColumnNullable::getPermutationImpl(bool reverse, size_t limit, int null_direction_hint, Permutation & res, const Collator * collator) const
 {
    /// Cannot pass limit because of unknown amount of NULLs.
--- a/src/Columns/ColumnNullable.h
+++ b/src/Columns/ColumnNullable.h
@ -94,6 +94,7 @@ public:
                       PaddedPODArray<UInt64> * row_indexes, PaddedPODArray<Int8> & compare_results,
                       int direction, int nan_direction_hint) const override;
    int compareAtWithCollation(size_t n, size_t m, const IColumn & rhs, int null_direction_hint, const Collator &) const override;
+    bool hasEqualValues() const override;
    void getPermutation(bool reverse, size_t limit, int null_direction_hint, Permutation & res) const override;
    void updatePermutation(bool reverse, size_t limit, int null_direction_hint, Permutation & res, EqualRanges & equal_range) const override;
    void getPermutationWithCollation(const Collator & collator, bool reverse, size_t limit, int null_direction_hint, Permutation & res) const override;
--- a/src/Columns/ColumnString.cpp
+++ b/src/Columns/ColumnString.cpp
@ -287,6 +287,11 @@ void ColumnString::compareColumn(
                                         compare_results, direction, nan_direction_hint);
 }

+bool ColumnString::hasEqualValues() const
+{
+    return hasEqualValuesImpl<ColumnString>();
+}
+
 template <bool positive>
 struct ColumnString::Cmp
 {
--- a/src/Columns/ColumnString.h
+++ b/src/Columns/ColumnString.h
@ -240,6 +240,8 @@ public:
                       PaddedPODArray<UInt64> * row_indexes, PaddedPODArray<Int8> & compare_results,
                       int direction, int nan_direction_hint) const override;

+    bool hasEqualValues() const override;
+
    /// Variant of compareAt for string comparison with respect of collation.
    int compareAtWithCollation(size_t n, size_t m, const IColumn & rhs_, int, const Collator & collator) const override;

--- a/src/Columns/ColumnTuple.cpp
+++ b/src/Columns/ColumnTuple.cpp
@ -312,6 +312,11 @@ int ColumnTuple::compareAtWithCollation(size_t n, size_t m, const IColumn & rhs,
    return compareAtImpl(n, m, rhs, nan_direction_hint, &collator);
 }

+bool ColumnTuple::hasEqualValues() const
+{
+    return hasEqualValuesImpl<ColumnTuple>();
+}
+
 template <bool positive>
 struct ColumnTuple::Less
 {
--- a/src/Columns/ColumnTuple.h
+++ b/src/Columns/ColumnTuple.h
@ -76,6 +76,7 @@ public:
                       PaddedPODArray<UInt64> * row_indexes, PaddedPODArray<Int8> & compare_results,
                       int direction, int nan_direction_hint) const override;
    int compareAtWithCollation(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint, const Collator & collator) const override;
+    bool hasEqualValues() const override;
    void getExtremes(Field & min, Field & max) const override;
    void getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const override;
    void updatePermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges & equal_ranges) const override;
--- a/src/Columns/ColumnVector.h
+++ b/src/Columns/ColumnVector.h
@ -205,6 +205,11 @@ public:
                                                    compare_results, direction, nan_direction_hint);
    }

+    bool hasEqualValues() const override
+    {
+        return this->template hasEqualValuesImpl<Self>();
+    }
+
    void getPermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res) const override;

    void updatePermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges& equal_range) const override;
--- a/src/Columns/IColumn.h
+++ b/src/Columns/IColumn.h
@ -266,6 +266,9 @@ public:
                               PaddedPODArray<UInt64> * row_indexes, PaddedPODArray<Int8> & compare_results,
                               int direction, int nan_direction_hint) const = 0;

+    /// Check if all elements in the column have equal values. Return true if column is empty.
+    virtual bool hasEqualValues() const = 0;
+
    /** Returns a permutation that sorts elements of this column,
      *  i.e. perm[i]-th element of source column should be i-th element of sorted column.
      * reverse - reverse ordering (acsending).
@ -467,6 +470,9 @@ protected:
                         PaddedPODArray<UInt64> * row_indexes,
                         PaddedPODArray<Int8> & compare_results,
                         int direction, int nan_direction_hint) const;
+
+    template <typename Derived>
+    bool hasEqualValuesImpl() const;
 };

 using ColumnPtr = IColumn::Ptr;
--- a/src/Columns/IColumnDummy.h
+++ b/src/Columns/IColumnDummy.h
@ -40,6 +40,8 @@ public:
    {
    }

+    bool hasEqualValues() const override { return true; }
+
    Field operator[](size_t) const override { throw Exception("Cannot get value from " + getName(), ErrorCodes::NOT_IMPLEMENTED); }
    void get(size_t, Field &) const override { throw Exception("Cannot get value from " + getName(), ErrorCodes::NOT_IMPLEMENTED); }
    void insert(const Field &) override { throw Exception("Cannot insert element into " + getName(), ErrorCodes::NOT_IMPLEMENTED); }
--- a/src/Columns/IColumnImpl.h
+++ b/src/Columns/IColumnImpl.h
@ -127,4 +127,16 @@ void IColumn::doCompareColumn(const Derived & rhs, size_t rhs_row_num,
    }
 }

+template <typename Derived>
+bool IColumn::hasEqualValuesImpl() const
+{
+    size_t num_rows = size();
+    for (size_t i = 1; i < num_rows; ++i)
+    {
+        if (compareAt(i, 0, static_cast<const Derived &>(*this), false) != 0)
+            return false;
+    }
+    return true;
+}
+
 }
--- a/src/Columns/IColumnUnique.h
+++ b/src/Columns/IColumnUnique.h
@ -172,6 +172,11 @@ public:
    {
        throw Exception("Method compareColumn is not supported for ColumnUnique.", ErrorCodes::NOT_IMPLEMENTED);
    }
+
+    bool hasEqualValues() const override
+    {
+        throw Exception("Method hasEqualValues is not supported for ColumnUnique.", ErrorCodes::NOT_IMPLEMENTED);
+    }
 };

 using ColumnUniquePtr = IColumnUnique::ColumnUniquePtr;
--- a/src/Common/Epoll.cpp
+++ b/src/Common/Epoll.cpp
@ -0,0 +1,86 @@
+#if defined(OS_LINUX)
+
+#include "Epoll.h"
+#include <Common/Exception.h>
+#include <unistd.h>
+#include <common/logger_useful.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int EPOLL_ERROR;
+    extern const int LOGICAL_ERROR;
+}
+
+Epoll::Epoll() : events_count(0)
+{
+    epoll_fd = epoll_create1(0);
+    if (epoll_fd == -1)
+        throwFromErrno("Cannot open epoll descriptor", DB::ErrorCodes::EPOLL_ERROR);
+}
+
+Epoll::Epoll(Epoll && other) : epoll_fd(other.epoll_fd), events_count(other.events_count.load())
+{
+    other.epoll_fd = -1;
+}
+
+Epoll & Epoll::operator=(Epoll && other)
+{
+    epoll_fd = other.epoll_fd;
+    other.epoll_fd = -1;
+    events_count.store(other.events_count.load());
+    return *this;
+}
+
+void Epoll::add(int fd, void * ptr)
+{
+    epoll_event event;
+    event.events = EPOLLIN | EPOLLPRI;
+    if (ptr)
+        event.data.ptr = ptr;
+    else
+        event.data.fd = fd;
+
+    ++events_count;
+
+    if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &event) == -1)
+        throwFromErrno("Cannot add new descriptor to epoll", DB::ErrorCodes::EPOLL_ERROR);
+}
+
+void Epoll::remove(int fd)
+{
+    --events_count;
+
+    if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, nullptr) == -1)
+        throwFromErrno("Cannot remove descriptor from epoll", DB::ErrorCodes::EPOLL_ERROR);
+}
+
+size_t Epoll::getManyReady(int max_events, epoll_event * events_out, bool blocking) const
+{
+    if (events_count == 0)
+        throw Exception("There is no events in epoll", ErrorCodes::LOGICAL_ERROR);
+
+    int ready_size;
+    int timeout = blocking ? -1 : 0;
+    do
+    {
+        ready_size = epoll_wait(epoll_fd, events_out, max_events, timeout);
+
+        if (ready_size == -1 && errno != EINTR)
+            throwFromErrno("Error in epoll_wait", DB::ErrorCodes::EPOLL_ERROR);
+    }
+    while (ready_size <= 0 && (ready_size != 0 || blocking));
+
+    return ready_size;
+}
+
+Epoll::~Epoll()
+{
+    if (epoll_fd != -1)
+        close(epoll_fd);
+}
+
+}
+#endif
--- a/src/Common/Epoll.h
+++ b/src/Common/Epoll.h
@ -0,0 +1,54 @@
+#pragma once
+#if defined(OS_LINUX)
+
+#include <sys/epoll.h>
+#include <vector>
+#include <boost/noncopyable.hpp>
+#include <Poco/Logger.h>
+
+namespace DB
+{
+
+using AsyncCallback = std::function<void(int, const Poco::Timespan &, const std::string &)>;
+
+class Epoll
+{
+public:
+    Epoll();
+
+    Epoll(const Epoll &) = delete;
+    Epoll & operator=(const Epoll &) = delete;
+
+    Epoll & operator=(Epoll && other);
+    Epoll(Epoll && other);
+
+    /// Add new file descriptor to epoll. If ptr set to nullptr, epoll_event.data.fd = fd,
+    /// otherwise epoll_event.data.ptr = ptr.
+    void add(int fd, void * ptr = nullptr);
+
+    /// Remove file descriptor to epoll.
+    void remove(int fd);
+
+    /// Get events from epoll. Events are written in events_out, this function returns an amount of ready events.
+    /// If blocking is false and there are no ready events,
+    /// return empty vector, otherwise wait for ready events.
+    size_t getManyReady(int max_events, epoll_event * events_out, bool blocking) const;
+
+    int getFileDescriptor() const { return epoll_fd; }
+
+    int size() const { return events_count; }
+
+    bool empty() const { return events_count == 0; }
+
+    const std::string & getDescription() const { return fd_description; }
+
+    ~Epoll();
+
+private:
+    int epoll_fd;
+    std::atomic<int> events_count;
+    const std::string fd_description = "epoll";
+};
+
+}
+#endif
--- a/src/Common/ErrorCodes.cpp
+++ b/src/Common/ErrorCodes.cpp
@ -538,12 +538,14 @@
    M(569, MULTIPLE_COLUMNS_SERIALIZED_TO_SAME_PROTOBUF_FIELD) \
    M(570, DATA_TYPE_INCOMPATIBLE_WITH_PROTOBUF_FIELD) \
    M(571, DATABASE_REPLICATION_FAILED) \
+    M(572, TOO_MANY_QUERY_PLAN_OPTIMIZATIONS) \
+    M(573, EPOLL_ERROR) \
    \
    M(999, KEEPER_EXCEPTION) \
    M(1000, POCO_EXCEPTION) \
    M(1001, STD_EXCEPTION) \
    M(1002, UNKNOWN_EXCEPTION) \
-    M(1003, INVALID_SHARD_ID)
+    M(1003, INVALID_SHARD_ID) \

 /* See END */

--- a/src/Common/PoolWithFailoverBase.h
+++ b/src/Common/PoolWithFailoverBase.h
@ -93,6 +93,18 @@ public:
        double staleness = 0.0; /// Helps choosing the "least stale" option when all replicas are stale.
    };

+    struct PoolState;
+
+    using PoolStates = std::vector<PoolState>;
+
+    struct ShuffledPool
+    {
+        NestedPool * pool{};
+        const PoolState * state{};
+        size_t index = 0;
+        size_t error_count = 0;
+    };
+
    /// This functor must be provided by a client. It must perform a single try that takes a connection
    /// from the provided pool and checks that it is good.
    using TryGetEntryFunc = std::function<TryResult(NestedPool & pool, std::string & fail_message)>;
@ -113,9 +125,6 @@ public:
            const GetPriorityFunc & get_priority = GetPriorityFunc());

 protected:
-    struct PoolState;
-
-    using PoolStates = std::vector<PoolState>;

    /// Returns a single connection.
    Entry get(size_t max_ignored_errors, bool fallback_to_stale_replicas,
@ -124,6 +133,10 @@ protected:
    /// This function returns a copy of pool states to avoid race conditions when modifying shared pool states.
    PoolStates updatePoolStates(size_t max_ignored_errors);

+    std::vector<ShuffledPool> getShuffledPools(size_t max_ignored_errors, const GetPriorityFunc & get_priority);
+
+    inline void updateSharedErrorCounts(std::vector<ShuffledPool> & shuffled_pools);
+
    auto getPoolExtendedStates() const
    {
        std::lock_guard lock(pool_states_mutex);
@ -143,6 +156,46 @@ protected:
    Poco::Logger * log;
 };

+
+template <typename TNestedPool>
+std::vector<typename PoolWithFailoverBase<TNestedPool>::ShuffledPool>
+PoolWithFailoverBase<TNestedPool>::getShuffledPools(
+    size_t max_ignored_errors, const PoolWithFailoverBase::GetPriorityFunc & get_priority)
+{
+    /// Update random numbers and error counts.
+    PoolStates pool_states = updatePoolStates(max_ignored_errors);
+    if (get_priority)
+    {
+        for (size_t i = 0; i < pool_states.size(); ++i)
+            pool_states[i].priority = get_priority(i);
+    }
+
+    /// Sort the pools into order in which they will be tried (based on respective PoolStates).
+    std::vector<ShuffledPool> shuffled_pools;
+    shuffled_pools.reserve(nested_pools.size());
+    for (size_t i = 0; i < nested_pools.size(); ++i)
+        shuffled_pools.push_back(ShuffledPool{nested_pools[i].get(), &pool_states[i], i, 0});
+    std::sort(
+        shuffled_pools.begin(), shuffled_pools.end(),
+        [](const ShuffledPool & lhs, const ShuffledPool & rhs)
+        {
+            return PoolState::compare(*lhs.state, *rhs.state);
+        });
+
+    return shuffled_pools;
+}
+
+template <typename TNestedPool>
+inline void PoolWithFailoverBase<TNestedPool>::updateSharedErrorCounts(std::vector<ShuffledPool> & shuffled_pools)
+{
+    std::lock_guard lock(pool_states_mutex);
+    for (const ShuffledPool & pool: shuffled_pools)
+    {
+        auto & pool_state = shared_pool_states[pool.index];
+        pool_state.error_count = std::min<UInt64>(max_error_cap, pool_state.error_count + pool.error_count);
+    }
+}
+
 template <typename TNestedPool>
 typename TNestedPool::Entry
 PoolWithFailoverBase<TNestedPool>::get(size_t max_ignored_errors, bool fallback_to_stale_replicas,
@ -168,33 +221,7 @@ PoolWithFailoverBase<TNestedPool>::getMany(
        const TryGetEntryFunc & try_get_entry,
        const GetPriorityFunc & get_priority)
 {
-    /// Update random numbers and error counts.
-    PoolStates pool_states = updatePoolStates(max_ignored_errors);
-    if (get_priority)
-    {
-        for (size_t i = 0; i < pool_states.size(); ++i)
-            pool_states[i].priority = get_priority(i);
-    }
-
-    struct ShuffledPool
-    {
-        NestedPool * pool{};
-        const PoolState * state{};
-        size_t index = 0;
-        size_t error_count = 0;
-    };
-
-    /// Sort the pools into order in which they will be tried (based on respective PoolStates).
-    std::vector<ShuffledPool> shuffled_pools;
-    shuffled_pools.reserve(nested_pools.size());
-    for (size_t i = 0; i < nested_pools.size(); ++i)
-        shuffled_pools.push_back(ShuffledPool{nested_pools[i].get(), &pool_states[i], i, 0});
-    std::sort(
-            shuffled_pools.begin(), shuffled_pools.end(),
-            [](const ShuffledPool & lhs, const ShuffledPool & rhs)
-            {
-                return PoolState::compare(*lhs.state, *rhs.state);
-            });
+    std::vector<ShuffledPool> shuffled_pools = getShuffledPools(max_ignored_errors, get_priority);

    /// We will try to get a connection from each pool until a connection is produced or max_tries is reached.
    std::vector<TryResult> try_results(shuffled_pools.size());
@ -206,12 +233,7 @@ PoolWithFailoverBase<TNestedPool>::getMany(
    /// At exit update shared error counts with error counts occurred during this call.
    SCOPE_EXIT(
    {
-        std::lock_guard lock(pool_states_mutex);
-        for (const ShuffledPool & pool: shuffled_pools)
-        {
-            auto & pool_state = shared_pool_states[pool.index];
-            pool_state.error_count = std::min<UInt64>(max_error_cap, pool_state.error_count + pool.error_count);
-        }
+        updateSharedErrorCounts(shuffled_pools);
    });

    std::string fail_messages;
--- a/src/Common/TimerDescriptor.cpp
+++ b/src/Common/TimerDescriptor.cpp
@ -27,10 +27,16 @@ TimerDescriptor::TimerDescriptor(int clockid, int flags)
        throwFromErrno("Cannot set O_NONBLOCK for timer_fd", ErrorCodes::CANNOT_FCNTL);
 }

+TimerDescriptor::TimerDescriptor(TimerDescriptor && other) : timer_fd(other.timer_fd)
+{
+    other.timer_fd = -1;
+}
+
 TimerDescriptor::~TimerDescriptor()
 {
    /// Do not check for result cause cannot throw exception.
-    close(timer_fd);
+    if (timer_fd != -1)
+        close(timer_fd);
 }

 void TimerDescriptor::reset() const
@ -74,7 +80,7 @@ void TimerDescriptor::setRelative(const Poco::Timespan & timespan) const
    spec.it_interval.tv_nsec = 0;
    spec.it_interval.tv_sec = 0;
    spec.it_value.tv_sec = timespan.totalSeconds();
-    spec.it_value.tv_nsec = timespan.useconds();
+    spec.it_value.tv_nsec = timespan.useconds() * 1000;

    if (-1 == timerfd_settime(timer_fd, 0 /*relative timer */, &spec, nullptr))
        throwFromErrno("Cannot set time for timer_fd", ErrorCodes::CANNOT_SET_TIMER_PERIOD);
--- a/src/Common/TimerDescriptor.h
+++ b/src/Common/TimerDescriptor.h
@ -12,12 +12,12 @@ private:
    int timer_fd;

 public:
-    explicit TimerDescriptor(int clockid, int flags);
+    explicit TimerDescriptor(int clockid = CLOCK_MONOTONIC, int flags = 0);
    ~TimerDescriptor();

    TimerDescriptor(const TimerDescriptor &) = delete;
    TimerDescriptor & operator=(const TimerDescriptor &) = delete;
-    TimerDescriptor(TimerDescriptor &&) = default;
+    TimerDescriptor(TimerDescriptor && other);
    TimerDescriptor & operator=(TimerDescriptor &&) = default;

    int getDescriptor() const { return timer_fd; }
--- a/src/Common/ya.make
+++ b/src/Common/ya.make
@ -39,6 +39,7 @@ SRCS(
    DNSResolver.cpp
    Dwarf.cpp
    Elf.cpp
+    Epoll.cpp
    ErrorCodes.cpp
    Exception.cpp
    ExternalLoaderStatus.cpp
--- a/src/Coordination/NuKeeperServer.cpp
+++ b/src/Coordination/NuKeeperServer.cpp
@ -80,8 +80,9 @@ void NuKeeperServer::shutdown()
 {
    state_machine->shutdownStorage();
    state_manager->flushLogStore();
-    if (!launcher.shutdown(coordination_settings->shutdown_timeout.totalSeconds()))
-        LOG_WARNING(&Poco::Logger::get("NuKeeperServer"), "Failed to shutdown RAFT server in {} seconds", 5);
+    auto timeout = coordination_settings->shutdown_timeout.totalSeconds();
+    if (!launcher.shutdown(timeout))
+        LOG_WARNING(&Poco::Logger::get("NuKeeperServer"), "Failed to shutdown RAFT server in {} seconds", timeout);
 }

 namespace
--- a/src/Coordination/tests/gtest_for_build.cpp
+++ b/src/Coordination/tests/gtest_for_build.cpp
@ -195,124 +195,6 @@ TEST(CoordinationTest, TestSummingRaft1)
    s1.launcher.shutdown(5);
 }

-TEST(CoordinationTest, TestSummingRaft3)
-{
-    ChangelogDirTest test1("./logs1");
-    SummingRaftServer s1(1, "localhost", 44444, "./logs1");
-    ChangelogDirTest test2("./logs2");
-    SummingRaftServer s2(2, "localhost", 44445, "./logs2");
-    ChangelogDirTest test3("./logs3");
-    SummingRaftServer s3(3, "localhost", 44446, "./logs3");
-
-    nuraft::srv_config first_config(1, 0, "localhost:44444", "", false, 0);
-    auto ret1 = s2.raft_instance->add_srv(first_config);
-    while (!ret1->get_accepted())
-    {
-
-        std::cout << "failed to add server: "
-                  << ret1->get_result_str() << std::endl;
-
-        std::this_thread::sleep_for(std::chrono::milliseconds(100));
-        ret1 = s2.raft_instance->add_srv(first_config);
-    }
-
-    while (s1.raft_instance->get_leader() != 2)
-    {
-        std::cout << "Waiting s1 to join to s2 quorum\n";
-        std::this_thread::sleep_for(std::chrono::milliseconds(100));
-    }
-
-    nuraft::srv_config third_config(3, 0, "localhost:44446", "", false, 0);
-    auto ret3 = s2.raft_instance->add_srv(third_config);
-    if (!ret3->get_accepted())
-    {
-        std::cout << "failed to add server: "
-                  << ret3->get_result_str() << std::endl;
-
-        std::this_thread::sleep_for(std::chrono::milliseconds(100));
-        ret3 = s2.raft_instance->add_srv(third_config);
-    }
-
-    while (s3.raft_instance->get_leader() != 2)
-    {
-        std::cout << "Waiting s3 to join to s2 quorum\n";
-        std::this_thread::sleep_for(std::chrono::milliseconds(100));
-    }
-
-    /// S2 is leader
-    EXPECT_EQ(s1.raft_instance->get_leader(), 2);
-    EXPECT_EQ(s2.raft_instance->get_leader(), 2);
-    EXPECT_EQ(s3.raft_instance->get_leader(), 2);
-
-    std::cerr << "Starting to add entries\n";
-    auto entry = getBuffer(1);
-    auto ret = s2.raft_instance->append_entries({entry});
-    while (!ret->get_accepted() || ret->get_result_code() != nuraft::cmd_result_code::OK)
-    {
-        std::cerr <<  ret->get_accepted() << "failed to replicate: entry 1" << ret->get_result_code() << std::endl;
-        ret = s2.raft_instance->append_entries({entry});
-    }
-
-    while (s1.state_machine->getValue() != 1)
-    {
-        std::cout << "Waiting s1 to apply entry\n";
-        std::this_thread::sleep_for(std::chrono::milliseconds(100));
-    }
-
-    while (s2.state_machine->getValue() != 1)
-    {
-        std::cout << "Waiting s2 to apply entry\n";
-        std::this_thread::sleep_for(std::chrono::milliseconds(100));
-    }
-
-    while (s3.state_machine->getValue() != 1)
-    {
-        std::cout << "Waiting s3 to apply entry\n";
-        std::this_thread::sleep_for(std::chrono::milliseconds(100));
-    }
-
-    EXPECT_EQ(s1.state_machine->getValue(), 1);
-    EXPECT_EQ(s2.state_machine->getValue(), 1);
-    EXPECT_EQ(s3.state_machine->getValue(), 1);
-
-    auto non_leader_entry = getBuffer(3);
-    auto ret_non_leader1 = s1.raft_instance->append_entries({non_leader_entry});
-
-    EXPECT_FALSE(ret_non_leader1->get_accepted());
-
-    auto ret_non_leader3 = s3.raft_instance->append_entries({non_leader_entry});
-
-    EXPECT_FALSE(ret_non_leader3->get_accepted());
-
-    auto leader_entry = getBuffer(77);
-    auto ret_leader = s2.raft_instance->append_entries({leader_entry});
-    while (!ret_leader->get_accepted() || ret_leader->get_result_code() != nuraft::cmd_result_code::OK)
-    {
-        std::cerr << "failed to replicate: entry 78" << ret_leader->get_result_code() << std::endl;
-        ret_leader = s2.raft_instance->append_entries({leader_entry});
-    }
-
-    while (s1.state_machine->getValue() != 78)
-    {
-        std::cout << "Waiting s1 to apply entry\n";
-        std::this_thread::sleep_for(std::chrono::milliseconds(100));
-    }
-
-    while (s3.state_machine->getValue() != 78)
-    {
-        std::cout << "Waiting s3 to apply entry\n";
-        std::this_thread::sleep_for(std::chrono::milliseconds(100));
-    }
-
-    EXPECT_EQ(s1.state_machine->getValue(), 78);
-    EXPECT_EQ(s2.state_machine->getValue(), 78);
-    EXPECT_EQ(s3.state_machine->getValue(), 78);
-
-    s1.launcher.shutdown(5);
-    s2.launcher.shutdown(5);
-    s3.launcher.shutdown(5);
-}
-
 nuraft::ptr<nuraft::buffer> getZooKeeperLogEntry(int64_t session_id, const Coordination::ZooKeeperRequestPtr & request)
 {
    DB::WriteBufferFromNuraftBuffer buf;
--- a/src/Core/Defines.h
+++ b/src/Core/Defines.h
@ -11,6 +11,9 @@
 #define DBMS_DEFAULT_CONNECT_TIMEOUT_WITH_FAILOVER_SECURE_MS 100
 #define DBMS_DEFAULT_SEND_TIMEOUT_SEC 300
 #define DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC 300
+/// Timeouts for hedged requests.
+#define DBMS_DEFAULT_HEDGED_CONNECTION_TIMEOUT_MS 100
+#define DBMS_DEFAULT_RECEIVE_DATA_TIMEOUT_SEC 2
 /// Timeout for synchronous request-result protocol call (like Ping or TablesStatus).
 #define DBMS_DEFAULT_SYNC_REQUEST_TIMEOUT_SEC 5
 #define DBMS_DEFAULT_POLL_INTERVAL 10
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -55,6 +55,10 @@ class IColumn;
    M(Seconds, receive_timeout, DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC, "", 0) \
    M(Seconds, send_timeout, DBMS_DEFAULT_SEND_TIMEOUT_SEC, "", 0) \
    M(Seconds, tcp_keep_alive_timeout, 0, "The time in seconds the connection needs to remain idle before TCP starts sending keepalive probes", 0) \
+    M(Milliseconds, hedged_connection_timeout, DBMS_DEFAULT_HEDGED_CONNECTION_TIMEOUT_MS, "Connection timeout for establishing connection with replica for Hedged requests", 0) \
+    M(Seconds, receive_data_timeout, DBMS_DEFAULT_RECEIVE_DATA_TIMEOUT_SEC, "Connection timeout for receiving first packet of data or packet with positive progress from replica", 0) \
+    M(Bool, use_hedged_requests, true, "Use hedged requests for distributed queries", 0) \
+    M(Bool, allow_changing_replica_until_first_data_packet, false, "Allow HedgedConnections to change replica until receiving first data packet", 0) \
    M(Milliseconds, queue_max_wait_ms, 0, "The wait time in the request queue, if the number of concurrent requests exceeds the maximum.", 0) \
    M(Milliseconds, connection_pool_max_wait_ms, 0, "The wait time when the connection pool is full.", 0) \
    M(Milliseconds, replace_running_query_max_wait_ms, 5000, "The wait time for running query with the same query_id to finish when setting 'replace_running_query' is active.", 0) \
@ -215,6 +219,10 @@ class IColumn;
    M(Milliseconds, stream_flush_interval_ms, 7500, "Timeout for flushing data from streaming storages.", 0) \
    M(Milliseconds, stream_poll_timeout_ms, 500, "Timeout for polling data from/to streaming storages.", 0) \
    \
+    /** Settings for testing hedged requests */ \
+    M(Int64, sleep_in_send_tables_status, 0, "Time to sleep in sending tables status response in TCPHandler", 0) \
+    M(Int64, sleep_in_send_data, 0, "Time to sleep in sending data in TCPHandler", 0) \
+    \
    M(Bool, insert_allow_materialized_columns, 0, "If setting is enabled, Allow materialized columns in INSERT.", 0) \
    M(Seconds, http_connection_timeout, DEFAULT_HTTP_READ_BUFFER_CONNECTION_TIMEOUT, "HTTP connection timeout.", 0) \
    M(Seconds, http_send_timeout, DEFAULT_HTTP_READ_BUFFER_TIMEOUT, "HTTP send timeout", 0) \
@ -437,6 +445,7 @@ class IColumn;
    M(UnionMode, union_default_mode, UnionMode::Unspecified, "Set default Union Mode in SelectWithUnion query. Possible values: empty string, 'ALL', 'DISTINCT'. If empty, query without Union Mode will throw exception.", 0) \
    M(Bool, optimize_aggregators_of_group_by_keys, true, "Eliminates min/max/any/anyLast aggregators of GROUP BY keys in SELECT section", 0) \
    M(Bool, optimize_group_by_function_keys, true, "Eliminates functions of other keys in GROUP BY section", 0) \
+    M(UInt64, query_plan_max_optimizations_to_apply, 10000, "Limit the total number of optimizations applied to query plan. If zero, ignored. If limit reached, throw exception", 0) \

 // End of COMMON_SETTINGS
 // Please add settings related to formats into the FORMAT_FACTORY_SETTINGS below.
--- a/src/DataStreams/RemoteQueryExecutor.cpp
+++ b/src/DataStreams/RemoteQueryExecutor.cpp
@ -13,6 +13,8 @@
 #include <Interpreters/InternalTextLogsQueue.h>
 #include <IO/ConnectionTimeoutsContext.h>
 #include <Common/FiberStack.h>
+#include <Client/MultiplexedConnections.h>
+#include <Client/HedgedConnections.h>
 #include <Storages/MergeTree/MergeTreeDataPartUUID.h>

 namespace DB
@ -31,23 +33,23 @@ RemoteQueryExecutor::RemoteQueryExecutor(
    : header(header_), query(query_), context(context_)
    , scalars(scalars_), external_tables(external_tables_), stage(stage_)
 {
-    create_multiplexed_connections = [this, &connection, throttler]()
+    create_connections = [this, &connection, throttler]()
    {
        return std::make_unique<MultiplexedConnections>(connection, context.getSettingsRef(), throttler);
    };
 }

 RemoteQueryExecutor::RemoteQueryExecutor(
-    std::vector<IConnectionPool::Entry> && connections,
+    std::vector<IConnectionPool::Entry> && connections_,
    const String & query_, const Block & header_, const Context & context_,
    const ThrottlerPtr & throttler, const Scalars & scalars_, const Tables & external_tables_, QueryProcessingStage::Enum stage_)
    : header(header_), query(query_), context(context_)
    , scalars(scalars_), external_tables(external_tables_), stage(stage_)
 {
-    create_multiplexed_connections = [this, connections, throttler]() mutable
+    create_connections = [this, connections_, throttler]() mutable
    {
        return std::make_unique<MultiplexedConnections>(
-                std::move(connections), context.getSettingsRef(), throttler);
+                std::move(connections_), context.getSettingsRef(), throttler);
    };
 }

@ -58,23 +60,34 @@ RemoteQueryExecutor::RemoteQueryExecutor(
    : header(header_), query(query_), context(context_)
    , scalars(scalars_), external_tables(external_tables_), stage(stage_)
 {
-    create_multiplexed_connections = [this, pool, throttler]()
+    create_connections = [this, pool, throttler]()->std::unique_ptr<IConnections>
    {
        const Settings & current_settings = context.getSettingsRef();
        auto timeouts = ConnectionTimeouts::getTCPTimeoutsWithFailover(current_settings);
-        std::vector<IConnectionPool::Entry> connections;
+
+#if defined(OS_LINUX)
+        if (current_settings.use_hedged_requests)
+        {
+            std::shared_ptr<QualifiedTableName> table_to_check = nullptr;
+            if (main_table)
+                table_to_check = std::make_shared<QualifiedTableName>(main_table.getQualifiedName());
+
+            return std::make_unique<HedgedConnections>(pool, current_settings, timeouts, throttler, pool_mode, table_to_check);
+        }
+#endif
+
+        std::vector<IConnectionPool::Entry> connection_entries;
        if (main_table)
        {
            auto try_results = pool->getManyChecked(timeouts, &current_settings, pool_mode, main_table.getQualifiedName());
-            connections.reserve(try_results.size());
+            connection_entries.reserve(try_results.size());
            for (auto & try_result : try_results)
-                connections.emplace_back(std::move(try_result.entry));
+                connection_entries.emplace_back(std::move(try_result.entry));
        }
        else
-            connections = pool->getMany(timeouts, &current_settings, pool_mode);
+            connection_entries = pool->getMany(timeouts, &current_settings, pool_mode);

-        return std::make_unique<MultiplexedConnections>(
-                std::move(connections), current_settings, throttler);
+        return std::make_unique<MultiplexedConnections>(std::move(connection_entries), current_settings, throttler);
    };
 }

@ -85,7 +98,7 @@ RemoteQueryExecutor::~RemoteQueryExecutor()
      * these connections did not remain hanging in the out-of-sync state.
      */
    if (established || isQueryPending())
-        multiplexed_connections->disconnect();
+        connections->disconnect();
 }

 /** If we receive a block with slightly different column types, or with excessive columns,
@ -142,10 +155,10 @@ void RemoteQueryExecutor::sendQuery()
    if (sent_query)
        return;

-    multiplexed_connections = create_multiplexed_connections();
+    connections = create_connections();

    const auto & settings = context.getSettingsRef();
-    if (settings.skip_unavailable_shards && 0 == multiplexed_connections->size())
+    if (settings.skip_unavailable_shards && 0 == connections->size())
        return;

    /// Query cannot be canceled in the middle of the send query,
@ -173,10 +186,10 @@ void RemoteQueryExecutor::sendQuery()
    {
        std::lock_guard lock(duplicated_part_uuids_mutex);
        if (!duplicated_part_uuids.empty())
-            multiplexed_connections->sendIgnoredPartUUIDs(duplicated_part_uuids);
+            connections->sendIgnoredPartUUIDs(duplicated_part_uuids);
    }

-    multiplexed_connections->sendQuery(timeouts, query, query_id, stage, modified_client_info, true);
+    connections->sendQuery(timeouts, query, query_id, stage, modified_client_info, true);

    established = false;
    sent_query = true;
@ -192,7 +205,7 @@ Block RemoteQueryExecutor::read()
    {
        sendQuery();

-        if (context.getSettingsRef().skip_unavailable_shards && (0 == multiplexed_connections->size()))
+        if (context.getSettingsRef().skip_unavailable_shards && (0 == connections->size()))
            return {};
    }

@ -201,7 +214,7 @@ Block RemoteQueryExecutor::read()
        if (was_cancelled)
            return Block();

-        Packet packet = multiplexed_connections->receivePacket();
+        Packet packet = connections->receivePacket();

        if (auto block = processPacket(std::move(packet)))
            return *block;
@ -218,7 +231,7 @@ std::variant<Block, int> RemoteQueryExecutor::read(std::unique_ptr<ReadContext>
    {
        sendQuery();

-        if (context.getSettingsRef().skip_unavailable_shards && (0 == multiplexed_connections->size()))
+        if (context.getSettingsRef().skip_unavailable_shards && (0 == connections->size()))
            return Block();
    }

@ -228,7 +241,7 @@ std::variant<Block, int> RemoteQueryExecutor::read(std::unique_ptr<ReadContext>
        if (was_cancelled)
            return Block();

-        read_context = std::make_unique<ReadContext>(*multiplexed_connections);
+        read_context = std::make_unique<ReadContext>(*connections);
    }

    do
@ -239,7 +252,7 @@ std::variant<Block, int> RemoteQueryExecutor::read(std::unique_ptr<ReadContext>
        if (read_context->is_read_in_progress.load(std::memory_order_relaxed))
        {
            read_context->setTimer();
-            return read_context->epoll_fd;
+            return read_context->epoll.getFileDescriptor();
        }
        else
        {
@ -260,7 +273,7 @@ std::variant<Block, int> RemoteQueryExecutor::restartQueryWithoutDuplicatedUUIDs
 {
    /// Cancel previous query and disconnect before retry.
    cancel(read_context);
-    multiplexed_connections->disconnect();
+    connections->disconnect();

    /// Only resend once, otherwise throw an exception
    if (!resent_query)
@ -300,7 +313,7 @@ std::optional<Block> RemoteQueryExecutor::processPacket(Packet packet)
            break;

        case Protocol::Server::EndOfStream:
-            if (!multiplexed_connections->hasActiveConnections())
+            if (!connections->hasActiveConnections())
            {
                finished = true;
                return Block();
@ -342,7 +355,7 @@ std::optional<Block> RemoteQueryExecutor::processPacket(Packet packet)
            got_unknown_packet_from_replica = true;
            throw Exception(ErrorCodes::UNKNOWN_PACKET_FROM_SERVER, "Unknown packet {} from one of the following replicas: {}",
                toString(packet.type),
-                multiplexed_connections->dumpAddresses());
+                connections->dumpAddresses());
    }

    return {};
@ -382,7 +395,7 @@ void RemoteQueryExecutor::finish(std::unique_ptr<ReadContext> * read_context)
    tryCancel("Cancelling query because enough data has been read", read_context);

    /// Get the remaining packets so that there is no out of sync in the connections to the replicas.
-    Packet packet = multiplexed_connections->drain();
+    Packet packet = connections->drain();
    switch (packet.type)
    {
        case Protocol::Server::EndOfStream:
@ -404,7 +417,7 @@ void RemoteQueryExecutor::finish(std::unique_ptr<ReadContext> * read_context)
            got_unknown_packet_from_replica = true;
            throw Exception(ErrorCodes::UNKNOWN_PACKET_FROM_SERVER, "Unknown packet {} from one of the following replicas: {}",
                toString(packet.type),
-                multiplexed_connections->dumpAddresses());
+                connections->dumpAddresses());
    }
 }

@ -427,14 +440,14 @@ void RemoteQueryExecutor::cancel(std::unique_ptr<ReadContext> * read_context)

 void RemoteQueryExecutor::sendScalars()
 {
-    multiplexed_connections->sendScalarsData(scalars);
+    connections->sendScalarsData(scalars);
 }

 void RemoteQueryExecutor::sendExternalTables()
 {
    SelectQueryInfo query_info;

-    size_t count = multiplexed_connections->size();
+    size_t count = connections->size();

    {
        std::lock_guard lock(external_tables_mutex);
@ -472,7 +485,7 @@ void RemoteQueryExecutor::sendExternalTables()
        }
    }

-    multiplexed_connections->sendExternalTablesData(external_tables_data);
+    connections->sendExternalTablesData(external_tables_data);
 }

 void RemoteQueryExecutor::tryCancel(const char * reason, std::unique_ptr<ReadContext> * read_context)
@ -489,11 +502,11 @@ void RemoteQueryExecutor::tryCancel(const char * reason, std::unique_ptr<ReadCon
        if (read_context && *read_context)
            (*read_context)->cancel();

-        multiplexed_connections->sendCancel();
+        connections->sendCancel();
    }

    if (log)
-        LOG_TRACE(log, "({}) {}", multiplexed_connections->dumpAddresses(), reason);
+        LOG_TRACE(log, "({}) {}", connections->dumpAddresses(), reason);
 }

 bool RemoteQueryExecutor::isQueryPending() const
--- a/src/DataStreams/RemoteQueryExecutor.h
+++ b/src/DataStreams/RemoteQueryExecutor.h
@ -1,7 +1,8 @@
 #pragma once

 #include <Client/ConnectionPool.h>
-#include <Client/MultiplexedConnections.h>
+#include <Client/IConnections.h>
+#include <Client/ConnectionPoolWithFailover.h>
 #include <Storages/IStorage_fwd.h>
 #include <Interpreters/Context.h>
 #include <Interpreters/StorageID.h>
@ -40,7 +41,7 @@ public:

    /// Accepts several connections already taken from pool.
    RemoteQueryExecutor(
-        std::vector<IConnectionPool::Entry> && connections,
+        std::vector<IConnectionPool::Entry> && connections_,
        const String & query_, const Block & header_, const Context & context_,
        const ThrottlerPtr & throttler = nullptr, const Scalars & scalars_ = Scalars(), const Tables & external_tables_ = Tables(),
        QueryProcessingStage::Enum stage_ = QueryProcessingStage::Complete);
@ -103,8 +104,8 @@ private:
    Block totals;
    Block extremes;

-    std::function<std::unique_ptr<MultiplexedConnections>()> create_multiplexed_connections;
-    std::unique_ptr<MultiplexedConnections> multiplexed_connections;
+    std::function<std::unique_ptr<IConnections>()> create_connections;
+    std::unique_ptr<IConnections> connections;

    const String query;
    String query_id = "";
--- a/src/DataStreams/RemoteQueryExecutorReadContext.cpp
+++ b/src/DataStreams/RemoteQueryExecutorReadContext.cpp
@ -3,7 +3,7 @@
 #include <DataStreams/RemoteQueryExecutorReadContext.h>
 #include <Common/Exception.h>
 #include <Common/NetException.h>
-#include <Client/MultiplexedConnections.h>
+#include <Client/IConnections.h>
 #include <sys/epoll.h>

 namespace DB
@ -11,7 +11,7 @@ namespace DB

 struct RemoteQueryExecutorRoutine
 {
-    MultiplexedConnections & connections;
+    IConnections & connections;
    RemoteQueryExecutorReadContext & read_context;

    struct ReadCallback
@ -19,15 +19,15 @@ struct RemoteQueryExecutorRoutine
        RemoteQueryExecutorReadContext & read_context;
        Fiber & fiber;

-        void operator()(Poco::Net::Socket & socket)
+        void operator()(int fd, const Poco::Timespan & timeout = 0, const std::string fd_description = "")
        {
            try
            {
-                read_context.setSocket(socket);
+                read_context.setConnectionFD(fd, timeout, fd_description);
            }
            catch (DB::Exception & e)
            {
-                e.addMessage(" while reading from socket ({})", socket.peerAddress().toString());
+                e.addMessage(" while reading from {}", fd_description);
                throw;
            }

@ -70,60 +70,38 @@ namespace ErrorCodes
    extern const int SOCKET_TIMEOUT;
 }

-RemoteQueryExecutorReadContext::RemoteQueryExecutorReadContext(MultiplexedConnections & connections_)
+RemoteQueryExecutorReadContext::RemoteQueryExecutorReadContext(IConnections & connections_)
    : connections(connections_)
 {
-    epoll_fd = epoll_create(2);
-    if (-1 == epoll_fd)
-        throwFromErrno("Cannot create epoll descriptor", ErrorCodes::CANNOT_OPEN_FILE);

    if (-1 == pipe2(pipe_fd, O_NONBLOCK))
        throwFromErrno("Cannot create pipe", ErrorCodes::CANNOT_OPEN_FILE);

    {
-        epoll_event socket_event;
-        socket_event.events = EPOLLIN | EPOLLPRI;
-        socket_event.data.fd = pipe_fd[0];
-
-        if (-1 == epoll_ctl(epoll_fd, EPOLL_CTL_ADD, pipe_fd[0], &socket_event))
-            throwFromErrno("Cannot add pipe descriptor to epoll", ErrorCodes::CANNOT_OPEN_FILE);
+        epoll.add(pipe_fd[0]);
    }

    {
-        epoll_event timer_event;
-        timer_event.events = EPOLLIN | EPOLLPRI;
-        timer_event.data.fd = timer.getDescriptor();
-
-        if (-1 == epoll_ctl(epoll_fd, EPOLL_CTL_ADD, timer_event.data.fd, &timer_event))
-            throwFromErrno("Cannot add timer descriptor to epoll", ErrorCodes::CANNOT_OPEN_FILE);
+        epoll.add(timer.getDescriptor());
    }

    auto routine = RemoteQueryExecutorRoutine{connections, *this};
    fiber = boost::context::fiber(std::allocator_arg_t(), stack, std::move(routine));
 }

-void RemoteQueryExecutorReadContext::setSocket(Poco::Net::Socket & socket)
+void RemoteQueryExecutorReadContext::setConnectionFD(int fd, const Poco::Timespan & timeout, const std::string & fd_description)
 {
-    int fd = socket.impl()->sockfd();
-    if (fd == socket_fd)
+    if (fd == connection_fd)
        return;

-    epoll_event socket_event;
-    socket_event.events = EPOLLIN | EPOLLPRI;
-    socket_event.data.fd = fd;
+    if (connection_fd != -1)
+        epoll.remove(connection_fd);

-    if (socket_fd != -1)
-    {
-        if (-1 == epoll_ctl(epoll_fd, EPOLL_CTL_DEL, socket_fd, &socket_event))
-            throwFromErrno("Cannot remove socket descriptor to epoll", ErrorCodes::CANNOT_OPEN_FILE);
-    }
+    connection_fd = fd;
+    epoll.add(connection_fd);

-    socket_fd = fd;
-
-    if (-1 == epoll_ctl(epoll_fd, EPOLL_CTL_ADD, socket_fd, &socket_event))
-        throwFromErrno("Cannot add socket descriptor to epoll", ErrorCodes::CANNOT_OPEN_FILE);
-
-    receive_timeout = socket.impl()->getReceiveTimeout();
+    receive_timeout = timeout;
+    connection_fd_description = fd_description;
 }

 bool RemoteQueryExecutorReadContext::checkTimeout() const
@ -142,17 +120,11 @@ bool RemoteQueryExecutorReadContext::checkTimeout() const

 bool RemoteQueryExecutorReadContext::checkTimeoutImpl() const
 {
+    /// Wait for epoll will not block if it was polled externally.
    epoll_event events[3];
    events[0].data.fd = events[1].data.fd = events[2].data.fd = -1;

-    /// Wait for epoll_fd will not block if it was polled externally.
-    int num_events = 0;
-    while (num_events <= 0)
-    {
-        num_events = epoll_wait(epoll_fd, events, 3, -1);
-        if (num_events == -1 && errno != EINTR)
-            throwFromErrno("Failed to epoll_wait", ErrorCodes::CANNOT_READ_FROM_SOCKET);
-    }
+    int num_events = epoll.getManyReady(3, events,/* blocking = */ false);

    bool is_socket_ready = false;
    bool is_pipe_alarmed = false;
@ -160,7 +132,7 @@ bool RemoteQueryExecutorReadContext::checkTimeoutImpl() const

    for (int i = 0; i < num_events; ++i)
    {
-        if (events[i].data.fd == socket_fd)
+        if (events[i].data.fd == connection_fd)
            is_socket_ready = true;
        if (events[i].data.fd == timer.getDescriptor())
            has_timer_alarm = true;
@ -229,9 +201,7 @@ void RemoteQueryExecutorReadContext::cancel()

 RemoteQueryExecutorReadContext::~RemoteQueryExecutorReadContext()
 {
-    /// socket_fd is closed by Poco::Net::Socket
-    if (epoll_fd != -1)
-        close(epoll_fd);
+    /// connection_fd is closed by Poco::Net::Socket or Epoll
    if (pipe_fd[0] != -1)
        close(pipe_fd[0]);
    if (pipe_fd[1] != -1)
--- a/src/DataStreams/RemoteQueryExecutorReadContext.h
+++ b/src/DataStreams/RemoteQueryExecutorReadContext.h
@ -7,7 +7,9 @@
 #include <Common/Fiber.h>
 #include <Common/FiberStack.h>
 #include <Common/TimerDescriptor.h>
+#include <Common/Epoll.h>
 #include <Client/Connection.h>
+#include <Client/IConnections.h>
 #include <Poco/Timespan.h>

 namespace Poco::Net
@ -33,26 +35,29 @@ public:
    std::mutex fiber_lock;

    Poco::Timespan receive_timeout;
-    MultiplexedConnections & connections;
+    IConnections & connections;
    Poco::Net::Socket * last_used_socket = nullptr;

    /// Here we have three descriptors we are going to wait:
-    /// * socket_fd is a descriptor of connection. It may be changed in case of reading from several replicas.
+    /// * connection_fd is a descriptor of connection. It may be changed in case of reading from several replicas.
    /// * timer is a timerfd descriptor to manually check socket timeout
    /// * pipe_fd is a pipe we use to cancel query and socket polling by executor.
-    /// We put those descriptors into our own epoll_fd which is used by external executor.
+    /// We put those descriptors into our own epoll which is used by external executor.
    TimerDescriptor timer{CLOCK_MONOTONIC, 0};
-    int socket_fd = -1;
-    int epoll_fd = -1;
+    int connection_fd = -1;
    int pipe_fd[2] = { -1, -1 };

-    explicit RemoteQueryExecutorReadContext(MultiplexedConnections & connections_);
+    Epoll epoll;
+
+    std::string connection_fd_description;
+
+    explicit RemoteQueryExecutorReadContext(IConnections & connections_);
    ~RemoteQueryExecutorReadContext();

    bool checkTimeout() const;
    bool checkTimeoutImpl() const;

-    void setSocket(Poco::Net::Socket & socket);
+    void setConnectionFD(int fd, const Poco::Timespan & timeout = 0, const std::string & fd_description = "");
    void setTimer() const;

    bool resumeRoutine();
--- a/src/Databases/DatabaseReplicatedWorker.cpp
+++ b/src/Databases/DatabaseReplicatedWorker.cpp
@ -22,7 +22,7 @@ DatabaseReplicatedDDLWorker::DatabaseReplicatedDDLWorker(DatabaseReplicated * db
    /// We also need similar graph to load tables on server startup in order of topsort.
 }

-void DatabaseReplicatedDDLWorker::initializeMainThread()
+bool DatabaseReplicatedDDLWorker::initializeMainThread()
 {
    while (!stop_flag)
    {
@ -33,7 +33,7 @@ void DatabaseReplicatedDDLWorker::initializeMainThread()
                database->tryConnectToZooKeeperAndInitDatabase(false);
            initializeReplication();
            initialized = true;
-            return;
+            return true;
        }
        catch (...)
        {
@ -41,6 +41,8 @@ void DatabaseReplicatedDDLWorker::initializeMainThread()
            sleepForSeconds(5);
        }
    }
+
+    return false;
 }

 void DatabaseReplicatedDDLWorker::shutdown()
@ -61,7 +63,7 @@ void DatabaseReplicatedDDLWorker::initializeReplication()
    if (our_log_ptr == 0 || our_log_ptr + logs_to_keep < max_log_ptr)
        database->recoverLostReplica(current_zookeeper, our_log_ptr, max_log_ptr);
    else
-        last_skipped_entry_name.emplace(log_ptr_str);
+        last_skipped_entry_name.emplace(DDLTaskBase::getLogEntryName(our_log_ptr));
 }

 String DatabaseReplicatedDDLWorker::enqueueQuery(DDLLogEntry & entry)
--- a/src/Databases/DatabaseReplicatedWorker.h
+++ b/src/Databases/DatabaseReplicatedWorker.h
@ -30,7 +30,7 @@ public:
    void shutdown() override;

 private:
-    void initializeMainThread() override;
+    bool initializeMainThread() override;
    void initializeReplication();

    DDLTaskPtr initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper) override;
--- a/src/Functions/connectionID.cpp
+++ b/src/Functions/connectionID.cpp
@ -0,0 +1,41 @@
+#include <DataTypes/DataTypesNumber.h>
+#include <Functions/FunctionFactory.h>
+#include <Functions/IFunctionImpl.h>
+#include <Interpreters/Context.h>
+
+
+namespace DB
+{
+
+/// Get the connection ID. It's used for MySQL handler only.
+class FunctionConnectionID : public IFunction
+{
+public:
+    static constexpr auto name = "connectionID";
+
+    explicit FunctionConnectionID(const Context & context_) : context(context_) {}
+
+    static FunctionPtr create(const Context & context) { return std::make_shared<FunctionConnectionID>(context); }
+
+    String getName() const override { return name; }
+
+    size_t getNumberOfArguments() const override { return 0; }
+
+    DataTypePtr getReturnTypeImpl(const DataTypes & /*arguments*/) const override { return std::make_shared<DataTypeUInt64>(); }
+
+    ColumnPtr executeImpl(const ColumnsWithTypeAndName &, const DataTypePtr & result_type, size_t input_rows_count) const override
+    {
+        return result_type->createColumnConst(input_rows_count, context.getClientInfo().connection_id);
+    }
+
+private:
+    const Context & context;
+};
+
+void registerFunctionConnectionID(FunctionFactory & factory)
+{
+    factory.registerFunction<FunctionConnectionID>();
+    factory.registerAlias("connection_id", "connectionID");
+}
+
+}
--- a/src/Functions/globalVariable.cpp
+++ b/src/Functions/globalVariable.cpp
@ -77,8 +77,11 @@ private:
        DataTypePtr type;
        Field value;
    };
-    std::unordered_map<String, TypeAndValue> global_variable_map = {
-        {"max_allowed_packet", {std::make_shared<DataTypeInt32>(), 67108864}}, {"version", {std::make_shared<DataTypeString>(), "5.7.30"}}};
+    std::unordered_map<String, TypeAndValue> global_variable_map
+        = {{"max_allowed_packet", {std::make_shared<DataTypeInt32>(), 67108864}},
+           {"version", {std::make_shared<DataTypeString>(), "5.7.30"}},
+           {"version_comment", {std::make_shared<DataTypeString>(), ""}},
+           {"transaction_isolation", {std::make_shared<DataTypeString>(), "READ-UNCOMMITTED"}}};
 };

 }
--- a/src/Functions/intDiv.cpp
+++ b/src/Functions/intDiv.cpp
@ -24,9 +24,25 @@ template <typename A, typename B>
 struct DivideIntegralByConstantImpl
    : BinaryOperation<A, B, DivideIntegralImpl<A, B>>
 {
-    using ResultType = typename DivideIntegralImpl<A, B>::ResultType;
+    using Op = DivideIntegralImpl<A, B>;
+    using ResultType = typename Op::ResultType;
    static const constexpr bool allow_fixed_string = false;

+    template <OpCase op_case>
+    static void NO_INLINE process(const A * __restrict a, const B * __restrict b, ResultType * __restrict c, size_t size)
+    {
+        if constexpr (op_case == OpCase::Vector)
+            for (size_t i = 0; i < size; ++i)
+                c[i] = Op::template apply<ResultType>(a[i], b[i]);
+        else if constexpr (op_case == OpCase::LeftConstant)
+            for (size_t i = 0; i < size; ++i)
+                c[i] = Op::template apply<ResultType>(*a, b[i]);
+        else
+            vectorConstant(a, *b, c, size);
+    }
+
+    static ResultType process(A a, B b) { return Op::template apply<ResultType>(a, b); }
+
    static NO_INLINE void vectorConstant(const A * __restrict a_pos, B b, ResultType * __restrict c_pos, size_t size)
    {
 #pragma GCC diagnostic push
--- a/src/Functions/modulo.cpp
+++ b/src/Functions/modulo.cpp
@ -24,10 +24,26 @@ template <typename A, typename B>
 struct ModuloByConstantImpl
    : BinaryOperation<A, B, ModuloImpl<A, B>>
 {
-    using ResultType = typename ModuloImpl<A, B>::ResultType;
+    using Op = ModuloImpl<A, B>;
+    using ResultType = typename Op::ResultType;
    static const constexpr bool allow_fixed_string = false;

-    static NO_INLINE void vectorConstant(const A * __restrict src, B b, ResultType * __restrict dst, size_t size)
+    template <OpCase op_case>
+    static void NO_INLINE process(const A * __restrict a, const B * __restrict b, ResultType * __restrict c, size_t size)
+    {
+        if constexpr (op_case == OpCase::Vector)
+            for (size_t i = 0; i < size; ++i)
+                c[i] = Op::template apply<ResultType>(a[i], b[i]);
+        else if constexpr (op_case == OpCase::LeftConstant)
+            for (size_t i = 0; i < size; ++i)
+                c[i] = Op::template apply<ResultType>(*a, b[i]);
+        else
+            vectorConstant(a, *b, c, size);
+    }
+
+    static ResultType process(A a, B b) { return Op::template apply<ResultType>(a, b); }
+
+    static void NO_INLINE vectorConstant(const A * __restrict src, B b, ResultType * __restrict dst, size_t size)
    {
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wsign-compare"
--- a/src/Functions/registerFunctionsMiscellaneous.cpp
+++ b/src/Functions/registerFunctionsMiscellaneous.cpp
@ -69,6 +69,7 @@ void registerFunctionErrorCodeToName(FunctionFactory &);
 void registerFunctionTcpPort(FunctionFactory &);
 void registerFunctionByteSize(FunctionFactory &);
 void registerFunctionFile(FunctionFactory & factory);
+void registerFunctionConnectionID(FunctionFactory & factory);

 #if USE_ICU
 void registerFunctionConvertCharset(FunctionFactory &);
@ -138,6 +139,7 @@ void registerFunctionsMiscellaneous(FunctionFactory & factory)
    registerFunctionTcpPort(factory);
    registerFunctionByteSize(factory);
    registerFunctionFile(factory);
+    registerFunctionConnectionID(factory);

 #if USE_ICU
    registerFunctionConvertCharset(factory);
--- a/src/Functions/ya.make
+++ b/src/Functions/ya.make
@ -210,6 +210,7 @@ SRCS(
    cbrt.cpp
    coalesce.cpp
    concat.cpp
+    connectionID.cpp
    convertCharset.cpp
    cos.cpp
    cosh.cpp
--- a/src/IO/ConnectionTimeouts.h
+++ b/src/IO/ConnectionTimeouts.h
@ -17,6 +17,10 @@ struct ConnectionTimeouts
    Poco::Timespan http_keep_alive_timeout;
    Poco::Timespan secure_connection_timeout;

+    /// Timeouts for HedgedConnections
+    Poco::Timespan hedged_connection_timeout;
+    Poco::Timespan receive_data_timeout;
+
    ConnectionTimeouts() = default;

    ConnectionTimeouts(const Poco::Timespan & connection_timeout_,
@ -27,7 +31,9 @@ struct ConnectionTimeouts
      receive_timeout(receive_timeout_),
      tcp_keep_alive_timeout(0),
      http_keep_alive_timeout(0),
-      secure_connection_timeout(connection_timeout)
+      secure_connection_timeout(connection_timeout),
+      hedged_connection_timeout(receive_timeout_),
+      receive_data_timeout(receive_timeout_)
    {
    }

@ -40,7 +46,9 @@ struct ConnectionTimeouts
      receive_timeout(receive_timeout_),
      tcp_keep_alive_timeout(tcp_keep_alive_timeout_),
      http_keep_alive_timeout(0),
-      secure_connection_timeout(connection_timeout)
+      secure_connection_timeout(connection_timeout),
+      hedged_connection_timeout(receive_timeout_),
+      receive_data_timeout(receive_timeout_)
    {
    }
    ConnectionTimeouts(const Poco::Timespan & connection_timeout_,
@ -53,7 +61,9 @@ struct ConnectionTimeouts
          receive_timeout(receive_timeout_),
          tcp_keep_alive_timeout(tcp_keep_alive_timeout_),
          http_keep_alive_timeout(http_keep_alive_timeout_),
-          secure_connection_timeout(connection_timeout)
+          secure_connection_timeout(connection_timeout),
+          hedged_connection_timeout(receive_timeout_),
+          receive_data_timeout(receive_timeout_)
    {
    }

@ -62,13 +72,17 @@ struct ConnectionTimeouts
                       const Poco::Timespan & receive_timeout_,
                       const Poco::Timespan & tcp_keep_alive_timeout_,
                       const Poco::Timespan & http_keep_alive_timeout_,
-                       const Poco::Timespan & secure_connection_timeout_)
-            : connection_timeout(connection_timeout_),
-              send_timeout(send_timeout_),
-              receive_timeout(receive_timeout_),
-              tcp_keep_alive_timeout(tcp_keep_alive_timeout_),
-              http_keep_alive_timeout(http_keep_alive_timeout_),
-              secure_connection_timeout(secure_connection_timeout_)
+                       const Poco::Timespan & secure_connection_timeout_,
+                       const Poco::Timespan & receive_hello_timeout_,
+                       const Poco::Timespan & receive_data_timeout_)
+        : connection_timeout(connection_timeout_),
+          send_timeout(send_timeout_),
+          receive_timeout(receive_timeout_),
+          tcp_keep_alive_timeout(tcp_keep_alive_timeout_),
+          http_keep_alive_timeout(http_keep_alive_timeout_),
+          secure_connection_timeout(secure_connection_timeout_),
+          hedged_connection_timeout(receive_hello_timeout_),
+          receive_data_timeout(receive_data_timeout_)
    {
    }

@ -87,7 +101,9 @@ struct ConnectionTimeouts
                                  saturate(receive_timeout, limit),
                                  saturate(tcp_keep_alive_timeout, limit),
                                  saturate(http_keep_alive_timeout, limit),
-                                  saturate(secure_connection_timeout, limit));
+                                  saturate(secure_connection_timeout, limit),
+                                  saturate(hedged_connection_timeout, limit),
+                                  saturate(receive_data_timeout, limit));
    }

    /// Timeouts for the case when we have just single attempt to connect.
--- a/src/IO/ConnectionTimeoutsContext.h
+++ b/src/IO/ConnectionTimeoutsContext.h
@ -16,7 +16,15 @@ inline ConnectionTimeouts ConnectionTimeouts::getTCPTimeoutsWithoutFailover(cons
 /// Timeouts for the case when we will try many addresses in a loop.
 inline ConnectionTimeouts ConnectionTimeouts::getTCPTimeoutsWithFailover(const Settings & settings)
 {
-    return ConnectionTimeouts(settings.connect_timeout_with_failover_ms, settings.send_timeout, settings.receive_timeout, settings.tcp_keep_alive_timeout, 0, settings.connect_timeout_with_failover_secure_ms);
+    return ConnectionTimeouts(
+        settings.connect_timeout_with_failover_ms,
+        settings.send_timeout,
+        settings.receive_timeout,
+        settings.tcp_keep_alive_timeout,
+        0,
+        settings.connect_timeout_with_failover_secure_ms,
+        settings.hedged_connection_timeout,
+        settings.receive_data_timeout);
 }

 inline ConnectionTimeouts ConnectionTimeouts::getHTTPTimeouts(const Context & context)
--- a/src/IO/ReadBufferFromPocoSocket.cpp
+++ b/src/IO/ReadBufferFromPocoSocket.cpp
@ -14,7 +14,6 @@ namespace ProfileEvents

 namespace DB
 {
-
 namespace ErrorCodes
 {
    extern const int NETWORK_ERROR;
@ -42,7 +41,7 @@ bool ReadBufferFromPocoSocket::nextImpl()
        /// Note that receive timeout is not checked here. External code should check it while polling.
        while (bytes_read < 0 && async_callback && errno == EAGAIN)
        {
-            async_callback(socket);
+            async_callback(socket.impl()->sockfd(), socket.getReceiveTimeout(), socket_description);
            bytes_read = socket.impl()->receiveBytes(internal_buffer.begin(), internal_buffer.size(), flags);
        }
    }
@ -74,7 +73,10 @@ bool ReadBufferFromPocoSocket::nextImpl()
 }

 ReadBufferFromPocoSocket::ReadBufferFromPocoSocket(Poco::Net::Socket & socket_, size_t buf_size)
-    : BufferWithOwnMemory<ReadBuffer>(buf_size), socket(socket_), peer_address(socket.peerAddress())
+    : BufferWithOwnMemory<ReadBuffer>(buf_size)
+    , socket(socket_)
+    , peer_address(socket.peerAddress())
+    , socket_description("socket (" + peer_address.toString() + ")")
 {
 }

--- a/src/IO/ReadBufferFromPocoSocket.h
+++ b/src/IO/ReadBufferFromPocoSocket.h
@ -8,6 +8,8 @@
 namespace DB
 {

+using AsyncCallback = std::function<void(int, const Poco::Timespan &, const std::string &)>;
+
 /// Works with the ready Poco::Net::Socket. Blocking operations.
 class ReadBufferFromPocoSocket : public BufferWithOwnMemory<ReadBuffer>
 {
@ -27,10 +29,11 @@ public:

    bool poll(size_t timeout_microseconds) const;

-    void setAsyncCallback(std::function<void(Poco::Net::Socket &)> async_callback_) { async_callback = std::move(async_callback_); }
+    void setAsyncCallback(AsyncCallback async_callback_) { async_callback = std::move(async_callback_); }

 private:
-    std::function<void(Poco::Net::Socket &)> async_callback;
+    AsyncCallback async_callback;
+    std::string socket_description;
 };

 }
--- a/src/Interpreters/ActionsDAG.cpp
+++ b/src/Interpreters/ActionsDAG.cpp
@ -6,6 +6,7 @@
 #include <Functions/IFunctionAdaptors.h>
 #include <Functions/FunctionsConversion.h>
 #include <Functions/materialize.h>
+#include <Functions/FunctionsLogical.h>
 #include <Interpreters/Context.h>
 #include <Interpreters/ExpressionJIT.h>
 #include <IO/WriteBufferFromString.h>
@ -80,14 +81,14 @@ ActionsDAG::Node & ActionsDAG::getNode(const std::string & name)
    return **it;
 }

-const ActionsDAG::Node & ActionsDAG::addInput(std::string name, DataTypePtr type, bool can_replace)
+const ActionsDAG::Node & ActionsDAG::addInput(std::string name, DataTypePtr type, bool can_replace, bool add_to_index)
 {
    Node node;
    node.type = ActionType::INPUT;
    node.result_type = std::move(type);
    node.result_name = std::move(name);

-    return addNode(std::move(node), can_replace);
+    return addNode(std::move(node), can_replace, add_to_index);
 }

 const ActionsDAG::Node & ActionsDAG::addInput(ColumnWithTypeAndName column, bool can_replace)
@ -364,7 +365,7 @@ void ActionsDAG::removeUnusedActions(const std::vector<Node *> & required_nodes)
    removeUnusedActions();
 }

-void ActionsDAG::removeUnusedActions()
+void ActionsDAG::removeUnusedActions(bool allow_remove_inputs)
 {
    std::unordered_set<const Node *> visited_nodes;
    std::stack<Node *> stack;
@ -388,6 +389,9 @@ void ActionsDAG::removeUnusedActions()
            visited_nodes.insert(&node);
            stack.push(&node);
        }
+
+        if (node.type == ActionType::INPUT && !allow_remove_inputs)
+            visited_nodes.insert(&node);
    }

    while (!stack.empty())
@ -516,6 +520,11 @@ bool ActionsDAG::removeUnusedResult(const std::string & column_name)
            if (col == child)
                return false;

+    /// Do not remove input if it was mentioned in index several times.
+    for (const auto * node : index)
+        if (col == node)
+            return false;
+
    /// Remove from nodes and inputs.
    for (auto jt = nodes.begin(); jt != nodes.end(); ++jt)
    {
@ -1203,4 +1212,340 @@ ActionsDAG::SplitResult ActionsDAG::splitActionsForFilter(const std::string & co
    return split(split_nodes);
 }

+namespace
+{
+
+struct ConjunctionNodes
+{
+    std::vector<ActionsDAG::Node *> allowed;
+    std::vector<ActionsDAG::Node *> rejected;
+};
+
+/// Take a node which result is predicate.
+/// Assuming predicate is a conjunction (probably, trivial).
+/// Find separate conjunctions nodes. Split nodes into allowed and rejected sets.
+/// Allowed predicate is a predicate which can be calculated using only nodes from allowed_nodes set.
+ConjunctionNodes getConjunctionNodes(ActionsDAG::Node * predicate, std::unordered_set<const ActionsDAG::Node *> allowed_nodes)
+{
+    ConjunctionNodes conjunction;
+    std::unordered_set<ActionsDAG::Node *> allowed;
+    std::unordered_set<ActionsDAG::Node *> rejected;
+
+    struct Frame
+    {
+        ActionsDAG::Node * node;
+        bool is_predicate = false;
+        size_t next_child_to_visit = 0;
+        size_t num_allowed_children = 0;
+    };
+
+    std::stack<Frame> stack;
+    std::unordered_set<ActionsDAG::Node *> visited_nodes;
+
+    stack.push(Frame{.node = predicate, .is_predicate = true});
+    visited_nodes.insert(predicate);
+    while (!stack.empty())
+    {
+        auto & cur = stack.top();
+        bool is_conjunction = cur.is_predicate
+                                && cur.node->type == ActionsDAG::ActionType::FUNCTION
+                                && cur.node->function_base->getName() == "and";
+
+        /// At first, visit all children.
+        while (cur.next_child_to_visit < cur.node->children.size())
+        {
+            auto * child = cur.node->children[cur.next_child_to_visit];
+
+            if (visited_nodes.count(child) == 0)
+            {
+                visited_nodes.insert(child);
+                stack.push({.node = child, .is_predicate = is_conjunction});
+                break;
+            }
+
+            if (allowed_nodes.contains(child))
+                ++cur.num_allowed_children;
+            ++cur.next_child_to_visit;
+        }
+
+        if (cur.next_child_to_visit == cur.node->children.size())
+        {
+            if (cur.num_allowed_children == cur.node->children.size())
+            {
+                if (cur.node->type != ActionsDAG::ActionType::ARRAY_JOIN && cur.node->type != ActionsDAG::ActionType::INPUT)
+                    allowed_nodes.emplace(cur.node);
+            }
+            else if (is_conjunction)
+            {
+                for (auto * child : cur.node->children)
+                {
+                    if (allowed_nodes.count(child))
+                    {
+                        if (allowed.insert(child).second)
+                            conjunction.allowed.push_back(child);
+
+                    }
+                }
+            }
+            else if (cur.is_predicate)
+            {
+                if (rejected.insert(cur.node).second)
+                    conjunction.rejected.push_back(cur.node);
+            }
+
+            stack.pop();
+        }
+    }
+
+    if (conjunction.allowed.empty())
+    {
+        /// If nothing was added to conjunction, check if it is trivial.
+        if (allowed_nodes.count(predicate))
+            conjunction.allowed.push_back(predicate);
+    }
+
+    return conjunction;
+}
+
+ColumnsWithTypeAndName prepareFunctionArguments(const std::vector<ActionsDAG::Node *> nodes)
+{
+    ColumnsWithTypeAndName arguments;
+    arguments.reserve(nodes.size());
+
+    for (const auto * child : nodes)
+    {
+        ColumnWithTypeAndName argument;
+        argument.column = child->column;
+        argument.type = child->result_type;
+        argument.name = child->result_name;
+
+        arguments.emplace_back(std::move(argument));
+    }
+
+    return arguments;
+}
+
+}
+
+/// Create actions which calculate conjunction of selected nodes.
+/// Assume conjunction nodes are predicates (and may be used as arguments of function AND).
+///
+/// Result actions add single column with conjunction result (it is always last in index).
+/// No other columns are added or removed.
+ActionsDAGPtr ActionsDAG::cloneActionsForConjunction(std::vector<Node *> conjunction)
+{
+    if (conjunction.empty())
+        return nullptr;
+
+    auto actions = cloneEmpty();
+    actions->settings.project_input = false;
+
+    FunctionOverloadResolverPtr func_builder_and =
+            std::make_shared<FunctionOverloadResolverAdaptor>(
+                    std::make_unique<DefaultOverloadResolver>(
+                            std::make_shared<FunctionAnd>()));
+
+    std::unordered_map<const ActionsDAG::Node *, ActionsDAG::Node *> nodes_mapping;
+
+    struct Frame
+    {
+        const ActionsDAG::Node * node;
+        size_t next_child_to_visit = 0;
+    };
+
+    std::stack<Frame> stack;
+
+    /// DFS. Clone actions.
+    for (const auto * predicate : conjunction)
+    {
+        if (nodes_mapping.count(predicate))
+            continue;
+
+        stack.push({.node = predicate});
+        while (!stack.empty())
+        {
+            auto & cur = stack.top();
+            /// At first, visit all children.
+            while (cur.next_child_to_visit < cur.node->children.size())
+            {
+                auto * child = cur.node->children[cur.next_child_to_visit];
+
+                if (nodes_mapping.count(child) == 0)
+                {
+                    stack.push({.node = child});
+                    break;
+                }
+
+                ++cur.next_child_to_visit;
+            }
+
+            if (cur.next_child_to_visit == cur.node->children.size())
+            {
+                auto & node = actions->nodes.emplace_back(*cur.node);
+                nodes_mapping[cur.node] = &node;
+
+                for (auto & child : node.children)
+                    child = nodes_mapping[child];
+
+                if (node.type == ActionType::INPUT)
+                {
+                    actions->inputs.emplace_back(&node);
+                    actions->index.insert(&node);
+                }
+
+                stack.pop();
+            }
+        }
+    }
+
+    Node * result_predicate = nodes_mapping[*conjunction.begin()];
+
+    if (conjunction.size() > 1)
+    {
+        std::vector<Node *> args;
+        args.reserve(conjunction.size());
+        for (const auto * predicate : conjunction)
+            args.emplace_back(nodes_mapping[predicate]);
+
+        result_predicate = &actions->addFunction(func_builder_and, args, {}, true, false);
+    }
+
+    actions->index.insert(result_predicate);
+    return actions;
+}
+
+ActionsDAGPtr ActionsDAG::splitActionsForFilter(const std::string & filter_name, bool can_remove_filter, const Names & available_inputs)
+{
+    Node * predicate;
+
+    {
+        auto it = index.begin();
+        for (; it != index.end(); ++it)
+            if ((*it)->result_name == filter_name)
+                break;
+
+        if (it == index.end())
+            throw Exception(ErrorCodes::LOGICAL_ERROR,
+                            "Index for ActionsDAG does not contain filter column name {}. DAG:\n{}",
+                            filter_name, dumpDAG());
+
+        predicate = *it;
+    }
+
+    std::unordered_set<const Node *> allowed_nodes;
+
+    /// Get input nodes from available_inputs names.
+    {
+        std::unordered_map<std::string_view, std::list<const Node *>> inputs_map;
+        for (const auto & input : inputs)
+            inputs_map[input->result_name].emplace_back(input);
+
+        for (const auto & name : available_inputs)
+        {
+            auto & inputs_list = inputs_map[name];
+            if (inputs_list.empty())
+                continue;
+
+            allowed_nodes.emplace(inputs_list.front());
+            inputs_list.pop_front();
+        }
+    }
+
+    auto conjunction = getConjunctionNodes(predicate, allowed_nodes);
+    auto actions = cloneActionsForConjunction(conjunction.allowed);
+    if (!actions)
+        return nullptr;
+
+    /// Now, when actions are created, update current DAG.
+
+    if (conjunction.rejected.empty())
+    {
+        /// The whole predicate was split.
+        if (can_remove_filter)
+        {
+            /// If filter column is not needed, remove it from index.
+            for (auto i = index.begin(); i != index.end(); ++i)
+            {
+                if (*i == predicate)
+                {
+                    index.remove(i);
+                    break;
+                }
+            }
+        }
+        else
+        {
+            /// Replace predicate result to constant 1.
+            Node node;
+            node.type = ActionType::COLUMN;
+            node.result_name = std::move(predicate->result_name);
+            node.result_type = std::move(predicate->result_type);
+            node.column = node.result_type->createColumnConst(0, 1);
+            *predicate = std::move(node);
+        }
+
+        removeUnusedActions(false);
+    }
+    else
+    {
+        /// Predicate is conjunction, where both allowed and rejected sets are not empty.
+        /// Replace this node to conjunction of rejected predicates.
+
+        std::vector<Node *> new_children(conjunction.rejected.begin(), conjunction.rejected.end());
+
+        if (new_children.size() == 1)
+        {
+            /// Rejected set has only one predicate.
+            if (new_children.front()->result_type->equals(*predicate->result_type))
+            {
+                /// If it's type is same, just add alias.
+                Node node;
+                node.type = ActionType::ALIAS;
+                node.result_name = predicate->result_name;
+                node.result_type = predicate->result_type;
+                node.children.swap(new_children);
+                *predicate = std::move(node);
+            }
+            else
+            {
+                /// If type is different, cast column.
+                /// This case is possible, cause AND can use any numeric type as argument.
+                Node node;
+                node.type = ActionType::COLUMN;
+                node.result_name = predicate->result_type->getName();
+                node.column = DataTypeString().createColumnConst(0, node.result_name);
+                node.result_type = std::make_shared<DataTypeString>();
+
+                auto * right_arg = &nodes.emplace_back(std::move(node));
+                auto * left_arg = new_children.front();
+
+                predicate->children = {left_arg, right_arg};
+                auto arguments = prepareFunctionArguments(predicate->children);
+
+                FunctionOverloadResolverPtr func_builder_cast =
+                        std::make_shared<FunctionOverloadResolverAdaptor>(
+                                CastOverloadResolver<CastType::nonAccurate>::createImpl(false));
+
+                predicate->function_builder = func_builder_cast;
+                predicate->function_base = predicate->function_builder->build(arguments);
+                predicate->function = predicate->function_base->prepare(arguments);
+            }
+        }
+        else
+        {
+            /// Predicate is function AND, which still have more then one argument.
+            /// Just update children and rebuild it.
+            predicate->children.swap(new_children);
+            auto arguments = prepareFunctionArguments(predicate->children);
+
+            predicate->function_base = predicate->function_builder->build(arguments);
+            predicate->function = predicate->function_base->prepare(arguments);
+        }
+
+        removeUnusedActions(false);
+    }
+
+    return actions;
+}
+
 }
--- a/src/Interpreters/ActionsDAG.h
+++ b/src/Interpreters/ActionsDAG.h
@ -152,6 +152,9 @@ public:
        }
    };

+    /// NOTE: std::list is an implementation detail.
+    /// It allows to add and remove new nodes inplace without reallocation.
+    /// Raw pointers to nodes remain valid.
    using Nodes = std::list<Node>;
    using Inputs = std::vector<Node *>;

@ -196,7 +199,7 @@ public:
    std::string dumpNames() const;
    std::string dumpDAG() const;

-    const Node & addInput(std::string name, DataTypePtr type, bool can_replace = false);
+    const Node & addInput(std::string name, DataTypePtr type, bool can_replace = false, bool add_to_index = true);
    const Node & addInput(ColumnWithTypeAndName column, bool can_replace = false);
    const Node & addColumn(ColumnWithTypeAndName column, bool can_replace = false, bool materialize = false);
    const Node & addAlias(const std::string & name, std::string alias, bool can_replace = false);
@ -208,6 +211,8 @@ public:
            const Context & context,
            bool can_replace = false);

+    void addNodeToIndex(const Node * node) { index.insert(const_cast<Node *>(node)); }
+
    /// Call addAlias several times.
    void addAliases(const NamesWithAliases & aliases);
    /// Add alias actions and remove unused columns from index. Also specify result columns order in index.
@ -220,7 +225,7 @@ public:
    /// Return true if column was removed from inputs.
    bool removeUnusedResult(const std::string & column_name);

-    void projectInput() { settings.project_input = true; }
+    void projectInput(bool project = true) { settings.project_input = project; }
    void removeUnusedActions(const Names & required_names);

    bool hasArrayJoin() const;
@ -278,6 +283,13 @@ public:
    /// Index of initial actions must contain column_name.
    SplitResult splitActionsForFilter(const std::string & column_name) const;

+    /// Create actions which may calculate part of filter using only available_inputs.
+    /// If nothing may be calculated, returns nullptr.
+    /// Otherwise, return actions which inputs are from available_inputs.
+    /// Returned actions add single column which may be used for filter.
+    /// Also, replace some nodes of current inputs to constant 1 in case they are filtered.
+    ActionsDAGPtr splitActionsForFilter(const std::string & filter_name, bool can_remove_filter, const Names & available_inputs);
+
 private:
    Node & addNode(Node node, bool can_replace = false, bool add_to_index = true);
    Node & getNode(const std::string & name);
@ -302,10 +314,12 @@ private:
    }

    void removeUnusedActions(const std::vector<Node *> & required_nodes);
-    void removeUnusedActions();
+    void removeUnusedActions(bool allow_remove_inputs = true);
    void addAliases(const NamesWithAliases & aliases, std::vector<Node *> & result_nodes);

    void compileFunctions();
+
+    ActionsDAGPtr cloneActionsForConjunction(std::vector<Node *> conjunction);
 };


--- a/src/Interpreters/ClientInfo.h
+++ b/src/Interpreters/ClientInfo.h
@ -84,6 +84,9 @@ public:
    String http_user_agent;
    String http_referer;

+    /// For mysql
+    UInt64 connection_id = 0;
+
    /// Comma separated list of forwarded IP addresses (from X-Forwarded-For for HTTP interface).
    /// It's expected that proxy appends the forwarded address to the end of the list.
    /// The element can be trusted only if you trust the corresponding proxy.
--- a/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp
+++ b/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp
@ -284,7 +284,7 @@ void SelectStreamFactory::createForShard(
            if (try_results.empty() || local_delay < max_remote_delay)
            {
                auto plan = createLocalPlan(modified_query_ast, header, context, stage);
-                return QueryPipeline::getPipe(std::move(*plan->buildQueryPipeline()));
+                return QueryPipeline::getPipe(std::move(*plan->buildQueryPipeline(QueryPlanOptimizationSettings(context.getSettingsRef()))));
            }
            else
            {
--- a/src/Interpreters/DDLTask.h
+++ b/src/Interpreters/DDLTask.h
@ -189,7 +189,7 @@ public:

    void commit();

-    ~ZooKeeperMetadataTransaction() { assert(isExecuted() || std::uncaught_exception()); }
+    ~ZooKeeperMetadataTransaction() { assert(isExecuted() || std::uncaught_exceptions()); }
 };

 }
--- a/src/Interpreters/DDLWorker.cpp
+++ b/src/Interpreters/DDLWorker.cpp
@ -305,20 +305,26 @@ static void filterAndSortQueueNodes(Strings & all_nodes)
    std::sort(all_nodes.begin(), all_nodes.end());
 }

-void DDLWorker::scheduleTasks()
+void DDLWorker::scheduleTasks(bool reinitialized)
 {
    LOG_DEBUG(log, "Scheduling tasks");
    auto zookeeper = tryGetZooKeeper();

-    for (auto & task : current_tasks)
+    /// Main thread of DDLWorker was restarted, probably due to lost connection with ZooKeeper.
+    /// We have some unfinished tasks. To avoid duplication of some queries, try to write execution status.
+    if (reinitialized)
    {
-        /// Main thread of DDLWorker was restarted, probably due to lost connection with ZooKeeper.
-        /// We have some unfinished tasks. To avoid duplication of some queries, try to write execution status.
-        bool task_still_exists = zookeeper->exists(task->entry_path);
-        bool status_written = zookeeper->exists(task->getFinishedNodePath());
-        if (task->was_executed && !status_written && task_still_exists)
+        for (auto & task : current_tasks)
        {
-            processTask(*task, zookeeper);
+            if (task->was_executed)
+            {
+                bool task_still_exists = zookeeper->exists(task->entry_path);
+                bool status_written = zookeeper->exists(task->getFinishedNodePath());
+                if (!status_written && task_still_exists)
+                {
+                    processTask(*task, zookeeper);
+                }
+            }
        }
    }

@ -332,19 +338,23 @@ void DDLWorker::scheduleTasks()
    else if (max_tasks_in_queue < queue_nodes.size())
        cleanup_event->set();

-    bool server_startup = current_tasks.empty();
+    /// Detect queue start, using:
+    /// - skipped tasks
+    /// - in memory tasks (that are currently active)
    auto begin_node = queue_nodes.begin();
-
-    if (!server_startup)
+    UInt64 last_task_id = 0;
+    if (!current_tasks.empty())
    {
-        /// We will recheck status of last executed tasks. It's useful if main thread was just restarted.
-        auto & min_task = *std::min_element(current_tasks.begin(), current_tasks.end());
-        String min_entry_name = last_skipped_entry_name ? std::min(min_task->entry_name, *last_skipped_entry_name) : min_task->entry_name;
-        begin_node = std::upper_bound(queue_nodes.begin(), queue_nodes.end(), min_entry_name);
-        current_tasks.clear();
+        auto & last_task = current_tasks.back();
+        last_task_id = DDLTaskBase::getLogEntryNumber(last_task->entry_name);
+        begin_node = std::upper_bound(queue_nodes.begin(), queue_nodes.end(), last_task->entry_name);
+    }
+    if (last_skipped_entry_name)
+    {
+        UInt64 last_skipped_entry_id = DDLTaskBase::getLogEntryNumber(*last_skipped_entry_name);
+        if (last_skipped_entry_id > last_task_id)
+            begin_node = std::upper_bound(queue_nodes.begin(), queue_nodes.end(), *last_skipped_entry_name);
    }
-
-    assert(current_tasks.empty());

    for (auto it = begin_node; it != queue_nodes.end() && !stop_flag; ++it)
    {
@ -365,7 +375,7 @@ void DDLWorker::scheduleTasks()

        if (worker_pool)
        {
-            worker_pool->scheduleOrThrowOnError([this, &saved_task, &zookeeper]()
+            worker_pool->scheduleOrThrowOnError([this, &saved_task, zookeeper]()
            {
                setThreadName("DDLWorkerExec");
                processTask(saved_task, zookeeper);
@ -930,11 +940,11 @@ String DDLWorker::enqueueQuery(DDLLogEntry & entry)
 }


-void DDLWorker::initializeMainThread()
+bool DDLWorker::initializeMainThread()
 {
    assert(!initialized);
    setThreadName("DDLWorker");
-    LOG_DEBUG(log, "Started DDLWorker thread");
+    LOG_DEBUG(log, "Initializing DDLWorker thread");

    while (!stop_flag)
    {
@ -943,7 +953,7 @@ void DDLWorker::initializeMainThread()
            auto zookeeper = getAndSetZooKeeper();
            zookeeper->createAncestors(fs::path(queue_dir) / "");
            initialized = true;
-            return;
+            return true;
        }
        catch (const Coordination::Exception & e)
        {
@ -964,6 +974,8 @@ void DDLWorker::initializeMainThread()
        /// Avoid busy loop when ZooKeeper is not available.
        sleepForSeconds(5);
    }
+
+    return false;
 }

 void DDLWorker::runMainThread()
@ -989,15 +1001,19 @@ void DDLWorker::runMainThread()
    {
        try
        {
+            bool reinitialized = !initialized;
+
            /// Reinitialize DDLWorker state (including ZooKeeper connection) if required
            if (!initialized)
            {
-                initializeMainThread();
+                /// Stopped
+                if (!initializeMainThread())
+                    break;
                LOG_DEBUG(log, "Initialized DDLWorker thread");
            }

            cleanup_event->set();
-            scheduleTasks();
+            scheduleTasks(reinitialized);

            LOG_DEBUG(log, "Waiting for queue updates");
            queue_updated_event->wait();
@ -1007,6 +1023,9 @@ void DDLWorker::runMainThread()
            if (Coordination::isHardwareError(e.code))
            {
                initialized = false;
+                /// Wait for pending async tasks
+                if (1 < pool_size)
+                    worker_pool = std::make_unique<ThreadPool>(pool_size);
                LOG_INFO(log, "Lost ZooKeeper connection, will try to connect again: {}", getCurrentExceptionMessage(true));
            }
            else
--- a/src/Interpreters/DDLWorker.h
+++ b/src/Interpreters/DDLWorker.h
@ -69,7 +69,7 @@ protected:
    ZooKeeperPtr getAndSetZooKeeper();

    /// Iterates through queue tasks in ZooKeeper, runs execution of new tasks
-    void scheduleTasks();
+    void scheduleTasks(bool reinitialized);

    DDLTaskBase & saveTask(DDLTaskPtr && task);

@ -104,7 +104,8 @@ protected:
    /// Init task node
    void createStatusDirs(const std::string & node_path, const ZooKeeperPtr & zookeeper);

-    virtual void initializeMainThread();
+    /// Return false if the worker was stopped (stop_flag = true)
+    virtual bool initializeMainThread();

    void runMainThread();
    void runCleanupThread();
--- a/src/Interpreters/ExecuteScalarSubqueriesVisitor.cpp
+++ b/src/Interpreters/ExecuteScalarSubqueriesVisitor.cpp
@ -21,7 +21,7 @@

 #include <IO/WriteHelpers.h>

-#include <Processors/Executors/PullingPipelineExecutor.h>
+#include <Processors/Executors/PullingAsyncPipelineExecutor.h>

 namespace DB
 {
@ -122,8 +122,10 @@ void ExecuteScalarSubqueriesMatcher::visit(const ASTSubquery & subquery, ASTPtr

            try
            {
-                PullingPipelineExecutor executor(io.pipeline);
-                if (!executor.pull(block))
+                PullingAsyncPipelineExecutor executor(io.pipeline);
+                while (block.rows() == 0 && executor.pull(block));
+
+                if (block.rows() == 0)
                {
                    /// Interpret subquery with empty result as Null literal
                    auto ast_new = std::make_unique<ASTLiteral>(Null());
@ -132,7 +134,13 @@ void ExecuteScalarSubqueriesMatcher::visit(const ASTSubquery & subquery, ASTPtr
                    return;
                }

-                if (block.rows() != 1 || executor.pull(block))
+                if (block.rows() != 1)
+                    throw Exception("Scalar subquery returned more than one row", ErrorCodes::INCORRECT_RESULT_OF_SCALAR_SUBQUERY);
+
+                Block tmp_block;
+                while (tmp_block.rows() == 0 && executor.pull(tmp_block));
+
+                if (tmp_block.rows() != 0)
                    throw Exception("Scalar subquery returned more than one row", ErrorCodes::INCORRECT_RESULT_OF_SCALAR_SUBQUERY);
            }
            catch (const Exception & e)
--- a/Show More
+++ b/Show More