Merge master

2024-09-20 16:50:48 +00:00 · 2021-03-04 13:26:40 +03:00 · 2021-03-04 13:26:40 +03:00 · e69124a0a6
commit e69124a0a6
parent 98065ec56e b985e33294
222 changed files with 6294 additions and 1205 deletions
--- a/contrib/boost
+++ b/contrib/boost
@ -1 +1 @@
-Subproject commit 48f40ebb539220d328958f8823b094c0b07a4e79
+Subproject commit ee24fa55bc46e4d2ce7d0d052cc5a0d9b1be8c36
--- a/docker/test/performance-comparison/compare.sh
+++ b/docker/test/performance-comparison/compare.sh
@ -358,6 +358,8 @@ mkdir analyze analyze/tmp ||:
 build_log_column_definitions
 # Split the raw test output into files suitable for analysis.
 # To debug calculations only for a particular test, substitute a suitable
 # wildcard here, e.g. `for test_file in modulo-raw.tsv`.
 for test_file in *-raw.tsv
 do
    test_name=$(basename "$test_file" "-raw.tsv")
@ -467,7 +469,13 @@ create view broken_queries as
 create table query_run_metrics_for_stats engine File(
        TSV, -- do not add header -- will parse with grep
        'analyze/query-run-metrics-for-stats.tsv')
-    as select test, query_index, 0 run, version, metric_values
+    as select test, query_index, 0 run, version,
        -- For debugging, add a filter for a particular metric like this:
        -- arrayFilter(m, n -> n = 'client_time', metric_values, metric_names)
        --     metric_values
        -- Note that further reporting may break, because the metric names are
        -- not filtered.
        metric_values
    from query_run_metric_arrays
    where (test, query_index) not in broken_queries
    order by test, query_index, run, version
@ -585,8 +593,19 @@ create view query_metric_stats as
 -- Main statistics for queries -- query time as reported in query log.
 create table queries engine File(TSVWithNamesAndTypes, 'report/queries.tsv')
    as select
-        abs(diff) > report_threshold        and abs(diff) > stat_threshold as changed_fail,
+        -- It is important to have a non-strict inequality with stat_threshold
-        abs(diff) > report_threshold - 0.05 and abs(diff) > stat_threshold as changed_show,
+        -- here. The randomization distribution is actually discrete, and when
        -- the number of runs is small, the quantile we need (e.g. 0.99) turns
        -- out to be the maximum value of the distribution. We can also hit this
        -- maximum possible value with our test run, and this obviously means
        -- that we have observed the difference to the best precision possible
        -- for the given number of runs. If we use a strict equality here, we
        -- will miss such cases. This happened in the wild and lead to some
        -- uncaught regressions, because for the default 7 runs we do for PRs,
        -- the randomization distribution has only 16 values, so the max quantile
        -- is actually 0.9375.
        abs(diff) > report_threshold        and abs(diff) >= stat_threshold as changed_fail,
        abs(diff) > report_threshold - 0.05 and abs(diff) >= stat_threshold as changed_show,
        not changed_fail and stat_threshold > report_threshold + 0.10 as unstable_fail,
        not changed_show and stat_threshold > report_threshold - 0.05 as unstable_show,
--- a/docker/test/performance-comparison/config/config.d/user_files.xml
+++ b/docker/test/performance-comparison/config/config.d/user_files.xml
@ -0,0 +1,7 @@
 <yandex>
    <!-- Directory with user provided files that are accessible by 'file' table function. -->
    <user_files_path>/var/lib/clickhouse/user_files/</user_files_path>
    <!-- Path to configuration file with users, access rights, profiles of settings, quotas. -->
    <users_config>users.xml</users_config>
 </yandex>
--- a/docker/test/performance-comparison/config/users.d/perf-comparison-tweaks-users.xml
+++ b/docker/test/performance-comparison/config/users.d/perf-comparison-tweaks-users.xml
@ -19,4 +19,9 @@
            <max_threads>12</max_threads>
        </default>
    </profiles>
    <users>
        <default>
            <access_management>1</access_management>
        </default>
    </users>
 </yandex>
--- a/docker/test/performance-comparison/eqmed.sql
+++ b/docker/test/performance-comparison/eqmed.sql
@ -1,4 +1,6 @@
-- input is table(test text, query text, run UInt32, version int, metrics Array(float))
+-- The input is table(test text, query text, run UInt32, version UInt8, metrics Array(float)).
 -- Run like this:
 -- clickhouse-local --queries-file eqmed.sql -S 'test text, query text, run UInt32, version UInt8, metrics Array(float)' --file analyze/tmp/modulo_0.tsv
 select
   arrayMap(x -> floor(x, 4), original_medians_array.medians_by_version[1] as l) l_rounded,
   arrayMap(x -> floor(x, 4), original_medians_array.medians_by_version[2] as r) r_rounded,
@ -8,14 +10,19 @@ select
 from
   (
      -- quantiles of randomization distributions
      -- note that for small number of runs, the exact quantile might not make
      -- sense, because the last possible value of randomization distribution
      -- might take a larger percentage of distirbution (i.e. the distribution
      -- actually has discrete values, and the last step can be large).
      select quantileExactForEach(0.99)(
        arrayMap(x, y -> abs(x - y), metrics_by_label[1], metrics_by_label[2]) as d
      ) threshold
-      ---- uncomment to see what the distribution is really like
+      ---- Uncomment to see what the distribution is really like. This debug
-      --, uniqExact(d.1) u
+      ---- code only works for single (the first) metric.
      --, uniqExact(d[1]) u
      --, arraySort(x->x.1,
      --      arrayZip(
-      --          (sumMap([d.1], [1]) as f).1,
+      --          (sumMap([d[1]], [1]) as f).1,
      --          f.2)) full_histogram
      from
         (
--- a/docs/_description_templates/template-data-type.md
+++ b/docs/_description_templates/template-data-type.md
@ -26,4 +26,4 @@ The name of an additional section can be any, for example, **Usage**.
 -   [link](#)
-[Original article](https://clickhouse.tech/docs/en/data_types/<data-type-name>/) <!--hide-->
+[Original article](https://clickhouse.tech/docs/en/data-types/<data-type-name>/) <!--hide-->
--- a/docs/en/engines/table-engines/integrations/kafka.md
+++ b/docs/en/engines/table-engines/integrations/kafka.md
@ -38,20 +38,20 @@ SETTINGS
 Required parameters:
-   `kafka_broker_list` – A comma-separated list of brokers (for example, `localhost:9092`).
+-   `kafka_broker_list` — A comma-separated list of brokers (for example, `localhost:9092`).
-   `kafka_topic_list` – A list of Kafka topics.
+-   `kafka_topic_list` — A list of Kafka topics.
-   `kafka_group_name` – A group of Kafka consumers. Reading margins are tracked for each group separately. If you don’t want messages to be duplicated in the cluster, use the same group name everywhere.
+-   `kafka_group_name` — A group of Kafka consumers. Reading margins are tracked for each group separately. If you don’t want messages to be duplicated in the cluster, use the same group name everywhere.
-   `kafka_format` – Message format. Uses the same notation as the SQL `FORMAT` function, such as `JSONEachRow`. For more information, see the [Formats](../../../interfaces/formats.md) section.
+-   `kafka_format` — Message format. Uses the same notation as the SQL `FORMAT` function, such as `JSONEachRow`. For more information, see the [Formats](../../../interfaces/formats.md) section.
 Optional parameters:
-   `kafka_row_delimiter` – Delimiter character, which ends the message.
+-   `kafka_row_delimiter` — Delimiter character, which ends the message.
-   `kafka_schema` – Parameter that must be used if the format requires a schema definition. For example, [Cap’n Proto](https://capnproto.org/) requires the path to the schema file and the name of the root `schema.capnp:Message` object.
+-   `kafka_schema` — Parameter that must be used if the format requires a schema definition. For example, [Cap’n Proto](https://capnproto.org/) requires the path to the schema file and the name of the root `schema.capnp:Message` object.
-   `kafka_num_consumers` – The number of consumers per table. Default: `1`. Specify more consumers if the throughput of one consumer is insufficient. The total number of consumers should not exceed the number of partitions in the topic, since only one consumer can be assigned per partition.
+-   `kafka_num_consumers` — The number of consumers per table. Default: `1`. Specify more consumers if the throughput of one consumer is insufficient. The total number of consumers should not exceed the number of partitions in the topic, since only one consumer can be assigned per partition.
-   `kafka_max_block_size` - The maximum batch size (in messages) for poll (default: `max_block_size`).
+-   `kafka_max_block_size` — The maximum batch size (in messages) for poll (default: `max_block_size`).
-   `kafka_skip_broken_messages` – Kafka message parser tolerance to schema-incompatible messages per block. Default: `0`. If `kafka_skip_broken_messages = N` then the engine skips *N* Kafka messages that cannot be parsed (a message equals a row of data).
+-   `kafka_skip_broken_messages` — Kafka message parser tolerance to schema-incompatible messages per block. Default: `0`. If `kafka_skip_broken_messages = N` then the engine skips *N* Kafka messages that cannot be parsed (a message equals a row of data).
-   `kafka_commit_every_batch` - Commit every consumed and handled batch instead of a single commit after writing a whole block (default: `0`).
+-   `kafka_commit_every_batch` — Commit every consumed and handled batch instead of a single commit after writing a whole block (default: `0`).
-   `kafka_thread_per_consumer` - Provide independent thread for each consumer (default: `0`). When enabled, every consumer flush the data independently, in parallel (otherwise - rows from several consumers squashed to form one block).
+-   `kafka_thread_per_consumer` — Provide independent thread for each consumer (default: `0`). When enabled, every consumer flush the data independently, in parallel (otherwise — rows from several consumers squashed to form one block).
 Examples:
--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@ -1956,8 +1956,8 @@ Default value: 16.
 **See Also**
-   [Kafka](../../engines/table-engines/integrations/kafka.md#kafka) engine
+-   [Kafka](../../engines/table-engines/integrations/kafka.md#kafka) engine.
-   [RabbitMQ](../../engines/table-engines/integrations/rabbitmq.md#rabbitmq-engine) engine
+-   [RabbitMQ](../../engines/table-engines/integrations/rabbitmq.md#rabbitmq-engine) engine.
 ## validate_polygons {#validate_polygons}
@ -2658,8 +2658,6 @@ Result:
 Note that this setting influences [Materialized view](../../sql-reference/statements/create/view.md#materialized) and [MaterializeMySQL](../../engines/database-engines/materialize-mysql.md) behaviour.
 [Original article](https://clickhouse.tech/docs/en/operations/settings/settings/) <!-- hide -->
 ## engine_file_empty_if_not_exists {#engine-file-empty_if-not-exists}
 Allows to select data from a file engine table without file.
@ -2679,3 +2677,16 @@ Possible values:
 - 1 — Enabled.
 Default value: `0`.
 ## allow_experimental_geo_types {#allow-experimental-geo-types}
 Allows working with experimental [geo data types](../../sql-reference/data-types/geo.md).
 Possible values:
 -   0 — Working with geo data types is disabled.
 -   1 — Working with geo data types is enabled.
 Default value: `0`.
 [Original article](https://clickhouse.tech/docs/en/operations/settings/settings/) <!-- hide -->
--- a/docs/en/operations/system-tables/index.md
+++ b/docs/en/operations/system-tables/index.md
@ -20,7 +20,7 @@ System tables:
 Most of system tables store their data in RAM. A ClickHouse server creates such system tables at the start.
-Unlike other system tables, the system log tables [metric_log](../../operations/system-tables/metric_log.md), [query_log](../../operations/system-tables/query_log.md), [query_thread_log](../../operations/system-tables/query_thread_log.md), [trace_log](../../operations/system-tables/trace_log.md), [part_log](../../operations/system-tables/part_log.md), crash_log and [text_log](../../operations/system-tables/text_log.md) are served by [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) table engine and store their data in a storage filesystem by default. If you remove a table from a filesystem, the ClickHouse server creates the empty one again at the time of the next data writing. If system table schema changed in a new release, then ClickHouse renames the current table and creates a new one.
+Unlike other system tables, the system log tables [metric_log](../../operations/system-tables/metric_log.md), [query_log](../../operations/system-tables/query_log.md), [query_thread_log](../../operations/system-tables/query_thread_log.md), [trace_log](../../operations/system-tables/trace_log.md), [part_log](../../operations/system-tables/part_log.md), [crash_log](../../operations/system-tables/crash-log.md) and [text_log](../../operations/system-tables/text_log.md) are served by [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) table engine and store their data in a filesystem by default. If you remove a table from a filesystem, the ClickHouse server creates the empty one again at the time of the next data writing. If system table schema changed in a new release, then ClickHouse renames the current table and creates a new one.
 System log tables can be customized by creating a config file with the same name as the table under `/etc/clickhouse-server/config.d/`, or setting corresponding elements in `/etc/clickhouse-server/config.xml`. Elements can be customized are:
@ -33,7 +33,7 @@ System log tables can be customized by creating a config file with the same name
 An example:
-```
+```xml
 <yandex>
    <query_log>
        <database>system</database>
--- a/docs/en/sql-reference/aggregate-functions/parametric-functions.md
+++ b/docs/en/sql-reference/aggregate-functions/parametric-functions.md
@ -253,8 +253,8 @@ windowFunnel(window, [mode])(timestamp, cond1, cond2, ..., condN)
 **Parameters**
-   `window` — Length of the sliding window. The unit of `window` depends on the timestamp itself and varies. Determined using the expression `timestamp of cond2 <= timestamp of cond1 + window`.
+-   `window` — Length of the sliding window. The unit of `window` depends on the `timestamp` itself and varies. Determined using the expression `timestamp of cond2 <= timestamp of cond1 + window`.
-   `mode` - It is an optional parameter.
+-   `mode` - It is an optional argument.
    -   `'strict'` - When the `'strict'` is set, the windowFunnel() applies conditions only for the unique values.
 **Returned value**
--- a/docs/en/sql-reference/data-types/geo.md
+++ b/docs/en/sql-reference/data-types/geo.md
@ -0,0 +1,106 @@
 ---
 toc_priority: 62
 toc_title: Geo
 ---
 # Geo Data Types {#geo-data-types}
 Clickhouse supports data types for representing geographical objects — locations, lands, etc. 
 !!! warning "Warning"
    Currently geo data types are an experimental feature. To work with them you must set `allow_experimental_geo_types = 1`.
 **See Also**
 - [Representing simple geographical features](https://en.wikipedia.org/wiki/GeoJSON).
 - [allow_experimental_geo_types](../../operations/settings/settings.md#allow-experimental-geo-types) setting.
 ## Point {#point-data-type}
 `Point` is represented by its X and Y coordinates, stored as a [Tuple](tuple.md)([Float64](float.md), [Float64](float.md)).
 **Example**
 Query:
 ```sql
 SET allow_experimental_geo_types = 1;
 CREATE TABLE geo_point (p Point) ENGINE = Memory();
 INSERT INTO geo_point VALUES((10, 10));
 SELECT p, toTypeName(p) FROM geo_point;
 ```
 Result: 
 ``` text
 ┌─p─────┬─toTypeName(p)─┐
 │ (10,10) │ Point         │
 └───────┴───────────────┘
 ```
 ## Ring {#ring-data-type}
 `Ring` is a simple polygon without holes stored as an array of points: [Array](array.md)([Point](#point-data-type)).
 **Example**
 Query:
 ```sql
 SET allow_experimental_geo_types = 1;
 CREATE TABLE geo_ring (r Ring) ENGINE = Memory();
 INSERT INTO geo_ring VALUES([(0, 0), (10, 0), (10, 10), (0, 10)]);
 SELECT r, toTypeName(r) FROM geo_ring;
 ```
 Result: 
 ``` text
 ┌─r─────────────────────────────┬─toTypeName(r)─┐
 │ [(0,0),(10,0),(10,10),(0,10)] │ Ring          │
 └───────────────────────────────┴───────────────┘
 ```
 ## Polygon {#polygon-data-type}
 `Polygon` is a polygon with holes stored as an array of rings: [Array](array.md)([Ring](#ring-data-type)). First element of outer array is the outer shape of polygon and all the following elements are holes.
 **Example**
 This is a polygon with one hole:
 ```sql
 SET allow_experimental_geo_types = 1;
 CREATE TABLE geo_polygon (pg Polygon) ENGINE = Memory();
 INSERT INTO geo_polygon VALUES([[(20, 20), (50, 20), (50, 50), (20, 50)], [(30, 30), (50, 50), (50, 30)]]);
 SELECT pg, toTypeName(pg) FROM geo_polygon;
 ```
 Result: 
 ``` text
 ┌─pg────────────────────────────────────────────────────────────┬─toTypeName(pg)─┐
 │ [[(20,20),(50,20),(50,50),(20,50)],[(30,30),(50,50),(50,30)]] │ Polygon        │
 └───────────────────────────────────────────────────────────────┴────────────────┘
 ```
 ## MultiPolygon {#multipolygon-data-type}
 `MultiPolygon` consists of multiple polygons and is stored as an array of polygons: [Array](array.md)([Polygon](#polygon-data-type)). 
 **Example**
 This multipolygon consists of two separate polygons — the first one without holes, and the second with one hole:
 ```sql
 SET allow_experimental_geo_types = 1;
 CREATE TABLE geo_multipolygon (mpg MultiPolygon) ENGINE = Memory();
 INSERT INTO geo_multipolygon VALUES([[[(0, 0), (10, 0), (10, 10), (0, 10)]], [[(20, 20), (50, 20), (50, 50), (20, 50)],[(30, 30), (50, 50), (50, 30)]]]);
 SELECT mpg, toTypeName(mpg) FROM geo_multipolygon;
 ```
 Result: 
 ``` text
 ┌─mpg─────────────────────────────────────────────────────────────────────────────────────────────┬─toTypeName(mpg)─┐
 │ [[[(0,0),(10,0),(10,10),(0,10)]],[[(20,20),(50,20),(50,50),(20,50)],[(30,30),(50,50),(50,30)]]] │ MultiPolygon    │
 └─────────────────────────────────────────────────────────────────────────────────────────────────┴─────────────────┘
 ```
 [Original article](https://clickhouse.tech/docs/en/data-types/geo/) <!--hide-->
--- a/docs/en/sql-reference/functions/encryption-functions.md
+++ b/docs/en/sql-reference/functions/encryption-functions.md
@ -55,7 +55,7 @@ CREATE TABLE encryption_test
    `comment` String,
    `secret` String
 )
-ENGINE = Memory
+ENGINE = Memory;
 ```
 Insert some data (please avoid storing the keys/ivs in the database as this undermines the whole concept of encryption), also storing 'hints' is unsafe too and used only for illustrative purposes:
@ -110,7 +110,7 @@ Result:
 Compatible with mysql encryption and resulting ciphertext can be decrypted with [AES_DECRYPT](https://dev.mysql.com/doc/refman/8.0/en/encryption-functions.html#function_aes-decrypt) function.
-Will produce same ciphertext as `encrypt` on equal inputs. But when `key` or `iv` are longer than they should normally be, `aes_encrypt_mysql` will stick to what MySQL's `aes_encrypt` does: 'fold' `key` and ignore excess bits of `IV`.
+Will produce the same ciphertext as `encrypt` on equal inputs. But when `key` or `iv` are longer than they should normally be, `aes_encrypt_mysql` will stick to what MySQL's `aes_encrypt` does: 'fold' `key` and ignore excess bits of `iv`.
 Supported encryption modes:
@ -138,7 +138,6 @@ aes_encrypt_mysql('mode', 'plaintext', 'key' [, iv])
 - Ciphertext binary string. [String](../../sql-reference/data-types/string.md#string).
 **Examples**
 Given equal input `encrypt` and `aes_encrypt_mysql` produce the same ciphertext:
@ -157,7 +156,6 @@ Result:
 └───────────────────┘
 ```
 But `encrypt` fails when `key` or `iv` is longer than expected:
 Query:
@ -252,7 +250,7 @@ decrypt('mode', 'ciphertext', 'key' [, iv, aad])
 **Examples**
-Re-using table from [encrypt](./encryption-functions.md#encrypt).
+Re-using table from [encrypt](#encrypt).
 Query:
@ -284,6 +282,7 @@ SELECT comment, decrypt('aes-256-cfb128', secret, '12345678910121314151617181920
 ```
 Result:
 ``` text
 ┌─comment─────────────────────────────┬─plaintext─┐
 │ aes-256-cfb128 no IV                │ Secret    │
@ -294,7 +293,7 @@ Result:
 └─────────────────────────────────────┴───────────┘
 ```
-Notice how only portion of the data was properly decrypted, and the rest is gibberish since either `mode`, `key`, or `iv` were different upon encryption.
+Notice how only a portion of the data was properly decrypted, and the rest is gibberish since either `mode`, `key`, or `iv` were different upon encryption.
 ## aes_decrypt_mysql {#aes_decrypt_mysql}
@ -331,6 +330,7 @@ aes_decrypt_mysql('mode', 'ciphertext', 'key' [, iv])
 **Examples**
 Let's decrypt data we've previously encrypted with MySQL:
 ``` sql
 mysql> SET  block_encryption_mode='aes-256-cfb128';
 Query OK, 0 rows affected (0.00 sec)
@ -345,11 +345,13 @@ mysql> SELECT aes_encrypt('Secret', '123456789101213141516171819202122', 'iviviv
 ```
 Query:
 ``` sql
 SELECT aes_decrypt_mysql('aes-256-cfb128', unhex('24E9E4966469'), '123456789101213141516171819202122', 'iviviviviviviviv123456') AS plaintext
 ```
 Result:
 ``` text
 ┌─plaintext─┐
 │ Secret    │
--- a/docs/en/sql-reference/statements/select/all.md
+++ b/docs/en/sql-reference/statements/select/all.md
@ -4,10 +4,8 @@ toc_title: ALL
 # ALL Clause {#select-all}
-`SELECT ALL` is identical to `SELECT` without `DISTINCT`.
+If there are multiple matching rows in the table, then `ALL` returns all of them. `SELECT ALL` is identical to `SELECT` without `DISTINCT`. If both `ALL` and `DISTINCT` specified, exception will be thrown.
 - If `ALL` specified, ignore it.
 - If both `ALL` and `DISTINCT` specified, exception will be thrown.
 `ALL` can also be specified inside aggregate function with the same effect(noop), for instance:
@ -19,3 +17,5 @@ equals to
 ```sql
 SELECT sum(number) FROM numbers(10);
 ```
 [Original article](https://clickhouse.tech/docs/en/sql-reference/statements/select/all) <!--hide-->
--- a/docs/ru/engines/table-engines/integrations/kafka.md
+++ b/docs/ru/engines/table-engines/integrations/kafka.md
@ -31,21 +31,26 @@ SETTINGS
    [kafka_schema = '',]
    [kafka_num_consumers = N,]
    [kafka_skip_broken_messages = N]
    [kafka_commit_every_batch = 0,]
    [kafka_thread_per_consumer = 0]
 ```
 Обязательные параметры:
-   `kafka_broker_list` – перечень брокеров, разделенный запятыми (`localhost:9092`).
+-   `kafka_broker_list` — перечень брокеров, разделенный запятыми (`localhost:9092`).
-   `kafka_topic_list` – перечень необходимых топиков Kafka.
+-   `kafka_topic_list` — перечень необходимых топиков Kafka.
-   `kafka_group_name` – группа потребителя Kafka. Отступы для чтения отслеживаются для каждой группы отдельно. Если необходимо, чтобы сообщения не повторялись на кластере, используйте везде одно имя группы.
+-   `kafka_group_name` — группа потребителя Kafka. Отступы для чтения отслеживаются для каждой группы отдельно. Если необходимо, чтобы сообщения не повторялись на кластере, используйте везде одно имя группы.
-   `kafka_format` – формат сообщений. Названия форматов должны быть теми же, что можно использовать в секции `FORMAT`, например, `JSONEachRow`. Подробнее читайте в разделе [Форматы](../../../interfaces/formats.md).
+-   `kafka_format` — формат сообщений. Названия форматов должны быть теми же, что можно использовать в секции `FORMAT`, например, `JSONEachRow`. Подробнее читайте в разделе [Форматы](../../../interfaces/formats.md).
 Опциональные параметры:
-   `kafka_row_delimiter` – символ-разделитель записей (строк), которым завершается сообщение.
+-   `kafka_row_delimiter` — символ-разделитель записей (строк), которым завершается сообщение.
-   `kafka_schema` – опциональный параметр, необходимый, если используется формат, требующий определения схемы. Например, [Cap’n Proto](https://capnproto.org/) требует путь к файлу со схемой и название корневого объекта `schema.capnp:Message`.
+-   `kafka_schema` — опциональный параметр, необходимый, если используется формат, требующий определения схемы. Например, [Cap’n Proto](https://capnproto.org/) требует путь к файлу со схемой и название корневого объекта `schema.capnp:Message`.
-   `kafka_num_consumers` – количество потребителей (consumer) на таблицу. По умолчанию: `1`. Укажите больше потребителей, если пропускная способность одного потребителя недостаточна. Общее число потребителей не должно превышать количество партиций в топике, так как на одну партицию может быть назначено не более одного потребителя.
+-   `kafka_num_consumers` — количество потребителей (consumer) на таблицу. По умолчанию: `1`. Укажите больше потребителей, если пропускная способность одного потребителя недостаточна. Общее число потребителей не должно превышать количество партиций в топике, так как на одну партицию может быть назначено не более одного потребителя.
-   `kafka_skip_broken_messages` – максимальное количество некорректных сообщений в блоке. Если `kafka_skip_broken_messages = N`, то движок отбрасывает `N` сообщений Кафки, которые не получилось обработать. Одно сообщение в точности соответствует одной записи (строке). Значение по умолчанию – 0.
+-   `kafka_max_block_size` — максимальный размер пачек (в сообщениях) для poll (по умолчанию `max_block_size`).
 -   `kafka_skip_broken_messages` — максимальное количество некорректных сообщений в блоке. Если `kafka_skip_broken_messages = N`, то движок отбрасывает `N` сообщений Кафки, которые не получилось обработать. Одно сообщение в точности соответствует одной записи (строке). Значение по умолчанию – 0.
 -   `kafka_commit_every_batch` — включает или отключает режим записи каждой принятой и обработанной пачки по отдельности вместо единой записи целого блока (по умолчанию `0`).
 -   `kafka_thread_per_consumer` — включает или отключает предоставление отдельного потока каждому потребителю (по умолчанию `0`). При включенном режиме каждый потребитель сбрасывает данные независимо и параллельно, при отключённом — строки с данными от нескольких потребителей собираются в один блок.
 Примеры
--- a/docs/ru/faq/general/columnar-database.md
+++ b/docs/ru/faq/general/columnar-database.md
@ -0,0 +1,25 @@
 ---
 title: Что такое столбцовая база данных?
 toc_hidden: true
 toc_priority: 101
 ---
 # Что такое столбцовая (колоночная) база данных? {#what-is-a-columnar-database}
 В столбцовой БД данные каждого столбца хранятся отдельно (независимо) от других столбцов. Такой принцип хранения позволяет при выполнении запроса считывать с диска данные только тех столбцов, которые непосредственно участвуют в этом запросе. Обратная сторона такого принципа хранения заключается в том, что выполнение операций над строками становится более затратным. ClickHouse — типичный пример столбцовой СУБД.
 Ключевые преимущества столбцовой СУБД: 
 - выполнение запросов над отдельными столбцами таблицы, а не над всей таблицей сразу;
 - агрегация запросов на больших объемах данных;
 - сжатие данных в столбцах.
 Ниже — иллюстрация того, как извлекаются данные для отчетов при использовании обычной строковой СУБД и столбцовой СУБД:
 **Стандартная строковая СУБД**
 ![Стандартная строковая СУБД](https://clickhouse.tech/docs/en/images/row-oriented.gif#)
 **Столбцовая СУБД**
 ![Столбцовая СУБД](https://clickhouse.tech/docs/en/images/column-oriented.gif#)
 Для аналитических приложений столбцовые СУБД предпочтительнее, так как в них можно хранить много столбцов в таблице просто на всякий случай, и это не будет сказываться на скорости чтения данных. Столбцовые СУБД предназначены для обработки и хранения больших данных. Они прекрасно масштабируются при помощи распределенных кластеров на относительно недорогих серверах — для увеличения производительности. В ClickHouse для этого используются [распределенные](../../engines/table-engines/special/distributed.md) и [реплицированные](../../engines/table-engines/mergetree-family/replication.md) таблицы.
--- a/docs/ru/faq/general/dbms-naming.md
+++ b/docs/ru/faq/general/dbms-naming.md
@ -0,0 +1,17 @@
 ---
 title: "Что означает название ClickHouse?"
 toc_hidden: true
 toc_priority: 10
 ---
 # Что означает название ClickHouse? {#what-does-clickhouse-mean}
 Это комбинация терминов **Click**stream и Data ware**House**. Название пришло из Яндекс.Метрики, для которой первоначально был разработан ClickHouse — там он использовался для хранения истории визитов пользователей на сайты и всех пользовательских действий — "кликов". Кстати, ClickHouse по-прежнему выполняет эту функцию. Узнать об этом больше можно на странице [истории ClickHouse](../../introduction/history.md).
 Поскольку название составное, использовать его нужно следующим образом:
 -   единственно правильный способ написания — Click**H**ouse — с заглавной буквой H;
 -   если нужно сокращеннное название, используйте **CH**. Исторически сложилось, что в Китае также популярно сокращение CK — в основном, из-за того, что это название использовалось в одном из первых обсуждений ClickHouse на китайском языке.
 !!! info "Забавный факт"
    Спустя годы после того, как ClickHouse получил свое название, принцип комбинирования двух слов, каждое из которых имеет подходящий смысл, был признан лучшим способом назвать базу данных в [исследовании Andy Pavlo](https://www.cs.cmu.edu/~pavlo/blog/2020/03/on-naming-a-database-management-system.html), Associate Professor of Databases в Carnegie Mellon University. ClickHouse разделил награду "за лучшее название СУБД" с Postgres.
--- a/docs/ru/faq/general/index.md
+++ b/docs/ru/faq/general/index.md
@ -1,25 +1,24 @@
 ---
-title: General questions about ClickHouse
+title: Общие вопросы о ClickHouse
 toc_hidden_folder: true
 toc_priority: 1
 toc_title: Общие вопросы
 ---
-# Общие вопросы о ClickHouse {#obshchie-voprosy}
+# Общие вопросы о ClickHouse {#general-questions}
 Вопросы:
-   Что такое ClickHouse?
+-   [Что такое ClickHouse?](../../index.md#what-is-clickhouse)
-   Почему ClickHouse такой быстрый?
+-   [Почему ClickHouse такой быстрый?](why-clickhouse-is-so-fast.md)
-   Кто пользуется ClickHouse?
+-   [Кто пользуется ClickHouse?](who-is-using-clickhouse.md)
-   Что обозначает название “ClickHouse”?
+-   [Что обозначает название ClickHouse?](dbms-naming.md)
-   Что значит “Не тормозит”?
+-   [Как фраза “Не тормозит” осталась на всех футболках?](ne-tormozit.md)
-   Что такое OLAP?
+-   [Что такое OLAP?](olap.md)
-   Что такое колоночная база данных?
+-   [Что такое столбцовая база данных?](columnar-database.md)
 -   [Почему бы не использовать системы типа MapReduce?](mapreduce.md)
 !!! info "Если вы не нашли то, что искали:"
-    Загляните в другие категории F.A.Q. или поищите в других разделах документации, ориентируйтесь по оглавлению слева.
+    Загляните в другие категории F.A.Q. или поищите в остальных разделах документации, ориентируясь по оглавлению слева.
 {## [Original article](https://clickhouse.tech/docs/ru/faq/general/) ##}
 [Original article](https://clickhouse.tech/docs/ru/faq/general/) <!--hide-->
--- a/docs/ru/faq/general/mapreduce.md
+++ b/docs/ru/faq/general/mapreduce.md
@ -4,13 +4,10 @@ toc_hidden: true
 toc_priority: 110
 ---
-## Почему бы не использовать системы типа MapReduce? {#pochemu-by-ne-ispolzovat-sistemy-tipa-mapreduce}
+# Почему бы не использовать системы типа MapReduce? {#why-not-use-something-like-mapreduce}
-Системами типа MapReduce будем называть системы распределённых вычислений, в которых операция reduce сделана на основе распределённой сортировки. Наиболее распространённым opensource решением данного класса является [Apache Hadoop](http://hadoop.apache.org). Яндекс использует собственное решение — YT. 
+Системами типа MapReduce будем называть системы распределённых вычислений, в которых операция свёртки реализована на основе распределённой сортировки. Наиболее распространённое решение с открытым кодом в данном классе — [Apache Hadoop](http://hadoop.apache.org). Яндекс пользуется собственным решением — YT. 
-Такие системы не подходят для онлайн запросов в силу слишком большой latency. То есть, не могут быть использованы в качестве бэкенда для веб-интерфейса.
+Такие системы не подходят для онлайн запросов в силу слишком большой задержки. То есть не могут быть использованы в качестве бэкенда для веб-интерфейса. Также эти системы не подходят для обновления данных в реальном времени. Распределённая сортировка является не оптимальным способом для выполнения операции свёртки в случае запросов, выполняющихся в режиме онлайн, потому что результат выполнения операции и все промежуточные результаты (если такие есть) помещаются в оперативную память на одном сервере. В таком случае оптимальным способом выполнения операции свёртки является хеш-таблица. Частым способом оптимизации "map-reduce" задач является предагрегация (частичная свёртка) с использованием хеш-таблицы в оперативной памяти. Пользователь делает эту оптимизацию в ручном режиме. Распределённая сортировка — основная причина тормозов при выполнении несложных задач типа "map-reduce".
 Такие системы не подходят для обновления данных в реальном времени.
 Распределённая сортировка не является оптимальным способом выполнения операции reduce, если результат выполнения операции и все промежуточные результаты, при их наличии, помещаются в оперативку на одном сервере, как обычно бывает в запросах, выполняющихся в режиме онлайн. В таком случае, оптимальным способом выполнения операции reduce является хэш-таблица. Частым способом оптимизации map-reduce задач является предагрегация (частичный reduce) с использованием хэш-таблицы в оперативной памяти. Эта оптимизация делается пользователем в ручном режиме.
 Распределённая сортировка является основной причиной тормозов при выполнении несложных map-reduce задач.
-Большинство реализаций MapReduce позволяют выполнять произвольный код на кластере. Но для OLAP задач лучше подходит декларативный язык запросов, который позволяет быстро проводить исследования. Для примера, для Hadoop существует Hive и Pig. Также смотрите Cloudera Impala, Shark (устаревший) для Spark, а также Spark SQL, Presto, Apache Drill. Впрочем, производительность при выполнении таких задач является сильно неоптимальной по сравнению со специализированными системами, а сравнительно высокая latency не позволяет использовать эти системы в качестве бэкенда для веб-интерфейса.
+Большинство реализаций MapReduce позволяют выполнять произвольный код на кластере. Но для OLAP-задач лучше подходит декларативный язык запросов, который позволяет быстро проводить исследования. Например, для Hadoop существуют Hive и Pig. Также посмотрите на Cloudera Impala, Shark (устаревший) для Spark, а также Spark SQL, Presto, Apache Drill. Впрочем, производительность при выполнении таких задач очень неоптимальная, если сравнивать со специализированными системами, а относительно высокая задержка не позволяет использовать эти системы в качестве бэкенда для веб-интерфейса.
--- a/docs/ru/faq/general/ne-tormozit.md
+++ b/docs/ru/faq/general/ne-tormozit.md
@ -0,0 +1,26 @@
 ---
 title: "What does \u201C\u043D\u0435 \u0442\u043E\u0440\u043C\u043E\u0437\u0438\u0442\
  \u201D mean?"
 toc_hidden: true
 toc_priority: 11
 ---
 # Что значит “Не тормозит”? {#what-does-ne-tormozit-mean}
 Обычно этот вопрос возникает, когда люди видят официальные футболки ClickHouse. На них большими буквами написано **“ClickHouse не тормозит”**.
 До того, как код ClickHouse стал открытым, его разрабатывали как собственную систему хранения данных в крупнейшей российской ИТ-компании [Яндекс](https://yandex.com/company/). Поэтому оригинальный слоган написан по-русски. После выхода версии с открытым исходным кодом мы впервые выпустили некоторое количество таких футболок для мероприятий в России, и просто оставили прежний слоган. 
 Когда мы решили отправить партию этих футболок на мероприятия вне России, мы пробовали подобрать подходящий английский слоган. К сожалению, мы так и не смогли придумать достаточно точный и выразительный перевод, ведь на русском этот слоган звучит очень ёмко и при этом довольно элегантно. К тому же, существовало ограничение по количеству символов на футболках. В итоге мы решили оставить русский вариант даже для международных событий. И это стало прекрасным решением, потому что люди по всему миру приятно удивлялись, когда видели фразу и интересовались, что же там написано.
 Итак, как же объяснить эту фразу на английском? Вот несколько вариантов: 
 -   Если переводить буквально, то получится что-то подобное: *“ClickHouse doesn’t press the brake pedal”*.
 -   Если же вы хотите максимально сохранить том смысл, который вкладывает в эту фразу человек из ИТ-сферы, то будет примерно следующее: *“If your larger system lags, it’s not because it uses ClickHouse”*. 
 -   Более короткие, но не такие точные версии: *“ClickHouse is not slow”*, *“ClickHouse doesn’t lag”* или просто *“ClickHouse is fast”*.
 Если вы не видели наших футболок, посмотрите видео о ClickHouse. Например, вот это:
 ![iframe](https://www.youtube.com/embed/bSyQahMVZ7w)
 P.S. Эти футболки не продаются, а распространяются бесплатно на большинстве митапов [ClickHouse](https://clickhouse.tech/#meet), обычно в награду за самые интересные вопросы или другие виды активного участия.
--- a/docs/ru/faq/general/olap.md
+++ b/docs/ru/faq/general/olap.md
@ -0,0 +1,39 @@
 ---
 title: Что такое OLAP?
 toc_hidden: true
 toc_priority: 100
 ---
 # Что такое OLAP? {#what-is-olap}
 [OLAP](https://ru.wikipedia.org/wiki/OLAP) (OnLine Analytical Processing) переводится как обработка данных в реальном времени. Это широкий термин, который можно рассмотреть с двух сторон: с технической и с точки зрения бизнеса. Для самого общего понимания можно просто прочитать его с конца:
 **Processing**
    Обрабатываются некие исходные данные…
 **Analytical**
 :   … чтобы получить какие-то аналитические отчеты или новые знания…
 **OnLine**
 :   … в реальном времени, практически без задержек на обработку.
 ## OLAP с точки зрения бизнеса {#olap-from-the-business-perspective}
 В последние годы бизнес-сообщество стало осознавать ценность данных. Компании, которые принимают решения вслепую, чаще всего отстают от конкурентов. Управление бизнесом на основе данных, которое применяется успешными компаниями, побуждает собирать все данные, которые могут быть полезны в будущем для принятия бизнес-решений, а также подбирать механизмы, чтобы своевременно эти данные анализировать. Именно для этого и нужны СУБД с OLAP. 
 С точки зрения бизнеса, OLAP позволяет компаниям постоянно планировать, анализировать и оценивать операционную деятельность, чтобы повышать её эффективность, уменьшать затраты и как следствие — увеличивать долю рынка. Это можно делать как в собственной системе, так и в облачной (SaaS), в веб или мобильных аналитических приложениях, CRM-системах и т.д. Технология OLAP используется во многих приложениях BI (Business Intelligence — бизнес-аналитика).
 ClickHouse — это СУБД с OLAP, которая часто используется для поддержки SaaS-решений для анализа данных в различных предметных областях. Но поскольку некоторые компании все еще не слишком охотно размещают свои данные в облаке (у сторонних провайдеров), ClickHouse может быть развернут и на собственных серверах заказчика.
 ## OLAP с технической точки зрения {#olap-from-the-technical-perspective}
 Все СУБД можно разделить на две группы: OLAP (**аналитическая** обработка в реальном времени) и OLTP (обработка **транзакций** в реальном времени). OLAP используются для построения отчетов на основе больших объемов накопленных исторических данных, но эти отчеты обновляются не слишком часто. OLTP обычно применяются для обработки непрерывных потоков операций (транзакций), каждая из которых изменяет состояние данных.
 На практике OLAP и OLTP — это не строго разделённые категории, а скорее спектр возможностей. Большинство СУБД специализируются на каком-то одном виде обработки данных, но имеют инструменты и для выполнения других операций, когда это необходимо. Из-за такой специализации часто приходится использовать несколько СУБД и интегрировать их между собой. Это вполне реальная и решаемая задача, но, как известно, чем больше систем, тем выше расходы на их содержание. Поэтому в последние годы становятся популярны гибридные СУБД — HTAP (**Hybrid Transactional/Analytical Processing**), которые одинаково эффективно выполняют оба вида операций по обработке данных.
 Даже если СУБД сначала развивались исключительно как OLAP или как OLTP, разработчики постепенно двигаются в сторону HTAP, чтобы сохранять конкурентоспособность. И ClickHouse не исключение. Изначально он создавался как [OLAP СУБД с максимальной производительностью](../../faq/general/why-clickhouse-is-so-fast.md), и на сегодняшний день в нем нет полноценной поддержки обработки тразакций, но уже реализованы некоторые возможности, такие как постоянная скорость чтения/записи данных и мутации при изменении/удалении данных.
 Принципиальное "разделение труда" между OLAP и OLTP СУБД сохраняется:
 -   Чтобы эффективно строить аналитические отчеты, нужно уметь обрабатывать колонки по отдельности, поэтому большинство OLAP СУБД — [столбцовые](../../faq/general/columnar-database.md).
 -   Хранение данных по столбцам снижает скорость выполнения операций над строками (таких как добавление или изменение данных) пропорционально числу столбцов, а это число может быть огромным для систем, ориентированных на сбор разнообразных детальных данных о событиях. Поэтому большинство OLTP систем используют строковые СУБД.
--- a/docs/ru/faq/general/who-is-using-clickhouse.md
+++ b/docs/ru/faq/general/who-is-using-clickhouse.md
@ -0,0 +1,19 @@
 ---
 title: Кто пользуется ClickHouse?
 toc_hidden: true
 toc_priority: 9
 ---
 # Кто пользуется ClickHouse? {#who-is-using-clickhouse}
 Так как CH является продуктом с открытым исходным кодом, на этот вопрос не так просто ответить. Вы не должны сообщать кому-либо о том, что вы начали пользоваться ClickHouse, достаточно взять исходный код или предкомпилированный установочный пакет. Не нужно подписывать контракт, а [лицензия Apache 2.0](https://github.com/ClickHouse/ClickHouse/blob/master/LICENSE) позволяет распространять ПО без ограничений.
 Кроме того, стек используемых технологий часто не раскрывается из-за NDA. Некоторые компании рассматривают технологии, которыми пользуются, как своё конкурентное преимущество, даже если это продукты с открытым исходным кодом. Такие компании не позволяют сотрудникам рассказывать о том, с каким ПО они работают, или требуют согласовывать это с PR-отделом.
 Итак, как же узнать, кто пользуется ClickHouse?
 Один из способов — **поспрашивать в своем окружении**. В разговорах люди более охотно делятся тем, какие технологии внедрены в их компаниях, какие задачи решаются с их помощью, могут назвать характеристики аппаратного обеспечения, объемы данных и т.д. Мы регулярно разговариваем с пользователями во время [митапов ClickHouse](https://www.youtube.com/channel/UChtmrD-dsdpspr42P_PyRAw/playlists) по всему миру и слышали о более чем 1000 компаний, которые пользуются ClickHouse. К сожалению, мы не можем раскрывать подробности, потому что по умолчанию считаем такие истории защищенными NDA, чтобы избежать любых возможных проблем. Вы можете прийти на любой из наших будущих митапов и самостоятельно поговорить с другими пользователями. Мы анонсируем события по разным каналам, например, вы можете подписаться на [наш Twitter](http://twitter.com/ClickHouseDB/).
 Второй способ узнать — посмотреть, что компании **говорят публично** о том, как именно они пользуются ClickHouse. Это более существенная информация, потому что ее можно найти в публикациях в блогах, видеозаписях разговоров, презентациях и т.д. Мы собираем ссылки на такие материалы на своей странице **[Пользователи ClickHouse](../../introduction/adopters.md)**. Будем рады, если вы поделитесь историей вашей компании или ссылками по теме (но всегда помните о том, что не стоит нарушать NDA).
 В числе пользователей есть множество очень крупных компаний, знакомых вам, таких как Bloomberg, Cisco, China Telecom, Tencent или Uber, но на самом деле это далеко не полный перечень. К примеру, если вы возьмете [список Forbes крупнейших ИТ-компаний в 2020](https://www.forbes.com/sites/hanktucker/2020/05/13/worlds-largest-technology-companies-2020-apple-stays-on-top-zoom-and-uber-debut/), то увидите, что более половины из этих компаний так или иначе пользуются ClickHouse. Также необходимо упомянуть Яндекс — компанию, которая открыла исходный код ClickHouse в 2016 году и является одной из самых крупных ИТ-компаний в Европе.
--- a/docs/ru/faq/general/why-clickhouse-is-so-fast.md
+++ b/docs/ru/faq/general/why-clickhouse-is-so-fast.md
@ -0,0 +1,63 @@
 ---
 title: Почему ClickHouse так быстро работает?
 toc_hidden: true
 toc_priority: 8
 ---
 #  Почему ClickHouse так быстро работает? {#why-clickhouse-is-so-fast}
 Производительность изначально заложена в архитектуре ClickHouse. Высокая скорость выполнения запросов была и остается самым важным критерием, который учитывается при разработке. Но мы обращаем внимание и на другие характеристики, такие как удобство использования, масштабируемость, безопасность. Всё это делает ClickHouse настоящей промышленной разработкой. 
 Сначала ClickHouse создавался как прототип, который должен был отлично справляться с одной единственной задачей — отбирать и агрегировать данные с максимальной скоростью. Это необходимо, чтобы создать обычный аналитический отчет, и именно это делает стандартный запрос [GROUP BY](../../sql-reference/statements/select/group-by.md). Для решения такой задачи команда разработки ClickHouse приняла несколько архитектурных решений: 
 Столбцовое хранение данных
 :   Исходные данные часто содержат сотни или даже тысячи столбцов, в то время как для конкретного отчета нужны только несколько из них. Система не должна читать ненужные столбцы, поскольку операции чтения данных с диска — самые дорогостоящие.
 Индексы
 :   ClickHouse хранит структуры данных в оперативной памяти, что позволяет считывать не только нужные столбцы, но и нужные диапазоны строк для этих столбцов.
 Сжатие данных
 :   Различные способы хранения смежных значений в столбце позволяют достигать более высокой степени сжатия данных (по сравнению с обычными строковыми СУБД), т.к. в смежных строках значения часто бывают одинаковыми или близкими. В дополнение к универсальному сжатию ClickHouse поддерживает [специализированные кодеки](../../sql-reference/statements/create/table.md#create-query-specialized-codecs), которые позволяют еще больше уменьшить объемы хранимых данных. 
 Векторные запросы
 :   ClickHouse не только хранит, но и обрабатывает данные в столбцах. Это приводит к лучшей утилизации кеша процессора и позволяет использовать инструкции [SIMD](https://en.wikipedia.org/wiki/SIMD).
 Масштабируемость
 :   ClickHouse может задействовать все доступные мощности процессоров и объемы дисков, чтобы выполнить даже одиночный запрос. Не только на отдельном сервере, но и в целом кластере.
 Похожие техники используют и многие другие СУБД. **Внимание к мельчайшим деталям** — вот что на самом деле выделяет ClickHouse. Большинство языков программирования поддерживают большинство распространенных алгоритмов и структур данных, но как правило, они бывают слишком универсальными, чтобы быть по-настоящему эффективными. Мы рассматриваем каждую задачу как тонкий инструмент со множеством настроек, вместо того чтобы просто взять какую-то случайную реализацию. Например, если вам нужна хеш-таблица, вот несколько ключевых вопросов, которые нужно продумать:
 -   Какую хеш-функцию выбрать?
 -   Каким способом разрешать коллизии: [открытая адресация](https://en.wikipedia.org/wiki/Open_addressing) или [метод цепочек](https://en.wikipedia.org/wiki/Hash_table#Separate_chaining)?
 -   Как хранить данные в памяти: ключи и значения в одном массиве или в отдельных? Будут ли там храниться маленькие или большие значения?
 -   Фактор заполнения: когда и как менять размер таблицы? Как перемещать значения при изменении размера?
 -   Будут ли значения удаляться и если да, то какой алгоритм сработает лучше?
 -   Понадобится ли быстрое зондирование с использованием битовых масок, встроенное хранение строковых ключей, поддержка неперемещаемых значений, предварительная выборка и пакетная обработка?
 Хеш-таблица — ключевая структура данных для реализации `GROUP BY`, и ClickHouse автоматически выбирает одну из [более 30 вариаций](https://github.com/ClickHouse/ClickHouse/blob/master/src/Interpreters/Aggregator.h) для каждого специфического запроса.
 Для алгоритмов сортировки, например, следует продумать следующие вопросы:
 -   Что будет сортироваться: массив чисел, кортежей, строк или структур?
 -   Доступны ли все данные в оперативной памяти?
 -   Нужна ли стабильная сортировка?
 -   Нужна ли полная сортировка? Может быть, будет достаточно частичной или выборочной сортировки?
 -   Как сравнивать данные?
 -   Не являются ли данные частично отсортированными?
 Алгоритмы, основанные на характеристиках рабочих данных, обычно дают лучшие результаты, чем их более универсальные аналоги. Если заранее неизвестно, с какими данными придется работать, ClickHouse будет в процессе выполнения пробовать различные реализации и в итоге выберет оптимальный вариант. Например, рекомендуем прочитать [статью о том, как в ClickHouse реализуется распаковка LZ4](https://habr.com/en/company/yandex/blog/457612/). 
 Ну и последнее, но тем не менее важное условие: команда ClickHouse постоянно отслеживает в интернете сообщения пользователей о найденных ими удачных реализациях, алгоритмах или структурах данных, анализирует и пробует новые идеи. Иногда в этом потоке сообщений попадаются действительно ценные предложения.
 !!! info "Советы о том, как создать собственную высокопроизводительную систему"
    -   При проектировании системы обращайте внимание на мельчайшие детали реализации.
    -   Учитывайте возможности аппаратного обеспечения.
    -   Выбирайте структуры и представления данных исходя из требований конкретной задачи.
    -   Для особых случаев разрабатывайте специализированные решения.
    -   Пробуйте новые алгоритмы, о которых вы вчера прочитали в интернете. Ищите возможности для совершенствования.
    -   Выбирайте алгоритмы динамически, в процессе выполнения, на основе статистики.
    -   Ориентируйтесь на показатели, собранные при работе с реальными данными.
    -   Проверяйте производительность в процессе CI.
    -   Измеряйте и анализируйте всё, что только возможно.
--- a/docs/ru/faq/index.md
+++ b/docs/ru/faq/index.md
@ -4,14 +4,42 @@ toc_hidden: true
 toc_priority: 76
 ---
-# Содержание F.A.Q. {#soderzhanie}
+# ClickHouse F.A.Q. {#clickhouse-f-a-q}
-В этом разделе документации собрали вопросы о ClickHouse, которые задают чаще всего.
+В этом разделе документации собраны ответы на вопросы о ClickHouse, которые задают чаще всего.
 Категории:
-   **[Общие вопросы](../faq/general/index.md)** 
+-   **[Общие вопросы](general/index.md)** 
-   **[Применение](../faq/use-cases/index.md)**
+    -   [Что такое ClickHouse?](../index.md#what-is-clickhouse)
-   **[Операции](../faq/operations/index.md)**  
+    -   [Почему ClickHouse такой быстрый?](general/why-clickhouse-is-so-fast.md)
-   **[Интеграция](../faq/integration/index.md)**
+    -   [Кто пользуется ClickHouse?](general/who-is-using-clickhouse.md)
    -   [Что обозначает название ClickHouse?](general/dbms-naming.md)
    -   [Как фраза “Не тормозит” осталась на всех футболках?](general/ne-tormozit.md)
    -   [Что такое OLAP?](general/olap.md)
    -   [Что такое столбцовая база данных?](general/columnar-database.md)
    -   [Почему бы не использовать системы типа MapReduce?](general/mapreduce.md)
 -   **[Применение](use-cases/index.md)**
    -   [Можно ли использовать ClickHouse как БД временных рядов?](use-cases/time-series.md)
    -   [Можно ли использовать ClickHouse для хранения данных вида "ключ-значение"?](use-cases/key-value.md)
 -   **[Операции](operations/index.md)**  
    -   [Какую версию ClickHouse использовать?](operations/production.md)
    -   [Возможно ли удалить старые записи из таблицы ClickHouse?](operations/delete-old-data.md)
 -   **[Интеграция](integration/index.md)**
    -   [Как экспортировать данные из ClickHouse в файл?](integration/file-export.md)
    -   [Как импортировать JSON в ClickHouse?](integration/json-import.md)
    -   [Что делать, если у меня проблема с кодировками при использовании Oracle через ODBC?](integration/oracle-odbc.md)
 {## TODO
 Question candidates:
 - How to choose a primary key?
 - How to add a column in ClickHouse?
 - Too many parts
 - How to filter ClickHouse table by an array column contents?
 - How to insert all rows from one table to another of identical structure?
 - How to kill a process (query) in ClickHouse?
 - How to implement pivot (like in pandas)?
 - How to remove the default ClickHouse user through users.d?
 - Importing MySQL dump to Clickhouse
 - Window function workarounds (row\_number, lag/lead, running diff/sum/average)
 ##}
--- a/docs/ru/faq/integration/file-export.md
+++ b/docs/ru/faq/integration/file-export.md
@ -1,27 +1,27 @@
 ---
-title: How do I export data from ClickHouse to a file?
+title: Как экспортировать данные из ClickHouse в файл?
 toc_hidden: true
 toc_priority: 10
 ---
-## Как экспортировать данные из ClickHouse в файл? {#how-to-export-to-file-rus}
+# Как экспортировать данные из ClickHouse в файл? {#how-to-export-to-file-rus}
-### Секция INTO OUTFILE {#sektsiia-into-outfile-rus}
+## Секция INTO OUTFILE {#using-into-outfile-clause}
-Добавьте секцию [INTO OUTFILE](../../sql-reference/statements/select/into-outfile.md#into-outfile-clause) к своему запросу.
+Добавьте к своему запросу секцию [INTO OUTFILE](../../sql-reference/statements/select/into-outfile.md#into-outfile-clause).
 Например:
 ``` sql
-SELECT * FROM table INTO OUTFILE 'file'
+SELECT * FROM table INTO OUTFILE 'file';
 ```
-По умолчанию, для выдачи данных ClickHouse использует формат [TabSeparated](../../interfaces/formats.md#tabseparated). Чтобы выбрать [формат данных](../../interfaces/formats.md), используйте секцию [FORMAT](../../sql-reference/statements/select/format.md#format-clause).
+По умолчанию при выдаче данных ClickHouse использует формат [TabSeparated](../../interfaces/formats.md#tabseparated). Чтобы выбрать другой [формат данных](../../interfaces/formats.md), используйте секцию [FORMAT](../../sql-reference/statements/select/format.md#format-clause).
 Например:
 ``` sql
-SELECT * FROM table INTO OUTFILE 'file' FORMAT CSV
+SELECT * FROM table INTO OUTFILE 'file' FORMAT CSV;
 ```
 ## Таблица с движком File {#using-a-file-engine-table}
--- a/docs/ru/faq/integration/index.md
+++ b/docs/ru/faq/integration/index.md
@ -1,19 +1,19 @@
 ---
-title: Questions about integrating ClickHouse and other systems
+title: Интеграция ClickHouse с другими системами
 toc_hidden_folder: true
 toc_priority: 4
 toc_title: Интеграция
 ---
-# Вопросы об интеграции ClickHouse с другими системами {#question-about-integrating-clickhouse-and-other-systems-rus}
+# Интеграция ClickHouse с другими системами {#question-about-integrating-clickhouse-and-other-systems-rus}
 Вопросы:
 -   [Как экспортировать данные из ClickHouse в файл?](file-export.md)
-   Как импортировать JSON в ClickHouse?
+-   [Как импортировать JSON в ClickHouse?](json-import.md)
 -   [Что делать, если у меня проблема с кодировками при использовании Oracle через ODBC?](oracle-odbc.md)
 !!! info "Если вы не нашли то, что искали"
    Загляните в другие подразделы F.A.Q. или поищите в остальных разделах документации, ориентируйтесь по оглавлению слева.
-{## [Original article](https://clickhouse.tech/docs/ru/faq/integration/) ##}
+[Original article](https://clickhouse.tech/docs/ru/faq/integration/)
--- a/docs/ru/faq/integration/json-import.md
+++ b/docs/ru/faq/integration/json-import.md
@ -0,0 +1,33 @@
 ---
 title: Как импортировать JSON в ClickHouse?
 toc_hidden: true
 toc_priority: 11
 ---
 # Как импортировать JSON в ClickHouse? {#how-to-import-json-into-clickhouse}
 ClickHouse поддерживает широкий спектр [входных и выходных форматов данных](../../interfaces/formats.md). Среди них есть множество вариаций JSON, но чаще всего для импорта данных используют [JSONEachRow](../../interfaces/formats.md#jsoneachrow): один JSON-объект в строке, каждый объект с новой строки.
 ## Примеры {#examples}
 С помощью [HTTP-интерфейса](../../interfaces/http.md):
 ``` bash
 $ echo '{"foo":"bar"}' | curl 'http://localhost:8123/?query=INSERT%20INTO%20test%20FORMAT%20JSONEachRow' --data-binary @-
 ```
 При помощи [интефейса CLI](../../interfaces/cli.md):
 ``` bash
 $ echo '{"foo":"bar"}'  | clickhouse-client ---query="INSERT INTO test FORMAT JSONEachRow"
 ```
 Чтобы не вставлять данные вручную, используйте одну из [готовых библиотек](../../interfaces/index.md).
 ## Полезные настройки {#useful-settings}
 -   `input_format_skip_unknown_fields` позволяет импортировать JSON, даже если он содержит дополнительные поля, которых нет в таблице (отбрасывая лишние поля).
 -   `input_format_import_nested_json` позволяет импортировать вложенные JSON-объекты в столбцы типа [Nested](../../sql-reference/data-types/nested-data-structures/nested.md).
 !!! note "Примечание"
    В HTTP-интерфейсе настройки передаются через параметры `GET` запроса, в `CLI` interface — как дополнительные аргументы командной строки, начинающиеся с `--`.
--- a/docs/ru/faq/integration/oracle-odbc.md
+++ b/docs/ru/faq/integration/oracle-odbc.md
@ -1,10 +1,10 @@
 ---
-title: What if I have a problem with encodings when using Oracle via ODBC?
+title: Что делать, если у меня проблема с кодировками при использовании Oracle через ODBC?
 toc_hidden: true
 toc_priority: 20
 ---
-## Что делать, если у меня проблема с кодировками при использовании Oracle через ODBC? {#oracle-odbc-encodings-rus}
+# Что делать, если у меня проблема с кодировками при использовании Oracle через ODBC? {#oracle-odbc-encodings}
 Если вы используете Oracle через драйвер ODBC в качестве источника внешних словарей, необходимо задать правильное значение для переменной окружения `NLS_LANG` в `/etc/default/clickhouse`. Подробнее читайте в [Oracle NLS_LANG FAQ](https://www.oracle.com/technetwork/products/globalization/nls-lang-099431.html).
--- a/docs/ru/faq/operations/delete-old-data.md
+++ b/docs/ru/faq/operations/delete-old-data.md
@ -0,0 +1,42 @@
 ---
 title: Возможно ли удалить старые записи из таблицы ClickHouse?
 toc_hidden: true
 toc_priority: 20
 ---
 # Возможно ли удалить старые записи из таблицы ClickHouse? {#is-it-possible-to-delete-old-records-from-a-clickhouse-table}
 Если отвечать коротко, то да. В ClickHouse есть множество механизмов, которые позволяют освобождать место на диске, удаляя старые данные. Каждый механизм подходит для разных сценариев.
 ## TTL {#ttl}
 ClickHouse позволяет автоматически удалять данные при выполнении некоторых условий. Эти условия задаются как выражение, вычисляемое на основе значений любых столбцов, обычно это просто разница между текущим моментом времени и значением какого-то столбца, содержащего дату и время.
 Ключевое преимущество такого подхода в том, что не нужно использовать внешнюю систему, чтобы запустить процесс — когда заданы условия TTL, удаление данных выполняется автоматически в фоновом режиме.
 !!! note "Note"
    TTL можно использовать не только для перемещения в [/dev/null](https://en.wikipedia.org/wiki/Null_device), но еще и между дисками, например, с SSD на HDD.
 [Подробнее о конфигурировании TTL](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl).
 ## ALTER DELETE {#alter-delete}
 ClickHouse не удаляет данные в реальном времени, как СУБД [OLTP](https://en.wikipedia.org/wiki/Online_transaction_processing). Больше всего на такое удаление похожи мутации. Они выполняются с помощью запросов `ALTER ... DELETE` или `ALTER ... UPDATE`. В отличие от обычных запросов `DELETE` и `UPDATE`, мутации выполняются асинхронно, в пакетном режиме, не в реальном времени. В остальном после слов `ALTER TABLE` синтаксис обычных запросов и мутаций одинаковый.
 `ALTER DELETE` можно использовать для гибкого удаления устаревших данных. Если вам нужно делать это регулярно, единственный недостаток такого подхода будет заключаться в том, что потребуется внешняя система для запуска запроса. Кроме того, могут возникнуть некоторые проблемы с производительностью, поскольку мутации перезаписывают целые куски данных если в них содержится хотя бы одна строка, которую нужно удалить.
 Это самый распространенный подход к тому, чтобы обеспечить соблюдение принципов [GDPR](https://gdpr-info.eu) в вашей системе на ClickHouse.
 Подробнее смотрите в разделе [Мутации](../../sql-reference/statements/alter/index.md#alter-mutations).
 ## DROP PARTITION {#drop-partition}
 Запрос `ALTER TABLE ... DROP PARTITION` позволяет эффективно удалять целые партиции. Этот способ не такой гибкий, важно правильно сконфигурировать партиции при создании таблицы, но он подходит для достаточно широкого спектра типовых задач. Как и для мутаций, для регулярного запуска таких запросов нужна внешняя система.
 Подробнее смотрите в разделе [Манипулирование с партициями и кусками](../../sql-reference/statements/alter/partition.md#alter_drop-partition).
 ## TRUNCATE {#truncate}
 Это достаточно радикальный способ, он удаляет все данные в таблице, но хорошо подходит для отдельных случаевх.
 Подробнее смотрите в разделе об [удалении партиций](../../sql-reference/statements/alter/partition.md#alter_drop-partition).
--- a/docs/ru/faq/operations/index.md
+++ b/docs/ru/faq/operations/index.md
@ -1,18 +1,18 @@
 ---
-title: Question about operating ClickHouse servers and clusters
+title: Вопросы о производительности серверов и кластеров ClickHouse
 toc_hidden_folder: true
 toc_priority: 3
 toc_title: Операции
 ---
-# Вопросы о производительности серверов и кластеров ClickHouse {#voprosy-ob-operating-clickhouse-servers-and-clusters}
+# Вопросы о производительности серверов и кластеров ClickHouse {#question-about-operating-clickhouse-servers-and-clusters}
 Вопросы:
-   Which ClickHouse version to use in production?
+-   [Какую версию ClickHouse использовать?](production.md)
-   Is it possible to delete old records from a ClickHouse table?
+-   [Возможно ли удалить старые записи из таблицы ClickHouse?](delete-old-data.md)
-    !!! info "Don’t see what you were looking for?"
+!!! info "Если вы не нашли то, что искали"
-        Check out [other F.A.Q. categories](../../faq/index.md) or browse around main documentation articles found in the left sidebar.
+    Загляните в другие подразделы F.A.Q. или поищите в остальных разделах документации, ориентируйтесь по оглавлению слева.
-    {## [Original article](https://clickhouse.tech/docs/en/faq/production/) ##}
+[Original article](https://clickhouse.tech/docs/en/faq/operations/)
--- a/docs/ru/faq/operations/production.md
+++ b/docs/ru/faq/operations/production.md
@ -0,0 +1,70 @@
 ---
 title: Какую версию ClickHouse использовать?
 toc_hidden: true
 toc_priority: 10
 ---
 # Какую версию ClickHouse использовать? {#which-clickhouse-version-to-use-in-production}
 Во-первых, давайте обсудим, почему возникает этот вопрос. Есть две основные причины:
 1.  ClickHouse развивается достаточно быстро, и обычно мы выпускаем более 10 стабильных релизов в год. Так что есть из чего выбрать, а это не всегда просто. 
 2.  Некоторые пользователи не хотят тратить время на анализ того, какая версия лучше подходит для их задач, и просто хотят получить совет от эксперта.
 Вторая причина более весомая, так что начнем с нее, а затем рассмотрим, какие бывают релизы ClickHouse.
 ## Какую версию ClickHouse вы посоветуете? {#which-clickhouse-version-do-you-recommend}
 Казалось бы, самый удобный вариант — нанять консультанта или довериться эксперту, и делегировать ему ответственность за вашу систему. Вы устанавливаете ту версию ClickHouse, которую вам рекомендовали, и теперь если что-то пойдет не так — это уже не ваша вина. На самом деле это не так. Никто не может знать лучше вас, что происходит в вашей системе. 
 Как же правильно выбрать версию ClickHouse, на которую стоит обновиться? Или как выбрать версию, с которой следует начать, если вы только внедряете ClickHouse? Во-первых, мы рекомендуем позаботиться о создании **реалистичной тестовой среды** (pre-production). В идеальном мире это была бы полная копия рабочей среды, но чаще всего такое решение оказывается слишком дорогостоящим.
 Чтобы тестовая среда была достаточно надежной, но не слишком дорогостоящей, учитывайте следующие моменты:
 -   В тестовой среде нужно выполнять набор запросов, максимально близкий к тому, который будет выполняться в реальной среде:
    -   Не используйте тестовую среду в режиме "только для чтения", работая с каким-то статичным набором данных.
    -   Не используйте её в режиме "только для записи", проверяя лишь копирование данных, без построения типовых отчетов.
    -   Не очищайте её, удаляя все данные подчистую вместо тестирования рабочих схем миграции.
 -   Выполняйте реальные запросы на выборке из реальных рабочих данных. Постарайтесь подготовить репрезентативную выборку, на которой запрос `SELECT` будет возвращать адекватные результаты. Если регламенты безопасности не позволяют использовать реальные данные за пределами защищенной рабочей среды, используйте обфускацию. 
 -   Убедитесь, что тестовая среда находится под контролем тех же систем мониторинга и оповещения, что и рабочая. 
 -   Если ваша рабочая среда распределена между разными дата-центрами и регионами, тестовая среда должна быть такой же.
 -   Если в рабочей среде используются сложные инструменты типа репликации, распределённых таблиц или каскадных материализованных представлений, тестовая среда должна быть сконфигурирована так же.
 -   Обычно в тестовой среде стараются использовать то же количество серверов и виртуальных машин, что и в рабочей, но делают их меньшего объема. Либо наоборот, используют существенно меньшее число серверов и ВМ, но тех же объемов. Первый вариант скорее позволит обнаружить проблемы, связанные с работой сети, а второй вариант более прост в управлении.
 Второе направление — **автоматизированное тестирование**. Не думайте, что если какой-то запрос отработал успешно один раз, так будет всегда. Считается приемлемым выполнять некоторые юнит-тесты, используя "заглушки" вместо запросов к СУБД. Но вы должны проводить достаточное количество автотестов, где запросы выполняются в реальном ClickHouse, чтобы убедиться, что все важные задачи отрабатывают должным образом.
 В продолжение этой темы, вы можете поделиться вашими автотестами и передать их [в открытую тестовую среду ClickHouse](https://github.com/ClickHouse/ClickHouse/tree/master/tests), которая используется для постоянного развития нашей СУБД. Вам придётся потратить немного времени и сил, чтобы научиться [составлять и выполнять тесты](../../development/tests.md), а также чтобы перенести ваши тесты на эту платформу. Наградой за это станет уверенность в том, что новые стабильные релизы ClickHouse будут корректно работать на ваших задачах. Это гораздо лучше, чем тратить время на то, чтобы вновь отлавливать прежние ошибки в новых версиях, а затем ждать, пока их исправят и включат эти исправления в очередной релиз. Некоторые компании уже включили в корпоративные регламенты необходимость передачи своих тестов в ClickHouse, прежде всего стоит упомянуть [правило Beyonce](https://www.oreilly.com/library/view/software-engineering-at/9781492082781/ch01.html#policies_that_scale_well), действующее в Google.
 После того, как вы подготовили тестовую среду и инфраструктуру, выбор версии ClickHouse упрощается:
 1.  Проверяйте новые релизы ClickHouse с помощью подготовленных автотестов. Вы можете проверять не только стабильные релизы, но и тестовые, хотя работать с такими релизами не рекомендуется.
 2.  Если новый релиз ClickHouse успешно прошел ваши автотесты, внедряйте его в тестовой среде и проверяйте работоспособность всех ваших задач.
 3.  Сообщайте обо всех обнаруженных проблемах в [ClickHouse GitHub Issues](https://github.com/ClickHouse/ClickHouse/issues).
 4.  Если никаких серьезных проблем не было выявлено, можно установить новый релиз ClickHouse в рабочую среду. Чтобы еще больше снизить риски, вы можете внедрить специальные техники поэтапного перехода на новые релизы, такие как [canary releases](https://martinfowler.com/bliki/CanaryRelease.html) или [green-blue deployments](https://martinfowler.com/bliki/BlueGreenDeployment.html).
 Как вы уже поняли, ClickHouse не требует какого-то особенного подхода — описанные выше правила широко используются для любых элементов инфраструктуры, если нужно обеспечить ее надежность и если компании серьезно подходят к вопросам стабильности своих систем.
 ## Какой вид релиза ClickHouse выбрать? {#how-to-choose-between-clickhouse-releases}
 Если вы заглянете в раздел, где публикуются установочные пакеты ClickHouse, вы увидите там следующие виды пакетов:
 1.  `testing`
 2.  `prestable`
 3.  `stable`
 4.  `lts` (long-term support)
 Как уже упоминалось выше, тестовые релизы (`testing`) стоит использовать для раннего обнаружения ошибок, в рабочей среде мы не рекомендуем использовать такие релизы, поскольку они еще не протестированы так же тщательно, как остальные.
 Подготовительные (`prestable`) — это релизы-кандидаты, которые с большой вероятностью скоро будут доведены до стабильного состояния. Вы можете использовать их в тестовой среде и сообщать нам об обнаруженных ошибках.
 В рабочей среде мы рекомендуем использвать либо стабильный релиз (`stable`), либо релиз с долговременной поддержкой (`lts`). Если вы выбираете между этими двуми видами релизов, примите во внимание следующее:
 -   По умолчанию мы рекомендуем релизы `stable`. Новый стабильный релиз выпускается примерно раз в месяц, что открывает доступ к новым функциям. Три последних стабильных релиза находятся на поддержке — это означает, что в них интегрируются исправленные ошибки и доработки.
 -   Релизы `lts` выпускаются дважды в год и находятся на поддержке в течение года с момента выхода. Они более предочтительны в следующих случаях:
    -   ваши корпоративные регламенты запрещают частые обновления или использование любых релизов, кроме LTS;
    -   вы используете ClickHouse в продуктах, которые не задействуют сложные инструменты ClickHouse, или у вас не хватает ресурсов для частого их обновления.
 Часто компании, которые изначально ориентировались на релизы `lts`, позднее переходят на `stable`, поскольку хотят быстрее получать доступ к новым возможностям.
 !!! warning "Важно"
    Мы всегда стремимся поддерживать совместимость релизов, но иногда это правило нарушается, и какие-то отдельные возможности в новых релизах становятся недоступны. Перед обновлением ClickHouse обязательно изучите [журнал изменений](../../whats-new/changelog/index.md), чтобы убедиться, что в нем нет объявлений о нарушении обратной совместимости.
--- a/docs/ru/faq/use-cases/index.md
+++ b/docs/ru/faq/use-cases/index.md
@ -1,14 +1,13 @@
 ---
-title: Questions about ClickHouse use cases
+title: Вопросы о применении ClickHouse
 toc_hidden_folder: true
 toc_priority: 2
 toc_title: Применение
 ---
-# Вопросы о применении ClickHouse {#voprosy-o-primenenii}
+# Вопросы о применении ClickHouse {#questions-about-clickhouse-use-cases}
 Вопросы:
-   Can I use ClickHouse as a time-series database?
+-   [Можно ли использовать ClickHouse как БД временных рядов?](time-series.md)
-   Can I use ClickHouse as a key-value storage?
+-   [Можно ли использовать ClickHouse для хранения данных вида "ключ-значение"?](key-value.md)
--- a/docs/ru/faq/use-cases/key-value.md
+++ b/docs/ru/faq/use-cases/key-value.md
@ -0,0 +1,19 @@
 ---
 title: Можно ли использовать ClickHouse для хранения данных вида "ключ-значение"?
 toc_hidden: true
 toc_priority: 101
 ---
 # Можно ли использовать ClickHouse для хранения данных вида "ключ-значение"? {#can-i-use-clickhouse-as-a-key-value-storage}
 Если отвечать коротко, то **"нет"**. Операции над данными вида "ключ-значение" занимают одну из верхних позиций в списке ситуаций, когда категорически **не стоит**{.text-danger} использовать ClickHouse. Это [OLAP](../../faq/general/olap.md) СУБД, в то время как есть много специализированных СУБД для данных вида "ключ-значение".
 Тем не менее, в некоторых ситуациях имеет смысл использовать ClickHouse для запросов над данными вида "ключ-значение". Чаще всего это относится к системам с относительно невысокой нагрузкой, в которых основной объем операций относится к аналитической обработке данных и отлично подходит для ClickHouse. Однако в них есть некий второстепенный процесс, в котором нужно обрабатывать данные вида "ключ-значение", при этом процесс не требует слишком высокой производительности и не имеет строгих ограничений по задержкам выполнения запросов. Если у вас нет ограничений по бюджету, вы можете использовать для таких операций вспомогательную базу данных "ключ-значение", но это увеличит расходы на обслуживание еще одной СУБД (мониторинг, бэкапы и т.д.). 
 Если вы все же решите не следовать рекомендациям и использовать ClickHouse для работы с данными вида "ключ-значение", вот несколько советов:
 -   Главная причина, по которой точечный запрос в ClickHouse становится ресурсозатратным — это разреженный индекс для первичного ключа в [таблице семейства MergeTree](../../engines/table-engines/mergetree-family/mergetree.md). Этот индекс не может обращаться напрямую к каждой строке данных, вместо этого он обращается к каждой N-ой строке, а затем сканирует соседние строки вплоть до указанной, обрабатывая по пути лишние данные. При обработке данных вида "ключ-значение" может быть полезно уменьшить значение N при помощи настройки `index_granularity`.
 -   ClickHouse хранит столбцы в отдельных файлах, поэтому чтобы собрать одну полную строку, ему приходится обрабатывать все эти файлы. Их количество растет линейно в зависимости от количества столбцов, поэтому при обработке данных вида "ключ-значение" стоит избегать использования множества столбцов и поместить все нужные данные в один столбец с типом `String` в формате JSON, Protobuf или другом подходящем формате.
 -   Подумайте об использовании табличного движка [Join](../../engines/table-engines/special/join.md) вместо обычных таблиц `MergeTree` и функции [joinGet](../../sql-reference/functions/other-functions.md#joinget) для получения данных. В этом случае производительность выполнения запросов может быть выше, но могут появиться проблемы с надежностью и удобством. Пример такого использования описан [здесь](https://github.com/ClickHouse/ClickHouse/blob/master/tests/queries/0_stateless/00800_versatile_storage_join.sql#L49-L51).
--- a/docs/ru/faq/use-cases/time-series.md
+++ b/docs/ru/faq/use-cases/time-series.md
@ -0,0 +1,15 @@
 ---
 title: Можно ли использовать ClickHouse как базу данных временных рядов?
 toc_hidden: true
 toc_priority: 101
 ---
 # Можно ли использовать ClickHouse как базу данных временных рядов? {#can-i-use-clickhouse-as-a-time-series-database}
 ClickHouse — это универсальное решение для [OLAP](../../faq/general/olap.md) операций, в то время как существует много специализированных СУБД временных рядов. Однако [высокая скорость выполнения запросов](../../faq/general/why-clickhouse-is-so-fast.md) позволяет CLickHouse во многих случаях "побеждать" специализированные аналоги. В подтверждение этому есть много [примеров](https://medium.com/@AltinityDB/clickhouse-for-time-series-scalability-benchmarks-e181132a895b) с конкретными показателями производительности, так что мы не будем останавливаться на этом подробно. Лучше рассмотрим те возможности ClickHouse, которые стоит использовать.
 Во-первых, есть **[специальные кодеки](../../sql-reference/statements/create/table.md#create-query-specialized-codecs)**, которые составляют типичные временные ряды. Это могут быть либо стандартные алгоритмы, такие как `DoubleDelta` или `Gorilla`, либо специфические для ClickHouse, например `T64`.
 Во-вторых, запросы по временным рядам часто затрагивают только недавние данные, не старше одного дня или недели. Имеет смысл использовать серверы, где есть как быстрые диски nVME/SSD, так и более медленные, но ёмкие HDD диски. С помощью [TTL](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes) можно сконфигурировать таблицы так, чтобы свежие данные хранились на быстрых дисках, а по мере устаревания перемещались на медленные диски. Для архивных данных можно также настроить сворачивание или даже удаление, если это необходимо.
 Несмотря на то, что работа с "сырыми" данными противоречит философии ClickHouse, если нужно соответствовать очень жестким требованиям по скорости обработки данных, вы можете использовать [материализованные представления](../../sql-reference/statements/create/view.md).
--- a/docs/ru/index.md
+++ b/docs/ru/index.md
@ -3,7 +3,7 @@ toc_priority: 0
 toc_title: "\u041E\u0431\u0437\u043E\u0440"
 ---
-# Что такое ClickHouse {#chto-takoe-clickhouse}
+# Что такое ClickHouse {#what-is-clickhouse}
 ClickHouse - столбцовая система управления базами данных (СУБД) для онлайн обработки аналитических запросов (OLAP).
--- a/docs/ru/operations/settings/settings.md
+++ b/docs/ru/operations/settings/settings.md
@ -1937,6 +1937,21 @@ SELECT idx, i FROM null_in WHERE i IN (1, NULL) SETTINGS transform_null_in = 1;
 Значение по умолчанию: 16.
 ## background_message_broker_schedule_pool_size {#background_message_broker_schedule_pool_size}
 Задает количество потоков для фонового потокового вывода сообщений. Настройка применяется при запуске сервера ClickHouse и не может быть изменена в пользовательском сеансе.
 Допустимые значения:
 -   Положительное целое число.
 Значение по умолчанию: 16.
 **Смотрите также**
 -   Движок [Kafka](../../engines/table-engines/integrations/kafka.md#kafka).
 -   Движок [RabbitMQ](../../engines/table-engines/integrations/rabbitmq.md#rabbitmq-engine).
 ## format_avro_schema_registry_url {#format_avro_schema_registry_url}
 Задает URL реестра схем [Confluent](https://docs.confluent.io/current/schema-registry/index.html) для использования с форматом [AvroConfluent](../../interfaces/formats.md#data-format-avro-confluent).
@ -2537,4 +2552,15 @@ SELECT * FROM test2;
 Обратите внимание на то, что эта настройка влияет на поведение [материализованных представлений](../../sql-reference/statements/create/view.md#materialized) и БД [MaterializeMySQL](../../engines/database-engines/materialize-mysql.md).
 ## allow_experimental_geo_types {#allow-experimental-geo-types}
 Разрешает использование экспериментальных типов данных для работы с [географическими структурами](../../sql-reference/data-types/geo.md).
 Возможные значения:
 -   0 — Использование типов данных для работы с географическими структурами не поддерживается.
 -   1 — Использование типов данных для работы с географическими структурами поддерживается.
 Значение по умолчанию: `0`.
 [Оригинальная статья](https://clickhouse.tech/docs/ru/operations/settings/settings/) <!--hide-->
--- a/docs/ru/operations/system-tables/index.md
+++ b/docs/ru/operations/system-tables/index.md
@ -9,25 +9,54 @@ toc_title: "\u0421\u0438\u0441\u0442\u0435\u043c\u043d\u044b\u0435\u0020\u0442\u
 Системные таблицы содержат информацию о:
-   Состоянии сервера, процессов и окружении.
+-   состоянии сервера, процессов и окружении.
-   Внутренних процессах сервера.
+-   внутренних процессах сервера.
 Системные таблицы:
-   Находятся в базе данных `system`.
+-   находятся в базе данных `system`.
-   Доступны только для чтения данных.
+-   доступны только для чтения данных.
-   Не могут быть удалены или изменены, но их можно отсоединить.
+-   не могут быть удалены или изменены, но их можно отсоединить.
-Системные таблицы `metric_log`, `query_log`, `query_thread_log`, `trace_log` системные таблицы хранят данные в файловой системе. Остальные системные таблицы хранят свои данные в оперативной памяти. Сервер ClickHouse создает такие системные таблицы при запуске.
+Большинство системных таблиц хранят свои данные в оперативной памяти. Сервер ClickHouse создает эти системные таблицы при старте.
 В отличие от других системных таблиц, таблицы с системными логами [metric_log](../../operations/system-tables/metric_log.md), [query_log](../../operations/system-tables/query_log.md), [query_thread_log](../../operations/system-tables/query_thread_log.md), [trace_log](../../operations/system-tables/trace_log.md), [part_log](../../operations/system-tables/part_log.md), [crash_log](../../operations/system-tables/crash-log.md) и [text_log](../../operations/system-tables/text_log.md) используют движок таблиц [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) и по умолчанию хранят свои данные в файловой системе. Если удалить таблицу из файловой системы, сервер ClickHouse снова создаст пустую таблицу во время следующей записи данных. Если схема системной таблицы изменилась в новом релизе, то ClickHouse переименует текущую таблицу и создаст новую.
 Таблицы с системными логами `log` можно настроить, создав конфигурационный файл с тем же именем, что и таблица в разделе `/etc/clickhouse-server/config.d/`, или указав соответствующие элементы в `/etc/clickhouse-server/config.xml`. Настраиваться могут следующие элементы:
 -   `database` — база данных, к которой принадлежит системная таблица. Эта опция на текущий момент устарела. Все системные таблицы находятся в базе данных `system`.
 -   `table` — таблица для добавления данных.
 -   `partition_by` — [ключ партиционирования](../../engines/table-engines/mergetree-family/custom-partitioning-key.md).
 -   `ttl` — [время жизни](../../sql-reference/statements/alter/ttl.md) таблицы.
 -   `flush_interval_milliseconds` — интервал сброса данных на диск, в миллисекундах.
 -   `engine` — полное имя движка (начиная с `ENGINE =` ) с параметрами. Эта опция противоречит `partition_by` и `ttl`. Если указать оба параметра вместе, сервер вернет ошибку и завершит работу.
 Пример:
 ```xml
 <yandex>
    <query_log>
        <database>system</database>
        <table>query_log</table>
        <partition_by>toYYYYMM(event_date)</partition_by>
        <ttl>event_date + INTERVAL 30 DAY DELETE</ttl>
        <!--
        <engine>ENGINE = MergeTree PARTITION BY toYYYYMM(event_date) ORDER BY (event_date, event_time) SETTINGS index_granularity = 1024</engine>
        -->
        <flush_interval_milliseconds>7500</flush_interval_milliseconds>
    </query_log>
 </yandex>
 ```
 По умолчанию размер таблицы не ограничен. Управлять размером таблицы можно используя [TTL](../../sql-reference/statements/alter/ttl.md#manipuliatsii-s-ttl-tablitsy) для удаления устаревших записей журнала. Также вы можете использовать функцию партиционирования для таблиц `MergeTree`.
 ### Источники системных показателей 
 Для сбора системных показателей сервер ClickHouse использует:
-   Возможности `CAP_NET_ADMIN`.
+-   возможности `CAP_NET_ADMIN`.
 -   [procfs](https://ru.wikipedia.org/wiki/Procfs) (только Linux).
 **procfs**
 Если для сервера ClickHouse не включено `CAP_NET_ADMIN`, он пытается обратиться к `ProcfsMetricsProvider`. `ProcfsMetricsProvider` позволяет собирать системные показатели для каждого запроса (для CPU и I/O).
--- a/docs/ru/sql-reference/aggregate-functions/parametric-functions.md
+++ b/docs/ru/sql-reference/aggregate-functions/parametric-functions.md
@ -239,7 +239,7 @@ windowFunnel(window, [mode])(timestamp, cond1, cond2, ..., condN)
 **Параметры**
-   `window` — ширина скользящего окна по времени в секундах. [UInt](../../sql-reference/aggregate-functions/parametric-functions.md).
+-   `window` — ширина скользящего окна по времени. Единица измерения зависит от `timestamp` и может варьироваться. Должно соблюдаться условие `timestamp события cond2 <= timestamp события cond1 + window`.
 -   `mode` - необязательный параметр. Если установлено значение `'strict'`, то функция `windowFunnel()` применяет условия только для уникальных значений.
 -   `timestamp` — имя столбца, содержащего временные отметки. [Date](../../sql-reference/aggregate-functions/parametric-functions.md), [DateTime](../../sql-reference/aggregate-functions/parametric-functions.md#data_type-datetime) и другие параметры с типом `Integer`. В случае хранения меток времени в столбцах с типом `UInt64`, максимально допустимое значение соответствует ограничению для типа `Int64`, т.е. равно `2^63-1`.
 -   `cond` — условия или данные, описывающие цепочку событий. [UInt8](../../sql-reference/aggregate-functions/parametric-functions.md).
--- a/docs/ru/sql-reference/data-types/geo.md
+++ b/docs/ru/sql-reference/data-types/geo.md
@ -0,0 +1,106 @@
 ---
 toc_priority: 62
 toc_title: Географические структуры
 ---
 # Типы данных для работы с географическими структурами {#geo-data-types}
 ClickHouse поддерживает типы данных для отображения географических объектов — точек (местоположений), территорий и т.п.
 !!! warning "Предупреждение"
    Сейчас использование типов данных для работы с географическими структурами является экспериментальной возможностью. Чтобы использовать эти типы данных, включите настройку `allow_experimental_geo_types = 1`.
 **См. также**
 - [Хранение географических структур данных](https://ru.wikipedia.org/wiki/GeoJSON).
 - Настройка [allow_experimental_geo_types](../../operations/settings/settings.md#allow-experimental-geo-types).
 ## Point {#point-data-type}
 Тип `Point` (точка) определяется парой координат X и Y и хранится в виде кортежа [Tuple](tuple.md)([Float64](float.md), [Float64](float.md)).
 **Пример**
 Запрос:
 ```sql
 SET allow_experimental_geo_types = 1;
 CREATE TABLE geo_point (p Point) ENGINE = Memory();
 INSERT INTO geo_point VALUES((10, 10));
 SELECT p, toTypeName(p) FROM geo_point;
 ```
 Результат: 
 ``` text
 ┌─p─────┬─toTypeName(p)─┐
 │ (10,10) │ Point         │
 └───────┴───────────────┘
 ```
 ## Ring {#ring-data-type}
 Тип `Ring` описывает простой многоугольник без внутренних областей (дыр) и хранится в виде массива точек: [Array](array.md)([Point](#point-data-type)).
 **Пример**
 Запрос:
 ```sql
 SET allow_experimental_geo_types = 1;
 CREATE TABLE geo_ring (r Ring) ENGINE = Memory();
 INSERT INTO geo_ring VALUES([(0, 0), (10, 0), (10, 10), (0, 10)]);
 SELECT r, toTypeName(r) FROM geo_ring;
 ```
 Результат: 
 ``` text
 ┌─r─────────────────────────────┬─toTypeName(r)─┐
 │ [(0,0),(10,0),(10,10),(0,10)] │ Ring          │
 └───────────────────────────────┴───────────────┘
 ```
 ## Polygon {#polygon-data-type}
 Тип `Polygon` описывает многоугольник с внутренними областями (дырами) и хранится в виде массива: [Array](array.md)([Ring](#ring-data-type)). Первый элемент массива описывает внешний многоугольник (контур), а остальные элементы описывают дыры.
 **Пример**
 Запись в этой таблице описывает многоугольник с одной дырой:
 ```sql
 SET allow_experimental_geo_types = 1;
 CREATE TABLE geo_polygon (pg Polygon) ENGINE = Memory();
 INSERT INTO geo_polygon VALUES([[(20, 20), (50, 20), (50, 50), (20, 50)], [(30, 30), (50, 50), (50, 30)]]);
 SELECT pg, toTypeName(pg) FROM geo_polygon;
 ```
 Результат: 
 ``` text
 ┌─pg────────────────────────────────────────────────────────────┬─toTypeName(pg)─┐
 │ [[(20,20),(50,20),(50,50),(20,50)],[(30,30),(50,50),(50,30)]] │ Polygon        │
 └───────────────────────────────────────────────────────────────┴────────────────┘
 ```
 ## MultiPolygon {#multipolygon-data-type}
 Тип `MultiPolygon` описывает элемент, состоящий из нескольких простых многоугольников (полигональную сетку). Он хранится в виде массива многоугольников: [Array](array.md)([Polygon](#polygon-data-type)). 
 **Пример**
 Запись в этой таблице описывает элемент, состоящий из двух многоугольников — первый без дыр, а второй с одной дырой:
 ```sql
 SET allow_experimental_geo_types = 1;
 CREATE TABLE geo_multipolygon (mpg MultiPolygon) ENGINE = Memory();
 INSERT INTO geo_multipolygon VALUES([[[(0, 0), (10, 0), (10, 10), (0, 10)]], [[(20, 20), (50, 20), (50, 50), (20, 50)],[(30, 30), (50, 50), (50, 30)]]]);
 SELECT mpg, toTypeName(mpg) FROM geo_multipolygon;
 ```
 Result: 
 ``` text
 ┌─mpg─────────────────────────────────────────────────────────────────────────────────────────────┬─toTypeName(mpg)─┐
 │ [[[(0,0),(10,0),(10,10),(0,10)]],[[(20,20),(50,20),(50,50),(20,50)],[(30,30),(50,50),(50,30)]]] │ MultiPolygon    │
 └─────────────────────────────────────────────────────────────────────────────────────────────────┴─────────────────┘
 ```
 [Оригинальная статья](https://clickhouse.tech/docs/ru/data-types/geo/) <!--hide-->
--- a/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md
+++ b/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md
@ -572,7 +572,7 @@ SOURCE(CLICKHOUSE(
 или
 ``` sql
-SOURCE(MONGO(
+SOURCE(MONGODB(
    host 'localhost'
    port 27017
    user ''
--- a/docs/ru/sql-reference/functions/encryption-functions.md
+++ b/docs/ru/sql-reference/functions/encryption-functions.md
@ -11,7 +11,7 @@ toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438 \u0434\u043b\u044f \u0448
 Длина инициализирующего вектора всегда 16 байт (лишнии байты игнорируются). 
-Обратите внимание, что эти функции работают медленно.
+Обратите внимание, что до версии Clickhouse 21.1 эти функции работали медленно.
 ## encrypt {#encrypt}
@ -41,7 +41,7 @@ encrypt('mode', 'plaintext', 'key' [, iv, aad])
 **Возвращаемое значение**
-   Зашифрованная строка. [String](../../sql-reference/data-types/string.md#string).
+-   Бинарная зашифрованная строка. [String](../../sql-reference/data-types/string.md#string).
 **Примеры**
@ -52,57 +52,38 @@ encrypt('mode', 'plaintext', 'key' [, iv, aad])
 ``` sql
 CREATE TABLE encryption_test
 (
-    input String,
+    `comment` String,
-    key String DEFAULT unhex('fb9958e2e897ef3fdb49067b51a24af645b3626eed2f9ea1dc7fd4dd71b7e38f9a68db2a3184f952382c783785f9d77bf923577108a88adaacae5c141b1576b0'),
+    `secret` String
-    iv String DEFAULT unhex('8CA3554377DFF8A369BC50A89780DD85'),
+)
-    key32 String DEFAULT substring(key, 1, 32),
+ENGINE = Memory;
    key24 String DEFAULT substring(key, 1, 24),
    key16 String DEFAULT substring(key, 1, 16)
 ) Engine = Memory;
 ```
-Вставим эти данные:
+Вставим некоторые данные (замечание: не храните ключи или инициализирующие векторы в базе данных, так как это компрометирует всю концепцию шифрования), также хранение "подсказок" небезопасно и используется только для наглядности: 
 Запрос:
 ``` sql
-INSERT INTO encryption_test (input) VALUES (''), ('text'), ('What Is ClickHouse?');
+INSERT INTO encryption_test VALUES('aes-256-cfb128 no IV', encrypt('aes-256-cfb128', 'Secret', '12345678910121314151617181920212')),\
 ('aes-256-cfb128 no IV, different key', encrypt('aes-256-cfb128', 'Secret', 'keykeykeykeykeykeykeykeykeykeyke')),\
 ('aes-256-cfb128 with IV', encrypt('aes-256-cfb128', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv')),\
 ('aes-256-cbc no IV', encrypt('aes-256-cbc', 'Secret', '12345678910121314151617181920212'));
 ```
 Пример без `iv`:
 Запрос:
 ``` sql
-SELECT 'aes-128-ecb' AS mode, hex(encrypt(mode, input, key16)) FROM encryption_test;
+SELECT comment, hex(secret) FROM encryption_test;
 ```
 Результат:
 ``` text
-┌─mode────────┬─hex(encrypt('aes-128-ecb', input, key16))────────────────────────┐
+┌─comment─────────────────────────────┬─hex(secret)──────────────────────┐
-│ aes-128-ecb │ 4603E6862B0D94BBEC68E0B0DF51D60F                                 │
+│ aes-256-cfb128 no IV                │ B4972BDC4459                     │
-│ aes-128-ecb │ 3004851B86D3F3950672DE7085D27C03                                 │
+│ aes-256-cfb128 no IV, different key │ 2FF57C092DC9                     │
-│ aes-128-ecb │ E807F8C8D40A11F65076361AFC7D8B68D8658C5FAA6457985CAA380F16B3F7E4 │
+│ aes-256-cfb128 with IV              │ 5E6CB398F653                     │
-└─────────────┴──────────────────────────────────────────────────────────────────┘
+│ aes-256-cbc no IV                   │ 1BC0629A92450D9E73A00E7D02CF4142 │
-```
+└─────────────────────────────────────┴──────────────────────────────────┘
 Пример с `iv`:
 Запрос:
 ``` sql
 SELECT 'aes-256-ctr' AS mode, hex(encrypt(mode, input, key32, iv)) FROM encryption_test;
 ```
 Результат:
 ``` text
 ┌─mode────────┬─hex(encrypt('aes-256-ctr', input, key32, iv))─┐
 │ aes-256-ctr │                                               │
 │ aes-256-ctr │ 7FB039F7                                      │
 │ aes-256-ctr │ 5CBD20F7ABD3AC41FCAA1A5C0E119E2B325949        │
 └─────────────┴───────────────────────────────────────────────┘
 ```
 Пример в режиме `-gcm`:
@ -110,41 +91,27 @@ SELECT 'aes-256-ctr' AS mode, hex(encrypt(mode, input, key32, iv)) FROM encrypti
 Запрос:
 ``` sql
-SELECT 'aes-256-gcm' AS mode, hex(encrypt(mode, input, key32, iv)) FROM encryption_test;
+INSERT INTO encryption_test VALUES('aes-256-gcm', encrypt('aes-256-gcm', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv')), \
 ('aes-256-gcm with AAD', encrypt('aes-256-gcm', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv', 'aad'));
 SELECT comment, hex(secret) FROM encryption_test WHERE comment LIKE '%gcm%';
 ```
 Результат:
 ``` text
-┌─mode────────┬─hex(encrypt('aes-256-gcm', input, key32, iv))──────────────────────────┐
+┌─comment──────────────┬─hex(secret)──────────────────────────────────┐
-│ aes-256-gcm │ E99DBEBC01F021758352D7FBD9039EFA                                       │
+│ aes-256-gcm          │ A8A3CCBC6426CFEEB60E4EAE03D3E94204C1B09E0254 │
-│ aes-256-gcm │ 8742CE3A7B0595B281C712600D274CA881F47414                               │
+│ aes-256-gcm with AAD │ A8A3CCBC6426D9A1017A0A932322F1852260A4AD6837 │
-│ aes-256-gcm │ A44FD73ACEB1A64BDE2D03808A2576EDBB60764CC6982DB9AF2C33C893D91B00C60DC5 │
+└──────────────────────┴──────────────────────────────────────────────┘
 └─────────────┴────────────────────────────────────────────────────────────────────────┘
 ```
 Пример в режиме `-gcm` и с `aad`:
 Запрос:
 ``` sql
 SELECT 'aes-192-gcm' AS mode, hex(encrypt(mode, input, key24, iv, 'AAD')) FROM encryption_test;
 ```
 Результат:
 ``` text
 ┌─mode────────┬─hex(encrypt('aes-192-gcm', input, key24, iv, 'AAD'))───────────────────┐
 │ aes-192-gcm │ 04C13E4B1D62481ED22B3644595CB5DB                                       │
 │ aes-192-gcm │ 9A6CF0FD2B329B04EAD18301818F016DF8F77447                               │
 │ aes-192-gcm │ B961E9FD9B940EBAD7ADDA75C9F198A40797A5EA1722D542890CC976E21113BBB8A7AA │
 └─────────────┴────────────────────────────────────────────────────────────────────────┘
 ```
 ## aes_encrypt_mysql {#aes_encrypt_mysql}
 Совместима с шифрованием myqsl, результат может быть расшифрован функцией [AES_DECRYPT](https://dev.mysql.com/doc/refman/8.0/en/encryption-functions.html#function_aes-decrypt).
 При одинаковых входящих значениях зашифрованный текст будет совпадать с результатом, возвращаемым функцией `encrypt`. Однако если `key` или `iv` длиннее, чем должны быть, `aes_encrypt_mysql` будет работать аналогично функции `aes_encrypt` в MySQL: свернет ключ и проигнорирует лишнюю часть `iv`.
 Функция поддерживает шифрофание данных следующими режимами:
 -   aes-128-ecb, aes-192-ecb, aes-256-ecb
@ -156,7 +123,7 @@ SELECT 'aes-192-gcm' AS mode, hex(encrypt(mode, input, key24, iv, 'AAD')) FROM e
 **Синтаксис**
-```sql
+``` sql
 aes_encrypt_mysql('mode', 'plaintext', 'key' [, iv])
 ```
@ -164,78 +131,96 @@ aes_encrypt_mysql('mode', 'plaintext', 'key' [, iv])
 -   `mode` — режим шифрования. [String](../../sql-reference/data-types/string.md#string).
 -   `plaintext` — текст, который будет зашифрован. [String](../../sql-reference/data-types/string.md#string).
-   `key` — ключ шифрования. [String](../../sql-reference/data-types/string.md#string).
+-   `key` — ключ шифрования. Если ключ длиннее, чем требует режим шифрования, производится специфичная для MySQL свертка ключа. [String](../../sql-reference/data-types/string.md#string).
-   `iv` — инициализирующий вектор. Необязателен. [String](../../sql-reference/data-types/string.md#string).
+-   `iv` — инициализирующий вектор. Необязателен, учитываются только первые 16 байтов. [String](../../sql-reference/data-types/string.md#string).
 **Возвращаемое значение**
-   Зашифрованная строка. [String](../../sql-reference/data-types/string.md#string).
+-   Бинарная зашифрованная строка. [String](../../sql-reference/data-types/string.md#string).
 **Примеры**
-Создадим такую таблицу:
+При одинаковых входящих значениях результаты шифрования у функций `encrypt` и `aes_encrypt_mysql`  совпадают.
 Запрос:
 ``` sql
-CREATE TABLE encryption_test
+SELECT encrypt('aes-256-cfb128', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv') = aes_encrypt_mysql('aes-256-cfb128', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv') AS ciphertexts_equal;
 (
    input String,
    key String DEFAULT unhex('fb9958e2e897ef3fdb49067b51a24af645b3626eed2f9ea1dc7fd4dd71b7e38f9a68db2a3184f952382c783785f9d77bf923577108a88adaacae5c141b1576b0'),
    iv String DEFAULT unhex('8CA3554377DFF8A369BC50A89780DD85'),
    key32 String DEFAULT substring(key, 1, 32),
    key24 String DEFAULT substring(key, 1, 24),
    key16 String DEFAULT substring(key, 1, 16)
 ) Engine = Memory;
 ```
 Вставим эти данные:
 Запрос:
 ``` sql
 INSERT INTO encryption_test (input) VALUES (''), ('text'), ('What Is ClickHouse?');
 ```
 Пример без `iv`:
 Запрос:
 ``` sql
 SELECT 'aes-128-cbc' AS mode, hex(aes_encrypt_mysql(mode, input, key32)) FROM encryption_test;
 ```
 Результат:
 ``` text
-┌─mode────────┬─hex(aes_encrypt_mysql('aes-128-cbc', input, key32))──────────────┐
+┌─ciphertexts_equal─┐
-│ aes-128-cbc │ FEA8CFDE6EE2C6E7A2CC6ADDC9F62C83                                 │
+│                 1 │
-│ aes-128-cbc │ 78B16CD4BE107660156124C5FEE6454A                                 │
+└───────────────────┘
 │ aes-128-cbc │ 67C0B119D96F18E2823968D42871B3D179221B1E7EE642D628341C2B29BA2E18 │
 └─────────────┴──────────────────────────────────────────────────────────────────┘
 ```
-Пример с `iv`:
+Функция `encrypt` генерирует исключение, если `key` или `iv` длиннее чем нужно:
 Запрос:
 ``` sql
-SELECT 'aes-256-cfb128' AS mode, hex(aes_encrypt_mysql(mode, input, key32, iv)) FROM encryption_test;
+SELECT encrypt('aes-256-cfb128', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123');
 ```
 Результат:
 ``` text
-┌─mode───────────┬─hex(aes_encrypt_mysql('aes-256-cfb128', input, key32, iv))─┐
+Received exception from server (version 21.1.2):
-│ aes-256-cfb128 │                                                            │
+Code: 36. DB::Exception: Received from localhost:9000. DB::Exception: Invalid key size: 33 expected 32: While processing encrypt('aes-256-cfb128', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123'). 
-│ aes-256-cfb128 │ 7FB039F7                                                   │
+```
-│ aes-256-cfb128 │ 5CBD20F7ABD3AC41FCAA1A5C0E119E2BB5174F                     │
+
-└────────────────┴────────────────────────────────────────────────────────────┘
+Однако функция `aes_encrypt_mysql` в аналогичном случае возвращает результат, который может быть обработан MySQL:
 Запрос:
 ``` sql
 SELECT hex(aes_encrypt_mysql('aes-256-cfb128', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123')) AS ciphertext;
 ```
 Результат:
 ```text
 ┌─ciphertext───┐
 │ 24E9E4966469 │
 └──────────────┘
 ```
 Если передать `iv` еще длиннее, результат останется таким же:
 Запрос:
 ``` sql
 SELECT hex(aes_encrypt_mysql('aes-256-cfb128', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123456')) AS ciphertext
 ```
 Результат:
 ``` text
 ┌─ciphertext───┐
 │ 24E9E4966469 │
 └──────────────┘
 ```
 Это совпадает с результатом, возвращаемым MySQL при таких же входящих значениях:
 ``` sql
 mysql> SET  block_encryption_mode='aes-256-cfb128';
 Query OK, 0 rows affected (0.00 sec)
 mysql> SELECT aes_encrypt('Secret', '123456789101213141516171819202122', 'iviviviviviviviv123456') as ciphertext;
 +------------------------+
 | ciphertext             |
 +------------------------+
 | 0x24E9E4966469         |
 +------------------------+
 1 row in set (0.00 sec)
 ```
 ## decrypt {#decrypt}
-Функция поддерживает расшифровку данных следующими режимами:
+Функция расшифровывает зашифрованный текст и может работать в следующих режимах:
 -   aes-128-ecb, aes-192-ecb, aes-256-ecb
 -   aes-128-cbc, aes-192-cbc, aes-256-cbc
@ -247,7 +232,7 @@ SELECT 'aes-256-cfb128' AS mode, hex(aes_encrypt_mysql(mode, input, key32, iv))
 **Синтаксис**
-```sql
+``` sql
 decrypt('mode', 'ciphertext', 'key' [, iv, aad])
 ```
@ -265,52 +250,58 @@ decrypt('mode', 'ciphertext', 'key' [, iv, aad])
 **Примеры**
-Создадим такую таблицу:
+Рассмотрим таблицу из примера для функции [encrypt](#encrypt).
 Запрос:
 ``` sql
-CREATE TABLE encryption_test
+SELECT comment, hex(secret) FROM encryption_test;
 (
    input String,
    key String DEFAULT unhex('fb9958e2e897ef3fdb49067b51a24af645b3626eed2f9ea1dc7fd4dd71b7e38f9a68db2a3184f952382c783785f9d77bf923577108a88adaacae5c141b1576b0'),
    iv String DEFAULT unhex('8CA3554377DFF8A369BC50A89780DD85'),
    key32 String DEFAULT substring(key, 1, 32),
    key24 String DEFAULT substring(key, 1, 24),
    key16 String DEFAULT substring(key, 1, 16)
 ) Engine = Memory;
 ```
 Вставим эти данные:
 Запрос:
 ``` sql
 INSERT INTO encryption_test (input) VALUES (''), ('text'), ('What Is ClickHouse?');
 ```
 Запрос:
 ``` sql
 SELECT 'aes-128-ecb' AS mode, decrypt(mode, encrypt(mode, input, key16), key16) FROM encryption_test;
 ```
 Результат:
-```text
+``` text
-┌─mode────────┬─decrypt('aes-128-ecb', encrypt('aes-128-ecb', input, key16), key16)─┐
+┌─comment──────────────┬─hex(secret)──────────────────────────────────┐
-│ aes-128-ecb │                                                                     │
+│ aes-256-gcm          │ A8A3CCBC6426CFEEB60E4EAE03D3E94204C1B09E0254 │
-│ aes-128-ecb │ text                                                                │
+│ aes-256-gcm with AAD │ A8A3CCBC6426D9A1017A0A932322F1852260A4AD6837 │
-│ aes-128-ecb │ What Is ClickHouse?                                                 │
+└──────────────────────┴──────────────────────────────────────────────┘
-└─────────────┴─────────────────────────────────────────────────────────────────────┘
+┌─comment─────────────────────────────┬─hex(secret)──────────────────────┐
 │ aes-256-cfb128 no IV                │ B4972BDC4459                     │
 │ aes-256-cfb128 no IV, different key │ 2FF57C092DC9                     │
 │ aes-256-cfb128 with IV              │ 5E6CB398F653                     │
 │ aes-256-cbc no IV                   │ 1BC0629A92450D9E73A00E7D02CF4142 │
 └─────────────────────────────────────┴──────────────────────────────────┘
 ```
 Теперь попытаемся расшифровать эти данные:
 Запрос:
 ``` sql
 SELECT comment, decrypt('aes-256-cfb128', secret, '12345678910121314151617181920212') as plaintext FROM encryption_test;
 ```
 Результат:
 ``` text
 ┌─comment─────────────────────────────┬─plaintext─┐
 │ aes-256-cfb128 no IV                │ Secret    │
 │ aes-256-cfb128 no IV, different key │ <20>4<EFBFBD>
                                           <20>         │
 │ aes-256-cfb128 with IV              │ <20><><EFBFBD>6<EFBFBD>~        │
 │aes-256-cbc no IV                   │ <20>2*4<>h3c<33>4w<34><77>@
 └─────────────────────────────────────┴───────────┘
 ```
 Обратите внимание, что только часть данных была расшифрована верно. Оставшаяся часть расшифрована некорректно, так как при шифровании использовались другие значения `mode`, `key`, или `iv`.
 ## aes_decrypt_mysql {#aes_decrypt_mysql}
 Совместима с шифрованием myqsl и может расшифровать данные, зашифрованные функцией [AES_ENCRYPT](https://dev.mysql.com/doc/refman/8.0/en/encryption-functions.html#function_aes-encrypt). 
-Функция поддерживает расшифровку данных следующими режимами:
+При одинаковых входящих значениях расшифрованный текст будет совпадать с результатом, возвращаемым функцией `decrypt`. Однако если `key` или `iv` длиннее, чем должны быть, `aes_decrypt_mysql` будет работать аналогично функции `aes_decrypt` в MySQL: свернет ключ и проигнорирует лишнюю часть `iv`.
 Функция поддерживает расшифровку данных в следующих режимах:
 -   aes-128-ecb, aes-192-ecb, aes-256-ecb
 -   aes-128-cbc, aes-192-cbc, aes-256-cbc
@ -321,7 +312,7 @@ SELECT 'aes-128-ecb' AS mode, decrypt(mode, encrypt(mode, input, key16), key16)
 **Синтаксис**
-```sql
+``` sql
 aes_decrypt_mysql('mode', 'ciphertext', 'key' [, iv])
 ```
@ -332,51 +323,39 @@ aes_decrypt_mysql('mode', 'ciphertext', 'key' [, iv])
 -   `key` — ключ шифрования. [String](../../sql-reference/data-types/string.md#string).
 -   `iv` — инициализирующий вектор. Необязателен. [String](../../sql-reference/data-types/string.md#string).
 **Возвращаемое значение**
 -   Расшифрованная строка. [String](../../sql-reference/data-types/string.md#string).
 **Примеры**
-Создадим такую таблицу:
+Расшифруем данные, которые до этого были зашифрованы в MySQL:
 Запрос:
 ``` sql
-CREATE TABLE encryption_test
+mysql> SET  block_encryption_mode='aes-256-cfb128';
-(
+Query OK, 0 rows affected (0.00 sec)
    input String,
    key String DEFAULT unhex('fb9958e2e897ef3fdb49067b51a24af645b3626eed2f9ea1dc7fd4dd71b7e38f9a68db2a3184f952382c783785f9d77bf923577108a88adaacae5c141b1576b0'),
    iv String DEFAULT unhex('8CA3554377DFF8A369BC50A89780DD85'),
    key32 String DEFAULT substring(key, 1, 32),
    key24 String DEFAULT substring(key, 1, 24),
    key16 String DEFAULT substring(key, 1, 16)
 ) Engine = Memory;
 ```
-Вставим эти данные:
+mysql> SELECT aes_encrypt('Secret', '123456789101213141516171819202122', 'iviviviviviviviv123456') as ciphertext;
-
+------------------------+
-Запрос:
+| ciphertext             |
-
+------------------------+
-``` sql
+| 0x24E9E4966469         |
-INSERT INTO encryption_test (input) VALUES (''), ('text'), ('What Is ClickHouse?');
+------------------------+
 1 row in set (0.00 sec)
 ```
 Запрос:
 ``` sql
-SELECT 'aes-128-cbc' AS mode, aes_decrypt_mysql(mode, aes_encrypt_mysql(mode, input, key), key) FROM encryption_test;
+SELECT aes_decrypt_mysql('aes-256-cfb128', unhex('24E9E4966469'), '123456789101213141516171819202122', 'iviviviviviviviv123456') AS plaintext;
 ```
 Результат:
 ``` text
-┌─mode────────┬─aes_decrypt_mysql('aes-128-cbc', aes_encrypt_mysql('aes-128-cbc', input, key), key)─┐
+┌─plaintext─┐
-│ aes-128-cbc │                                                                                     │
+│ Secret    │
-│ aes-128-cbc │ text                                                                                │
+└───────────┘
 │ aes-128-cbc │ What Is ClickHouse?                                                                 │
 └─────────────┴─────────────────────────────────────────────────────────────────────────────────────┘
 ```
 [Original article](https://clickhouse.tech/docs/ru/sql-reference/functions/encryption_functions/) <!--hide-->
--- a/docs/ru/sql-reference/statements/select/all.md
+++ b/docs/ru/sql-reference/statements/select/all.md
@ -0,0 +1,22 @@
 ---
 toc_title: ALL
 ---
 # Секция ALL {#select-all}
 Если в таблице несколько совпадающих строк, то `ALL` возвращает все из них. Поведение запроса `SELECT ALL` точно такое же, как и `SELECT` без аргумента `DISTINCT`. Если указаны оба аргумента: `ALL` и `DISTINCT`, функция вернет исключение.
 `ALL` может быть указан внутри агрегатной функции, например, результат выполнения запроса:
 ```sql
 SELECT sum(ALL number) FROM numbers(10);
 ```
 равен результату выполнения запроса:
 ```sql
 SELECT sum(number) FROM numbers(10);
 ```
 [Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/statements/select/all) <!--hide-->
--- a/docs/ru/whats-new/extended-roadmap.md
+++ b/docs/ru/whats-new/extended-roadmap.md
@ -1 +0,0 @@
 roadmap.md
--- a/docs/ru/whats-new/extended-roadmap.md
+++ b/docs/ru/whats-new/extended-roadmap.md
@ -0,0 +1,10 @@
 ---
 toc_priority: 74
 toc_title: Roadmap
 ---
 # Планы развития {#roadmap}
 Планы развития на 2021 год опубликованы для обсуждения [здесь](https://github.com/ClickHouse/ClickHouse/issues/17623).
 [Оригинальная статья](https://clickhouse.tech/docs/ru/roadmap/) <!--hide-->
--- a/docs/ru/whats-new/index.md
+++ b/docs/ru/whats-new/index.md
@ -3,4 +3,6 @@ toc_folder_title: "\u0427\u0442\u043E \u043D\u043E\u0432\u043E\u0433\u043E?"
 toc_priority: 82
 ---
 # Что нового в ClickHouse?
 Планы развития вкратце изложены [здесь](extended-roadmap.md), а новости по предыдущим релизам подробно описаны в [журнале изменений](changelog/index.md).
--- a/docs/ru/whats-new/roadmap.md
+++ b/docs/ru/whats-new/roadmap.md
@ -1 +0,0 @@
 ../../en/whats-new/roadmap.md
--- a/programs/install/Install.cpp
+++ b/programs/install/Install.cpp
@ -66,6 +66,7 @@ namespace ErrorCodes
    extern const int CANNOT_OPEN_FILE;
    extern const int SYSTEM_ERROR;
    extern const int NOT_ENOUGH_SPACE;
    extern const int CANNOT_KILL;
 }
 }
@ -886,6 +887,27 @@ namespace
                fmt::print("Sent kill signal.\n", pid);
            else
                throwFromErrno("Cannot send kill signal", ErrorCodes::SYSTEM_ERROR);
            /// Wait for the process (100 seconds).
            constexpr size_t num_kill_check_tries = 1000;
            constexpr size_t kill_check_delay_ms = 100;
            for (size_t i = 0; i < num_kill_check_tries; ++i)
            {
                fmt::print("Waiting for server to be killed\n");
                if (!isRunning(pid_file))
                {
                    fmt::print("Server exited\n");
                    break;
                }
                sleepForMilliseconds(kill_check_delay_ms);
            }
            if (isRunning(pid_file))
            {
                throw Exception(ErrorCodes::CANNOT_KILL,
                    "The server process still exists after %zu ms",
                    num_kill_check_tries, kill_check_delay_ms);
            }
        }
        return 0;
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@ -1017,17 +1017,6 @@ int Server::main(const std::vector<std::string> & /*args*/)
        LOG_INFO(log, "Query Profiler and TraceCollector are disabled because they require PHDR cache to be created"
            " (otherwise the function 'dl_iterate_phdr' is not lock free and not async-signal safe).");
    if (has_zookeeper && config().has("distributed_ddl"))
    {
        /// DDL worker should be started after all tables were loaded
        String ddl_zookeeper_path = config().getString("distributed_ddl.path", "/clickhouse/task_queue/ddl/");
        int pool_size = config().getInt("distributed_ddl.pool_size", 1);
        if (pool_size < 1)
            throw Exception("distributed_ddl.pool_size should be greater then 0", ErrorCodes::ARGUMENT_OUT_OF_BOUND);
        global_context->setDDLWorker(std::make_unique<DDLWorker>(pool_size, ddl_zookeeper_path, *global_context, &config(),
                                                                 "distributed_ddl", "DDLWorker", &CurrentMetrics::MaxDDLEntryID));
    }
    std::unique_ptr<DNSCacheUpdater> dns_cache_updater;
    if (config().has("disable_internal_dns_cache") && config().getInt("disable_internal_dns_cache"))
    {
@ -1309,6 +1298,37 @@ int Server::main(const std::vector<std::string> & /*args*/)
                std::thread::hardware_concurrency());
        }
        /// try to load dictionaries immediately, throw on error and die
        ext::scope_guard dictionaries_xmls, models_xmls;
        try
        {
            if (!config().getBool("dictionaries_lazy_load", true))
            {
                global_context->tryCreateEmbeddedDictionaries();
                global_context->getExternalDictionariesLoader().enableAlwaysLoadEverything(true);
            }
            dictionaries_xmls = global_context->getExternalDictionariesLoader().addConfigRepository(
                std::make_unique<ExternalLoaderXMLConfigRepository>(config(), "dictionaries_config"));
            models_xmls = global_context->getExternalModelsLoader().addConfigRepository(
                std::make_unique<ExternalLoaderXMLConfigRepository>(config(), "models_config"));
        }
        catch (...)
        {
            LOG_ERROR(log, "Caught exception while loading dictionaries.");
            throw;
        }
        if (has_zookeeper && config().has("distributed_ddl"))
        {
            /// DDL worker should be started after all tables were loaded
            String ddl_zookeeper_path = config().getString("distributed_ddl.path", "/clickhouse/task_queue/ddl/");
            int pool_size = config().getInt("distributed_ddl.pool_size", 1);
            if (pool_size < 1)
                throw Exception("distributed_ddl.pool_size should be greater then 0", ErrorCodes::ARGUMENT_OUT_OF_BOUND);
            global_context->setDDLWorker(std::make_unique<DDLWorker>(pool_size, ddl_zookeeper_path, *global_context, &config(),
                                                                     "distributed_ddl", "DDLWorker", &CurrentMetrics::MaxDDLEntryID));
        }
        LOG_INFO(log, "Ready for connections.");
        SCOPE_EXIT({
@ -1358,26 +1378,6 @@ int Server::main(const std::vector<std::string> & /*args*/)
            }
        });
        /// try to load dictionaries immediately, throw on error and die
        ext::scope_guard dictionaries_xmls, models_xmls;
        try
        {
            if (!config().getBool("dictionaries_lazy_load", true))
            {
                global_context->tryCreateEmbeddedDictionaries();
                global_context->getExternalDictionariesLoader().enableAlwaysLoadEverything(true);
            }
            dictionaries_xmls = global_context->getExternalDictionariesLoader().addConfigRepository(
                std::make_unique<ExternalLoaderXMLConfigRepository>(config(), "dictionaries_config"));
            models_xmls = global_context->getExternalModelsLoader().addConfigRepository(
                std::make_unique<ExternalLoaderXMLConfigRepository>(config(), "models_config"));
        }
        catch (...)
        {
            LOG_ERROR(log, "Caught exception while loading dictionaries.");
            throw;
        }
        std::vector<std::unique_ptr<MetricsTransmitter>> metrics_transmitters;
        for (const auto & graphite_key : DB::getMultipleKeysFromConfig(config(), "", "graphite"))
        {
--- a/programs/server/config.xml
+++ b/programs/server/config.xml
@ -892,6 +892,19 @@
        <!-- Controls how much ON CLUSTER queries can be run simultaneously. -->
        <!-- <pool_size>1</pool_size> -->
        <!--
             Cleanup settings (active tasks will not be removed)
        -->
        <!-- Controls task TTL (default 1 week) -->
        <!-- <task_max_lifetime>604800</task_max_lifetime> -->
        <!-- Controls how often cleanup should be performed (in seconds) -->
        <!-- <cleanup_delay_period>60</cleanup_delay_period> -->
        <!-- Controls how many tasks could be in the queue -->
        <!-- <max_tasks_in_queue>1000</max_tasks_in_queue> -->
    </distributed_ddl>
    <!-- Settings to fine tune MergeTree tables. See documentation in source code, in MergeTreeSettings.h -->
--- a/src/AggregateFunctions/AggregateFunctionSumMap.h
+++ b/src/AggregateFunctions/AggregateFunctionSumMap.h
@ -118,6 +118,8 @@ public:
                WhichDataType value_type_to_check(value_type);
                /// Do not promote decimal because of implementation issues of this function design
                /// Currently we cannot get result column type in case of decimal we cannot get decimal scale
                /// in method void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
                /// If we decide to make this function more efficient we should promote decimal type during summ
                if (value_type_to_check.isDecimal())
                    result_type = value_type_without_nullable;
--- a/src/Client/Connection.cpp
+++ b/src/Client/Connection.cpp
@ -109,6 +109,8 @@ void Connection::connect(const ConnectionTimeouts & timeouts)
        }
        in = std::make_shared<ReadBufferFromPocoSocket>(*socket);
        in->setAsyncCallback(std::move(async_callback));
        out = std::make_shared<WriteBufferFromPocoSocket>(*socket);
        connected = true;
@ -753,15 +755,8 @@ std::optional<UInt64> Connection::checkPacket(size_t timeout_microseconds)
 }
-Packet Connection::receivePacket(std::function<void(Poco::Net::Socket &)> async_callback)
+Packet Connection::receivePacket()
 {
    in->setAsyncCallback(std::move(async_callback));
    SCOPE_EXIT({
        /// disconnect() will reset "in".
        if (in)
            in->setAsyncCallback({});
    });
    try
    {
        Packet res;
--- a/src/Client/Connection.h
+++ b/src/Client/Connection.h
@ -27,7 +27,6 @@
 #include <atomic>
 #include <optional>
 namespace DB
 {
@ -175,8 +174,7 @@ public:
    std::optional<UInt64> checkPacket(size_t timeout_microseconds = 0);
    /// Receive packet from server.
-    /// Each time read blocks and async_callback is set, it will be called. You can poll socket inside it.
+    Packet receivePacket();
    Packet receivePacket(std::function<void(Poco::Net::Socket &)> async_callback = {});
    /// If not connected yet, or if connection is broken - then connect. If cannot connect - throw an exception.
    void forceConnected(const ConnectionTimeouts & timeouts);
@ -195,6 +193,16 @@ public:
    size_t outBytesCount() const { return out ? out->count() : 0; }
    size_t inBytesCount() const { return in ? in->count() : 0; }
    Poco::Net::Socket * getSocket() { return socket.get(); }
    /// Each time read from socket blocks and async_callback is set, it will be called. You can poll socket inside it.
    void setAsyncCallback(AsyncCallback async_callback_)
    {
        async_callback = std::move(async_callback_);
        if (in)
            in->setAsyncCallback(std::move(async_callback));
    }
 private:
    String host;
    UInt16 port;
@ -282,6 +290,8 @@ private:
    LoggerWrapper log_wrapper;
    AsyncCallback async_callback = {};
    void connect(const ConnectionTimeouts & timeouts);
    void sendHello();
    void receiveHello();
@ -307,4 +317,20 @@ private:
    [[noreturn]] void throwUnexpectedPacket(UInt64 packet_type, const char * expected) const;
 };
 class AsyncCallbackSetter
 {
 public:
    AsyncCallbackSetter(Connection * connection_, AsyncCallback async_callback) : connection(connection_)
    {
        connection->setAsyncCallback(std::move(async_callback));
    }
    ~AsyncCallbackSetter()
    {
        connection->setAsyncCallback({});
    }
 private:
    Connection * connection;
 };
 }
--- a/src/Client/ConnectionEstablisher.cpp
+++ b/src/Client/ConnectionEstablisher.cpp
@ -0,0 +1,239 @@
 #include <Client/ConnectionEstablisher.h>
 #include <Common/quoteString.h>
 #include <Common/ProfileEvents.h>
 namespace ProfileEvents
 {
    extern const Event DistributedConnectionMissingTable;
    extern const Event DistributedConnectionStaleReplica;
 }
 namespace DB
 {
 namespace ErrorCodes
 {
    extern const int ATTEMPT_TO_READ_AFTER_EOF;
    extern const int NETWORK_ERROR;
    extern const int SOCKET_TIMEOUT;
 }
 ConnectionEstablisher::ConnectionEstablisher(
    IConnectionPool * pool_,
    const ConnectionTimeouts * timeouts_,
    const Settings * settings_,
    Poco::Logger * log_,
    const QualifiedTableName * table_to_check_)
    : pool(pool_), timeouts(timeouts_), settings(settings_), log(log_), table_to_check(table_to_check_), is_finished(false)
 {
 }
 void ConnectionEstablisher::run(ConnectionEstablisher::TryResult & result, std::string & fail_message)
 {
    is_finished = false;
    SCOPE_EXIT(is_finished = true);
    try
    {
        result.entry = pool->get(*timeouts, settings, /* force_connected = */ false);
        AsyncCallbackSetter async_setter(&*result.entry, std::move(async_callback));
        UInt64 server_revision = 0;
        if (table_to_check)
            server_revision = result.entry->getServerRevision(*timeouts);
        if (!table_to_check || server_revision < DBMS_MIN_REVISION_WITH_TABLES_STATUS)
        {
            result.entry->forceConnected(*timeouts);
            result.is_usable = true;
            result.is_up_to_date = true;
            return;
        }
        /// Only status of the remote table corresponding to the Distributed table is taken into account.
        /// TODO: request status for joined tables also.
        TablesStatusRequest status_request;
        status_request.tables.emplace(*table_to_check);
        TablesStatusResponse status_response = result.entry->getTablesStatus(*timeouts, status_request);
        auto table_status_it = status_response.table_states_by_id.find(*table_to_check);
        if (table_status_it == status_response.table_states_by_id.end())
        {
            const char * message_pattern = "There is no table {}.{} on server: {}";
            fail_message = fmt::format(message_pattern, backQuote(table_to_check->database), backQuote(table_to_check->table), result.entry->getDescription());
            LOG_WARNING(log, fail_message);
            ProfileEvents::increment(ProfileEvents::DistributedConnectionMissingTable);
            return;
        }
        result.is_usable = true;
        UInt64 max_allowed_delay = settings ? UInt64(settings->max_replica_delay_for_distributed_queries) : 0;
        if (!max_allowed_delay)
        {
            result.is_up_to_date = true;
            return;
        }
        UInt32 delay = table_status_it->second.absolute_delay;
        if (delay < max_allowed_delay)
            result.is_up_to_date = true;
        else
        {
            result.is_up_to_date = false;
            result.staleness = delay;
            LOG_TRACE(log, "Server {} has unacceptable replica delay for table {}.{}: {}", result.entry->getDescription(), table_to_check->database, table_to_check->table, delay);
            ProfileEvents::increment(ProfileEvents::DistributedConnectionStaleReplica);
        }
    }
    catch (const Exception & e)
    {
        if (e.code() != ErrorCodes::NETWORK_ERROR && e.code() != ErrorCodes::SOCKET_TIMEOUT
            && e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF)
            throw;
        fail_message = getCurrentExceptionMessage(/* with_stacktrace = */ false);
        if (!result.entry.isNull())
        {
            result.entry->disconnect();
            result.reset();
        }
    }
 }
 #if defined(OS_LINUX)
 ConnectionEstablisherAsync::ConnectionEstablisherAsync(
    IConnectionPool * pool_,
    const ConnectionTimeouts * timeouts_,
    const Settings * settings_,
    Poco::Logger * log_,
    const QualifiedTableName * table_to_check_)
    : connection_establisher(pool_, timeouts_, settings_, log_, table_to_check_)
 {
    epoll.add(receive_timeout.getDescriptor());
 }
 void ConnectionEstablisherAsync::Routine::ReadCallback::operator()(int fd, const Poco::Timespan & timeout, const std::string &)
 {
    /// Check if it's the first time and we need to add socket fd to epoll.
    if (connection_establisher_async.socket_fd == -1)
    {
        connection_establisher_async.epoll.add(fd);
        connection_establisher_async.socket_fd = fd;
    }
    connection_establisher_async.receive_timeout.setRelative(timeout);
    fiber = std::move(fiber).resume();
    connection_establisher_async.receive_timeout.reset();
 }
 Fiber ConnectionEstablisherAsync::Routine::operator()(Fiber && sink)
 {
    try
    {
        connection_establisher_async.connection_establisher.setAsyncCallback(ReadCallback{connection_establisher_async, sink});
        connection_establisher_async.connection_establisher.run(connection_establisher_async.result, connection_establisher_async.fail_message);
    }
    catch (const boost::context::detail::forced_unwind &)
    {
        /// This exception is thrown by fiber implementation in case if fiber is being deleted but hasn't exited
        /// It should not be caught or it will segfault.
        /// Other exceptions must be caught
        throw;
    }
    catch (...)
    {
        connection_establisher_async.exception = std::current_exception();
    }
    return std::move(sink);
 }
 std::variant<int, ConnectionEstablisher::TryResult> ConnectionEstablisherAsync::resume()
 {
    if (!fiber_created)
    {
        reset();
        fiber = boost::context::fiber(std::allocator_arg_t(), fiber_stack, Routine{*this});
        fiber_created = true;
    } else if (!checkReceiveTimeout())
        return result;
    fiber = std::move(fiber).resume();
    if (exception)
        std::rethrow_exception(std::move(exception));
    if (connection_establisher.isFinished())
    {
        destroyFiber();
        return result;
    }
    return epoll.getFileDescriptor();
 }
 bool ConnectionEstablisherAsync::checkReceiveTimeout()
 {
    bool is_socket_ready = false;
    bool is_receive_timeout_alarmed = false;
    epoll_event events[2];
    events[0].data.fd = events[1].data.fd = -1;
    size_t ready_count = epoll.getManyReady(2, events, false);
    for (size_t i = 0; i != ready_count; ++i)
    {
        if (events[i].data.fd == socket_fd)
            is_socket_ready = true;
        if (events[i].data.fd == receive_timeout.getDescriptor())
            is_receive_timeout_alarmed = true;
    }
    if (is_receive_timeout_alarmed && !is_socket_ready)
    {
        destroyFiber();
        /// In not async case this exception would be thrown and caught in ConnectionEstablisher::run,
        /// but in async case we process timeout outside and cannot throw exception. So, we just save fail message.
        fail_message = "Timeout exceeded while reading from socket (" + result.entry->getDescription() + ")";
        epoll.remove(socket_fd);
        resetResult();
        return false;
    }
    return true;
 }
 void ConnectionEstablisherAsync::cancel()
 {
    destroyFiber();
    reset();
 }
 void ConnectionEstablisherAsync::reset()
 {
    resetResult();
    fail_message.clear();
    socket_fd = -1;
 }
 void ConnectionEstablisherAsync::resetResult()
 {
    if (!result.entry.isNull())
    {
        result.entry->disconnect();
        result.reset();
    }
 }
 void ConnectionEstablisherAsync::destroyFiber()
 {
    Fiber to_destroy = std::move(fiber);
    fiber_created = false;
 }
 #endif
 }
--- a/src/Client/ConnectionEstablisher.h
+++ b/src/Client/ConnectionEstablisher.h
@ -0,0 +1,131 @@
 #pragma once
 #include <variant>
 #include <Common/Epoll.h>
 #include <Common/Fiber.h>
 #include <Common/FiberStack.h>
 #include <Common/TimerDescriptor.h>
 #include <Common/PoolWithFailoverBase.h>
 #include <Client/ConnectionPool.h>
 namespace DB
 {
 /// Class for establishing connection to the replica. It supports setting up
 /// an async callback that will be called when reading from socket blocks.
 class ConnectionEstablisher
 {
 public:
    using TryResult = PoolWithFailoverBase<IConnectionPool>::TryResult;
    ConnectionEstablisher(IConnectionPool * pool_,
                          const ConnectionTimeouts * timeouts_,
                          const Settings * settings_,
                          Poco::Logger * log,
                          const QualifiedTableName * table_to_check = nullptr);
    /// Establish connection and save it in result, write possible exception message in fail_message.
    void run(TryResult & result, std::string & fail_message);
    /// Set async callback that will be called when reading from socket blocks.
    void setAsyncCallback(AsyncCallback async_callback_) { async_callback = std::move(async_callback_); }
    bool isFinished() const { return is_finished; }
 private:
    IConnectionPool * pool;
    const ConnectionTimeouts * timeouts;
    const Settings * settings;
    Poco::Logger * log;
    const QualifiedTableName * table_to_check;
    bool is_finished;
    AsyncCallback async_callback = {};
 };
 #if defined(OS_LINUX)
 /// Class for nonblocking establishing connection to the replica.
 /// It runs establishing connection process in fiber and sets special
 /// read callback which is called when reading from socket blocks.
 /// When read callback is called, socket and receive timeout are added in epoll
 /// and execution returns to the main program.
 /// So, you can poll this epoll file descriptor to determine when to resume.
 class ConnectionEstablisherAsync
 {
 public:
    using TryResult = PoolWithFailoverBase<IConnectionPool>::TryResult;
    ConnectionEstablisherAsync(IConnectionPool * pool_,
                          const ConnectionTimeouts * timeouts_,
                          const Settings * settings_,
                          Poco::Logger * log_,
                          const QualifiedTableName * table_to_check = nullptr);
    /// Resume establishing connection. If the process was not finished,
    /// return file descriptor (you can add it in epoll and poll it,
    /// when this fd become ready, call resume again),
    /// if the process was failed or finished, return it's result,
    std::variant<int, TryResult> resume();
    /// Cancel establishing connections. Fiber will be destroyed,
    /// class will be set in initial stage.
    void cancel();
    TryResult getResult() const { return result; }
    const std::string & getFailMessage() const { return fail_message; }
 private:
    /// When epoll file descriptor is ready, check if it's an expired timeout.
    /// Return false if receive timeout expired and socket is not ready, return true otherwise.
    bool checkReceiveTimeout();
    struct Routine
    {
        ConnectionEstablisherAsync & connection_establisher_async;
        struct ReadCallback
        {
            ConnectionEstablisherAsync & connection_establisher_async;
            Fiber & fiber;
            void operator()(int fd, const Poco::Timespan & timeout, const std::string &);
        };
        Fiber operator()(Fiber && sink);
    };
    void reset();
    void resetResult();
    void destroyFiber();
    ConnectionEstablisher connection_establisher;
    TryResult result;
    std::string fail_message;
    Fiber fiber;
    FiberStack fiber_stack;
    /// We use timer descriptor for checking socket receive timeout.
    TimerDescriptor receive_timeout;
    /// In read callback we add socket file descriptor and timer descriptor with receive timeout
    /// in epoll, so we can return epoll file descriptor outside for polling.
    Epoll epoll;
    int socket_fd = -1;
    std::string socket_description;
    /// If and exception occurred in fiber resume, we save it and rethrow.
    std::exception_ptr exception;
    bool fiber_created = false;
 };
 #endif
 }
--- a/src/Client/ConnectionPoolWithFailover.cpp
+++ b/src/Client/ConnectionPoolWithFailover.cpp
@ -1,4 +1,5 @@
 #include <Client/ConnectionPoolWithFailover.h>
 #include <Client/ConnectionEstablisher.h>
 #include <Poco/Net/NetException.h>
 #include <Poco/Net/DNS.h>
@ -23,9 +24,6 @@ namespace DB
 namespace ErrorCodes
 {
    extern const int ATTEMPT_TO_READ_AFTER_EOF;
    extern const int NETWORK_ERROR;
    extern const int SOCKET_TIMEOUT;
    extern const int LOGICAL_ERROR;
 }
@ -172,6 +170,43 @@ std::vector<ConnectionPoolWithFailover::TryResult> ConnectionPoolWithFailover::g
    return getManyImpl(settings, pool_mode, try_get_entry);
 }
 ConnectionPoolWithFailover::Base::GetPriorityFunc ConnectionPoolWithFailover::makeGetPriorityFunc(const Settings * settings)
 {
    size_t offset = 0;
    if (settings)
        offset = settings->load_balancing_first_offset % nested_pools.size();
    GetPriorityFunc get_priority;
    switch (settings ? LoadBalancing(settings->load_balancing) : default_load_balancing)
    {
        case LoadBalancing::NEAREST_HOSTNAME:
            get_priority = [&](size_t i) { return hostname_differences[i]; };
            break;
        case LoadBalancing::IN_ORDER:
            get_priority = [](size_t i) { return i; };
            break;
        case LoadBalancing::RANDOM:
            break;
        case LoadBalancing::FIRST_OR_RANDOM:
            get_priority = [offset](size_t i) -> size_t { return i != offset; };
            break;
        case LoadBalancing::ROUND_ROBIN:
            if (last_used >= nested_pools.size())
                last_used = 0;
            ++last_used;
            /* Consider nested_pools.size() equals to 5
             * last_used = 1 -> get_priority: 0 1 2 3 4
             * last_used = 2 -> get_priority: 5 0 1 2 3
             * last_used = 3 -> get_priority: 5 4 0 1 2
             * ...
             * */
            get_priority = [&](size_t i) { ++i; return i < last_used ? nested_pools.size() - i : i - last_used; };
            break;
    }
    return get_priority;
 }
 std::vector<ConnectionPoolWithFailover::TryResult> ConnectionPoolWithFailover::getManyImpl(
        const Settings * settings,
        PoolMode pool_mode,
@ -194,36 +229,7 @@ std::vector<ConnectionPoolWithFailover::TryResult> ConnectionPoolWithFailover::g
    else
        throw DB::Exception("Unknown pool allocation mode", DB::ErrorCodes::LOGICAL_ERROR);
-    size_t offset = 0;
+    GetPriorityFunc get_priority = makeGetPriorityFunc(settings);
    if (settings)
        offset = settings->load_balancing_first_offset % nested_pools.size();
    GetPriorityFunc get_priority;
    switch (settings ? LoadBalancing(settings->load_balancing) : default_load_balancing)
    {
    case LoadBalancing::NEAREST_HOSTNAME:
        get_priority = [&](size_t i) { return hostname_differences[i]; };
        break;
    case LoadBalancing::IN_ORDER:
        get_priority = [](size_t i) { return i; };
        break;
    case LoadBalancing::RANDOM:
        break;
    case LoadBalancing::FIRST_OR_RANDOM:
        get_priority = [offset](size_t i) -> size_t { return i != offset; };
        break;
    case LoadBalancing::ROUND_ROBIN:
        if (last_used >= nested_pools.size())
            last_used = 0;
        ++last_used;
        /* Consider nested_pools.size() equals to 5
         * last_used = 1 -> get_priority: 0 1 2 3 4
         * last_used = 2 -> get_priority: 5 0 1 2 3
         * last_used = 3 -> get_priority: 5 4 0 1 2
         * ...
         * */
        get_priority = [&](size_t i) { ++i; return i < last_used ? nested_pools.size() - i : i - last_used; };
        break;
    }
    UInt64 max_ignored_errors = settings ? settings->distributed_replica_max_ignored_errors.value : 0;
    bool fallback_to_stale_replicas = settings ? settings->fallback_to_stale_replicas_for_distributed_queries.value : true;
@ -241,77 +247,17 @@ ConnectionPoolWithFailover::tryGetEntry(
        const Settings * settings,
        const QualifiedTableName * table_to_check)
 {
    ConnectionEstablisher connection_establisher(&pool, &timeouts, settings, log, table_to_check);
    TryResult result;
-    try
+    connection_establisher.run(result, fail_message);
    {
        result.entry = pool.get(timeouts, settings, /* force_connected = */ false);
        UInt64 server_revision = 0;
        if (table_to_check)
            server_revision = result.entry->getServerRevision(timeouts);
        if (!table_to_check || server_revision < DBMS_MIN_REVISION_WITH_TABLES_STATUS)
        {
            result.entry->forceConnected(timeouts);
            result.is_usable = true;
            result.is_up_to_date = true;
            return result;
        }
        /// Only status of the remote table corresponding to the Distributed table is taken into account.
        /// TODO: request status for joined tables also.
        TablesStatusRequest status_request;
        status_request.tables.emplace(*table_to_check);
        TablesStatusResponse status_response = result.entry->getTablesStatus(timeouts, status_request);
        auto table_status_it = status_response.table_states_by_id.find(*table_to_check);
        if (table_status_it == status_response.table_states_by_id.end())
        {
            const char * message_pattern = "There is no table {}.{} on server: {}";
            fail_message = fmt::format(message_pattern, backQuote(table_to_check->database), backQuote(table_to_check->table), result.entry->getDescription());
            LOG_WARNING(log, fail_message);
            ProfileEvents::increment(ProfileEvents::DistributedConnectionMissingTable);
            return result;
        }
        result.is_usable = true;
        UInt64 max_allowed_delay = settings ? UInt64(settings->max_replica_delay_for_distributed_queries) : 0;
        if (!max_allowed_delay)
        {
            result.is_up_to_date = true;
            return result;
        }
        UInt32 delay = table_status_it->second.absolute_delay;
        if (delay < max_allowed_delay)
            result.is_up_to_date = true;
        else
        {
            result.is_up_to_date = false;
            result.staleness = delay;
            LOG_TRACE(log, "Server {} has unacceptable replica delay for table {}.{}: {}", result.entry->getDescription(), table_to_check->database, table_to_check->table, delay);
            ProfileEvents::increment(ProfileEvents::DistributedConnectionStaleReplica);
        }
    }
    catch (const Exception & e)
    {
        if (e.code() != ErrorCodes::NETWORK_ERROR && e.code() != ErrorCodes::SOCKET_TIMEOUT
            && e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF)
            throw;
        fail_message = getCurrentExceptionMessage(/* with_stacktrace = */ false);
        if (!result.entry.isNull())
        {
            result.entry->disconnect();
            result.reset();
        }
    }
    return result;
 }
 std::vector<ConnectionPoolWithFailover::Base::ShuffledPool> ConnectionPoolWithFailover::getShuffledPools(const Settings * settings)
 {
    GetPriorityFunc get_priority = makeGetPriorityFunc(settings);
    UInt64 max_ignored_errors = settings ? settings->distributed_replica_max_ignored_errors.value : 0;
    return Base::getShuffledPools(max_ignored_errors, get_priority);
 }
 }
--- a/src/Client/ConnectionPoolWithFailover.h
+++ b/src/Client/ConnectionPoolWithFailover.h
@ -80,6 +80,15 @@ public:
    using Status = std::vector<NestedPoolStatus>;
    Status getStatus() const;
    std::vector<Base::ShuffledPool> getShuffledPools(const Settings * settings);
    size_t getMaxErrorCup() const { return Base::max_error_cap; }
    void updateSharedError(std::vector<ShuffledPool> & shuffled_pools)
    {
        Base::updateSharedErrorCounts(shuffled_pools);
    }
 private:
    /// Get the values of relevant settings and call Base::getMany()
    std::vector<TryResult> getManyImpl(
@ -97,6 +106,8 @@ private:
            const Settings * settings,
            const QualifiedTableName * table_to_check = nullptr);
    GetPriorityFunc makeGetPriorityFunc(const Settings * settings);
 private:
    std::vector<size_t> hostname_differences; /// Distances from name of this host to the names of hosts of pools.
    size_t last_used = 0; /// Last used for round_robin policy.
--- a/src/Client/HedgedConnections.cpp
+++ b/src/Client/HedgedConnections.cpp
@ -0,0 +1,524 @@
 #if defined(OS_LINUX)
 #include <Client/HedgedConnections.h>
 #include <Interpreters/ClientInfo.h>
 namespace DB
 {
 namespace ErrorCodes
 {
    extern const int MISMATCH_REPLICAS_DATA_SOURCES;
    extern const int LOGICAL_ERROR;
    extern const int SOCKET_TIMEOUT;
    extern const int ALL_CONNECTION_TRIES_FAILED;
 }
 HedgedConnections::HedgedConnections(
    const ConnectionPoolWithFailoverPtr & pool_,
    const Settings & settings_,
    const ConnectionTimeouts & timeouts_,
    const ThrottlerPtr & throttler_,
    PoolMode pool_mode,
    std::shared_ptr<QualifiedTableName> table_to_check_)
    : hedged_connections_factory(pool_, &settings_, timeouts_, table_to_check_)
    , settings(settings_)
    , throttler(throttler_)
 {
    std::vector<Connection *> connections = hedged_connections_factory.getManyConnections(pool_mode);
    if (connections.empty())
        return;
    offset_states.reserve(connections.size());
    for (size_t i = 0; i != connections.size(); ++i)
    {
        offset_states.emplace_back();
        offset_states[i].replicas.emplace_back(connections[i]);
        offset_states[i].active_connection_count = 1;
        ReplicaState & replica = offset_states[i].replicas.back();
        replica.connection->setThrottler(throttler_);
        epoll.add(replica.packet_receiver->getFileDescriptor());
        fd_to_replica_location[replica.packet_receiver->getFileDescriptor()] = ReplicaLocation{i, 0};
        epoll.add(replica.change_replica_timeout.getDescriptor());
        timeout_fd_to_replica_location[replica.change_replica_timeout.getDescriptor()] = ReplicaLocation{i, 0};
    }
    active_connection_count = connections.size();
    offsets_with_disabled_changing_replica = 0;
    pipeline_for_new_replicas.add([throttler_](ReplicaState & replica_) { replica_.connection->setThrottler(throttler_); });
 }
 void HedgedConnections::Pipeline::add(std::function<void(ReplicaState & replica)> send_function)
 {
    pipeline.push_back(send_function);
 }
 void HedgedConnections::Pipeline::run(ReplicaState & replica)
 {
    for (auto & send_func : pipeline)
        send_func(replica);
 }
 void HedgedConnections::sendScalarsData(Scalars & data)
 {
    std::lock_guard lock(cancel_mutex);
    if (!sent_query)
        throw Exception("Cannot send scalars data: query not yet sent.", ErrorCodes::LOGICAL_ERROR);
    auto send_scalars_data = [&data](ReplicaState & replica) { replica.connection->sendScalarsData(data); };
    for (auto & offset_state : offset_states)
        for (auto & replica : offset_state.replicas)
            if (replica.connection)
                send_scalars_data(replica);
    pipeline_for_new_replicas.add(send_scalars_data);
 }
 void HedgedConnections::sendExternalTablesData(std::vector<ExternalTablesData> & data)
 {
    std::lock_guard lock(cancel_mutex);
    if (!sent_query)
        throw Exception("Cannot send external tables data: query not yet sent.", ErrorCodes::LOGICAL_ERROR);
    if (data.size() != size())
        throw Exception("Mismatch between replicas and data sources", ErrorCodes::MISMATCH_REPLICAS_DATA_SOURCES);
    auto send_external_tables_data = [&data](ReplicaState & replica) { replica.connection->sendExternalTablesData(data[0]); };
    for (auto & offset_state : offset_states)
        for (auto & replica : offset_state.replicas)
            if (replica.connection)
                send_external_tables_data(replica);
    pipeline_for_new_replicas.add(send_external_tables_data);
 }
 void HedgedConnections::sendIgnoredPartUUIDs(const std::vector<UUID> & uuids)
 {
    std::lock_guard lock(cancel_mutex);
    if (sent_query)
        throw Exception("Cannot send uuids after query is sent.", ErrorCodes::LOGICAL_ERROR);
    auto send_ignored_part_uuids = [&uuids](ReplicaState & replica) { replica.connection->sendIgnoredPartUUIDs(uuids); };
    for (auto & offset_state : offset_states)
        for (auto & replica : offset_state.replicas)
            if (replica.connection)
                send_ignored_part_uuids(replica);
    pipeline_for_new_replicas.add(send_ignored_part_uuids);
 }
 void HedgedConnections::sendQuery(
    const ConnectionTimeouts & timeouts,
    const String & query,
    const String & query_id,
    UInt64 stage,
    const ClientInfo & client_info,
    bool with_pending_data)
 {
    std::lock_guard lock(cancel_mutex);
    if (sent_query)
        throw Exception("Query already sent.", ErrorCodes::LOGICAL_ERROR);
    for (auto & offset_state : offset_states)
    {
        for (auto & replica : offset_state.replicas)
        {
            if (replica.connection->getServerRevision(timeouts) < DBMS_MIN_REVISION_WITH_CURRENT_AGGREGATION_VARIANT_SELECTION_METHOD)
            {
                disable_two_level_aggregation = true;
                break;
            }
        }
        if (disable_two_level_aggregation)
            break;
    }
    if (!disable_two_level_aggregation)
    {
        /// Tell hedged_connections_factory to skip replicas that doesn't support two-level aggregation.
        hedged_connections_factory.skipReplicasWithTwoLevelAggregationIncompatibility();
    }
    auto send_query = [this, timeouts, query, query_id, stage, client_info, with_pending_data](ReplicaState & replica)
    {
        Settings modified_settings = settings;
        if (disable_two_level_aggregation)
        {
            /// Disable two-level aggregation due to version incompatibility.
            modified_settings.group_by_two_level_threshold = 0;
            modified_settings.group_by_two_level_threshold_bytes = 0;
        }
        if (offset_states.size() > 1)
        {
            modified_settings.parallel_replicas_count = offset_states.size();
            modified_settings.parallel_replica_offset = fd_to_replica_location[replica.packet_receiver->getFileDescriptor()].offset;
        }
        replica.connection->sendQuery(timeouts, query, query_id, stage, &modified_settings, &client_info, with_pending_data);
        replica.change_replica_timeout.setRelative(timeouts.receive_data_timeout);
    };
    for (auto & offset_status : offset_states)
        for (auto & replica : offset_status.replicas)
            send_query(replica);
    pipeline_for_new_replicas.add(send_query);
    sent_query = true;
 }
 void HedgedConnections::disconnect()
 {
    std::lock_guard lock(cancel_mutex);
    for (auto & offset_status : offset_states)
        for (auto & replica : offset_status.replicas)
            if (replica.connection)
                finishProcessReplica(replica, true);
    if (hedged_connections_factory.hasEventsInProcess())
    {
        if (hedged_connections_factory.numberOfProcessingReplicas() > 0)
            epoll.remove(hedged_connections_factory.getFileDescriptor());
        hedged_connections_factory.stopChoosingReplicas();
    }
 }
 std::string HedgedConnections::dumpAddresses() const
 {
    std::lock_guard lock(cancel_mutex);
    std::string addresses;
    bool is_first = true;
    for (const auto & offset_state : offset_states)
    {
        for (const auto & replica : offset_state.replicas)
        {
            if (replica.connection)
            {
                addresses += (is_first ? "" : "; ") + replica.connection->getDescription();
                is_first = false;
            }
        }
    }
    return addresses;
 }
 void HedgedConnections::sendCancel()
 {
    std::lock_guard lock(cancel_mutex);
    if (!sent_query || cancelled)
        throw Exception("Cannot cancel. Either no query sent or already cancelled.", ErrorCodes::LOGICAL_ERROR);
    for (auto & offset_status : offset_states)
        for (auto & replica : offset_status.replicas)
            if (replica.connection)
                replica.connection->sendCancel();
    cancelled = true;
 }
 Packet HedgedConnections::drain()
 {
    std::lock_guard lock(cancel_mutex);
    if (!cancelled)
        throw Exception("Cannot drain connections: cancel first.", ErrorCodes::LOGICAL_ERROR);
    Packet res;
    res.type = Protocol::Server::EndOfStream;
    while (!epoll.empty())
    {
        ReplicaLocation location = getReadyReplicaLocation();
        Packet packet = receivePacketFromReplica(location);
        switch (packet.type)
        {
            case Protocol::Server::PartUUIDs:
            case Protocol::Server::Data:
            case Protocol::Server::Progress:
            case Protocol::Server::ProfileInfo:
            case Protocol::Server::Totals:
            case Protocol::Server::Extremes:
            case Protocol::Server::EndOfStream:
                break;
            case Protocol::Server::Exception:
            default:
                /// If we receive an exception or an unknown packet, we save it.
                res = std::move(packet);
                break;
        }
    }
    return res;
 }
 Packet HedgedConnections::receivePacket()
 {
    std::lock_guard lock(cancel_mutex);
    return receivePacketUnlocked({});
 }
 Packet HedgedConnections::receivePacketUnlocked(AsyncCallback async_callback)
 {
    if (!sent_query)
        throw Exception("Cannot receive packets: no query sent.", ErrorCodes::LOGICAL_ERROR);
    if (!hasActiveConnections())
        throw Exception("No more packets are available.", ErrorCodes::LOGICAL_ERROR);
    if (epoll.empty())
        throw Exception("No pending events in epoll.", ErrorCodes::LOGICAL_ERROR);
    ReplicaLocation location = getReadyReplicaLocation(std::move(async_callback));
    return receivePacketFromReplica(location);
 }
 HedgedConnections::ReplicaLocation HedgedConnections::getReadyReplicaLocation(AsyncCallback async_callback)
 {
    /// Firstly, resume replica with the last received packet if it has pending data.
    if (replica_with_last_received_packet)
    {
        ReplicaLocation location = replica_with_last_received_packet.value();
        replica_with_last_received_packet.reset();
        if (offset_states[location.offset].replicas[location.index].connection->hasReadPendingData() && resumePacketReceiver(location))
            return location;
    }
    int event_fd;
    while (true)
    {
        /// Get ready file descriptor from epoll and process it.
        event_fd = getReadyFileDescriptor(async_callback);
        if (event_fd == hedged_connections_factory.getFileDescriptor())
            checkNewReplica();
        else if (fd_to_replica_location.contains(event_fd))
        {
            ReplicaLocation location = fd_to_replica_location[event_fd];
            if (resumePacketReceiver(location))
                return location;
        }
        else if (timeout_fd_to_replica_location.contains(event_fd))
        {
            ReplicaLocation location = timeout_fd_to_replica_location[event_fd];
            offset_states[location.offset].replicas[location.index].change_replica_timeout.reset();
            offset_states[location.offset].replicas[location.index].is_change_replica_timeout_expired = true;
            offset_states[location.offset].next_replica_in_process = true;
            offsets_queue.push(location.offset);
            startNewReplica();
        }
        else
            throw Exception("Unknown event from epoll", ErrorCodes::LOGICAL_ERROR);
    }
 };
 bool HedgedConnections::resumePacketReceiver(const HedgedConnections::ReplicaLocation & location)
 {
    ReplicaState & replica_state = offset_states[location.offset].replicas[location.index];
    auto res = replica_state.packet_receiver->resume();
    if (std::holds_alternative<Packet>(res))
    {
        last_received_packet = std::move(std::get<Packet>(res));
        return true;
    }
    else if (std::holds_alternative<Poco::Timespan>(res))
    {
        finishProcessReplica(replica_state, true);
        /// Check if there is no more active connections with the same offset and there is no new replica in process.
        if (offset_states[location.offset].active_connection_count == 0 && !offset_states[location.offset].next_replica_in_process)
            throw NetException("Receive timeout expired", ErrorCodes::SOCKET_TIMEOUT);
    }
    return false;
 }
 int HedgedConnections::getReadyFileDescriptor(AsyncCallback async_callback)
 {
    epoll_event event;
    event.data.fd = -1;
    size_t events_count = 0;
    while (events_count == 0)
    {
        events_count = epoll.getManyReady(1, &event, false);
        if (!events_count && async_callback)
            async_callback(epoll.getFileDescriptor(), 0, epoll.getDescription());
    }
    return event.data.fd;
 }
 Packet HedgedConnections::receivePacketFromReplica(const ReplicaLocation & replica_location)
 {
    ReplicaState & replica = offset_states[replica_location.offset].replicas[replica_location.index];
    Packet packet = std::move(last_received_packet);
    switch (packet.type)
    {
        case Protocol::Server::Data:
            /// If we received the first not empty data packet and still can change replica,
            /// disable changing replica with this offset.
            if (offset_states[replica_location.offset].can_change_replica && packet.block.rows() > 0)
                disableChangingReplica(replica_location);
            replica_with_last_received_packet = replica_location;
            break;
        case Protocol::Server::Progress:
            /// Check if we have made some progress and still can change replica.
            if (offset_states[replica_location.offset].can_change_replica && packet.progress.read_bytes > 0)
            {
                /// If we are allowed to change replica until the first data packet,
                /// just restart timeout (if it hasn't expired yet). Otherwise disable changing replica with this offset.
                if (settings.allow_changing_replica_until_first_data_packet && !replica.is_change_replica_timeout_expired)
                    replica.change_replica_timeout.setRelative(hedged_connections_factory.getConnectionTimeouts().receive_data_timeout);
                else
                    disableChangingReplica(replica_location);
            }
            replica_with_last_received_packet = replica_location;
            break;
        case Protocol::Server::PartUUIDs:
        case Protocol::Server::ProfileInfo:
        case Protocol::Server::Totals:
        case Protocol::Server::Extremes:
        case Protocol::Server::Log:
            replica_with_last_received_packet = replica_location;
            break;
        case Protocol::Server::EndOfStream:
            finishProcessReplica(replica, false);
            break;
        case Protocol::Server::Exception:
        default:
            finishProcessReplica(replica, true);
            break;
    }
    return packet;
 }
 void HedgedConnections::disableChangingReplica(const ReplicaLocation & replica_location)
 {
    /// Stop working with replicas, that are responsible for the same offset.
    OffsetState & offset_state = offset_states[replica_location.offset];
    offset_state.replicas[replica_location.index].change_replica_timeout.reset();
    ++offsets_with_disabled_changing_replica;
    offset_state.can_change_replica = false;
    for (size_t i = 0; i != offset_state.replicas.size(); ++i)
    {
        if (i != replica_location.index && offset_state.replicas[i].connection)
        {
            offset_state.replicas[i].connection->sendCancel();
            finishProcessReplica(offset_state.replicas[i], true);
        }
    }
    /// If we disabled changing replica with all offsets, we need to stop choosing new replicas.
    if (hedged_connections_factory.hasEventsInProcess() && offsets_with_disabled_changing_replica == offset_states.size())
    {
        if (hedged_connections_factory.numberOfProcessingReplicas() > 0)
            epoll.remove(hedged_connections_factory.getFileDescriptor());
        hedged_connections_factory.stopChoosingReplicas();
    }
 }
 void HedgedConnections::startNewReplica()
 {
    Connection * connection = nullptr;
    HedgedConnectionsFactory::State state = hedged_connections_factory.startNewConnection(connection);
    /// Check if we need to add hedged_connections_factory file descriptor to epoll.
    if (state == HedgedConnectionsFactory::State::NOT_READY && hedged_connections_factory.numberOfProcessingReplicas() == 1)
        epoll.add(hedged_connections_factory.getFileDescriptor());
    processNewReplicaState(state, connection);
 }
 void HedgedConnections::checkNewReplica()
 {
    Connection * connection = nullptr;
    HedgedConnectionsFactory::State state = hedged_connections_factory.waitForReadyConnections(connection);
    processNewReplicaState(state, connection);
    /// Check if we don't need to listen hedged_connections_factory file descriptor in epoll anymore.
    if (hedged_connections_factory.numberOfProcessingReplicas() == 0)
        epoll.remove(hedged_connections_factory.getFileDescriptor());
 }
 void HedgedConnections::processNewReplicaState(HedgedConnectionsFactory::State state, Connection * connection)
 {
    switch (state)
    {
        case HedgedConnectionsFactory::State::READY:
        {
            size_t offset = offsets_queue.front();
            offsets_queue.pop();
            offset_states[offset].replicas.emplace_back(connection);
            ++offset_states[offset].active_connection_count;
            offset_states[offset].next_replica_in_process = false;
            ++active_connection_count;
            ReplicaState & replica = offset_states[offset].replicas.back();
            epoll.add(replica.packet_receiver->getFileDescriptor());
            fd_to_replica_location[replica.packet_receiver->getFileDescriptor()] = ReplicaLocation{offset, offset_states[offset].replicas.size() - 1};
            epoll.add(replica.change_replica_timeout.getDescriptor());
            timeout_fd_to_replica_location[replica.change_replica_timeout.getDescriptor()] = ReplicaLocation{offset, offset_states[offset].replicas.size() - 1};
            pipeline_for_new_replicas.run(replica);
            break;
        }
        case HedgedConnectionsFactory::State::CANNOT_CHOOSE:
        {
            while (!offsets_queue.empty())
            {
                /// Check if there is no active replica with needed offsets.
                if (offset_states[offsets_queue.front()].active_connection_count == 0)
                    throw Exception("Cannot find enough connections to replicas", ErrorCodes::ALL_CONNECTION_TRIES_FAILED);
                offset_states[offsets_queue.front()].next_replica_in_process = false;
                offsets_queue.pop();
            }
            break;
        }
        case HedgedConnectionsFactory::State::NOT_READY:
            break;
    }
 }
 void HedgedConnections::finishProcessReplica(ReplicaState & replica, bool disconnect)
 {
    replica.packet_receiver->cancel();
    replica.change_replica_timeout.reset();
    epoll.remove(replica.packet_receiver->getFileDescriptor());
    --offset_states[fd_to_replica_location[replica.packet_receiver->getFileDescriptor()].offset].active_connection_count;
    fd_to_replica_location.erase(replica.packet_receiver->getFileDescriptor());
    epoll.remove(replica.change_replica_timeout.getDescriptor());
    timeout_fd_to_replica_location.erase(replica.change_replica_timeout.getDescriptor());
    --active_connection_count;
    if (disconnect)
        replica.connection->disconnect();
    replica.connection = nullptr;
 }
 }
 #endif
--- a/src/Client/HedgedConnections.h
+++ b/src/Client/HedgedConnections.h
@ -0,0 +1,189 @@
 #pragma once
 #if defined(OS_LINUX)
 #include <functional>
 #include <queue>
 #include <optional>
 #include <Client/HedgedConnectionsFactory.h>
 #include <Client/IConnections.h>
 #include <Client/PacketReceiver.h>
 #include <Common/FiberStack.h>
 #include <Common/Fiber.h>
 namespace DB
 {
 /** To receive data from multiple replicas (connections) from one shard asynchronously.
  * The principe of Hedged Connections is used to reduce tail latency:
  * if we don't receive data from replica and there is no progress in query execution
  * for a long time, we try to get new replica and send query to it,
  * without cancelling working with previous replica. This class
  * supports all functionality that MultipleConnections has.
  */
 class HedgedConnections : public IConnections
 {
 public:
    using PacketReceiverPtr = std::unique_ptr<PacketReceiver>;
    struct ReplicaState
    {
        explicit ReplicaState(Connection * connection_) : connection(connection_), packet_receiver(std::make_unique<PacketReceiver>(connection_))
        {
        }
        Connection * connection = nullptr;
        PacketReceiverPtr packet_receiver;
        TimerDescriptor change_replica_timeout;
        bool is_change_replica_timeout_expired = false;
    };
    struct OffsetState
    {
        /// Replicas with the same offset.
        std::vector<ReplicaState> replicas;
        /// An amount of active replicas. When can_change_replica is false,
        /// active_connection_count is always <= 1 (because we stopped working with
        /// other replicas with the same offset)
        size_t active_connection_count = 0;
        bool can_change_replica = true;
        /// This flag is true when this offset is in queue for
        /// new replicas. It's needed to process receive timeout
        /// (throw an exception when receive timeout expired and there is no
        /// new replica in process)
        bool next_replica_in_process = false;
    };
    /// We process events in epoll, so we need to determine replica by it's
    /// file descriptor. We store map fd -> replica location. To determine
    /// where replica is, we need a replica offset
    /// (the same as parallel_replica_offset), and index, which is needed because
    /// we can have many replicas with same offset (when receive_data_timeout has expired).
    struct ReplicaLocation
    {
        size_t offset;
        size_t index;
    };
    HedgedConnections(const ConnectionPoolWithFailoverPtr & pool_,
                      const Settings & settings_,
                      const ConnectionTimeouts & timeouts_,
                      const ThrottlerPtr & throttler,
                      PoolMode pool_mode,
                      std::shared_ptr<QualifiedTableName> table_to_check_ = nullptr);
    void sendScalarsData(Scalars & data) override;
    void sendExternalTablesData(std::vector<ExternalTablesData> & data) override;
    void sendQuery(
        const ConnectionTimeouts & timeouts,
        const String & query,
        const String & query_id,
        UInt64 stage,
        const ClientInfo & client_info,
        bool with_pending_data) override;
    Packet receivePacket() override;
    Packet receivePacketUnlocked(AsyncCallback async_callback) override;
    void disconnect() override;
    void sendCancel() override;
    void sendIgnoredPartUUIDs(const std::vector<UUID> & uuids) override;
    Packet drain() override;
    std::string dumpAddresses() const override;
    size_t size() const override { return offset_states.size(); }
    bool hasActiveConnections() const override { return active_connection_count > 0; }
 private:
    /// If we don't receive data from replica and there is no progress in query
    /// execution for receive_data_timeout, we are trying to get new
    /// replica and send query to it. Beside sending query, there are some
    /// additional actions like sendScalarsData or sendExternalTablesData and we need
    /// to perform these actions in the same order on the new replica. So, we will
    /// save actions with replicas in pipeline to perform them on the new replicas.
    class Pipeline
    {
    public:
        void add(std::function<void(ReplicaState &)> send_function);
        void run(ReplicaState & replica);
    private:
        std::vector<std::function<void(ReplicaState &)>> pipeline;
    };
    Packet receivePacketFromReplica(const ReplicaLocation & replica_location);
    ReplicaLocation getReadyReplicaLocation(AsyncCallback async_callback = {});
    bool resumePacketReceiver(const ReplicaLocation & replica_location);
    void disableChangingReplica(const ReplicaLocation & replica_location);
    void startNewReplica();
    void checkNewReplica();
    void processNewReplicaState(HedgedConnectionsFactory::State state, Connection * connection);
    void finishProcessReplica(ReplicaState & replica, bool disconnect);
    int getReadyFileDescriptor(AsyncCallback async_callback = {});
    HedgedConnectionsFactory hedged_connections_factory;
    /// All replicas in offset_states[offset] is responsible for process query
    /// with setting parallel_replica_offset = offset. In common situations
    /// replica_states[offset].replicas.size() = 1 (like in MultiplexedConnections).
    std::vector<OffsetState> offset_states;
    /// Map socket file descriptor to replica location (it's offset and index in OffsetState.replicas).
    std::unordered_map<int, ReplicaLocation> fd_to_replica_location;
    /// Map receive data timeout file descriptor to replica location.
    std::unordered_map<int, ReplicaLocation> timeout_fd_to_replica_location;
    /// A queue of offsets for new replicas. When we get RECEIVE_DATA_TIMEOUT from
    /// the replica, we push it's offset to this queue and start trying to get
    /// new replica.
    std::queue<int> offsets_queue;
    /// The current number of valid connections to the replicas of this shard.
    size_t active_connection_count;
    /// We count offsets in which we can't change replica anymore,
    /// it's needed to cancel choosing new replicas when we
    /// disabled replica changing in all offsets.
    size_t offsets_with_disabled_changing_replica;
    Pipeline pipeline_for_new_replicas;
    /// New replica may not support two-level aggregation due to version incompatibility.
    /// If we didn't disabled it, we need to skip this replica.
    bool disable_two_level_aggregation = false;
    /// We will save replica with last received packet
    /// (except cases when packet type is EndOfStream or Exception)
    /// to resume it's packet receiver when new packet is needed.
    std::optional<ReplicaLocation> replica_with_last_received_packet;
    Packet last_received_packet;
    Epoll epoll;
    const Settings & settings;
    ThrottlerPtr throttler;
    bool sent_query = false;
    bool cancelled = false;
    mutable std::mutex cancel_mutex;
 };
 }
 #endif
--- a/src/Client/HedgedConnectionsFactory.cpp
+++ b/src/Client/HedgedConnectionsFactory.cpp
@ -0,0 +1,387 @@
 #if defined(OS_LINUX)
 #include <Client/HedgedConnectionsFactory.h>
 #include <Common/typeid_cast.h>
 namespace DB
 {
 namespace ErrorCodes
 {
    extern const int ALL_CONNECTION_TRIES_FAILED;
    extern const int ALL_REPLICAS_ARE_STALE;
    extern const int LOGICAL_ERROR;
 }
 HedgedConnectionsFactory::HedgedConnectionsFactory(
    const ConnectionPoolWithFailoverPtr & pool_,
    const Settings * settings_,
    const ConnectionTimeouts & timeouts_,
    std::shared_ptr<QualifiedTableName> table_to_check_)
    : pool(pool_), settings(settings_), timeouts(timeouts_), table_to_check(table_to_check_), log(&Poco::Logger::get("HedgedConnectionsFactory"))
 {
    shuffled_pools = pool->getShuffledPools(settings);
    for (size_t i = 0; i != shuffled_pools.size(); ++i)
        replicas.emplace_back(ConnectionEstablisherAsync(shuffled_pools[i].pool, &timeouts, settings, log, table_to_check.get()));
    max_tries
        = (settings ? size_t{settings->connections_with_failover_max_tries} : size_t{DBMS_CONNECTION_POOL_WITH_FAILOVER_DEFAULT_MAX_TRIES});
    fallback_to_stale_replicas = settings && settings->fallback_to_stale_replicas_for_distributed_queries;
 }
 HedgedConnectionsFactory::~HedgedConnectionsFactory()
 {
    pool->updateSharedError(shuffled_pools);
 }
 std::vector<Connection *> HedgedConnectionsFactory::getManyConnections(PoolMode pool_mode)
 {
    size_t min_entries = (settings && settings->skip_unavailable_shards) ? 0 : 1;
    size_t max_entries;
    switch (pool_mode)
    {
        case PoolMode::GET_ALL:
        {
            min_entries = shuffled_pools.size();
            max_entries = shuffled_pools.size();
            break;
        }
        case PoolMode::GET_ONE:
        {
            max_entries = 1;
            break;
        }
        case PoolMode::GET_MANY:
        {
            max_entries = settings ? size_t(settings->max_parallel_replicas) : 1;
            break;
        }
    }
    std::vector<Connection *> connections;
    connections.reserve(max_entries);
    Connection * connection = nullptr;
    /// Try to start establishing connections with max_entries replicas.
    for (size_t i = 0; i != max_entries; ++i)
    {
        ++requested_connections_count;
        State state = startNewConnectionImpl(connection);
        if (state == State::READY)
            connections.push_back(connection);
        if (state == State::CANNOT_CHOOSE)
            break;
    }
    /// Process connections until we get enough READY connections
    /// (work asynchronously with all connections we started).
    /// TODO: when we get GET_ALL mode we can start reading packets from ready
    /// TODO: connection as soon as we got it, not even waiting for the others.
    while (connections.size() < max_entries)
    {
        /// Set blocking = true to avoid busy-waiting here.
        auto state = waitForReadyConnectionsImpl(/*blocking = */true, connection);
        if (state == State::READY)
            connections.push_back(connection);
        else if (state == State::CANNOT_CHOOSE)
        {
            if (connections.size() >= min_entries)
                break;
            /// Determine the reason of not enough replicas.
            if (!fallback_to_stale_replicas && up_to_date_count < min_entries)
                throw Exception(
                    "Could not find enough connections to up-to-date replicas. Got: " + std::to_string(connections.size())
                    + ", needed: " + std::to_string(min_entries),
                    DB::ErrorCodes::ALL_REPLICAS_ARE_STALE);
            if (usable_count < min_entries)
                throw NetException(
                    "All connection tries failed. Log: \n\n" + fail_messages + "\n",
                    DB::ErrorCodes::ALL_CONNECTION_TRIES_FAILED);
            throw Exception("Unknown reason of not enough replicas.", ErrorCodes::LOGICAL_ERROR);
        }
    }
    return connections;
 }
 HedgedConnectionsFactory::State HedgedConnectionsFactory::startNewConnection(Connection *& connection_out)
 {
    ++requested_connections_count;
    State state = startNewConnectionImpl(connection_out);
    /// If we cannot start new connection but there are connections in epoll, return NOT_READY.
    if (state == State::CANNOT_CHOOSE && !epoll.empty())
        state = State::NOT_READY;
    return state;
 }
 HedgedConnectionsFactory::State HedgedConnectionsFactory::waitForReadyConnections(Connection *& connection_out)
 {
    return waitForReadyConnectionsImpl(false, connection_out);
 }
 HedgedConnectionsFactory::State HedgedConnectionsFactory::waitForReadyConnectionsImpl(bool blocking, Connection *& connection_out)
 {
    State state = processEpollEvents(blocking, connection_out);
    if (state != State::CANNOT_CHOOSE)
        return state;
    /// We reach this point only if there was no free up to date replica.
    /// We will try to use usable replica.
    /// Check if we are not allowed to use usable replicas or there is no even a free usable replica.
    if (!fallback_to_stale_replicas)
        return State::CANNOT_CHOOSE;
    return setBestUsableReplica(connection_out);
 }
 int HedgedConnectionsFactory::getNextIndex()
 {
    /// Check if there is no free replica.
    if (entries_count + replicas_in_process_count + failed_pools_count >= shuffled_pools.size())
        return -1;
    /// Check if it's the first time.
    if (last_used_index == -1)
    {
        last_used_index = 0;
        return 0;
    }
    bool finish = false;
    int next_index = last_used_index;
    while (!finish)
    {
        next_index = (next_index + 1) % shuffled_pools.size();
        /// Check if we can try this replica.
        if (replicas[next_index].connection_establisher.getResult().entry.isNull()
            && (max_tries == 0 || shuffled_pools[next_index].error_count < max_tries))
            finish = true;
        /// If we made a complete round, there is no replica to connect.
        else if (next_index == last_used_index)
            return -1;
    }
    last_used_index = next_index;
    return next_index;
 }
 HedgedConnectionsFactory::State HedgedConnectionsFactory::startNewConnectionImpl(Connection *& connection_out)
 {
    int index;
    State state;
    do
    {
        index = getNextIndex();
        if (index == -1)
            return State::CANNOT_CHOOSE;
        state = resumeConnectionEstablisher(index, connection_out);
    }
    while (state == State::CANNOT_CHOOSE);
    return state;
 }
 HedgedConnectionsFactory::State HedgedConnectionsFactory::processEpollEvents(bool blocking, Connection *& connection_out)
 {
    int event_fd;
    while (!epoll.empty())
    {
        event_fd = getReadyFileDescriptor(blocking);
        if (event_fd == -1)
            return State::NOT_READY;
        if (fd_to_replica_index.contains(event_fd))
        {
            int index = fd_to_replica_index[event_fd];
            State state = resumeConnectionEstablisher(index, connection_out);
            if (state == State::NOT_READY)
                continue;
            /// Connection establishing not in process now, remove all
            /// information about it from epoll.
            removeReplicaFromEpoll(index, event_fd);
            if (state == State::READY)
                return state;
        }
        else if (timeout_fd_to_replica_index.contains(event_fd))
            replicas[timeout_fd_to_replica_index[event_fd]].change_replica_timeout.reset();
        else
            throw Exception("Unknown event from epoll", ErrorCodes::LOGICAL_ERROR);
        /// We reach this point only if we need to start new connection
        /// (Special timeout expired or one of the previous connections failed).
        /// Return only if replica is ready.
        if (startNewConnectionImpl(connection_out) == State::READY)
            return State::READY;
    }
    return State::CANNOT_CHOOSE;
 }
 int HedgedConnectionsFactory::getReadyFileDescriptor(bool blocking)
 {
    epoll_event event;
    event.data.fd = -1;
    epoll.getManyReady(1, &event, blocking);
    return event.data.fd;
 }
 HedgedConnectionsFactory::State HedgedConnectionsFactory::resumeConnectionEstablisher(int index, Connection *& connection_out)
 {
    auto res = replicas[index].connection_establisher.resume();
    if (std::holds_alternative<TryResult>(res))
        return processFinishedConnection(index, std::get<TryResult>(res), connection_out);
    int fd = std::get<int>(res);
    if (!fd_to_replica_index.contains(fd))
        addNewReplicaToEpoll(index, fd);
    return State::NOT_READY;
 }
 HedgedConnectionsFactory::State HedgedConnectionsFactory::processFinishedConnection(int index, TryResult result, Connection *& connection_out)
 {
    const std::string & fail_message = replicas[index].connection_establisher.getFailMessage();
    if (!fail_message.empty())
        fail_messages += fail_message + "\n";
    if (!result.entry.isNull())
    {
        ++entries_count;
        if (result.is_usable)
        {
            ++usable_count;
            if (result.is_up_to_date)
            {
                ++up_to_date_count;
                if (!skip_replicas_with_two_level_aggregation_incompatibility || !isTwoLevelAggregationIncompatible(&*result.entry))
                {
                    replicas[index].is_ready = true;
                    ++ready_replicas_count;
                    connection_out = &*result.entry;
                    return State::READY;
                }
            }
        }
    }
    else
    {
        ShuffledPool & shuffled_pool = shuffled_pools[index];
        LOG_WARNING(
            log, "Connection failed at try №{}, reason: {}", (shuffled_pool.error_count + 1), fail_message);
        ProfileEvents::increment(ProfileEvents::DistributedConnectionFailTry);
        shuffled_pool.error_count = std::min(pool->getMaxErrorCup(), shuffled_pool.error_count + 1);
        if (shuffled_pool.error_count >= max_tries)
        {
            ++failed_pools_count;
            ProfileEvents::increment(ProfileEvents::DistributedConnectionFailAtAll);
        }
    }
    return State::CANNOT_CHOOSE;
 }
 void HedgedConnectionsFactory::stopChoosingReplicas()
 {
    for (auto & [fd, index] : fd_to_replica_index)
    {
        --replicas_in_process_count;
        epoll.remove(fd);
        replicas[index].connection_establisher.cancel();
    }
    for (auto & [timeout_fd, index] : timeout_fd_to_replica_index)
    {
        replicas[index].change_replica_timeout.reset();
        epoll.remove(timeout_fd);
    }
    fd_to_replica_index.clear();
    timeout_fd_to_replica_index.clear();
 }
 void HedgedConnectionsFactory::addNewReplicaToEpoll(int index, int fd)
 {
    ++replicas_in_process_count;
    epoll.add(fd);
    fd_to_replica_index[fd] = index;
    /// Add timeout for changing replica.
    replicas[index].change_replica_timeout.setRelative(timeouts.hedged_connection_timeout);
    epoll.add(replicas[index].change_replica_timeout.getDescriptor());
    timeout_fd_to_replica_index[replicas[index].change_replica_timeout.getDescriptor()] = index;
 }
 void HedgedConnectionsFactory::removeReplicaFromEpoll(int index, int fd)
 {
    --replicas_in_process_count;
    epoll.remove(fd);
    fd_to_replica_index.erase(fd);
    replicas[index].change_replica_timeout.reset();
    epoll.remove(replicas[index].change_replica_timeout.getDescriptor());
    timeout_fd_to_replica_index.erase(replicas[index].change_replica_timeout.getDescriptor());
 }
 int HedgedConnectionsFactory::numberOfProcessingReplicas() const
 {
    if (epoll.empty())
        return 0;
    return requested_connections_count - ready_replicas_count;
 }
 HedgedConnectionsFactory::State HedgedConnectionsFactory::setBestUsableReplica(Connection *& connection_out)
 {
    std::vector<int> indexes;
    for (size_t i = 0; i != replicas.size(); ++i)
    {
        /// Don't add unusable, failed replicas and replicas that are ready or in process.
        TryResult result = replicas[i].connection_establisher.getResult();
        if (!result.entry.isNull()
            && result.is_usable
            && !replicas[i].is_ready
            && (!skip_replicas_with_two_level_aggregation_incompatibility || !isTwoLevelAggregationIncompatible(&*result.entry)))
            indexes.push_back(i);
    }
    if (indexes.empty())
        return State::CANNOT_CHOOSE;
    /// Sort replicas by staleness.
    std::stable_sort(
        indexes.begin(),
        indexes.end(),
        [&](size_t lhs, size_t rhs)
        {
            return replicas[lhs].connection_establisher.getResult().staleness < replicas[rhs].connection_establisher.getResult().staleness;
        });
    replicas[indexes[0]].is_ready = true;
    TryResult result = replicas[indexes[0]].connection_establisher.getResult();
    connection_out = &*result.entry;
    return State::READY;
 }
 bool HedgedConnectionsFactory::isTwoLevelAggregationIncompatible(Connection * connection)
 {
    return connection->getServerRevision(timeouts) < DBMS_MIN_REVISION_WITH_CURRENT_AGGREGATION_VARIANT_SELECTION_METHOD;
 }
 }
 #endif
--- a/src/Client/HedgedConnectionsFactory.h
+++ b/src/Client/HedgedConnectionsFactory.h
@ -0,0 +1,158 @@
 #pragma once
 #if defined(OS_LINUX)
 #include <Common/TimerDescriptor.h>
 #include <Common/Epoll.h>
 #include <Common/FiberStack.h>
 #include <Common/Fiber.h>
 #include <Client/ConnectionEstablisher.h>
 #include <Client/ConnectionPoolWithFailover.h>
 #include <Core/Settings.h>
 #include <unordered_map>
 #include <memory>
 namespace DB
 {
 /** Class for establishing hedged connections with replicas.
  * The process of establishing connection is divided on stages, on each stage if
  * replica doesn't respond for a long time, we start establishing connection with
  * the next replica, without cancelling working with previous one.
  * It works with multiple replicas simultaneously without blocking by using epoll.
  */
 class HedgedConnectionsFactory
 {
 public:
    using ShuffledPool = ConnectionPoolWithFailover::Base::ShuffledPool;
    using TryResult = PoolWithFailoverBase<IConnectionPool>::TryResult;
    enum class State
    {
        READY,
        NOT_READY,
        CANNOT_CHOOSE,
    };
    struct ReplicaStatus
    {
        explicit ReplicaStatus(ConnectionEstablisherAsync connection_stablisher_) : connection_establisher(std::move(connection_stablisher_))
        {
        }
        ConnectionEstablisherAsync connection_establisher;
        TimerDescriptor change_replica_timeout;
        bool is_ready = false;
    };
    HedgedConnectionsFactory(const ConnectionPoolWithFailoverPtr & pool_,
                        const Settings * settings_,
                        const ConnectionTimeouts & timeouts_,
                        std::shared_ptr<QualifiedTableName> table_to_check_ = nullptr);
    /// Create and return active connections according to pool_mode.
    std::vector<Connection *> getManyConnections(PoolMode pool_mode);
    /// Try to get connection to the new replica without blocking. Process all current events in epoll (connections, timeouts),
    /// Returned state might be READY (connection established successfully),
    /// NOT_READY (there are no ready events now) and CANNOT_CHOOSE (cannot produce new connection anymore).
    /// If state is READY, replica connection will be written in connection_out.
    State waitForReadyConnections(Connection *& connection_out);
    State startNewConnection(Connection *& connection_out);
    /// Stop working with all replicas that are not READY.
    void stopChoosingReplicas();
    bool hasEventsInProcess() const { return !epoll.empty(); }
    int getFileDescriptor() const { return epoll.getFileDescriptor(); }
    const ConnectionTimeouts & getConnectionTimeouts() const { return timeouts; }
    int numberOfProcessingReplicas() const;
    /// Tell Factory to not return connections with two level aggregation incompatibility.
    void skipReplicasWithTwoLevelAggregationIncompatibility() { skip_replicas_with_two_level_aggregation_incompatibility = true; }
    ~HedgedConnectionsFactory();
 private:
    State waitForReadyConnectionsImpl(bool blocking, Connection *& connection_out);
    /// Try to start establishing connection to the new replica. Return
    /// the index of the new replica or -1 if cannot start new connection.
    State startNewConnectionImpl(Connection *& connection_out);
    /// Find an index of the next free replica to start connection.
    /// Return -1 if there is no free replica.
    int getNextIndex();
    int getReadyFileDescriptor(bool blocking);
    void processFailedConnection(int index, const std::string & fail_message);
    State resumeConnectionEstablisher(int index, Connection *& connection_out);
    State processFinishedConnection(int index, TryResult result, Connection *& connection_out);
    void removeReplicaFromEpoll(int index, int fd);
    void addNewReplicaToEpoll(int index, int fd);
    /// Return NOT_READY state if there is no ready events, READY if replica is ready
    /// and CANNOT_CHOOSE if there is no more events in epoll.
    State processEpollEvents(bool blocking, Connection *& connection_out);
    State setBestUsableReplica(Connection *& connection_out);
    bool isTwoLevelAggregationIncompatible(Connection * connection);
    const ConnectionPoolWithFailoverPtr pool;
    const Settings * settings;
    const ConnectionTimeouts timeouts;
    std::vector<ShuffledPool> shuffled_pools;
    std::vector<ReplicaStatus> replicas;
    /// Map socket file descriptor to replica index.
    std::unordered_map<int, int> fd_to_replica_index;
    /// Map timeout for changing replica to replica index.
    std::unordered_map<int, int> timeout_fd_to_replica_index;
    /// If this flag is true, don't return connections with
    /// two level aggregation incompatibility
    bool skip_replicas_with_two_level_aggregation_incompatibility = false;
    std::shared_ptr<QualifiedTableName> table_to_check;
    int last_used_index = -1;
    bool fallback_to_stale_replicas;
    Epoll epoll;
    Poco::Logger * log;
    std::string fail_messages;
    /// The maximum number of attempts to connect to replicas.
    size_t max_tries;
    /// Total number of established connections.
    size_t entries_count = 0;
    /// The number of established connections that are usable.
    size_t usable_count = 0;
    /// The number of established connections that are up to date.
    size_t up_to_date_count = 0;
    /// The number of failed connections (replica is considered failed after max_tries attempts to connect).
    size_t failed_pools_count= 0;
    /// The number of replicas that are in process of connection.
    size_t replicas_in_process_count = 0;
    /// The number of ready replicas (replica is considered ready when it's
    /// connection returns outside).
    size_t ready_replicas_count = 0;
    /// The number of requested in startNewConnection replicas (it's needed for
    /// checking the number of requested replicas that are still in process).
    size_t requested_connections_count = 0;
 };
 }
 #endif
--- a/src/Client/IConnections.h
+++ b/src/Client/IConnections.h
@ -0,0 +1,60 @@
 #pragma once
 #include <Client/Connection.h>
 namespace DB
 {
 /// Base class for working with multiple replicas (connections)
 /// from one shard within a single thread
 class IConnections : boost::noncopyable
 {
 public:
    /// Send all scalars to replicas.
    virtual void sendScalarsData(Scalars & data) = 0;
    /// Send all content of external tables to replicas.
    virtual void sendExternalTablesData(std::vector<ExternalTablesData> & data) = 0;
    /// Send request to replicas.
    virtual void sendQuery(
        const ConnectionTimeouts & timeouts,
        const String & query,
        const String & query_id,
        UInt64 stage,
        const ClientInfo & client_info,
        bool with_pending_data) = 0;
    /// Get packet from any replica.
    virtual Packet receivePacket() = 0;
    /// Version of `receivePacket` function without locking.
    virtual Packet receivePacketUnlocked(AsyncCallback async_callback) = 0;
    /// Break all active connections.
    virtual void disconnect() = 0;
    /// Send a request to replicas to cancel the request
    virtual void sendCancel() = 0;
    /// Send parts' uuids to replicas to exclude them from query processing
    virtual void sendIgnoredPartUUIDs(const std::vector<UUID> & uuids) = 0;
    /** On each replica, read and skip all packets to EndOfStream or Exception.
      * Returns EndOfStream if no exception has been received. Otherwise
      * returns the last received packet of type Exception.
      */
    virtual Packet drain() = 0;
    /// Get the replica addresses as a string.
    virtual std::string dumpAddresses() const = 0;
    /// Returns the number of replicas.
    virtual size_t size() const = 0;
    /// Check if there are any valid replicas.
    virtual bool hasActiveConnections() const = 0;
    virtual ~IConnections() = default;
 };
 }
--- a/src/Client/MultiplexedConnections.cpp
+++ b/src/Client/MultiplexedConnections.cpp
@ -158,7 +158,7 @@ void MultiplexedConnections::sendIgnoredPartUUIDs(const std::vector<UUID> & uuid
 Packet MultiplexedConnections::receivePacket()
 {
    std::lock_guard lock(cancel_mutex);
-    Packet packet = receivePacketUnlocked();
+    Packet packet = receivePacketUnlocked({});
    return packet;
 }
@ -206,7 +206,7 @@ Packet MultiplexedConnections::drain()
    while (hasActiveConnections())
    {
-        Packet packet = receivePacketUnlocked();
+        Packet packet = receivePacketUnlocked({});
        switch (packet.type)
        {
@ -253,7 +253,7 @@ std::string MultiplexedConnections::dumpAddressesUnlocked() const
    return buf.str();
 }
-Packet MultiplexedConnections::receivePacketUnlocked(std::function<void(Poco::Net::Socket &)> async_callback)
+Packet MultiplexedConnections::receivePacketUnlocked(AsyncCallback async_callback)
 {
    if (!sent_query)
        throw Exception("Cannot receive packets: no query sent.", ErrorCodes::LOGICAL_ERROR);
@ -265,7 +265,11 @@ Packet MultiplexedConnections::receivePacketUnlocked(std::function<void(Poco::Ne
    if (current_connection == nullptr)
        throw Exception("Logical error: no available replica", ErrorCodes::NO_AVAILABLE_REPLICA);
-    Packet packet = current_connection->receivePacket(std::move(async_callback));
+    Packet packet;
    {
        AsyncCallbackSetter async_setter(current_connection, std::move(async_callback));
        packet = current_connection->receivePacket();
    }
    switch (packet.type)
    {
--- a/src/Client/MultiplexedConnections.h
+++ b/src/Client/MultiplexedConnections.h
@ -5,6 +5,7 @@
 #include <Client/Connection.h>
 #include <Client/ConnectionPoolWithFailover.h>
 #include <IO/ConnectionTimeouts.h>
 #include <Client/IConnections.h>
 namespace DB
 {
@ -16,7 +17,7 @@ namespace DB
  *
  * The interface is almost the same as Connection.
  */
-class MultiplexedConnections final : private boost::noncopyable
+class MultiplexedConnections final : public IConnections
 {
 public:
    /// Accepts ready connection.
@ -27,52 +28,38 @@ public:
        std::vector<IConnectionPool::Entry> && connections,
        const Settings & settings_, const ThrottlerPtr & throttler_);
-    /// Send all scalars to replicas.
+    void sendScalarsData(Scalars & data) override;
-    void sendScalarsData(Scalars & data);
+    void sendExternalTablesData(std::vector<ExternalTablesData> & data) override;
    /// Send all content of external tables to replicas.
    void sendExternalTablesData(std::vector<ExternalTablesData> & data);
    /// Send request to replicas.
    void sendQuery(
        const ConnectionTimeouts & timeouts,
        const String & query,
        const String & query_id,
        UInt64 stage,
        const ClientInfo & client_info,
-        bool with_pending_data);
+        bool with_pending_data) override;
-    /// Get packet from any replica.
+    Packet receivePacket() override;
    Packet receivePacket();
-    /// Break all active connections.
+    void disconnect() override;
    void disconnect();
-    /// Send a request to the replica to cancel the request
+    void sendCancel() override;
    void sendCancel();
    /// Send parts' uuids to replicas to exclude them from query processing
-    void sendIgnoredPartUUIDs(const std::vector<UUID> & uuids);
+    void sendIgnoredPartUUIDs(const std::vector<UUID> & uuids) override;
-    /** On each replica, read and skip all packets to EndOfStream or Exception.
+    Packet drain() override;
      * Returns EndOfStream if no exception has been received. Otherwise
      * returns the last received packet of type Exception.
      */
    Packet drain();
-    /// Get the replica addresses as a string.
+    std::string dumpAddresses() const override;
    std::string dumpAddresses() const;
    /// Returns the number of replicas.
    /// Without locking, because sendCancel() does not change this number.
-    size_t size() const { return replica_states.size(); }
+    size_t size() const override { return replica_states.size(); }
    /// Check if there are any valid replicas.
    /// Without locking, because sendCancel() does not change the state of the replicas.
-    bool hasActiveConnections() const { return active_connection_count > 0; }
+    bool hasActiveConnections() const override { return active_connection_count > 0; }
 private:
-    /// Internal version of `receivePacket` function without locking.
+    Packet receivePacketUnlocked(AsyncCallback async_callback) override;
    Packet receivePacketUnlocked(std::function<void(Poco::Net::Socket &)> async_callback = {});
    /// Internal version of `dumpAddresses` function without locking.
    std::string dumpAddressesUnlocked() const;
--- a/src/Client/PacketReceiver.h
+++ b/src/Client/PacketReceiver.h
@ -0,0 +1,161 @@
 #pragma once
 #if defined(OS_LINUX)
 #include <variant>
 #include <Client/IConnections.h>
 #include <Common/FiberStack.h>
 #include <Common/Fiber.h>
 #include <Common/Epoll.h>
 #include <Common/TimerDescriptor.h>
 namespace DB
 {
 /// Class for nonblocking packet receiving. It runs connection->receivePacket
 /// in fiber and sets special read callback which is called when
 /// reading from socket blocks. When read callback is called,
 /// socket and receive timeout are added in epoll and execution returns to the main program.
 /// So, you can poll this epoll file descriptor to determine when to resume
 /// packet receiving.
 class PacketReceiver
 {
 public:
    explicit PacketReceiver(Connection * connection_) : connection(connection_)
    {
        epoll.add(receive_timeout.getDescriptor());
        epoll.add(connection->getSocket()->impl()->sockfd());
        fiber = boost::context::fiber(std::allocator_arg_t(), fiber_stack, Routine{*this});
    }
    /// Resume packet receiving.
    std::variant<int, Packet, Poco::Timespan> resume()
    {
        /// If there is no pending data, check receive timeout.
        if (!connection->hasReadPendingData() && !checkReceiveTimeout())
        {
            /// Receive timeout expired.
            return Poco::Timespan();
        }
        /// Resume fiber.
        fiber = std::move(fiber).resume();
        if (exception)
            std::rethrow_exception(std::move(exception));
        if (is_read_in_process)
            return epoll.getFileDescriptor();
        /// Receiving packet was finished.
        return std::move(packet);
    }
    void cancel()
    {
        Fiber to_destroy = std::move(fiber);
        connection = nullptr;
    }
    int getFileDescriptor() const { return epoll.getFileDescriptor(); }
 private:
    /// When epoll file descriptor is ready, check if it's an expired timeout.
    /// Return false if receive timeout expired and socket is not ready, return true otherwise.
    bool checkReceiveTimeout()
    {
        bool is_socket_ready = false;
        bool is_receive_timeout_expired = false;
        epoll_event events[2];
        events[0].data.fd = events[1].data.fd = -1;
        size_t ready_count = epoll.getManyReady(2, events, true);
        for (size_t i = 0; i != ready_count; ++i)
        {
            if (events[i].data.fd == connection->getSocket()->impl()->sockfd())
                is_socket_ready = true;
            if (events[i].data.fd == receive_timeout.getDescriptor())
                is_receive_timeout_expired = true;
        }
        if (is_receive_timeout_expired && !is_socket_ready)
        {
            receive_timeout.reset();
            return false;
        }
        return true;
    }
    struct Routine
    {
        PacketReceiver & receiver;
        struct ReadCallback
        {
            PacketReceiver & receiver;
            Fiber & sink;
            void operator()(int, const Poco::Timespan & timeout, const std::string &)
            {
                receiver.receive_timeout.setRelative(timeout);
                receiver.is_read_in_process = true;
                sink = std::move(sink).resume();
                receiver.is_read_in_process = false;
                receiver.receive_timeout.reset();
            }
        };
        Fiber operator()(Fiber && sink)
        {
            try
            {
                while (true)
                {
                    {
                        AsyncCallbackSetter async_setter(receiver.connection, ReadCallback{receiver, sink});
                        receiver.packet = receiver.connection->receivePacket();
                    }
                    sink = std::move(sink).resume();
                }
            }
            catch (const boost::context::detail::forced_unwind &)
            {
                /// This exception is thrown by fiber implementation in case if fiber is being deleted but hasn't exited
                /// It should not be caught or it will segfault.
                /// Other exceptions must be caught
                throw;
            }
            catch (...)
            {
                receiver.exception = std::current_exception();
            }
            return std::move(sink);
        }
    };
    Connection * connection;
    Packet packet;
    Fiber fiber;
    FiberStack fiber_stack;
    /// We use timer descriptor for checking socket receive timeout.
    TimerDescriptor receive_timeout;
    /// In read callback we add socket file descriptor and timer descriptor with receive timeout
    /// in epoll, so we can return epoll file descriptor outside for polling.
    Epoll epoll;
    /// If and exception occurred in fiber resume, we save it and rethrow.
    std::exception_ptr exception;
    bool is_read_in_process = false;
 };
 }
 #endif
--- a/src/Client/ya.make
+++ b/src/Client/ya.make
@ -11,7 +11,10 @@ PEERDIR(
 SRCS(
    Connection.cpp
    ConnectionEstablisher.cpp
    ConnectionPoolWithFailover.cpp
    HedgedConnections.cpp
    HedgedConnectionsFactory.cpp
    MultiplexedConnections.cpp
    TimeoutSetter.cpp
--- a/src/Columns/ColumnAggregateFunction.h
+++ b/src/Columns/ColumnAggregateFunction.h
@ -198,6 +198,11 @@ public:
        throw Exception("Method compareColumn is not supported for ColumnAggregateFunction", ErrorCodes::NOT_IMPLEMENTED);
    }
    bool hasEqualValues() const override
    {
        throw Exception("Method hasEqualValues is not supported for ColumnAggregateFunction", ErrorCodes::NOT_IMPLEMENTED);
    }
    void getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const override;
    void updatePermutation(bool reverse, size_t limit, int, Permutation & res, EqualRanges & equal_range) const override;
--- a/src/Columns/ColumnArray.cpp
+++ b/src/Columns/ColumnArray.cpp
@ -370,6 +370,10 @@ void ColumnArray::compareColumn(const IColumn & rhs, size_t rhs_row_num,
                                        compare_results, direction, nan_direction_hint);
 }
 bool ColumnArray::hasEqualValues() const
 {
    return hasEqualValuesImpl<ColumnArray>();
 }
 namespace
 {
--- a/src/Columns/ColumnArray.h
+++ b/src/Columns/ColumnArray.h
@ -78,6 +78,7 @@ public:
                       PaddedPODArray<UInt64> * row_indexes, PaddedPODArray<Int8> & compare_results,
                       int direction, int nan_direction_hint) const override;
    int compareAtWithCollation(size_t n, size_t m, const IColumn & rhs_, int nan_direction_hint, const Collator & collator) const override;
    bool hasEqualValues() const override;
    void getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const override;
    void updatePermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res, EqualRanges & equal_range) const override;
    void getPermutationWithCollation(const Collator & collator, bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const override;
--- a/src/Columns/ColumnCompressed.h
+++ b/src/Columns/ColumnCompressed.h
@ -96,6 +96,10 @@ public:
    {
        throwMustBeDecompressed();
    }
    bool hasEqualValues() const override
    {
        throwMustBeDecompressed();
    }
    void getPermutation(bool, size_t, int, Permutation &) const override { throwMustBeDecompressed(); }
    void updatePermutation(bool, size_t, int, Permutation &, EqualRanges &) const override { throwMustBeDecompressed(); }
    ColumnPtr replicate(const Offsets &) const override { throwMustBeDecompressed(); }
--- a/src/Columns/ColumnConst.h
+++ b/src/Columns/ColumnConst.h
@ -206,6 +206,8 @@ public:
                       PaddedPODArray<UInt64> * row_indexes, PaddedPODArray<Int8> & compare_results,
                       int direction, int nan_direction_hint) const override;
    bool hasEqualValues() const override { return true; }
    MutableColumns scatter(ColumnIndex num_columns, const Selector & selector) const override;
    void gather(ColumnGathererStream &) override
--- a/src/Columns/ColumnDecimal.cpp
+++ b/src/Columns/ColumnDecimal.cpp
@ -58,6 +58,12 @@ void ColumnDecimal<T>::compareColumn(const IColumn & rhs, size_t rhs_row_num,
                                                         compare_results, direction, nan_direction_hint);
 }
 template <typename T>
 bool ColumnDecimal<T>::hasEqualValues() const
 {
    return this->template hasEqualValuesImpl<ColumnDecimal<T>>();
 }
 template <typename T>
 StringRef ColumnDecimal<T>::serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const
 {
--- a/src/Columns/ColumnDecimal.h
+++ b/src/Columns/ColumnDecimal.h
@ -136,6 +136,7 @@ public:
    void compareColumn(const IColumn & rhs, size_t rhs_row_num,
                       PaddedPODArray<UInt64> * row_indexes, PaddedPODArray<Int8> & compare_results,
                       int direction, int nan_direction_hint) const override;
    bool hasEqualValues() const override;
    void getPermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res) const override;
    void updatePermutation(bool reverse, size_t limit, int, IColumn::Permutation & res, EqualRanges& equal_range) const override;
--- a/src/Columns/ColumnFixedString.h
+++ b/src/Columns/ColumnFixedString.h
@ -132,6 +132,11 @@ public:
                                               compare_results, direction, nan_direction_hint);
    }
    bool hasEqualValues() const override
    {
        return hasEqualValuesImpl<ColumnFixedString>();
    }
    void getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const override;
    void updatePermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res, EqualRanges & equal_range) const override;
--- a/src/Columns/ColumnFunction.h
+++ b/src/Columns/ColumnFunction.h
@ -128,6 +128,11 @@ public:
        throw Exception("compareColumn is not implemented for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
    }
    bool hasEqualValues() const override
    {
        throw Exception("hasEqualValues is not implemented for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
    }
    void getPermutation(bool, size_t, int, Permutation &) const override
    {
        throw Exception("getPermutation is not implemented for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
--- a/src/Columns/ColumnLowCardinality.cpp
+++ b/src/Columns/ColumnLowCardinality.cpp
@ -311,6 +311,13 @@ void ColumnLowCardinality::compareColumn(const IColumn & rhs, size_t rhs_row_num
            compare_results, direction, nan_direction_hint);
 }
 bool ColumnLowCardinality::hasEqualValues() const
 {
    if (getDictionary().size() <= 1)
        return true;
    return getIndexes().hasEqualValues();
 }
 void ColumnLowCardinality::getPermutationImpl(bool reverse, size_t limit, int nan_direction_hint, Permutation & res, const Collator * collator) const
 {
    if (limit == 0)
--- a/src/Columns/ColumnLowCardinality.h
+++ b/src/Columns/ColumnLowCardinality.h
@ -126,6 +126,8 @@ public:
    int compareAtWithCollation(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint, const Collator &) const override;
    bool hasEqualValues() const override;
    void getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const override;
    void updatePermutation(bool reverse, size_t limit, int, IColumn::Permutation & res, EqualRanges & equal_range) const override;
--- a/src/Columns/ColumnMap.cpp
+++ b/src/Columns/ColumnMap.cpp
@ -187,6 +187,11 @@ void ColumnMap::compareColumn(const IColumn & rhs, size_t rhs_row_num,
                                        compare_results, direction, nan_direction_hint);
 }
 bool ColumnMap::hasEqualValues() const
 {
    return hasEqualValuesImpl<ColumnMap>();
 }
 void ColumnMap::getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const
 {
    nested->getPermutation(reverse, limit, nan_direction_hint, res);
--- a/src/Columns/ColumnMap.h
+++ b/src/Columns/ColumnMap.h
@ -72,6 +72,7 @@ public:
    void compareColumn(const IColumn & rhs, size_t rhs_row_num,
                       PaddedPODArray<UInt64> * row_indexes, PaddedPODArray<Int8> & compare_results,
                       int direction, int nan_direction_hint) const override;
    bool hasEqualValues() const override;
    void getExtremes(Field & min, Field & max) const override;
    void getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const override;
    void updatePermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges & equal_range) const override;
--- a/src/Columns/ColumnNullable.cpp
+++ b/src/Columns/ColumnNullable.cpp
@ -271,6 +271,11 @@ void ColumnNullable::compareColumn(const IColumn & rhs, size_t rhs_row_num,
                                           compare_results, direction, nan_direction_hint);
 }
 bool ColumnNullable::hasEqualValues() const
 {
    return hasEqualValuesImpl<ColumnNullable>();
 }
 void ColumnNullable::getPermutationImpl(bool reverse, size_t limit, int null_direction_hint, Permutation & res, const Collator * collator) const
 {
    /// Cannot pass limit because of unknown amount of NULLs.
--- a/src/Columns/ColumnNullable.h
+++ b/src/Columns/ColumnNullable.h
@ -94,6 +94,7 @@ public:
                       PaddedPODArray<UInt64> * row_indexes, PaddedPODArray<Int8> & compare_results,
                       int direction, int nan_direction_hint) const override;
    int compareAtWithCollation(size_t n, size_t m, const IColumn & rhs, int null_direction_hint, const Collator &) const override;
    bool hasEqualValues() const override;
    void getPermutation(bool reverse, size_t limit, int null_direction_hint, Permutation & res) const override;
    void updatePermutation(bool reverse, size_t limit, int null_direction_hint, Permutation & res, EqualRanges & equal_range) const override;
    void getPermutationWithCollation(const Collator & collator, bool reverse, size_t limit, int null_direction_hint, Permutation & res) const override;
--- a/src/Columns/ColumnString.cpp
+++ b/src/Columns/ColumnString.cpp
@ -287,6 +287,11 @@ void ColumnString::compareColumn(
                                         compare_results, direction, nan_direction_hint);
 }
 bool ColumnString::hasEqualValues() const
 {
    return hasEqualValuesImpl<ColumnString>();
 }
 template <bool positive>
 struct ColumnString::Cmp
 {
--- a/src/Columns/ColumnString.h
+++ b/src/Columns/ColumnString.h
@ -240,6 +240,8 @@ public:
                       PaddedPODArray<UInt64> * row_indexes, PaddedPODArray<Int8> & compare_results,
                       int direction, int nan_direction_hint) const override;
    bool hasEqualValues() const override;
    /// Variant of compareAt for string comparison with respect of collation.
    int compareAtWithCollation(size_t n, size_t m, const IColumn & rhs_, int, const Collator & collator) const override;
--- a/src/Columns/ColumnTuple.cpp
+++ b/src/Columns/ColumnTuple.cpp
@ -312,6 +312,11 @@ int ColumnTuple::compareAtWithCollation(size_t n, size_t m, const IColumn & rhs,
    return compareAtImpl(n, m, rhs, nan_direction_hint, &collator);
 }
 bool ColumnTuple::hasEqualValues() const
 {
    return hasEqualValuesImpl<ColumnTuple>();
 }
 template <bool positive>
 struct ColumnTuple::Less
 {
--- a/src/Columns/ColumnTuple.h
+++ b/src/Columns/ColumnTuple.h
@ -76,6 +76,7 @@ public:
                       PaddedPODArray<UInt64> * row_indexes, PaddedPODArray<Int8> & compare_results,
                       int direction, int nan_direction_hint) const override;
    int compareAtWithCollation(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint, const Collator & collator) const override;
    bool hasEqualValues() const override;
    void getExtremes(Field & min, Field & max) const override;
    void getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const override;
    void updatePermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges & equal_ranges) const override;
--- a/src/Columns/ColumnVector.h
+++ b/src/Columns/ColumnVector.h
@ -205,6 +205,11 @@ public:
                                                    compare_results, direction, nan_direction_hint);
    }
    bool hasEqualValues() const override
    {
        return this->template hasEqualValuesImpl<Self>();
    }
    void getPermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res) const override;
    void updatePermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges& equal_range) const override;
--- a/src/Columns/IColumn.h
+++ b/src/Columns/IColumn.h
@ -266,6 +266,9 @@ public:
                               PaddedPODArray<UInt64> * row_indexes, PaddedPODArray<Int8> & compare_results,
                               int direction, int nan_direction_hint) const = 0;
    /// Check if all elements in the column have equal values. Return true if column is empty.
    virtual bool hasEqualValues() const = 0;
    /** Returns a permutation that sorts elements of this column,
      *  i.e. perm[i]-th element of source column should be i-th element of sorted column.
      * reverse - reverse ordering (acsending).
@ -467,6 +470,9 @@ protected:
                         PaddedPODArray<UInt64> * row_indexes,
                         PaddedPODArray<Int8> & compare_results,
                         int direction, int nan_direction_hint) const;
    template <typename Derived>
    bool hasEqualValuesImpl() const;
 };
 using ColumnPtr = IColumn::Ptr;
--- a/src/Columns/IColumnDummy.h
+++ b/src/Columns/IColumnDummy.h
@ -40,6 +40,8 @@ public:
    {
    }
    bool hasEqualValues() const override { return true; }
    Field operator[](size_t) const override { throw Exception("Cannot get value from " + getName(), ErrorCodes::NOT_IMPLEMENTED); }
    void get(size_t, Field &) const override { throw Exception("Cannot get value from " + getName(), ErrorCodes::NOT_IMPLEMENTED); }
    void insert(const Field &) override { throw Exception("Cannot insert element into " + getName(), ErrorCodes::NOT_IMPLEMENTED); }
--- a/src/Columns/IColumnImpl.h
+++ b/src/Columns/IColumnImpl.h
@ -127,4 +127,16 @@ void IColumn::doCompareColumn(const Derived & rhs, size_t rhs_row_num,
    }
 }
 template <typename Derived>
 bool IColumn::hasEqualValuesImpl() const
 {
    size_t num_rows = size();
    for (size_t i = 1; i < num_rows; ++i)
    {
        if (compareAt(i, 0, static_cast<const Derived &>(*this), false) != 0)
            return false;
    }
    return true;
 }
 }
--- a/src/Columns/IColumnUnique.h
+++ b/src/Columns/IColumnUnique.h
@ -172,6 +172,11 @@ public:
    {
        throw Exception("Method compareColumn is not supported for ColumnUnique.", ErrorCodes::NOT_IMPLEMENTED);
    }
    bool hasEqualValues() const override
    {
        throw Exception("Method hasEqualValues is not supported for ColumnUnique.", ErrorCodes::NOT_IMPLEMENTED);
    }
 };
 using ColumnUniquePtr = IColumnUnique::ColumnUniquePtr;
--- a/src/Common/Epoll.cpp
+++ b/src/Common/Epoll.cpp
@ -0,0 +1,86 @@
 #if defined(OS_LINUX)
 #include "Epoll.h"
 #include <Common/Exception.h>
 #include <unistd.h>
 #include <common/logger_useful.h>
 namespace DB
 {
 namespace ErrorCodes
 {
    extern const int EPOLL_ERROR;
    extern const int LOGICAL_ERROR;
 }
 Epoll::Epoll() : events_count(0)
 {
    epoll_fd = epoll_create1(0);
    if (epoll_fd == -1)
        throwFromErrno("Cannot open epoll descriptor", DB::ErrorCodes::EPOLL_ERROR);
 }
 Epoll::Epoll(Epoll && other) : epoll_fd(other.epoll_fd), events_count(other.events_count.load())
 {
    other.epoll_fd = -1;
 }
 Epoll & Epoll::operator=(Epoll && other)
 {
    epoll_fd = other.epoll_fd;
    other.epoll_fd = -1;
    events_count.store(other.events_count.load());
    return *this;
 }
 void Epoll::add(int fd, void * ptr)
 {
    epoll_event event;
    event.events = EPOLLIN | EPOLLPRI;
    if (ptr)
        event.data.ptr = ptr;
    else
        event.data.fd = fd;
    ++events_count;
    if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &event) == -1)
        throwFromErrno("Cannot add new descriptor to epoll", DB::ErrorCodes::EPOLL_ERROR);
 }
 void Epoll::remove(int fd)
 {
    --events_count;
    if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, nullptr) == -1)
        throwFromErrno("Cannot remove descriptor from epoll", DB::ErrorCodes::EPOLL_ERROR);
 }
 size_t Epoll::getManyReady(int max_events, epoll_event * events_out, bool blocking) const
 {
    if (events_count == 0)
        throw Exception("There is no events in epoll", ErrorCodes::LOGICAL_ERROR);
    int ready_size;
    int timeout = blocking ? -1 : 0;
    do
    {
        ready_size = epoll_wait(epoll_fd, events_out, max_events, timeout);
        if (ready_size == -1 && errno != EINTR)
            throwFromErrno("Error in epoll_wait", DB::ErrorCodes::EPOLL_ERROR);
    }
    while (ready_size <= 0 && (ready_size != 0 || blocking));
    return ready_size;
 }
 Epoll::~Epoll()
 {
    if (epoll_fd != -1)
        close(epoll_fd);
 }
 }
 #endif
--- a/src/Common/Epoll.h
+++ b/src/Common/Epoll.h
@ -0,0 +1,54 @@
 #pragma once
 #if defined(OS_LINUX)
 #include <sys/epoll.h>
 #include <vector>
 #include <boost/noncopyable.hpp>
 #include <Poco/Logger.h>
 namespace DB
 {
 using AsyncCallback = std::function<void(int, const Poco::Timespan &, const std::string &)>;
 class Epoll
 {
 public:
    Epoll();
    Epoll(const Epoll &) = delete;
    Epoll & operator=(const Epoll &) = delete;
    Epoll & operator=(Epoll && other);
    Epoll(Epoll && other);
    /// Add new file descriptor to epoll. If ptr set to nullptr, epoll_event.data.fd = fd,
    /// otherwise epoll_event.data.ptr = ptr.
    void add(int fd, void * ptr = nullptr);
    /// Remove file descriptor to epoll.
    void remove(int fd);
    /// Get events from epoll. Events are written in events_out, this function returns an amount of ready events.
    /// If blocking is false and there are no ready events,
    /// return empty vector, otherwise wait for ready events.
    size_t getManyReady(int max_events, epoll_event * events_out, bool blocking) const;
    int getFileDescriptor() const { return epoll_fd; }
    int size() const { return events_count; }
    bool empty() const { return events_count == 0; }
    const std::string & getDescription() const { return fd_description; }
    ~Epoll();
 private:
    int epoll_fd;
    std::atomic<int> events_count;
    const std::string fd_description = "epoll";
 };
 }
 #endif
--- a/src/Common/ErrorCodes.cpp
+++ b/src/Common/ErrorCodes.cpp
@ -539,7 +539,8 @@
    M(570, DATA_TYPE_INCOMPATIBLE_WITH_PROTOBUF_FIELD) \
    M(571, DATABASE_REPLICATION_FAILED) \
    M(572, TOO_MANY_QUERY_PLAN_OPTIMIZATIONS) \
-    M(573, INCORRECT_PART_TYPE) \
+    M(573, EPOLL_ERROR) \
    M(574, INCORRECT_PART_TYPE) \
    \
    M(999, KEEPER_EXCEPTION) \
    M(1000, POCO_EXCEPTION) \
--- a/src/Common/PoolWithFailoverBase.h
+++ b/src/Common/PoolWithFailoverBase.h
@ -93,6 +93,18 @@ public:
        double staleness = 0.0; /// Helps choosing the "least stale" option when all replicas are stale.
    };
    struct PoolState;
    using PoolStates = std::vector<PoolState>;
    struct ShuffledPool
    {
        NestedPool * pool{};
        const PoolState * state{};
        size_t index = 0;
        size_t error_count = 0;
    };
    /// This functor must be provided by a client. It must perform a single try that takes a connection
    /// from the provided pool and checks that it is good.
    using TryGetEntryFunc = std::function<TryResult(NestedPool & pool, std::string & fail_message)>;
@ -113,9 +125,6 @@ public:
            const GetPriorityFunc & get_priority = GetPriorityFunc());
 protected:
    struct PoolState;
    using PoolStates = std::vector<PoolState>;
    /// Returns a single connection.
    Entry get(size_t max_ignored_errors, bool fallback_to_stale_replicas,
@ -124,6 +133,10 @@ protected:
    /// This function returns a copy of pool states to avoid race conditions when modifying shared pool states.
    PoolStates updatePoolStates(size_t max_ignored_errors);
    std::vector<ShuffledPool> getShuffledPools(size_t max_ignored_errors, const GetPriorityFunc & get_priority);
    inline void updateSharedErrorCounts(std::vector<ShuffledPool> & shuffled_pools);
    auto getPoolExtendedStates() const
    {
        std::lock_guard lock(pool_states_mutex);
@ -143,6 +156,46 @@ protected:
    Poco::Logger * log;
 };
 template <typename TNestedPool>
 std::vector<typename PoolWithFailoverBase<TNestedPool>::ShuffledPool>
 PoolWithFailoverBase<TNestedPool>::getShuffledPools(
    size_t max_ignored_errors, const PoolWithFailoverBase::GetPriorityFunc & get_priority)
 {
    /// Update random numbers and error counts.
    PoolStates pool_states = updatePoolStates(max_ignored_errors);
    if (get_priority)
    {
        for (size_t i = 0; i < pool_states.size(); ++i)
            pool_states[i].priority = get_priority(i);
    }
    /// Sort the pools into order in which they will be tried (based on respective PoolStates).
    std::vector<ShuffledPool> shuffled_pools;
    shuffled_pools.reserve(nested_pools.size());
    for (size_t i = 0; i < nested_pools.size(); ++i)
        shuffled_pools.push_back(ShuffledPool{nested_pools[i].get(), &pool_states[i], i, 0});
    std::sort(
        shuffled_pools.begin(), shuffled_pools.end(),
        [](const ShuffledPool & lhs, const ShuffledPool & rhs)
        {
            return PoolState::compare(*lhs.state, *rhs.state);
        });
    return shuffled_pools;
 }
 template <typename TNestedPool>
 inline void PoolWithFailoverBase<TNestedPool>::updateSharedErrorCounts(std::vector<ShuffledPool> & shuffled_pools)
 {
    std::lock_guard lock(pool_states_mutex);
    for (const ShuffledPool & pool: shuffled_pools)
    {
        auto & pool_state = shared_pool_states[pool.index];
        pool_state.error_count = std::min<UInt64>(max_error_cap, pool_state.error_count + pool.error_count);
    }
 }
 template <typename TNestedPool>
 typename TNestedPool::Entry
 PoolWithFailoverBase<TNestedPool>::get(size_t max_ignored_errors, bool fallback_to_stale_replicas,
@ -168,33 +221,7 @@ PoolWithFailoverBase<TNestedPool>::getMany(
        const TryGetEntryFunc & try_get_entry,
        const GetPriorityFunc & get_priority)
 {
-    /// Update random numbers and error counts.
+    std::vector<ShuffledPool> shuffled_pools = getShuffledPools(max_ignored_errors, get_priority);
    PoolStates pool_states = updatePoolStates(max_ignored_errors);
    if (get_priority)
    {
        for (size_t i = 0; i < pool_states.size(); ++i)
            pool_states[i].priority = get_priority(i);
    }
    struct ShuffledPool
    {
        NestedPool * pool{};
        const PoolState * state{};
        size_t index = 0;
        size_t error_count = 0;
    };
    /// Sort the pools into order in which they will be tried (based on respective PoolStates).
    std::vector<ShuffledPool> shuffled_pools;
    shuffled_pools.reserve(nested_pools.size());
    for (size_t i = 0; i < nested_pools.size(); ++i)
        shuffled_pools.push_back(ShuffledPool{nested_pools[i].get(), &pool_states[i], i, 0});
    std::sort(
            shuffled_pools.begin(), shuffled_pools.end(),
            [](const ShuffledPool & lhs, const ShuffledPool & rhs)
            {
                return PoolState::compare(*lhs.state, *rhs.state);
            });
    /// We will try to get a connection from each pool until a connection is produced or max_tries is reached.
    std::vector<TryResult> try_results(shuffled_pools.size());
@ -206,12 +233,7 @@ PoolWithFailoverBase<TNestedPool>::getMany(
    /// At exit update shared error counts with error counts occurred during this call.
    SCOPE_EXIT(
    {
-        std::lock_guard lock(pool_states_mutex);
+        updateSharedErrorCounts(shuffled_pools);
        for (const ShuffledPool & pool: shuffled_pools)
        {
            auto & pool_state = shared_pool_states[pool.index];
            pool_state.error_count = std::min<UInt64>(max_error_cap, pool_state.error_count + pool.error_count);
        }
    });
    std::string fail_messages;
--- a/src/Common/TimerDescriptor.cpp
+++ b/src/Common/TimerDescriptor.cpp
@ -27,10 +27,16 @@ TimerDescriptor::TimerDescriptor(int clockid, int flags)
        throwFromErrno("Cannot set O_NONBLOCK for timer_fd", ErrorCodes::CANNOT_FCNTL);
 }
 TimerDescriptor::TimerDescriptor(TimerDescriptor && other) : timer_fd(other.timer_fd)
 {
    other.timer_fd = -1;
 }
 TimerDescriptor::~TimerDescriptor()
 {
    /// Do not check for result cause cannot throw exception.
-    close(timer_fd);
+    if (timer_fd != -1)
        close(timer_fd);
 }
 void TimerDescriptor::reset() const
@ -74,7 +80,7 @@ void TimerDescriptor::setRelative(const Poco::Timespan & timespan) const
    spec.it_interval.tv_nsec = 0;
    spec.it_interval.tv_sec = 0;
    spec.it_value.tv_sec = timespan.totalSeconds();
-    spec.it_value.tv_nsec = timespan.useconds();
+    spec.it_value.tv_nsec = timespan.useconds() * 1000;
    if (-1 == timerfd_settime(timer_fd, 0 /*relative timer */, &spec, nullptr))
        throwFromErrno("Cannot set time for timer_fd", ErrorCodes::CANNOT_SET_TIMER_PERIOD);
--- a/src/Common/TimerDescriptor.h
+++ b/src/Common/TimerDescriptor.h
@ -12,12 +12,12 @@ private:
    int timer_fd;
 public:
-    explicit TimerDescriptor(int clockid, int flags);
+    explicit TimerDescriptor(int clockid = CLOCK_MONOTONIC, int flags = 0);
    ~TimerDescriptor();
    TimerDescriptor(const TimerDescriptor &) = delete;
    TimerDescriptor & operator=(const TimerDescriptor &) = delete;
-    TimerDescriptor(TimerDescriptor &&) = default;
+    TimerDescriptor(TimerDescriptor && other);
    TimerDescriptor & operator=(TimerDescriptor &&) = default;
    int getDescriptor() const { return timer_fd; }
--- a/src/Common/ya.make
+++ b/src/Common/ya.make
@ -39,6 +39,7 @@ SRCS(
    DNSResolver.cpp
    Dwarf.cpp
    Elf.cpp
    Epoll.cpp
    ErrorCodes.cpp
    Exception.cpp
    ExternalLoaderStatus.cpp
--- a/src/Coordination/NuKeeperServer.cpp
+++ b/src/Coordination/NuKeeperServer.cpp
@ -80,8 +80,9 @@ void NuKeeperServer::shutdown()
 {
    state_machine->shutdownStorage();
    state_manager->flushLogStore();
-    if (!launcher.shutdown(coordination_settings->shutdown_timeout.totalSeconds()))
+    auto timeout = coordination_settings->shutdown_timeout.totalSeconds();
-        LOG_WARNING(&Poco::Logger::get("NuKeeperServer"), "Failed to shutdown RAFT server in {} seconds", 5);
+    if (!launcher.shutdown(timeout))
        LOG_WARNING(&Poco::Logger::get("NuKeeperServer"), "Failed to shutdown RAFT server in {} seconds", timeout);
 }
 namespace
--- a/src/Core/Defines.h
+++ b/src/Core/Defines.h
@ -11,6 +11,9 @@
 #define DBMS_DEFAULT_CONNECT_TIMEOUT_WITH_FAILOVER_SECURE_MS 100
 #define DBMS_DEFAULT_SEND_TIMEOUT_SEC 300
 #define DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC 300
 /// Timeouts for hedged requests.
 #define DBMS_DEFAULT_HEDGED_CONNECTION_TIMEOUT_MS 100
 #define DBMS_DEFAULT_RECEIVE_DATA_TIMEOUT_SEC 2
 /// Timeout for synchronous request-result protocol call (like Ping or TablesStatus).
 #define DBMS_DEFAULT_SYNC_REQUEST_TIMEOUT_SEC 5
 #define DBMS_DEFAULT_POLL_INTERVAL 10
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -55,6 +55,10 @@ class IColumn;
    M(Seconds, receive_timeout, DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC, "", 0) \
    M(Seconds, send_timeout, DBMS_DEFAULT_SEND_TIMEOUT_SEC, "", 0) \
    M(Seconds, tcp_keep_alive_timeout, 0, "The time in seconds the connection needs to remain idle before TCP starts sending keepalive probes", 0) \
    M(Milliseconds, hedged_connection_timeout, DBMS_DEFAULT_HEDGED_CONNECTION_TIMEOUT_MS, "Connection timeout for establishing connection with replica for Hedged requests", 0) \
    M(Seconds, receive_data_timeout, DBMS_DEFAULT_RECEIVE_DATA_TIMEOUT_SEC, "Connection timeout for receiving first packet of data or packet with positive progress from replica", 0) \
    M(Bool, use_hedged_requests, true, "Use hedged requests for distributed queries", 0) \
    M(Bool, allow_changing_replica_until_first_data_packet, false, "Allow HedgedConnections to change replica until receiving first data packet", 0) \
    M(Milliseconds, queue_max_wait_ms, 0, "The wait time in the request queue, if the number of concurrent requests exceeds the maximum.", 0) \
    M(Milliseconds, connection_pool_max_wait_ms, 0, "The wait time when the connection pool is full.", 0) \
    M(Milliseconds, replace_running_query_max_wait_ms, 5000, "The wait time for running query with the same query_id to finish when setting 'replace_running_query' is active.", 0) \
@ -215,6 +219,10 @@ class IColumn;
    M(Milliseconds, stream_flush_interval_ms, 7500, "Timeout for flushing data from streaming storages.", 0) \
    M(Milliseconds, stream_poll_timeout_ms, 500, "Timeout for polling data from/to streaming storages.", 0) \
    \
    /** Settings for testing hedged requests */ \
    M(Int64, sleep_in_send_tables_status, 0, "Time to sleep in sending tables status response in TCPHandler", 0) \
    M(Int64, sleep_in_send_data, 0, "Time to sleep in sending data in TCPHandler", 0) \
    \
    M(Bool, insert_allow_materialized_columns, 0, "If setting is enabled, Allow materialized columns in INSERT.", 0) \
    M(Seconds, http_connection_timeout, DEFAULT_HTTP_READ_BUFFER_CONNECTION_TIMEOUT, "HTTP connection timeout.", 0) \
    M(Seconds, http_send_timeout, DEFAULT_HTTP_READ_BUFFER_TIMEOUT, "HTTP send timeout", 0) \
--- a/src/DataStreams/RemoteQueryExecutor.cpp
+++ b/src/DataStreams/RemoteQueryExecutor.cpp
@ -13,6 +13,8 @@
 #include <Interpreters/InternalTextLogsQueue.h>
 #include <IO/ConnectionTimeoutsContext.h>
 #include <Common/FiberStack.h>
 #include <Client/MultiplexedConnections.h>
 #include <Client/HedgedConnections.h>
 #include <Storages/MergeTree/MergeTreeDataPartUUID.h>
 namespace DB
@ -31,23 +33,23 @@ RemoteQueryExecutor::RemoteQueryExecutor(
    : header(header_), query(query_), context(context_)
    , scalars(scalars_), external_tables(external_tables_), stage(stage_)
 {
-    create_multiplexed_connections = [this, &connection, throttler]()
+    create_connections = [this, &connection, throttler]()
    {
        return std::make_unique<MultiplexedConnections>(connection, context.getSettingsRef(), throttler);
    };
 }
 RemoteQueryExecutor::RemoteQueryExecutor(
-    std::vector<IConnectionPool::Entry> && connections,
+    std::vector<IConnectionPool::Entry> && connections_,
    const String & query_, const Block & header_, const Context & context_,
    const ThrottlerPtr & throttler, const Scalars & scalars_, const Tables & external_tables_, QueryProcessingStage::Enum stage_)
    : header(header_), query(query_), context(context_)
    , scalars(scalars_), external_tables(external_tables_), stage(stage_)
 {
-    create_multiplexed_connections = [this, connections, throttler]() mutable
+    create_connections = [this, connections_, throttler]() mutable
    {
        return std::make_unique<MultiplexedConnections>(
-                std::move(connections), context.getSettingsRef(), throttler);
+                std::move(connections_), context.getSettingsRef(), throttler);
    };
 }
@ -58,23 +60,34 @@ RemoteQueryExecutor::RemoteQueryExecutor(
    : header(header_), query(query_), context(context_)
    , scalars(scalars_), external_tables(external_tables_), stage(stage_)
 {
-    create_multiplexed_connections = [this, pool, throttler]()
+    create_connections = [this, pool, throttler]()->std::unique_ptr<IConnections>
    {
        const Settings & current_settings = context.getSettingsRef();
        auto timeouts = ConnectionTimeouts::getTCPTimeoutsWithFailover(current_settings);
-        std::vector<IConnectionPool::Entry> connections;
+
 #if defined(OS_LINUX)
        if (current_settings.use_hedged_requests)
        {
            std::shared_ptr<QualifiedTableName> table_to_check = nullptr;
            if (main_table)
                table_to_check = std::make_shared<QualifiedTableName>(main_table.getQualifiedName());
            return std::make_unique<HedgedConnections>(pool, current_settings, timeouts, throttler, pool_mode, table_to_check);
        }
 #endif
        std::vector<IConnectionPool::Entry> connection_entries;
        if (main_table)
        {
            auto try_results = pool->getManyChecked(timeouts, &current_settings, pool_mode, main_table.getQualifiedName());
-            connections.reserve(try_results.size());
+            connection_entries.reserve(try_results.size());
            for (auto & try_result : try_results)
-                connections.emplace_back(std::move(try_result.entry));
+                connection_entries.emplace_back(std::move(try_result.entry));
        }
        else
-            connections = pool->getMany(timeouts, &current_settings, pool_mode);
+            connection_entries = pool->getMany(timeouts, &current_settings, pool_mode);
-        return std::make_unique<MultiplexedConnections>(
+        return std::make_unique<MultiplexedConnections>(std::move(connection_entries), current_settings, throttler);
                std::move(connections), current_settings, throttler);
    };
 }
@ -85,7 +98,7 @@ RemoteQueryExecutor::~RemoteQueryExecutor()
      * these connections did not remain hanging in the out-of-sync state.
      */
    if (established || isQueryPending())
-        multiplexed_connections->disconnect();
+        connections->disconnect();
 }
 /** If we receive a block with slightly different column types, or with excessive columns,
@ -142,10 +155,10 @@ void RemoteQueryExecutor::sendQuery()
    if (sent_query)
        return;
-    multiplexed_connections = create_multiplexed_connections();
+    connections = create_connections();
    const auto & settings = context.getSettingsRef();
-    if (settings.skip_unavailable_shards && 0 == multiplexed_connections->size())
+    if (settings.skip_unavailable_shards && 0 == connections->size())
        return;
    /// Query cannot be canceled in the middle of the send query,
@ -173,10 +186,10 @@ void RemoteQueryExecutor::sendQuery()
    {
        std::lock_guard lock(duplicated_part_uuids_mutex);
        if (!duplicated_part_uuids.empty())
-            multiplexed_connections->sendIgnoredPartUUIDs(duplicated_part_uuids);
+            connections->sendIgnoredPartUUIDs(duplicated_part_uuids);
    }
-    multiplexed_connections->sendQuery(timeouts, query, query_id, stage, modified_client_info, true);
+    connections->sendQuery(timeouts, query, query_id, stage, modified_client_info, true);
    established = false;
    sent_query = true;
@ -192,7 +205,7 @@ Block RemoteQueryExecutor::read()
    {
        sendQuery();
-        if (context.getSettingsRef().skip_unavailable_shards && (0 == multiplexed_connections->size()))
+        if (context.getSettingsRef().skip_unavailable_shards && (0 == connections->size()))
            return {};
    }
@ -201,7 +214,7 @@ Block RemoteQueryExecutor::read()
        if (was_cancelled)
            return Block();
-        Packet packet = multiplexed_connections->receivePacket();
+        Packet packet = connections->receivePacket();
        if (auto block = processPacket(std::move(packet)))
            return *block;
@ -218,7 +231,7 @@ std::variant<Block, int> RemoteQueryExecutor::read(std::unique_ptr<ReadContext>
    {
        sendQuery();
-        if (context.getSettingsRef().skip_unavailable_shards && (0 == multiplexed_connections->size()))
+        if (context.getSettingsRef().skip_unavailable_shards && (0 == connections->size()))
            return Block();
    }
@ -228,7 +241,7 @@ std::variant<Block, int> RemoteQueryExecutor::read(std::unique_ptr<ReadContext>
        if (was_cancelled)
            return Block();
-        read_context = std::make_unique<ReadContext>(*multiplexed_connections);
+        read_context = std::make_unique<ReadContext>(*connections);
    }
    do
@ -239,7 +252,7 @@ std::variant<Block, int> RemoteQueryExecutor::read(std::unique_ptr<ReadContext>
        if (read_context->is_read_in_progress.load(std::memory_order_relaxed))
        {
            read_context->setTimer();
-            return read_context->epoll_fd;
+            return read_context->epoll.getFileDescriptor();
        }
        else
        {
@ -260,7 +273,7 @@ std::variant<Block, int> RemoteQueryExecutor::restartQueryWithoutDuplicatedUUIDs
 {
    /// Cancel previous query and disconnect before retry.
    cancel(read_context);
-    multiplexed_connections->disconnect();
+    connections->disconnect();
    /// Only resend once, otherwise throw an exception
    if (!resent_query)
@ -300,7 +313,7 @@ std::optional<Block> RemoteQueryExecutor::processPacket(Packet packet)
            break;
        case Protocol::Server::EndOfStream:
-            if (!multiplexed_connections->hasActiveConnections())
+            if (!connections->hasActiveConnections())
            {
                finished = true;
                return Block();
@ -342,7 +355,7 @@ std::optional<Block> RemoteQueryExecutor::processPacket(Packet packet)
            got_unknown_packet_from_replica = true;
            throw Exception(ErrorCodes::UNKNOWN_PACKET_FROM_SERVER, "Unknown packet {} from one of the following replicas: {}",
                toString(packet.type),
-                multiplexed_connections->dumpAddresses());
+                connections->dumpAddresses());
    }
    return {};
@ -382,7 +395,7 @@ void RemoteQueryExecutor::finish(std::unique_ptr<ReadContext> * read_context)
    tryCancel("Cancelling query because enough data has been read", read_context);
    /// Get the remaining packets so that there is no out of sync in the connections to the replicas.
-    Packet packet = multiplexed_connections->drain();
+    Packet packet = connections->drain();
    switch (packet.type)
    {
        case Protocol::Server::EndOfStream:
@ -404,7 +417,7 @@ void RemoteQueryExecutor::finish(std::unique_ptr<ReadContext> * read_context)
            got_unknown_packet_from_replica = true;
            throw Exception(ErrorCodes::UNKNOWN_PACKET_FROM_SERVER, "Unknown packet {} from one of the following replicas: {}",
                toString(packet.type),
-                multiplexed_connections->dumpAddresses());
+                connections->dumpAddresses());
    }
 }
@ -427,14 +440,14 @@ void RemoteQueryExecutor::cancel(std::unique_ptr<ReadContext> * read_context)
 void RemoteQueryExecutor::sendScalars()
 {
-    multiplexed_connections->sendScalarsData(scalars);
+    connections->sendScalarsData(scalars);
 }
 void RemoteQueryExecutor::sendExternalTables()
 {
    SelectQueryInfo query_info;
-    size_t count = multiplexed_connections->size();
+    size_t count = connections->size();
    {
        std::lock_guard lock(external_tables_mutex);
@ -472,7 +485,7 @@ void RemoteQueryExecutor::sendExternalTables()
        }
    }
-    multiplexed_connections->sendExternalTablesData(external_tables_data);
+    connections->sendExternalTablesData(external_tables_data);
 }
 void RemoteQueryExecutor::tryCancel(const char * reason, std::unique_ptr<ReadContext> * read_context)
@ -489,11 +502,11 @@ void RemoteQueryExecutor::tryCancel(const char * reason, std::unique_ptr<ReadCon
        if (read_context && *read_context)
            (*read_context)->cancel();
-        multiplexed_connections->sendCancel();
+        connections->sendCancel();
    }
    if (log)
-        LOG_TRACE(log, "({}) {}", multiplexed_connections->dumpAddresses(), reason);
+        LOG_TRACE(log, "({}) {}", connections->dumpAddresses(), reason);
 }
 bool RemoteQueryExecutor::isQueryPending() const
--- a/src/DataStreams/RemoteQueryExecutor.h
+++ b/src/DataStreams/RemoteQueryExecutor.h
@ -1,7 +1,8 @@
 #pragma once
 #include <Client/ConnectionPool.h>
-#include <Client/MultiplexedConnections.h>
+#include <Client/IConnections.h>
 #include <Client/ConnectionPoolWithFailover.h>
 #include <Storages/IStorage_fwd.h>
 #include <Interpreters/Context.h>
 #include <Interpreters/StorageID.h>
@ -40,7 +41,7 @@ public:
    /// Accepts several connections already taken from pool.
    RemoteQueryExecutor(
-        std::vector<IConnectionPool::Entry> && connections,
+        std::vector<IConnectionPool::Entry> && connections_,
        const String & query_, const Block & header_, const Context & context_,
        const ThrottlerPtr & throttler = nullptr, const Scalars & scalars_ = Scalars(), const Tables & external_tables_ = Tables(),
        QueryProcessingStage::Enum stage_ = QueryProcessingStage::Complete);
@ -103,8 +104,8 @@ private:
    Block totals;
    Block extremes;
-    std::function<std::unique_ptr<MultiplexedConnections>()> create_multiplexed_connections;
+    std::function<std::unique_ptr<IConnections>()> create_connections;
-    std::unique_ptr<MultiplexedConnections> multiplexed_connections;
+    std::unique_ptr<IConnections> connections;
    const String query;
    String query_id = "";
--- a/Show More
+++ b/Show More
		`@ -1 +1 @@`
			`Subproject commit 48f40ebb539220d328958f8823b094c0b07a4e79`				`Subproject commit ee24fa55bc46e4d2ce7d0d052cc5a0d9b1be8c36`
`@ -26,4 +26,4 @@ The name of an additional section can be any, for example, Usage.`

	`- [link](#)`	`- [link](#)`

	`[Original article](https://clickhouse.tech/docs/en/data_types/<data-type-name>/) <!--hide-->`	`[Original article](https://clickhouse.tech/docs/en/data-types/<data-type-name>/) <!--hide-->`