Merge branch 'master' of github.com:ClickHouse/ClickHouse into insert-cluster

2024-09-20 16:50:48 +00:00 · 2021-01-03 08:47:14 +00:00 · 2021-01-03 08:47:14 +00:00 · 459abca5b5
commit 459abca5b5
parent 166674ddfa 1e98917963
139 changed files with 2617 additions and 1703 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -68,7 +68,15 @@ include (cmake/find/ccache.cmake)

 option(ENABLE_CHECK_HEAVY_BUILDS "Don't allow C++ translation units to compile too long or to take too much memory while compiling" OFF)
 if (ENABLE_CHECK_HEAVY_BUILDS)
-    set (CMAKE_CXX_COMPILER_LAUNCHER prlimit --rss=10000000 --cpu=600)
+    # set DATA (since RSS does not work since 2.6.x+) to 2G
+    set (RLIMIT_DATA 5000000000)
+    # set VIRT (RLIMIT_AS) to 10G (DATA*10)
+    set (RLIMIT_AS 10000000000)
+    # gcc10/gcc10/clang -fsanitize=memory is too heavy
+    if (SANITIZE STREQUAL "memory" OR COMPILER_GCC)
+       set (RLIMIT_DATA 10000000000)
+    endif()
+    set (CMAKE_CXX_COMPILER_LAUNCHER prlimit --as=${RLIMIT_AS} --data=${RLIMIT_DATA} --cpu=600)
 endif ()

 if (NOT CMAKE_BUILD_TYPE OR CMAKE_BUILD_TYPE STREQUAL "None")
@ -187,13 +195,14 @@ endif ()
 option(ADD_GDB_INDEX_FOR_GOLD "Add .gdb-index to resulting binaries for gold linker.")

 if (NOT CMAKE_BUILD_TYPE_UC STREQUAL "RELEASE")
-    if (LINKER_NAME STREQUAL "lld")
+    # Can be lld or ld-lld.
+    if (LINKER_NAME MATCHES "lld$")
        set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--gdb-index")
        set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--gdb-index")
        message (STATUS "Adding .gdb-index via --gdb-index linker option.")
    # we use another tool for gdb-index, because gold linker removes section .debug_aranges, which used inside clickhouse stacktraces
    # http://sourceware-org.1504.n7.nabble.com/gold-No-debug-aranges-section-when-linking-with-gdb-index-td540965.html#a556932
-    elseif (LINKER_NAME STREQUAL "gold" AND ADD_GDB_INDEX_FOR_GOLD)
+    elseif (LINKER_NAME MATCHES "gold$" AND ADD_GDB_INDEX_FOR_GOLD)
        find_program (GDB_ADD_INDEX_EXE NAMES "gdb-add-index" DOC "Path to gdb-add-index executable")
        if (NOT GDB_ADD_INDEX_EXE)
            set (USE_GDB_ADD_INDEX 0)
--- a/contrib/CMakeLists.txt
+++ b/contrib/CMakeLists.txt
@ -31,7 +31,6 @@ set_property(DIRECTORY PROPERTY EXCLUDE_FROM_ALL 1)
 add_subdirectory (antlr4-runtime-cmake)
 add_subdirectory (boost-cmake)
 add_subdirectory (cctz-cmake)
-add_subdirectory (consistent-hashing-sumbur)
 add_subdirectory (consistent-hashing)
 add_subdirectory (dragonbox-cmake)
 add_subdirectory (FastMemcpy)
--- a/contrib/consistent-hashing-sumbur/CMakeLists.txt
+++ b/contrib/consistent-hashing-sumbur/CMakeLists.txt
@ -1,2 +0,0 @@
-add_library(consistent-hashing-sumbur sumbur.cpp)
-target_include_directories(consistent-hashing-sumbur PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
--- a/contrib/consistent-hashing-sumbur/sumbur.cpp
+++ b/contrib/consistent-hashing-sumbur/sumbur.cpp
@ -1,113 +0,0 @@
-//Copyright (c) 2011-2012 Mail.RU
-//Copyright (c) 2011-2012 Maksim Kalinchenko
-//Copyright (c) 2012 Sokolov Yura aka funny-falcon
-//
-//MIT License
-//
-//Permission is hereby granted, free of charge, to any person obtaining
-//a copy of this software and associated documentation files (the
-//"Software"), to deal in the Software without restriction, including
-//without limitation the rights to use, copy, modify, merge, publish,
-//distribute, sublicense, and/or sell copies of the Software, and to
-//permit persons to whom the Software is furnished to do so, subject to
-//the following conditions:
-//
-//The above copyright notice and this permission notice shall be
-//included in all copies or substantial portions of the Software.
-//
-//THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-//EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-//MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-//NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
-//LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-//OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-//WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-#include <stdexcept>
-
-
-#define L 0xFFFFFFFF
-
-static unsigned int L27_38[] = {L / 27, L / 28, L / 29, L / 30, L / 31, L / 32,
-                               L / 33, L / 34, L / 35, L / 36, L / 37, L / 38,
-                               L / 39, L / 40, L / 41, L / 42, L / 43, L / 44,
-                               L / 45, L / 46, L / 47, L / 48, L / 49, L / 50,
-                               L / 51, L / 52, L / 53, L / 54, L / 55, L / 56,
-                               L / 57, L / 58, L / 59, L / 60, L / 61, L / 62
-                               };
-static unsigned int LL27_38[] = {L/(26*27), L/(27*28), L/(28*29), L/(29*30), L/(30*31), L/(31*32),
-                                 L/(32*33), L/(33*34), L/(34*35), L/(35*36), L/(36*37), L/(37*38),
-                                 L/(38*39), L/(39*40), L/(40*41), L/(41*42), L/(42*43), L/(43*44),
-                                 L/(44*45), L/(45*46), L/(46*47), L/(47*48), L/(48*49), L/(49*50),
-                                 L/(50*51), L/(51*52), L/(52*53), L/(53*54), L/(54*55), L/(55*56),
-                                 L/(56*57), L/(57*58), L/(58*59), L/(59*60), L/(60*61), L/(61*62)
-                               };
-
-unsigned int sumburConsistentHash(unsigned int hashed_int, unsigned int capacity)
-{
-    unsigned int h = hashed_int;
-    unsigned int capa = capacity;
-    unsigned int part, n, i, c;
-
-    if (capa == 0)
-        throw std::runtime_error("Sumbur is not applicable to empty cluster");
-
-    part = L / capa;
-
-    if (L - h < part) return 0;
-
-    n = 1;
-
-    do {
-        if (h >= L / 2) h -= L / 2;
-        else {
-            n = 2;
-            if (L / 2 - h < part) return 1;
-        }
-        if (capa == 2) return 1;
-
-#define curslice(i) (L / (i * (i - 1)))
-#define unroll(i) \
-        if (curslice(i) <= h) h -= curslice(i); \
-        else {                                  \
-            h += curslice(i) * (i - n - 1);     \
-            n = i;                              \
-            if (L / i - h < part) return n-1;   \
-        }                                       \
-        if (capa == i) return (n-1)
-
-        unroll(3); unroll(4); unroll(5);
-        unroll(6); unroll(7); unroll(8);
-        unroll(9); unroll(10); unroll(11);
-        unroll(12); unroll(13); unroll(14);
-        unroll(15); unroll(16); unroll(17);
-        unroll(18); unroll(19); unroll(20);
-        unroll(21); unroll(22); unroll(23);
-        unroll(24); unroll(25); unroll(26);
-
-        for (i = 27; i <= capa && i <= 62; i++) {
-            c = LL27_38[i-27];
-            if (c <= h) {
-                h -= c;
-            }
-            else {
-                h += c * (i - n - 1);
-                n = i;
-                if (L27_38[i-27] - h < part) return n-1;
-            }
-        }
-
-        for(i = 63; i <= capa; i++) {
-            c = L / (i * (i - 1));
-            if (c <= h) {
-                h -= c;
-            }
-            else {
-                h += c * (i - n - 1);
-                n = i;
-                if (L / i - h < part) return n - 1;
-            }
-        }
-    } while(false);
-    return n - 1;
-}
--- a/contrib/consistent-hashing-sumbur/sumbur.h
+++ b/contrib/consistent-hashing-sumbur/sumbur.h
@ -1,28 +0,0 @@
-//Copyright (c) 2011-2012 Mail.RU
-//Copyright (c) 2011-2012 Maksim Kalinchenko
-//Copyright (c) 2012 Sokolov Yura aka funny-falcon
-//
-//MIT License
-//
-//Permission is hereby granted, free of charge, to any person obtaining
-//a copy of this software and associated documentation files (the
-//"Software"), to deal in the Software without restriction, including
-//without limitation the rights to use, copy, modify, merge, publish,
-//distribute, sublicense, and/or sell copies of the Software, and to
-//permit persons to whom the Software is furnished to do so, subject to
-//the following conditions:
-//
-//The above copyright notice and this permission notice shall be
-//included in all copies or substantial portions of the Software.
-//
-//THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-//EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-//MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-//NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
-//LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-//OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-//WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-/// Source code: https://github.com/mailru/sumbur-ruby/blob/master/ext/sumbur/sumbur.c
-
-unsigned int sumburConsistentHash(unsigned int hashed_int, unsigned int capacity);
--- a/docker/packager/binary/build.sh
+++ b/docker/packager/binary/build.sh
@ -31,7 +31,7 @@ find . -name '*.so.*' -print -exec mv '{}' /output \;
 if [ "performance" == "$COMBINED_OUTPUT" ]
 then
    cp -r ../tests/performance /output
-    cp -r ../tests/config/top_level_domains  /
+    cp -r ../tests/config/top_level_domains  /output
    cp -r ../docker/test/performance-comparison/config /output ||:
    rm /output/unit_tests_dbms ||:
    rm /output/clickhouse-odbc-bridge ||:
--- a/docker/test/performance-comparison/compare.sh
+++ b/docker/test/performance-comparison/compare.sh
@ -36,6 +36,22 @@ function wait_for_server # port, pid
    fi
 }

+function left_or_right()
+{
+    local from=$1 && shift
+    local basename=$1 && shift
+
+    if [ -e "$from/$basename" ]; then
+        echo "$from/$basename"
+        return
+    fi
+
+    case "$from" in
+        left) echo "right/$basename" ;;
+        right) echo "left/$basename" ;;
+    esac
+}
+
 function configure
 {
    # Use the new config for both servers, so that we can change it in a PR.
@ -55,7 +71,7 @@ function configure
        # server *config* directives overrides
        --path db0
        --user_files_path db0/user_files
-        --top_level_domains_path /top_level_domains
+        --top_level_domains_path "$(left_or_right right top_level_domains)"
        --tcp_port $LEFT_SERVER_PORT
    )
    left/clickhouse-server "${setup_left_server_opts[@]}" &> setup-server-log.log &
@ -103,7 +119,7 @@ function restart
        # server *config* directives overrides
        --path left/db
        --user_files_path left/db/user_files
-        --top_level_domains_path /top_level_domains
+        --top_level_domains_path "$(left_or_right left top_level_domains)"
        --tcp_port $LEFT_SERVER_PORT
    )
    left/clickhouse-server "${left_server_opts[@]}" &>> left-server-log.log &
@ -118,7 +134,7 @@ function restart
        # server *config* directives overrides
        --path right/db
        --user_files_path right/db/user_files
-        --top_level_domains_path /top_level_domains
+        --top_level_domains_path "$(left_or_right right top_level_domains)"
        --tcp_port $RIGHT_SERVER_PORT
    )
    right/clickhouse-server "${right_server_opts[@]}" &>> right-server-log.log &
--- a/docs/en/development/build.md
+++ b/docs/en/development/build.md
@ -23,9 +23,28 @@ $ sudo apt-get install git cmake python ninja-build

 Or cmake3 instead of cmake on older systems.

+### Install clang-11 (recommended) {#install-clang-11}
+
+On Ubuntu/Debian you can use the automatic installation script (check [official webpage](https://apt.llvm.org/))
+
+```bash 
+sudo bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)"
+```
+
+For other Linux distribution - check the availability of the [prebuild packages](https://releases.llvm.org/download.html) or build clang [from sources](https://clang.llvm.org/get_started.html).
+
+#### Use clang-11 for Builds {#use-gcc-10-for-builds}
+
+``` bash
+$ export CC=clang-11
+$ export CXX=clang++-11
+```
+
 ### Install GCC 10 {#install-gcc-10}

-There are several ways to do this.
+We recommend building ClickHouse with clang-11, GCC-10 also supported, but it is not used for production builds.
+
+If you want to use GCC-10 there are several ways to install it.

 #### Install from Repository {#install-from-repository}

@ -49,7 +68,7 @@ $ sudo apt-get install gcc-10 g++-10

 See [utils/ci/build-gcc-from-sources.sh](https://github.com/ClickHouse/ClickHouse/blob/master/utils/ci/build-gcc-from-sources.sh)

-### Use GCC 10 for Builds {#use-gcc-10-for-builds}
+#### Use GCC 10 for Builds {#use-gcc-10-for-builds}

 ``` bash
 $ export CC=gcc-10
--- a/docs/en/operations/server-configuration-parameters/settings.md
+++ b/docs/en/operations/server-configuration-parameters/settings.md
@ -576,6 +576,35 @@ For more information, see the MergeTreeSettings.h header file.
 </merge_tree>
 ```

+## metric_log {#metric_log}
+
+It is enabled by default. If it`s not, you can do this manually.
+
+**Enabling**
+
+To manually turn on metrics history collection [`system.metric_log`](../../operations/system-tables/metric_log.md), create `/etc/clickhouse-server/config.d/metric_log.xml` with the following content:
+
+``` xml
+<yandex>
+    <metric_log>
+        <database>system</database>
+        <table>metric_log</table>
+        <flush_interval_milliseconds>7500</flush_interval_milliseconds>
+        <collect_interval_milliseconds>1000</collect_interval_milliseconds>
+    </metric_log>
+</yandex>
+```
+
+**Disabling**
+
+To disable `metric_log` setting, you should create the following file `/etc/clickhouse-server/config.d/disable_metric_log.xml` with the following content:
+
+``` xml
+<yandex>
+<metric_log remove="1" />
+</yandex>
+```
+
 ## replicated_merge_tree {#server_configuration_parameters-replicated_merge_tree}

 Fine tuning for tables in the [ReplicatedMergeTree](../../engines/table-engines/mergetree-family/mergetree.md).
--- a/docs/en/operations/system-tables/metric_log.md
+++ b/docs/en/operations/system-tables/metric_log.md
@ -2,19 +2,6 @@

 Contains history of metrics values from tables `system.metrics` and `system.events`, periodically flushed to disk.

-To turn on metrics history collection on `system.metric_log`, create `/etc/clickhouse-server/config.d/metric_log.xml` with following content:
-
-``` xml
-<yandex>
-    <metric_log>
-        <database>system</database>
-        <table>metric_log</table>
-        <flush_interval_milliseconds>7500</flush_interval_milliseconds>
-        <collect_interval_milliseconds>1000</collect_interval_milliseconds>
-    </metric_log>
-</yandex>
-```
-
 Columns:
 -   `event_date` ([Date](../../sql-reference/data-types/date.md)) — Event date.
 -   `event_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Event time.
@ -55,6 +42,7 @@ CurrentMetric_DistributedFilesToInsert:                          0

 **See also**

+-   [metric_log setting](../../operations/server-configuration-parameters/settings.md#metric_log) — Enabling and disabling the setting.
 -   [system.asynchronous_metrics](../../operations/system-tables/asynchronous_metrics.md) — Contains periodically calculated metrics.
 -   [system.events](../../operations/system-tables/events.md#system_tables-events) — Contains a number of events that occurred.
 -   [system.metrics](../../operations/system-tables/metrics.md) — Contains instantly calculated metrics.
--- a/docs/en/sql-reference/table-functions/mysql.md
+++ b/docs/en/sql-reference/table-functions/mysql.md
@ -5,10 +5,12 @@ toc_title: mysql

 # mysql {#mysql}

-Allows `SELECT` queries to be performed on data that is stored on a remote MySQL server.
+Allows `SELECT` and `INSERT` queries to be performed on data that is stored on a remote MySQL server.
+
+**Syntax**

 ``` sql
-mysql('host:port', 'database', 'table', 'user', 'password'[, replace_query, 'on_duplicate_clause']);
+mysql('host:port', 'database', 'table', 'user', 'password'[, replace_query, 'on_duplicate_clause'])
 ```

 **Parameters**
@ -23,13 +25,15 @@ mysql('host:port', 'database', 'table', 'user', 'password'[, replace_query, 'on_

 -   `password` — User password.

-   `replace_query` — Flag that converts `INSERT INTO` queries to `REPLACE INTO`. If `replace_query=1`, the query is replaced.
+-   `replace_query` — Flag that converts `INSERT INTO` queries to `REPLACE INTO`. Possible values:
+    - `0` - The query is executed as `INSERT INTO`.
+    - `1` - The query is executed as `REPLACE INTO`.

-   `on_duplicate_clause` — The `ON DUPLICATE KEY on_duplicate_clause` expression that is added to the `INSERT` query.
+-   `on_duplicate_clause` — The `ON DUPLICATE KEY on_duplicate_clause` expression that is added to the `INSERT` query. Can be specified only with `replace_query = 0` (if you simultaneously pass `replace_query = 1` and `on_duplicate_clause`, ClickHouse generates an exception).

-        Example: `INSERT INTO t (c1,c2) VALUES ('a', 2) ON DUPLICATE KEY UPDATE c2 = c2 + 1`, where `on_duplicate_clause` is `UPDATE c2 = c2 + 1`. See the MySQL documentation to find which `on_duplicate_clause` you can use with the `ON DUPLICATE KEY` clause.
+    Example: `INSERT INTO t (c1,c2) VALUES ('a', 2) ON DUPLICATE KEY UPDATE c2 = c2 + 1;`

-        To specify `on_duplicate_clause` you need to pass `0` to the `replace_query` parameter. If you simultaneously pass `replace_query = 1` and `on_duplicate_clause`, ClickHouse generates an exception.
+    `on_duplicate_clause` here is `UPDATE c2 = c2 + 1`. See the MySQL documentation to find which `on_duplicate_clause` you can use with the `ON DUPLICATE KEY` clause.

 Simple `WHERE` clauses such as `=, !=, >, >=, <, <=` are currently executed on the MySQL server.

@ -39,46 +43,59 @@ The rest of the conditions and the `LIMIT` sampling constraint are executed in C

 A table object with the same columns as the original MySQL table.

-## Usage Example {#usage-example}
+!!! info "Note"
+    In the `INSERT` query to distinguish table function `mysql(...)` from table name with column names list you must use keywords `FUNCTION` or `TABLE FUNCTION`. See examples below. 
+
+**Examples**

 Table in MySQL:

 ``` text
 mysql> CREATE TABLE `test`.`test` (
    ->   `int_id` INT NOT NULL AUTO_INCREMENT,
-    ->   `int_nullable` INT NULL DEFAULT NULL,
    ->   `float` FLOAT NOT NULL,
-    ->   `float_nullable` FLOAT NULL DEFAULT NULL,
    ->   PRIMARY KEY (`int_id`));
-Query OK, 0 rows affected (0,09 sec)

-mysql> insert into test (`int_id`, `float`) VALUES (1,2);
-Query OK, 1 row affected (0,00 sec)
+mysql> INSERT INTO test (`int_id`, `float`) VALUES (1,2);

-mysql> select * from test;
-+------+----------+-----+----------+
-| int_id | int_nullable | float | float_nullable |
-+------+----------+-----+----------+
-|      1 |         NULL |     2 |           NULL |
-+------+----------+-----+----------+
-1 row in set (0,00 sec)
+mysql> SELECT * FROM test;
+--------+-------+
+| int_id | float |
+--------+-------+
+|      1 |     2 |
+--------+-------+
 ```

 Selecting data from ClickHouse:

 ``` sql
-SELECT * FROM mysql('localhost:3306', 'test', 'test', 'bayonet', '123')
+SELECT * FROM mysql('localhost:3306', 'test', 'test', 'bayonet', '123');
 ```

 ``` text
-┌─int_id─┬─int_nullable─┬─float─┬─float_nullable─┐
-│      1 │         ᴺᵁᴸᴸ │     2 │           ᴺᵁᴸᴸ │
-└────────┴──────────────┴───────┴────────────────┘
+┌─int_id─┬─float─┐
+│      1 │     2 │
+└────────┴───────┘
 ```

-## See Also {#see-also}
+Replacing and inserting:
+
+```sql
+INSERT INTO FUNCTION mysql('localhost:3306', 'test', 'test', 'bayonet', '123', 1) (int_id, float) VALUES (1, 3);
+INSERT INTO TABLE FUNCTION mysql('localhost:3306', 'test', 'test', 'bayonet', '123', 0, 'UPDATE int_id = int_id + 1') (int_id, float) VALUES (1, 4);
+SELECT * FROM mysql('localhost:3306', 'test', 'test', 'bayonet', '123');
+```
+
+``` text
+┌─int_id─┬─float─┐
+│      1 │     3 │
+│      2 │     4 │
+└────────┴───────┘
+```
+
+**See Also**

 -   [The ‘MySQL’ table engine](../../engines/table-engines/integrations/mysql.md)
 -   [Using MySQL as a source of external dictionary](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-mysql)

-[Original article](https://clickhouse.tech/docs/en/query_language/table_functions/mysql/) <!--hide-->
+[Original article](https://clickhouse.tech/docs/en/sql-reference/table_functions/mysql/) <!--hide-->
--- a/docs/ru/operations/server-configuration-parameters/settings.md
+++ b/docs/ru/operations/server-configuration-parameters/settings.md
@ -577,6 +577,35 @@ ClickHouse проверяет условия для `min_part_size` и `min_part
 </merge_tree>
 ```

+## metric_log {#metric_log}
+
+Эта настройка включена по умолчанию. Если это не так, вы можете включить ее сами.
+
+**Включение**
+
+Чтобы вручную включить сбор истории метрик в таблице [`system.metric_log`](../../operations/system-tables/metric_log.md), создайте `/etc/clickhouse-server/config.d/metric_log.xml` следующего содержания:
+
+``` xml
+<yandex>
+    <metric_log>
+        <database>system</database>
+        <table>metric_log</table>
+        <flush_interval_milliseconds>7500</flush_interval_milliseconds>
+        <collect_interval_milliseconds>1000</collect_interval_milliseconds>
+    </metric_log>
+</yandex>
+```
+
+**Выключение**
+
+Чтобы отключить настройку `metric_log` , создайте файл `/etc/clickhouse-server/config.d/disable_metric_log.xml` следующего содержания:
+
+``` xml
+<yandex>
+<metric_log remove="1" />
+</yandex>
+```
+
 ## replicated\_merge\_tree {#server_configuration_parameters-replicated_merge_tree}

 Тонкая настройка таблиц в [ReplicatedMergeTree](../../engines/table-engines/mergetree-family/mergetree.md).
--- a/docs/ru/operations/system-tables/metric_log.md
+++ b/docs/ru/operations/system-tables/metric_log.md
@ -2,19 +2,6 @@

 Содержит историю значений метрик из таблиц `system.metrics` и `system.events`, периодически сбрасываемую на диск.

-Для включения сбора истории метрик в таблице `system.metric_log` создайте `/etc/clickhouse-server/config.d/metric_log.xml` следующего содержания:
-
-``` xml
-<yandex>
-    <metric_log>
-        <database>system</database>
-        <table>metric_log</table>
-        <flush_interval_milliseconds>7500</flush_interval_milliseconds>
-        <collect_interval_milliseconds>1000</collect_interval_milliseconds>
-    </metric_log>
-</yandex>
-```
-
 Столбцы:
 -   `event_date` ([Date](../../sql-reference/data-types/date.md)) — дата события.
 -   `event_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — время события.
@ -55,6 +42,7 @@ CurrentMetric_ReplicatedChecks:                             0

 **Смотрите также**

+-   [Настройка metric_log](../../operations/server-configuration-parameters/settings.md#metric_log) — как включить и выключить запись истории.
 -   [system.asynchronous_metrics](#system_tables-asynchronous_metrics) — таблица с периодически вычисляемыми метриками.
 -   [system.events](#system_tables-events) — таблица с количеством произошедших событий.
 -   [system.metrics](#system_tables-metrics) — таблица с мгновенно вычисляемыми метриками.
--- a/docs/tools/release.sh
+++ b/docs/tools/release.sh
@ -31,9 +31,11 @@ then
    git add *
    git add ".nojekyll"

-    # Push to GitHub rewriting the existing contents.
    git commit --quiet -m "Add new release at $(date)"
-    git push --force origin master
+
+    # Push to GitHub rewriting the existing contents.
+    # Sometimes it does not work with error message "! [remote rejected] master -> master (cannot lock ref 'refs/heads/master': is at 42a0f6b6b6c7be56a469441b4bf29685c1cebac3 but expected 520e9b02c0d4678a2a5f41d2f561e6532fb98cc1)"
+    for _ in {1..10}; do git push --force origin master && break; sleep 5; done

    if [[ ! -z "${CLOUDFLARE_TOKEN}" ]]
    then
--- a/programs/client/Client.cpp
+++ b/programs/client/Client.cpp
@ -948,7 +948,7 @@ private:
        {   /// disable logs if expects errors
            TestHint test_hint(test_mode, all_queries_text);
            if (test_hint.clientError() || test_hint.serverError())
-                processTextAsSingleQuery("SET send_logs_level = 'none'");
+                processTextAsSingleQuery("SET send_logs_level = 'fatal'");

            // Echo all queries if asked; makes for a more readable reference
            // file.
@ -1934,7 +1934,12 @@ private:
            if (has_vertical_output_suffix)
                current_format = "Vertical";

-            block_out_stream = context.getOutputFormat(current_format, *out_buf, block);
+            /// It is not clear how to write progress with parallel formatting. It may increase code complexity significantly.
+            if (!need_render_progress)
+                block_out_stream = context.getOutputStreamParallelIfPossible(current_format, *out_buf, block);
+            else
+                block_out_stream = context.getOutputStream(current_format, *out_buf, block);
+
            block_out_stream->writePrefix();
        }
    }
@ -1991,15 +1996,18 @@ private:
            written_first_block = true;
        }

-        bool clear_progess = std_out.offset() > 0;
-        if (clear_progess)
+        bool clear_progress = false;
+        if (need_render_progress)
+            clear_progress = std_out.offset() > 0;
+
+        if (clear_progress)
            clearProgress();

        /// Received data block is immediately displayed to the user.
        block_out_stream->flush();

        /// Restore progress bar after data block.
-        if (clear_progess)
+        if (clear_progress)
            writeProgress();
    }

--- a/programs/compressor/Compressor.cpp
+++ b/programs/compressor/Compressor.cpp
@ -169,11 +169,7 @@ int mainEntryClickHouseCompressor(int argc, char ** argv)

            if (offset_in_compressed_file || offset_in_decompressed_block)
            {
-                if (!options.count("input"))
-                {
-                    throw DB::Exception("--offset-in-compressed-file/--offset-in-decompressed-block requires --input", DB::ErrorCodes::BAD_ARGUMENTS);
-                }
-                CompressedReadBufferFromFile compressed_file(options["input"].as<std::string>(), 0, 0, 0);
+                CompressedReadBufferFromFile compressed_file(std::move(rb));
                compressed_file.seek(offset_in_compressed_file, offset_in_decompressed_block);
                copyData(compressed_file, *wb);
            }
--- a/programs/obfuscator/Obfuscator.cpp
+++ b/programs/obfuscator/Obfuscator.cpp
@ -1180,7 +1180,7 @@ try
        file_in.seek(0, SEEK_SET);

        BlockInputStreamPtr input = context.getInputFormat(input_format, file_in, header, max_block_size);
-        BlockOutputStreamPtr output = context.getOutputFormat(output_format, file_out, header);
+        BlockOutputStreamPtr output = context.getOutputStream(output_format, file_out, header);

        if (processed_rows + source_rows > limit)
            input = std::make_shared<LimitBlockInputStream>(input, limit - processed_rows, 0);
--- a/programs/odbc-bridge/MainHandler.cpp
+++ b/programs/odbc-bridge/MainHandler.cpp
@ -1,24 +1,26 @@
 #include "MainHandler.h"

 #include "validateODBCConnectionString.h"
-#include <memory>
-#include <DataStreams/copyData.h>
-#include <DataTypes/DataTypeFactory.h>
 #include "ODBCBlockInputStream.h"
 #include "ODBCBlockOutputStream.h"
+#include "getIdentifierQuote.h"
+#include <DataStreams/copyData.h>
+#include <DataTypes/DataTypeFactory.h>
 #include <Formats/FormatFactory.h>
 #include <IO/WriteBufferFromHTTPServerResponse.h>
 #include <IO/WriteHelpers.h>
 #include <IO/ReadHelpers.h>
+#include <IO/ReadBufferFromIStream.h>
 #include <Poco/Net/HTTPServerRequest.h>
 #include <Poco/Net/HTTPServerResponse.h>
 #include <Poco/Net/HTMLForm.h>
-#include <common/logger_useful.h>
-#include <mutex>
 #include <Poco/ThreadPool.h>
-#include <IO/ReadBufferFromIStream.h>
-#include <Columns/ColumnsNumber.h>
-#include "getIdentifierQuote.h"
+#include <Processors/Formats/InputStreamFromInputFormat.h>
+#include <common/logger_useful.h>
+
+#include <mutex>
+#include <memory>
+

 #if USE_ODBC
 #include <Poco/Data/ODBC/SessionImpl.h>
@ -162,8 +164,9 @@ void ODBCHandler::handleRequest(Poco::Net::HTTPServerRequest & request, Poco::Ne

            auto pool = getPool(connection_string);
            ReadBufferFromIStream read_buf(request.stream());
-            BlockInputStreamPtr input_stream = FormatFactory::instance().getInput(format, read_buf, *sample_block,
-                                                                                  context, max_block_size);
+            auto input_format = FormatFactory::instance().getInput(format, read_buf, *sample_block,
+                                                                   context, max_block_size);
+            auto input_stream = std::make_shared<InputStreamFromInputFormat>(input_format);
            ODBCBlockOutputStream output_stream(pool->get(), db_name, table_name, *sample_block, quoting_style);
            copyData(*input_stream, output_stream);
            writeStringBinary("Ok.", out);
@ -173,7 +176,7 @@ void ODBCHandler::handleRequest(Poco::Net::HTTPServerRequest & request, Poco::Ne
            std::string query = params.get("query");
            LOG_TRACE(log, "Query: {}", query);

-            BlockOutputStreamPtr writer = FormatFactory::instance().getOutput(format, out, *sample_block, context);
+            BlockOutputStreamPtr writer = FormatFactory::instance().getOutputStream(format, out, *sample_block, context);
            auto pool = getPool(connection_string);
            ODBCBlockInputStream inp(pool->get(), query, *sample_block, max_block_size);
            copyData(inp, *writer);
--- a/src/Access/SettingsConstraints.cpp
+++ b/src/Access/SettingsConstraints.cpp
@ -157,23 +157,7 @@ bool SettingsConstraints::checkImpl(const Settings & current_settings, SettingCh
    const String & setting_name = change.name;

    if (setting_name == "profile")
-    {
-        /// TODO Check profile settings in Context::setProfile(...), not here. It will be backward incompatible.
-        const String & profile_name = change.value.safeGet<String>();
-        const auto & profile_settings_changes = manager->getProfileSettings(profile_name);
-        try
-        {
-            /// NOTE We cannot use CLAMP_ON_VIOLATION here, because we cannot modify elements of profile_settings_changes
-            for (auto change_copy : *profile_settings_changes)
-                checkImpl(current_settings, change_copy, THROW_ON_VIOLATION);
-        }
-        catch (Exception & e)
-        {
-            e.addMessage(", while trying to set settings profile {}", profile_name);
-            throw;
-        }
        return true;
-    }

    bool cannot_cast;
    auto cast_value = [&](const Field & x) -> Field
--- a/src/AggregateFunctions/AggregateFunctionAny.cpp
+++ b/src/AggregateFunctions/AggregateFunctionAny.cpp
@ -0,0 +1,39 @@
+#include <AggregateFunctions/AggregateFunctionFactory.h>
+#include <AggregateFunctions/HelpersMinMaxAny.h>
+#include <AggregateFunctions/FactoryHelpers.h>
+#include "registerAggregateFunctions.h"
+
+
+namespace DB
+{
+
+namespace
+{
+
+AggregateFunctionPtr createAggregateFunctionAny(const std::string & name, const DataTypes & argument_types, const Array & parameters)
+{
+    return AggregateFunctionPtr(createAggregateFunctionSingleValue<AggregateFunctionsSingleValue, AggregateFunctionAnyData>(name, argument_types, parameters));
+}
+
+AggregateFunctionPtr createAggregateFunctionAnyLast(const std::string & name, const DataTypes & argument_types, const Array & parameters)
+{
+    return AggregateFunctionPtr(createAggregateFunctionSingleValue<AggregateFunctionsSingleValue, AggregateFunctionAnyLastData>(name, argument_types, parameters));
+}
+
+AggregateFunctionPtr createAggregateFunctionAnyHeavy(const std::string & name, const DataTypes & argument_types, const Array & parameters)
+{
+    return AggregateFunctionPtr(createAggregateFunctionSingleValue<AggregateFunctionsSingleValue, AggregateFunctionAnyHeavyData>(name, argument_types, parameters));
+}
+
+}
+
+void registerAggregateFunctionsAny(AggregateFunctionFactory & factory)
+{
+    AggregateFunctionProperties properties = { .returns_default_when_only_null = false, .is_order_dependent = true };
+
+    factory.registerFunction("any", { createAggregateFunctionAny, properties });
+    factory.registerFunction("anyLast", { createAggregateFunctionAnyLast, properties });
+    factory.registerFunction("anyHeavy", { createAggregateFunctionAnyHeavy, properties });
+}
+
+}
--- a/src/AggregateFunctions/AggregateFunctionArray.h
+++ b/src/AggregateFunctions/AggregateFunctionArray.h
@ -129,7 +129,7 @@ public:
        return nested_func->allocatesMemoryInArena();
    }

-    AggregateFunctionPtr getNestedFunction() const { return nested_func; }
+    AggregateFunctionPtr getNestedFunction() const override { return nested_func; }
 };

 }
--- a/src/AggregateFunctions/AggregateFunctionCount.h
+++ b/src/AggregateFunctions/AggregateFunctionCount.h
@ -6,6 +6,7 @@
 #include <array>
 #include <DataTypes/DataTypesNumber.h>
 #include <Columns/ColumnNullable.h>
+#include <Columns/ColumnsCommon.h>
 #include <AggregateFunctions/IAggregateFunction.h>
 #include <Common/assert_cast.h>

@ -42,6 +43,39 @@ public:
        ++data(place).count;
    }

+    void addBatchSinglePlace(
+        size_t batch_size, AggregateDataPtr place, const IColumn ** columns, Arena *, ssize_t if_argument_pos) const override
+    {
+        if (if_argument_pos >= 0)
+        {
+            const auto & flags = assert_cast<const ColumnUInt8 &>(*columns[if_argument_pos]).getData();
+            data(place).count += countBytesInFilter(flags);
+        }
+        else
+        {
+            data(place).count += batch_size;
+        }
+    }
+
+    void addBatchSinglePlaceNotNull(
+        size_t batch_size,
+        AggregateDataPtr place,
+        const IColumn ** columns,
+        const UInt8 * null_map,
+        Arena *,
+        ssize_t if_argument_pos) const override
+    {
+        if (if_argument_pos >= 0)
+        {
+            const auto & flags = assert_cast<const ColumnUInt8 &>(*columns[if_argument_pos]).getData();
+            data(place).count += countBytesInFilterWithNull(flags, null_map);
+        }
+        else
+        {
+            data(place).count += batch_size - countBytesInFilter(null_map, batch_size);
+        }
+    }
+
    void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena *) const override
    {
        data(place).count += data(rhs).count;
--- a/src/AggregateFunctions/AggregateFunctionDistinct.h
+++ b/src/AggregateFunctions/AggregateFunctionDistinct.h
@ -235,6 +235,8 @@ public:
    {
        return true;
    }
+
+    AggregateFunctionPtr getNestedFunction() const override { return nested_func; }
 };

 }
--- a/src/AggregateFunctions/AggregateFunctionForEach.h
+++ b/src/AggregateFunctions/AggregateFunctionForEach.h
@ -252,6 +252,8 @@ public:
    {
        return nested_func->isState();
    }
+
+    AggregateFunctionPtr getNestedFunction() const override { return nested_func; }
 };


--- a/src/AggregateFunctions/AggregateFunctionGroupArray.h
+++ b/src/AggregateFunctions/AggregateFunctionGroupArray.h
@ -188,13 +188,13 @@ public:
            if (!limit_num_elems)
            {
                if (rhs_elems.value.size())
-                    cur_elems.value.insert(rhs_elems.value.begin(), rhs_elems.value.end(), arena);
+                    cur_elems.value.insertByOffsets(rhs_elems.value, 0, rhs_elems.value.size(), arena);
            }
            else
            {
                UInt64 elems_to_insert = std::min(static_cast<size_t>(max_elems) - cur_elems.value.size(), rhs_elems.value.size());
                if (elems_to_insert)
-                    cur_elems.value.insert(rhs_elems.value.begin(), rhs_elems.value.begin() + elems_to_insert, arena);
+                    cur_elems.value.insertByOffsets(rhs_elems.value, 0, elems_to_insert, arena);
            }
        }

--- a/src/AggregateFunctions/AggregateFunctionIf.h
+++ b/src/AggregateFunctions/AggregateFunctionIf.h
@ -80,6 +80,34 @@ public:
            nested_func->add(place, columns, row_num, arena);
    }

+    void addBatch(
+        size_t batch_size,
+        AggregateDataPtr * places,
+        size_t place_offset,
+        const IColumn ** columns,
+        Arena * arena,
+        ssize_t) const override
+    {
+        nested_func->addBatch(batch_size, places, place_offset, columns, arena, num_arguments - 1);
+    }
+
+    void addBatchSinglePlace(
+        size_t batch_size, AggregateDataPtr place, const IColumn ** columns, Arena * arena, ssize_t) const override
+    {
+        nested_func->addBatchSinglePlace(batch_size, place, columns, arena, num_arguments - 1);
+    }
+
+    void addBatchSinglePlaceNotNull(
+        size_t batch_size,
+        AggregateDataPtr place,
+        const IColumn ** columns,
+        const UInt8 * null_map,
+        Arena * arena,
+        ssize_t) const override
+    {
+        nested_func->addBatchSinglePlaceNotNull(batch_size, place, columns, null_map, arena, num_arguments - 1);
+    }
+
    void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena * arena) const override
    {
        nested_func->merge(place, rhs, arena);
@ -113,6 +141,8 @@ public:
    AggregateFunctionPtr getOwnNullAdapter(
        const AggregateFunctionPtr & nested_function, const DataTypes & arguments,
        const Array & params, const AggregateFunctionProperties & properties) const override;
+
+    AggregateFunctionPtr getNestedFunction() const override { return nested_func; }
 };

 }
--- a/src/AggregateFunctions/AggregateFunctionMax.cpp
+++ b/src/AggregateFunctions/AggregateFunctionMax.cpp
@ -0,0 +1,34 @@
+#include <AggregateFunctions/AggregateFunctionFactory.h>
+#include <AggregateFunctions/HelpersMinMaxAny.h>
+#include <AggregateFunctions/FactoryHelpers.h>
+#include "registerAggregateFunctions.h"
+
+
+namespace DB
+{
+
+namespace
+{
+
+AggregateFunctionPtr createAggregateFunctionMax(const std::string & name, const DataTypes & argument_types, const Array & parameters)
+{
+    return AggregateFunctionPtr(createAggregateFunctionSingleValue<AggregateFunctionsSingleValue, AggregateFunctionMaxData>(name, argument_types, parameters));
+}
+
+AggregateFunctionPtr createAggregateFunctionArgMax(const std::string & name, const DataTypes & argument_types, const Array & parameters)
+{
+    return AggregateFunctionPtr(createAggregateFunctionArgMinMax<AggregateFunctionMaxData>(name, argument_types, parameters));
+}
+
+}
+
+void registerAggregateFunctionsMax(AggregateFunctionFactory & factory)
+{
+    factory.registerFunction("max", createAggregateFunctionMax, AggregateFunctionFactory::CaseInsensitive);
+
+    /// The functions below depend on the order of data.
+    AggregateFunctionProperties properties = { .returns_default_when_only_null = false, .is_order_dependent = true };
+    factory.registerFunction("argMax", { createAggregateFunctionArgMax, properties });
+}
+
+}
--- a/src/AggregateFunctions/AggregateFunctionMerge.h
+++ b/src/AggregateFunctions/AggregateFunctionMerge.h
@ -102,6 +102,8 @@ public:
    {
        return nested_func->allocatesMemoryInArena();
    }
+
+    AggregateFunctionPtr getNestedFunction() const override { return nested_func; }
 };

 }
--- a/src/AggregateFunctions/AggregateFunctionMin.cpp
+++ b/src/AggregateFunctions/AggregateFunctionMin.cpp
@ -0,0 +1,34 @@
+#include <AggregateFunctions/AggregateFunctionFactory.h>
+#include <AggregateFunctions/HelpersMinMaxAny.h>
+#include <AggregateFunctions/FactoryHelpers.h>
+#include "registerAggregateFunctions.h"
+
+
+namespace DB
+{
+
+namespace
+{
+
+AggregateFunctionPtr createAggregateFunctionMin(const std::string & name, const DataTypes & argument_types, const Array & parameters)
+{
+    return AggregateFunctionPtr(createAggregateFunctionSingleValue<AggregateFunctionsSingleValue, AggregateFunctionMinData>(name, argument_types, parameters));
+}
+
+AggregateFunctionPtr createAggregateFunctionArgMin(const std::string & name, const DataTypes & argument_types, const Array & parameters)
+{
+    return AggregateFunctionPtr(createAggregateFunctionArgMinMax<AggregateFunctionMinData>(name, argument_types, parameters));
+}
+
+}
+
+void registerAggregateFunctionsMin(AggregateFunctionFactory & factory)
+{
+    factory.registerFunction("min", createAggregateFunctionMin, AggregateFunctionFactory::CaseInsensitive);
+
+    /// The functions below depend on the order of data.
+    AggregateFunctionProperties properties = { .returns_default_when_only_null = false, .is_order_dependent = true };
+    factory.registerFunction("argMin", { createAggregateFunctionArgMin, properties });
+}
+
+}
--- a/src/AggregateFunctions/AggregateFunctionMinMaxAny.cpp
+++ b/src/AggregateFunctions/AggregateFunctionMinMaxAny.cpp
@ -1,66 +0,0 @@
-#include <AggregateFunctions/AggregateFunctionFactory.h>
-#include <AggregateFunctions/HelpersMinMaxAny.h>
-#include <AggregateFunctions/FactoryHelpers.h>
-#include "registerAggregateFunctions.h"
-
-
-namespace DB
-{
-
-namespace
-{
-
-AggregateFunctionPtr createAggregateFunctionAny(const std::string & name, const DataTypes & argument_types, const Array & parameters)
-{
-    return AggregateFunctionPtr(createAggregateFunctionSingleValue<AggregateFunctionsSingleValue, AggregateFunctionAnyData>(name, argument_types, parameters));
-}
-
-AggregateFunctionPtr createAggregateFunctionAnyLast(const std::string & name, const DataTypes & argument_types, const Array & parameters)
-{
-    return AggregateFunctionPtr(createAggregateFunctionSingleValue<AggregateFunctionsSingleValue, AggregateFunctionAnyLastData>(name, argument_types, parameters));
-}
-
-AggregateFunctionPtr createAggregateFunctionAnyHeavy(const std::string & name, const DataTypes & argument_types, const Array & parameters)
-{
-    return AggregateFunctionPtr(createAggregateFunctionSingleValue<AggregateFunctionsSingleValue, AggregateFunctionAnyHeavyData>(name, argument_types, parameters));
-}
-
-AggregateFunctionPtr createAggregateFunctionMin(const std::string & name, const DataTypes & argument_types, const Array & parameters)
-{
-    return AggregateFunctionPtr(createAggregateFunctionSingleValue<AggregateFunctionsSingleValue, AggregateFunctionMinData>(name, argument_types, parameters));
-}
-
-AggregateFunctionPtr createAggregateFunctionMax(const std::string & name, const DataTypes & argument_types, const Array & parameters)
-{
-    return AggregateFunctionPtr(createAggregateFunctionSingleValue<AggregateFunctionsSingleValue, AggregateFunctionMaxData>(name, argument_types, parameters));
-}
-
-AggregateFunctionPtr createAggregateFunctionArgMin(const std::string & name, const DataTypes & argument_types, const Array & parameters)
-{
-    return AggregateFunctionPtr(createAggregateFunctionArgMinMax<AggregateFunctionMinData>(name, argument_types, parameters));
-}
-
-AggregateFunctionPtr createAggregateFunctionArgMax(const std::string & name, const DataTypes & argument_types, const Array & parameters)
-{
-    return AggregateFunctionPtr(createAggregateFunctionArgMinMax<AggregateFunctionMaxData>(name, argument_types, parameters));
-}
-
-}
-
-void registerAggregateFunctionsMinMaxAny(AggregateFunctionFactory & factory)
-{
-    factory.registerFunction("min", createAggregateFunctionMin, AggregateFunctionFactory::CaseInsensitive);
-    factory.registerFunction("max", createAggregateFunctionMax, AggregateFunctionFactory::CaseInsensitive);
-
-    /// The functions below depend on the order of data.
-
-    AggregateFunctionProperties properties = { .returns_default_when_only_null = false, .is_order_dependent = true };
-
-    factory.registerFunction("any", { createAggregateFunctionAny, properties });
-    factory.registerFunction("anyLast", { createAggregateFunctionAnyLast, properties });
-    factory.registerFunction("anyHeavy", { createAggregateFunctionAnyHeavy, properties });
-    factory.registerFunction("argMin", { createAggregateFunctionArgMin, properties });
-    factory.registerFunction("argMax", { createAggregateFunctionArgMax, properties });
-}
-
-}
--- a/src/AggregateFunctions/AggregateFunctionNull.h
+++ b/src/AggregateFunctions/AggregateFunctionNull.h
@ -180,6 +180,8 @@ public:
    {
        return nested_function->isState();
    }
+
+    AggregateFunctionPtr getNestedFunction() const override { return nested_function; }
 };


@ -209,13 +211,15 @@ public:
        }
    }

-    void addBatchSinglePlace(size_t batch_size, AggregateDataPtr place, const IColumn ** columns, Arena * arena) const override
+    void addBatchSinglePlace(
+        size_t batch_size, AggregateDataPtr place, const IColumn ** columns, Arena * arena, ssize_t if_argument_pos = -1) const override
    {
        const ColumnNullable * column = assert_cast<const ColumnNullable *>(columns[0]);
        const IColumn * nested_column = &column->getNestedColumn();
        const UInt8 * null_map = column->getNullMapData().data();

-        this->nested_function->addBatchSinglePlaceNotNull(batch_size, this->nestedPlace(place), &nested_column, null_map, arena);
+        this->nested_function->addBatchSinglePlaceNotNull(
+            batch_size, this->nestedPlace(place), &nested_column, null_map, arena, if_argument_pos);

        if constexpr (result_is_nullable)
            if (!memoryIsByte(null_map, batch_size, 1))
--- a/src/AggregateFunctions/AggregateFunctionOrFill.h
+++ b/src/AggregateFunctions/AggregateFunctionOrFill.h
@ -2,6 +2,7 @@

 #include <AggregateFunctions/IAggregateFunction.h>
 #include <Columns/ColumnNullable.h>
+#include <Columns/ColumnsCommon.h>
 #include <Common/typeid_cast.h>
 #include <DataTypes/DataTypeNullable.h>
 #include <IO/ReadHelpers.h>
@ -96,37 +97,93 @@ public:
        place[size_of_data] = 1;
    }

-    void addBatch(size_t batch_size, AggregateDataPtr * places, size_t place_offset, const IColumn ** columns, Arena * arena) const override
+    void addBatch(
+        size_t batch_size,
+        AggregateDataPtr * places,
+        size_t place_offset,
+        const IColumn ** columns,
+        Arena * arena,
+        ssize_t if_argument_pos = -1) const override
    {
-        nested_function->addBatch(batch_size, places, place_offset, columns, arena);
-        for (size_t i = 0; i < batch_size; ++i)
-            (places[i] + place_offset)[size_of_data] = 1;
+        if (if_argument_pos >= 0)
+        {
+            const auto & flags = assert_cast<const ColumnUInt8 &>(*columns[if_argument_pos]).getData();
+            for (size_t i = 0; i < batch_size; ++i)
+            {
+                if (flags[i])
+                    add(places[i] + place_offset, columns, i, arena);
+            }
+        }
+        else
+        {
+            nested_function->addBatch(batch_size, places, place_offset, columns, arena, if_argument_pos);
+            for (size_t i = 0; i < batch_size; ++i)
+                (places[i] + place_offset)[size_of_data] = 1;
+        }
    }

-    void addBatchSinglePlace(size_t batch_size, AggregateDataPtr place, const IColumn ** columns, Arena * arena) const override
+    void addBatchSinglePlace(
+        size_t batch_size, AggregateDataPtr place, const IColumn ** columns, Arena * arena, ssize_t if_argument_pos = -1) const override
    {
-        if (batch_size)
+        if (if_argument_pos >= 0)
        {
-            nested_function->addBatchSinglePlace(batch_size, place, columns, arena);
-            place[size_of_data] = 1;
+            const auto & flags = assert_cast<const ColumnUInt8 &>(*columns[if_argument_pos]).getData();
+            nested_function->addBatchSinglePlace(batch_size, place, columns, arena, if_argument_pos);
+            for (size_t i = 0; i < batch_size; ++i)
+            {
+                if (flags[i])
+                {
+                    place[size_of_data] = 1;
+                    break;
+                }
+            }
+        }
+        else
+        {
+            if (batch_size)
+            {
+                nested_function->addBatchSinglePlace(batch_size, place, columns, arena, if_argument_pos);
+                place[size_of_data] = 1;
+            }
        }
    }

    void addBatchSinglePlaceNotNull(
-        size_t batch_size, AggregateDataPtr place, const IColumn ** columns, const UInt8 * null_map, Arena * arena) const override
+        size_t batch_size,
+        AggregateDataPtr place,
+        const IColumn ** columns,
+        const UInt8 * null_map,
+        Arena * arena,
+        ssize_t if_argument_pos = -1) const override
    {
-        if (batch_size)
+        if (if_argument_pos >= 0)
        {
-            nested_function->addBatchSinglePlaceNotNull(batch_size, place, columns, null_map, arena);
+            const auto & flags = assert_cast<const ColumnUInt8 &>(*columns[if_argument_pos]).getData();
+            nested_function->addBatchSinglePlaceNotNull(batch_size, place, columns, null_map, arena, if_argument_pos);
            for (size_t i = 0; i < batch_size; ++i)
            {
-                if (!null_map[i])
+                if (flags[i] && !null_map[i])
                {
                    place[size_of_data] = 1;
                    break;
                }
            }
        }
+        else
+        {
+            if (batch_size)
+            {
+                nested_function->addBatchSinglePlaceNotNull(batch_size, place, columns, null_map, arena, if_argument_pos);
+                for (size_t i = 0; i < batch_size; ++i)
+                {
+                    if (!null_map[i])
+                    {
+                        place[size_of_data] = 1;
+                        break;
+                    }
+                }
+            }
+        }
    }

    void merge(
@ -207,6 +264,8 @@ public:
        else
            to.insertDefault();
    }
+
+    AggregateFunctionPtr getNestedFunction() const override { return nested_function; }
 };

 }
--- a/src/AggregateFunctions/AggregateFunctionResample.h
+++ b/src/AggregateFunctions/AggregateFunctionResample.h
@ -198,6 +198,8 @@ public:

        col_offsets.getData().push_back(col.getData().size());
    }
+
+    AggregateFunctionPtr getNestedFunction() const override { return nested_function; }
 };

 }
--- a/src/AggregateFunctions/AggregateFunctionSimpleState.h
+++ b/src/AggregateFunctions/AggregateFunctionSimpleState.h
@ -79,7 +79,7 @@ public:

    bool allocatesMemoryInArena() const override { return nested_func->allocatesMemoryInArena(); }

-    AggregateFunctionPtr getNestedFunction() const { return nested_func; }
+    AggregateFunctionPtr getNestedFunction() const override { return nested_func; }
 };

 }
--- a/src/AggregateFunctions/AggregateFunctionState.h
+++ b/src/AggregateFunctions/AggregateFunctionState.h
@ -92,7 +92,7 @@ public:
        return nested_func->allocatesMemoryInArena();
    }

-    AggregateFunctionPtr getNestedFunction() const { return nested_func; }
+    AggregateFunctionPtr getNestedFunction() const override { return nested_func; }
 };

 }
--- a/src/AggregateFunctions/AggregateFunctionSum.h
+++ b/src/AggregateFunctions/AggregateFunctionSum.h
@ -282,17 +282,41 @@ public:
    }

    /// Vectorized version when there is no GROUP BY keys.
-    void addBatchSinglePlace(size_t batch_size, AggregateDataPtr place, const IColumn ** columns, Arena *) const override
+    void addBatchSinglePlace(
+        size_t batch_size, AggregateDataPtr place, const IColumn ** columns, Arena * arena, ssize_t if_argument_pos) const override
    {
-        const auto & column = static_cast<const ColVecType &>(*columns[0]);
-        this->data(place).addMany(column.getData().data(), batch_size);
+        if (if_argument_pos >= 0)
+        {
+            const auto & flags = assert_cast<const ColumnUInt8 &>(*columns[if_argument_pos]).getData();
+            for (size_t i = 0; i < batch_size; ++i)
+            {
+                if (flags[i])
+                    add(place, columns, i, arena);
+            }
+        }
+        else
+        {
+            const auto & column = static_cast<const ColVecType &>(*columns[0]);
+            this->data(place).addMany(column.getData().data(), batch_size);
+        }
    }

    void addBatchSinglePlaceNotNull(
-        size_t batch_size, AggregateDataPtr place, const IColumn ** columns, const UInt8 * null_map, Arena *) const override
+        size_t batch_size, AggregateDataPtr place, const IColumn ** columns, const UInt8 * null_map, Arena * arena, ssize_t if_argument_pos)
+        const override
    {
-        const auto & column = static_cast<const ColVecType &>(*columns[0]);
-        this->data(place).addManyNotNull(column.getData().data(), null_map, batch_size);
+        if (if_argument_pos >= 0)
+        {
+            const auto & flags = assert_cast<const ColumnUInt8 &>(*columns[if_argument_pos]).getData();
+            for (size_t i = 0; i < batch_size; ++i)
+                if (!null_map[i] && flags[i])
+                    add(place, columns, i, arena);
+        }
+        else
+        {
+            const auto & column = static_cast<const ColVecType &>(*columns[0]);
+            this->data(place).addManyNotNull(column.getData().data(), null_map, batch_size);
+        }
    }

    void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena *) const override
--- a/src/AggregateFunctions/IAggregateFunction.h
+++ b/src/AggregateFunctions/IAggregateFunction.h
@ -10,6 +10,7 @@
 #include <Core/Block.h>
 #include <Common/Exception.h>
 #include <Core/Field.h>
+#include <Columns/ColumnsNumber.h>


 namespace DB
@ -143,19 +144,32 @@ public:
    /** Contains a loop with calls to "add" function. You can collect arguments into array "places"
      *  and do a single call to "addBatch" for devirtualization and inlining.
      */
-    virtual void addBatch(size_t batch_size, AggregateDataPtr * places, size_t place_offset, const IColumn ** columns, Arena * arena) const = 0;
+    virtual void addBatch(
+        size_t batch_size,
+        AggregateDataPtr * places,
+        size_t place_offset,
+        const IColumn ** columns,
+        Arena * arena,
+        ssize_t if_argument_pos = -1) const = 0;

    /** The same for single place.
      */
-    virtual void addBatchSinglePlace(size_t batch_size, AggregateDataPtr place, const IColumn ** columns, Arena * arena) const = 0;
+    virtual void addBatchSinglePlace(
+        size_t batch_size, AggregateDataPtr place, const IColumn ** columns, Arena * arena, ssize_t if_argument_pos = -1) const = 0;

    /** The same for single place when need to aggregate only filtered data.
      */
    virtual void addBatchSinglePlaceNotNull(
-        size_t batch_size, AggregateDataPtr place, const IColumn ** columns, const UInt8 * null_map, Arena * arena) const = 0;
+        size_t batch_size,
+        AggregateDataPtr place,
+        const IColumn ** columns,
+        const UInt8 * null_map,
+        Arena * arena,
+        ssize_t if_argument_pos = -1) const = 0;

    virtual void addBatchSinglePlaceFromInterval(
-        size_t batch_begin, size_t batch_end, AggregateDataPtr place, const IColumn ** columns, Arena * arena) const = 0;
+        size_t batch_begin, size_t batch_end, AggregateDataPtr place, const IColumn ** columns, Arena * arena, ssize_t if_argument_pos = -1)
+        const = 0;

    /** In addition to addBatch, this method collects multiple rows of arguments into array "places"
      *  as long as they are between offsets[i-1] and offsets[i]. This is used for arrayReduce and
@ -195,6 +209,11 @@ public:
        return nullptr;
    }

+    /** Return the nested function if this is an Aggregate Function Combinator.
+      * Otherwise return nullptr.
+      */
+    virtual AggregateFunctionPtr getNestedFunction() const { return {}; }
+
    const DataTypes & getArgumentTypes() const { return argument_types; }
    const Array & getParameters() const { return parameters; }

@ -220,31 +239,90 @@ public:

    AddFunc getAddressOfAddFunction() const override { return &addFree; }

-    void addBatch(size_t batch_size, AggregateDataPtr * places, size_t place_offset, const IColumn ** columns, Arena * arena) const override
+    void addBatch(
+        size_t batch_size,
+        AggregateDataPtr * places,
+        size_t place_offset,
+        const IColumn ** columns,
+        Arena * arena,
+        ssize_t if_argument_pos = -1) const override
    {
-        for (size_t i = 0; i < batch_size; ++i)
-            static_cast<const Derived *>(this)->add(places[i] + place_offset, columns, i, arena);
+        if (if_argument_pos >= 0)
+        {
+            const auto & flags = assert_cast<const ColumnUInt8 &>(*columns[if_argument_pos]).getData();
+            for (size_t i = 0; i < batch_size; ++i)
+            {
+                if (flags[i])
+                    static_cast<const Derived *>(this)->add(places[i] + place_offset, columns, i, arena);
+            }
+        }
+        else
+        {
+            for (size_t i = 0; i < batch_size; ++i)
+                static_cast<const Derived *>(this)->add(places[i] + place_offset, columns, i, arena);
+        }
    }

-    void addBatchSinglePlace(size_t batch_size, AggregateDataPtr place, const IColumn ** columns, Arena * arena) const override
+    void addBatchSinglePlace(
+        size_t batch_size, AggregateDataPtr place, const IColumn ** columns, Arena * arena, ssize_t if_argument_pos = -1) const override
    {
-        for (size_t i = 0; i < batch_size; ++i)
-            static_cast<const Derived *>(this)->add(place, columns, i, arena);
+        if (if_argument_pos >= 0)
+        {
+            const auto & flags = assert_cast<const ColumnUInt8 &>(*columns[if_argument_pos]).getData();
+            for (size_t i = 0; i < batch_size; ++i)
+            {
+                if (flags[i])
+                    static_cast<const Derived *>(this)->add(place, columns, i, arena);
+            }
+        }
+        else
+        {
+            for (size_t i = 0; i < batch_size; ++i)
+                static_cast<const Derived *>(this)->add(place, columns, i, arena);
+        }
    }

    void addBatchSinglePlaceNotNull(
-        size_t batch_size, AggregateDataPtr place, const IColumn ** columns, const UInt8 * null_map, Arena * arena) const override
+        size_t batch_size,
+        AggregateDataPtr place,
+        const IColumn ** columns,
+        const UInt8 * null_map,
+        Arena * arena,
+        ssize_t if_argument_pos = -1) const override
    {
-        for (size_t i = 0; i < batch_size; ++i)
-            if (!null_map[i])
-                static_cast<const Derived *>(this)->add(place, columns, i, arena);
+        if (if_argument_pos >= 0)
+        {
+            const auto & flags = assert_cast<const ColumnUInt8 &>(*columns[if_argument_pos]).getData();
+            for (size_t i = 0; i < batch_size; ++i)
+                if (!null_map[i] && flags[i])
+                    static_cast<const Derived *>(this)->add(place, columns, i, arena);
+        }
+        else
+        {
+            for (size_t i = 0; i < batch_size; ++i)
+                if (!null_map[i])
+                    static_cast<const Derived *>(this)->add(place, columns, i, arena);
+        }
    }

    void addBatchSinglePlaceFromInterval(
-        size_t batch_begin, size_t batch_end, AggregateDataPtr place, const IColumn ** columns, Arena * arena) const override
+        size_t batch_begin, size_t batch_end, AggregateDataPtr place, const IColumn ** columns, Arena * arena, ssize_t if_argument_pos = -1)
+        const override
    {
-        for (size_t i = batch_begin; i < batch_end; ++i)
-            static_cast<const Derived *>(this)->add(place, columns, i, arena);
+        if (if_argument_pos >= 0)
+        {
+            const auto & flags = assert_cast<const ColumnUInt8 &>(*columns[if_argument_pos]).getData();
+            for (size_t i = batch_begin; i < batch_end; ++i)
+            {
+                if (flags[i])
+                    static_cast<const Derived *>(this)->add(place, columns, i, arena);
+            }
+        }
+        else
+        {
+            for (size_t i = batch_begin; i < batch_end; ++i)
+                static_cast<const Derived *>(this)->add(place, columns, i, arena);
+        }
    }

    void addBatchArray(
--- a/src/AggregateFunctions/registerAggregateFunctions.cpp
+++ b/src/AggregateFunctions/registerAggregateFunctions.cpp
@ -18,7 +18,9 @@ void registerAggregateFunctionsQuantile(AggregateFunctionFactory &);
 void registerAggregateFunctionsSequenceMatch(AggregateFunctionFactory &);
 void registerAggregateFunctionWindowFunnel(AggregateFunctionFactory &);
 void registerAggregateFunctionRate(AggregateFunctionFactory &);
-void registerAggregateFunctionsMinMaxAny(AggregateFunctionFactory &);
+void registerAggregateFunctionsMin(AggregateFunctionFactory &);
+void registerAggregateFunctionsMax(AggregateFunctionFactory &);
+void registerAggregateFunctionsAny(AggregateFunctionFactory &);
 void registerAggregateFunctionsStatisticsStable(AggregateFunctionFactory &);
 void registerAggregateFunctionsStatisticsSimple(AggregateFunctionFactory &);
 void registerAggregateFunctionSum(AggregateFunctionFactory &);
@ -71,7 +73,9 @@ void registerAggregateFunctions()
        registerAggregateFunctionsSequenceMatch(factory);
        registerAggregateFunctionWindowFunnel(factory);
        registerAggregateFunctionRate(factory);
-        registerAggregateFunctionsMinMaxAny(factory);
+        registerAggregateFunctionsMin(factory);
+        registerAggregateFunctionsMax(factory);
+        registerAggregateFunctionsAny(factory);
        registerAggregateFunctionsStatisticsStable(factory);
        registerAggregateFunctionsStatisticsSimple(factory);
        registerAggregateFunctionSum(factory);
--- a/src/AggregateFunctions/ya.make
+++ b/src/AggregateFunctions/ya.make
@ -10,6 +10,7 @@ PEERDIR(

 SRCS(
    AggregateFunctionAggThrow.cpp
+    AggregateFunctionAny.cpp
    AggregateFunctionArray.cpp
    AggregateFunctionAvg.cpp
    AggregateFunctionAvgWeighted.cpp
@ -30,9 +31,10 @@ SRCS(
    AggregateFunctionIf.cpp
    AggregateFunctionMLMethod.cpp
    AggregateFunctionMannWhitney.cpp
+    AggregateFunctionMax.cpp
    AggregateFunctionMaxIntersections.cpp
    AggregateFunctionMerge.cpp
-    AggregateFunctionMinMaxAny.cpp
+    AggregateFunctionMin.cpp
    AggregateFunctionNull.cpp
    AggregateFunctionOrFill.cpp
    AggregateFunctionQuantile.cpp
--- a/src/Columns/ColumnsCommon.cpp
+++ b/src/Columns/ColumnsCommon.cpp
@ -12,7 +12,54 @@
 namespace DB
 {

+#if defined(__SSE2__) && defined(__POPCNT__)
+/// Transform 64-byte mask to 64-bit mask.
+static UInt64 toBits64(const Int8 * bytes64)
+{
+    static const __m128i zero16 = _mm_setzero_si128();
+    return static_cast<UInt64>(_mm_movemask_epi8(_mm_cmpgt_epi8(_mm_loadu_si128(reinterpret_cast<const __m128i *>(bytes64)), zero16)))
+        | (static_cast<UInt64>(_mm_movemask_epi8(_mm_cmpgt_epi8(_mm_loadu_si128(reinterpret_cast<const __m128i *>(bytes64 + 16)), zero16)))
+           << 16)
+        | (static_cast<UInt64>(_mm_movemask_epi8(_mm_cmpgt_epi8(_mm_loadu_si128(reinterpret_cast<const __m128i *>(bytes64 + 32)), zero16)))
+           << 32)
+        | (static_cast<UInt64>(_mm_movemask_epi8(_mm_cmpgt_epi8(_mm_loadu_si128(reinterpret_cast<const __m128i *>(bytes64 + 48)), zero16)))
+           << 48);
+}
+#endif
+
+size_t countBytesInFilter(const UInt8 * filt, size_t sz)
+{
+    size_t count = 0;
+
+    /** NOTE: In theory, `filt` should only contain zeros and ones.
+      * But, just in case, here the condition > 0 (to signed bytes) is used.
+      * It would be better to use != 0, then this does not allow SSE2.
+      */
+
+    const Int8 * pos = reinterpret_cast<const Int8 *>(filt);
+    const Int8 * end = pos + sz;
+
+#if defined(__SSE2__) && defined(__POPCNT__)
+    const Int8 * end64 = pos + sz / 64 * 64;
+
+    for (; pos < end64; pos += 64)
+        count += __builtin_popcountll(toBits64(pos));
+
+    /// TODO Add duff device for tail?
+#endif
+
+    for (; pos < end; ++pos)
+        count += *pos > 0;
+
+    return count;
+}
+
 size_t countBytesInFilter(const IColumn::Filter & filt)
+{
+    return countBytesInFilter(filt.data(), filt.size());
+}
+
+size_t countBytesInFilterWithNull(const IColumn::Filter & filt, const UInt8 * null_map)
 {
    size_t count = 0;

@ -22,32 +69,20 @@ size_t countBytesInFilter(const IColumn::Filter & filt)
      */

    const Int8 * pos = reinterpret_cast<const Int8 *>(filt.data());
+    const Int8 * pos2 = reinterpret_cast<const Int8 *>(null_map);
    const Int8 * end = pos + filt.size();

 #if defined(__SSE2__) && defined(__POPCNT__)
-    const __m128i zero16 = _mm_setzero_si128();
    const Int8 * end64 = pos + filt.size() / 64 * 64;

-    for (; pos < end64; pos += 64)
-        count += __builtin_popcountll(
-            static_cast<UInt64>(_mm_movemask_epi8(_mm_cmpgt_epi8(
-                _mm_loadu_si128(reinterpret_cast<const __m128i *>(pos)),
-                zero16)))
-            | (static_cast<UInt64>(_mm_movemask_epi8(_mm_cmpgt_epi8(
-                _mm_loadu_si128(reinterpret_cast<const __m128i *>(pos + 16)),
-                zero16))) << 16)
-            | (static_cast<UInt64>(_mm_movemask_epi8(_mm_cmpgt_epi8(
-                _mm_loadu_si128(reinterpret_cast<const __m128i *>(pos + 32)),
-                zero16))) << 32)
-            | (static_cast<UInt64>(_mm_movemask_epi8(_mm_cmpgt_epi8(
-                _mm_loadu_si128(reinterpret_cast<const __m128i *>(pos + 48)),
-                zero16))) << 48));
+    for (; pos < end64; pos += 64, pos2 += 64)
+        count += __builtin_popcountll(toBits64(pos) & ~toBits64(pos2));

-    /// TODO Add duff device for tail?
+        /// TODO Add duff device for tail?
 #endif

    for (; pos < end; ++pos)
-        count += *pos > 0;
+        count += (*pos & ~*pos2) > 0;

    return count;
 }
--- a/src/Columns/ColumnsCommon.h
+++ b/src/Columns/ColumnsCommon.h
@ -15,7 +15,9 @@ namespace ErrorCodes
 }

 /// Counts how many bytes of `filt` are greater than zero.
+size_t countBytesInFilter(const UInt8 * filt, size_t sz);
 size_t countBytesInFilter(const IColumn::Filter & filt);
+size_t countBytesInFilterWithNull(const IColumn::Filter & filt, const UInt8 * null_map);

 /// Returns vector with num_columns elements. vector[i] is the count of i values in selector.
 /// Selector must contain values from 0 to num_columns - 1. NOTE: this is not checked.
--- a/src/Common/Allocator.h
+++ b/src/Common/Allocator.h
@ -28,7 +28,7 @@
 #include <common/mremap.h>
 #include <common/getPageSize.h>

-#include <Common/MemoryTracker.h>
+#include <Common/CurrentMemoryTracker.h>
 #include <Common/Exception.h>
 #include <Common/formatReadable.h>

--- a/src/Common/Arena.h
+++ b/src/Common/Arena.h
@ -28,7 +28,7 @@ namespace DB
  * - put lot of strings inside pool, keep their addresses;
  * - addresses remain valid during lifetime of pool;
  * - at destruction of pool, all memory is freed;
-  * - memory is allocated and freed by large chunks;
+  * - memory is allocated and freed by large MemoryChunks;
  * - freeing parts of data is not possible (but look at ArenaWithFreeLists if you need);
  */
 class Arena : private boost::noncopyable
@ -37,16 +37,16 @@ private:
    /// Padding allows to use 'memcpySmallAllowReadWriteOverflow15' instead of 'memcpy'.
    static constexpr size_t pad_right = 15;

-    /// Contiguous chunk of memory and pointer to free space inside it. Member of single-linked list.
-    struct alignas(16) Chunk : private Allocator<false>    /// empty base optimization
+    /// Contiguous MemoryChunk of memory and pointer to free space inside it. Member of single-linked list.
+    struct alignas(16) MemoryChunk : private Allocator<false>    /// empty base optimization
    {
        char * begin;
        char * pos;
        char * end; /// does not include padding.

-        Chunk * prev;
+        MemoryChunk * prev;

-        Chunk(size_t size_, Chunk * prev_)
+        MemoryChunk(size_t size_, MemoryChunk * prev_)
        {
            ProfileEvents::increment(ProfileEvents::ArenaAllocChunks);
            ProfileEvents::increment(ProfileEvents::ArenaAllocBytes, size_);
@ -59,7 +59,7 @@ private:
            ASAN_POISON_MEMORY_REGION(begin, size_);
        }

-        ~Chunk()
+        ~MemoryChunk()
        {
            /// We must unpoison the memory before returning to the allocator,
            /// because the allocator might not have asan integration, and the
@ -80,8 +80,8 @@ private:
    size_t growth_factor;
    size_t linear_growth_threshold;

-    /// Last contiguous chunk of memory.
-    Chunk * head;
+    /// Last contiguous MemoryChunk of memory.
+    MemoryChunk * head;
    size_t size_in_bytes;
    size_t page_size;

@ -90,7 +90,7 @@ private:
        return (s + page_size - 1) / page_size * page_size;
    }

-    /// If chunks size is less than 'linear_growth_threshold', then use exponential growth, otherwise - linear growth
+    /// If MemoryChunks size is less than 'linear_growth_threshold', then use exponential growth, otherwise - linear growth
    ///  (to not allocate too much excessive memory).
    size_t nextSize(size_t min_next_size) const
    {
@ -104,7 +104,7 @@ private:
        {
            // allocContinue() combined with linear growth results in quadratic
            // behavior: we append the data by small amounts, and when it
-            // doesn't fit, we create a new chunk and copy all the previous data
+            // doesn't fit, we create a new MemoryChunk and copy all the previous data
            // into it. The number of times we do this is directly proportional
            // to the total size of data that is going to be serialized. To make
            // the copying happen less often, round the next size up to the
@ -117,10 +117,10 @@ private:
        return roundUpToPageSize(size_after_grow, page_size);
    }

-    /// Add next contiguous chunk of memory with size not less than specified.
-    void NO_INLINE addChunk(size_t min_size)
+    /// Add next contiguous MemoryChunk of memory with size not less than specified.
+    void NO_INLINE addMemoryChunk(size_t min_size)
    {
-        head = new Chunk(nextSize(min_size + pad_right), head);
+        head = new MemoryChunk(nextSize(min_size + pad_right), head);
        size_in_bytes += head->size();
    }

@ -130,7 +130,7 @@ private:
 public:
    Arena(size_t initial_size_ = 4096, size_t growth_factor_ = 2, size_t linear_growth_threshold_ = 128 * 1024 * 1024)
        : growth_factor(growth_factor_), linear_growth_threshold(linear_growth_threshold_),
-        head(new Chunk(initial_size_, nullptr)), size_in_bytes(head->size()),
+        head(new MemoryChunk(initial_size_, nullptr)), size_in_bytes(head->size()),
        page_size(static_cast<size_t>(::getPageSize()))
    {
    }
@ -144,7 +144,7 @@ public:
    char * alloc(size_t size)
    {
        if (unlikely(head->pos + size > head->end))
-            addChunk(size);
+            addMemoryChunk(size);

        char * res = head->pos;
        head->pos += size;
@ -169,7 +169,7 @@ public:
                return res;
            }

-            addChunk(size + alignment);
+            addMemoryChunk(size + alignment);
        } while (true);
    }

@ -194,8 +194,8 @@ public:
    /** Begin or expand a contiguous range of memory.
      * 'range_start' is the start of range. If nullptr, a new range is
      * allocated.
-      * If there is no space in the current chunk to expand the range,
-      * the entire range is copied to a new, bigger memory chunk, and the value
+      * If there is no space in the current MemoryChunk to expand the range,
+      * the entire range is copied to a new, bigger memory MemoryChunk, and the value
      * of 'range_start' is updated.
      * If the optional 'start_alignment' is specified, the start of range is
      * kept aligned to this value.
@ -209,7 +209,7 @@ public:
        /*
         * Allocating zero bytes doesn't make much sense. Also, a zero-sized
         * range might break the invariant that the range begins at least before
-         * the current chunk end.
+         * the current MemoryChunk end.
         */
        assert(additional_bytes > 0);

@ -228,19 +228,19 @@ public:

        // This method only works for extending the last allocation. For lack of
        // original size, check a weaker condition: that 'begin' is at least in
-        // the current Chunk.
+        // the current MemoryChunk.
        assert(range_start >= head->begin);
        assert(range_start < head->end);

        if (head->pos + additional_bytes <= head->end)
        {
-            // The new size fits into the last chunk, so just alloc the
+            // The new size fits into the last MemoryChunk, so just alloc the
            // additional size. We can alloc without alignment here, because it
            // only applies to the start of the range, and we don't change it.
            return alloc(additional_bytes);
        }

-        // New range doesn't fit into this chunk, will copy to a new one.
+        // New range doesn't fit into this MemoryChunk, will copy to a new one.
        //
        // Note: among other things, this method is used to provide a hack-ish
        // implementation of realloc over Arenas in ArenaAllocators. It wastes a
@ -301,16 +301,16 @@ public:
        return res;
    }

-    /// Size of chunks in bytes.
+    /// Size of MemoryChunks in bytes.
    size_t size() const
    {
        return size_in_bytes;
    }

-    /// Bad method, don't use it -- the chunks are not your business, the entire
+    /// Bad method, don't use it -- the MemoryChunks are not your business, the entire
    /// purpose of the arena code is to manage them for you, so if you find
    /// yourself having to use this method, probably you're doing something wrong.
-    size_t remainingSpaceInCurrentChunk() const
+    size_t remainingSpaceInCurrentMemoryChunk() const
    {
        return head->remaining();
    }
--- a/src/Common/CurrentMemoryTracker.cpp
+++ b/src/Common/CurrentMemoryTracker.cpp
@ -0,0 +1,81 @@
+#include <Common/MemoryTracker.h>
+#include <Common/CurrentThread.h>
+
+#include <Common/CurrentMemoryTracker.h>
+
+namespace
+{
+
+MemoryTracker * getMemoryTracker()
+{
+    if (auto * thread_memory_tracker = DB::CurrentThread::getMemoryTracker())
+        return thread_memory_tracker;
+
+    /// Once the main thread is initialized,
+    /// total_memory_tracker is initialized too.
+    /// And can be used, since MainThreadStatus is required for profiling.
+    if (DB::MainThreadStatus::get())
+        return &total_memory_tracker;
+
+    return nullptr;
+}
+
+}
+
+namespace CurrentMemoryTracker
+{
+
+using DB::current_thread;
+
+void alloc(Int64 size)
+{
+    if (auto * memory_tracker = getMemoryTracker())
+    {
+        if (current_thread)
+        {
+            current_thread->untracked_memory += size;
+            if (current_thread->untracked_memory > current_thread->untracked_memory_limit)
+            {
+                /// Zero untracked before track. If tracker throws out-of-limit we would be able to alloc up to untracked_memory_limit bytes
+                /// more. It could be useful to enlarge Exception message in rethrow logic.
+                Int64 tmp = current_thread->untracked_memory;
+                current_thread->untracked_memory = 0;
+                memory_tracker->alloc(tmp);
+            }
+        }
+        /// total_memory_tracker only, ignore untracked_memory
+        else
+        {
+            memory_tracker->alloc(size);
+        }
+    }
+}
+
+void realloc(Int64 old_size, Int64 new_size)
+{
+    Int64 addition = new_size - old_size;
+    addition > 0 ? alloc(addition) : free(-addition);
+}
+
+void free(Int64 size)
+{
+    if (auto * memory_tracker = getMemoryTracker())
+    {
+        if (current_thread)
+        {
+            current_thread->untracked_memory -= size;
+            if (current_thread->untracked_memory < -current_thread->untracked_memory_limit)
+            {
+                memory_tracker->free(-current_thread->untracked_memory);
+                current_thread->untracked_memory = 0;
+            }
+        }
+        /// total_memory_tracker only, ignore untracked_memory
+        else
+        {
+            memory_tracker->free(size);
+        }
+    }
+}
+
+}
--- a/src/Common/CurrentMemoryTracker.h
+++ b/src/Common/CurrentMemoryTracker.h
@ -0,0 +1,11 @@
+#pragma once
+
+#include <common/types.h>
+
+/// Convenience methods, that use current thread's memory_tracker if it is available.
+namespace CurrentMemoryTracker
+{
+    void alloc(Int64 size);
+    void realloc(Int64 old_size, Int64 new_size);
+    void free(Int64 size);
+}
--- a/src/Common/DNSResolver.cpp
+++ b/src/Common/DNSResolver.cpp
@ -80,13 +80,7 @@ static void splitHostAndPort(const std::string & host_and_port, std::string & ou
        out_port = static_cast<UInt16>(port);
    }
    else
-    {
-        struct servent * se = getservbyname(port_str.c_str(), nullptr);
-        if (se)
-            out_port = ntohs(static_cast<UInt16>(se->s_port));
-        else
-            throw Exception("Service not found", ErrorCodes::BAD_ARGUMENTS);
-    }
+        throw Exception("Port must be numeric", ErrorCodes::BAD_ARGUMENTS);
 }

 static DNSResolver::IPAddresses resolveIPAddressImpl(const std::string & host)
--- a/src/Common/FiberStack.h
+++ b/src/Common/FiberStack.h
@ -2,7 +2,7 @@
 #include <common/defines.h>
 #include <boost/context/stack_context.hpp>
 #include <Common/formatReadable.h>
-#include <Common/MemoryTracker.h>
+#include <Common/CurrentMemoryTracker.h>

 #include <sys/time.h>
 #include <sys/resource.h>
--- a/src/Common/IPv6ToBinary.cpp
+++ b/src/Common/IPv6ToBinary.cpp
@ -2,12 +2,20 @@
 #include <Poco/Net/IPAddress.h>
 #include <Poco/ByteOrder.h>

+#include <Common/formatIPv6.h>
+
 #include <cstring>


 namespace DB
 {

+/// Result array could be indexed with all possible uint8 values without extra check.
+/// For values greater than 128 we will store same value as for 128 (all bits set).
+constexpr size_t IPV6_MASKS_COUNT = 256;
+
+using RawMaskArray = std::array<uint8_t, IPV6_BINARY_LENGTH>;
+
 void IPv6ToRawBinary(const Poco::Net::IPAddress & address, char * res)
 {
    if (Poco::Net::IPAddress::IPv6 == address.family())
@ -33,4 +41,33 @@ std::array<char, 16> IPv6ToBinary(const Poco::Net::IPAddress & address)
    return res;
 }

+static constexpr RawMaskArray generateBitMask(size_t prefix)
+{
+    if (prefix >= 128)
+        prefix = 128;
+    RawMaskArray arr{0};
+    size_t i = 0;
+    for (; prefix >= 8; ++i, prefix -= 8)
+        arr[i] = 0xff;
+    if (prefix > 0)
+        arr[i++] = ~(0xff >> prefix);
+    while (i < 16)
+        arr[i++] = 0x00;
+    return arr;
+}
+
+static constexpr std::array<RawMaskArray, IPV6_MASKS_COUNT> generateBitMasks()
+{
+    std::array<RawMaskArray, IPV6_MASKS_COUNT> arr{};
+    for (size_t i = 0; i < IPV6_MASKS_COUNT; ++i)
+        arr[i] = generateBitMask(i);
+    return arr;
+}
+
+const uint8_t * getCIDRMaskIPv6(UInt8 prefix_len)
+{
+    static constexpr std::array<RawMaskArray, IPV6_MASKS_COUNT> IPV6_RAW_MASK_ARRAY = generateBitMasks();
+    return IPV6_RAW_MASK_ARRAY[prefix_len].data();
+}
+
 }
--- a/src/Common/IPv6ToBinary.h
+++ b/src/Common/IPv6ToBinary.h
@ -14,4 +14,9 @@ void IPv6ToRawBinary(const Poco::Net::IPAddress & address, char * res);
 /// Convert IP address to 16-byte array with IPv6 data (big endian). If it's an IPv4, map it to IPv6.
 std::array<char, 16> IPv6ToBinary(const Poco::Net::IPAddress & address);

+/// Returns pointer to 16-byte array containing mask with first `prefix_len` bits set to `1` and `128 - prefix_len` to `0`.
+/// Pointer is valid during all program execution time and doesn't require freeing.
+/// Values of prefix_len greater than 128 interpreted as 128 exactly.
+const uint8_t * getCIDRMaskIPv6(UInt8 prefix_len);
+
 }
--- a/src/Common/MemoryTracker.cpp
+++ b/src/Common/MemoryTracker.cpp
@ -2,7 +2,6 @@

 #include <IO/WriteHelpers.h>
 #include "Common/TraceCollector.h"
-#include <Common/CurrentThread.h>
 #include <Common/Exception.h>
 #include <Common/formatReadable.h>
 #include <common/logger_useful.h>
@ -16,20 +15,6 @@
 namespace
 {

-MemoryTracker * getMemoryTracker()
-{
-    if (auto * thread_memory_tracker = DB::CurrentThread::getMemoryTracker())
-        return thread_memory_tracker;
-
-    /// Once the main thread is initialized,
-    /// total_memory_tracker is initialized too.
-    /// And can be used, since MainThreadStatus is required for profiling.
-    if (DB::MainThreadStatus::get())
-        return &total_memory_tracker;
-
-    return nullptr;
-}
-
 /// MemoryTracker cannot throw MEMORY_LIMIT_EXCEEDED (either configured memory
 /// limit reached or fault injected), in the following cases:
 ///
@ -41,9 +26,9 @@ MemoryTracker * getMemoryTracker()
 ///   NOTE: that since C++11 destructor marked with noexcept by default, and
 ///   this means that any throw from destructor (that is not marked with
 ///   noexcept(false)) will cause std::terminate()
-bool inline memoryTrackerCanThrow()
+bool inline memoryTrackerCanThrow(VariableContext level, bool fault_injection)
 {
-    return !MemoryTracker::LockExceptionInThread::isBlocked() && !std::uncaught_exceptions();
+    return !MemoryTracker::LockExceptionInThread::isBlocked(level, fault_injection) && !std::uncaught_exceptions();
 }

 }
@ -64,8 +49,40 @@ namespace ProfileEvents

 static constexpr size_t log_peak_memory_usage_every = 1ULL << 30;

+// BlockerInThread
 thread_local uint64_t MemoryTracker::BlockerInThread::counter = 0;
+thread_local VariableContext MemoryTracker::BlockerInThread::level = VariableContext::Global;
+MemoryTracker::BlockerInThread::BlockerInThread(VariableContext level_)
+    : previous_level(level)
+{
+    ++counter;
+    level = level_;
+}
+MemoryTracker::BlockerInThread::~BlockerInThread()
+{
+    --counter;
+    level = previous_level;
+}
+
+/// LockExceptionInThread
 thread_local uint64_t MemoryTracker::LockExceptionInThread::counter = 0;
+thread_local VariableContext MemoryTracker::LockExceptionInThread::level = VariableContext::Global;
+thread_local bool MemoryTracker::LockExceptionInThread::block_fault_injections = false;
+MemoryTracker::LockExceptionInThread::LockExceptionInThread(VariableContext level_, bool block_fault_injections_)
+    : previous_level(level)
+    , previous_block_fault_injections(block_fault_injections)
+{
+    ++counter;
+    level = level_;
+    block_fault_injections = block_fault_injections_;
+}
+MemoryTracker::LockExceptionInThread::~LockExceptionInThread()
+{
+    --counter;
+    level = previous_level;
+    block_fault_injections = previous_block_fault_injections;
+}
+

 MemoryTracker total_memory_tracker(nullptr, VariableContext::Global);

@ -110,8 +127,13 @@ void MemoryTracker::alloc(Int64 size)
    if (size < 0)
        throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Negative size ({}) is passed to MemoryTracker. It is a bug.", size);

-    if (BlockerInThread::isBlocked())
+    if (BlockerInThread::isBlocked(level))
+    {
+        /// Since the BlockerInThread should respect the level, we should go to the next parent.
+        if (auto * loaded_next = parent.load(std::memory_order_relaxed))
+            loaded_next->alloc(size);
        return;
+    }

    /** Using memory_order_relaxed means that if allocations are done simultaneously,
      *  we allow exception about memory limit exceeded to be thrown only on next allocation.
@ -144,7 +166,7 @@ void MemoryTracker::alloc(Int64 size)
    }

    std::bernoulli_distribution fault(fault_probability);
-    if (unlikely(fault_probability && fault(thread_local_rng)) && memoryTrackerCanThrow())
+    if (unlikely(fault_probability && fault(thread_local_rng)) && memoryTrackerCanThrow(level, true))
    {
        /// Prevent recursion. Exception::ctor -> std::string -> new[] -> MemoryTracker::alloc
        BlockerInThread untrack_lock;
@ -173,7 +195,7 @@ void MemoryTracker::alloc(Int64 size)
        DB::TraceCollector::collect(DB::TraceType::MemorySample, StackTrace(), size);
    }

-    if (unlikely(current_hard_limit && will_be > current_hard_limit) && memoryTrackerCanThrow())
+    if (unlikely(current_hard_limit && will_be > current_hard_limit) && memoryTrackerCanThrow(level, false))
    {
        /// Prevent recursion. Exception::ctor -> std::string -> new[] -> MemoryTracker::alloc
        BlockerInThread untrack_lock;
@ -211,7 +233,7 @@ void MemoryTracker::updatePeak(Int64 will_be)

 void MemoryTracker::free(Int64 size)
 {
-    if (BlockerInThread::isBlocked())
+    if (BlockerInThread::isBlocked(level))
        return;

    std::bernoulli_distribution sample(sample_probability);
@ -292,60 +314,3 @@ void MemoryTracker::setOrRaiseProfilerLimit(Int64 value)
    while (old_value < value && !profiler_limit.compare_exchange_weak(old_value, value))
        ;
 }
-
-
-namespace CurrentMemoryTracker
-{
-    using DB::current_thread;
-
-    void alloc(Int64 size)
-    {
-        if (auto * memory_tracker = getMemoryTracker())
-        {
-            if (current_thread)
-            {
-                current_thread->untracked_memory += size;
-                if (current_thread->untracked_memory > current_thread->untracked_memory_limit)
-                {
-                    /// Zero untracked before track. If tracker throws out-of-limit we would be able to alloc up to untracked_memory_limit bytes
-                    /// more. It could be useful to enlarge Exception message in rethrow logic.
-                    Int64 tmp = current_thread->untracked_memory;
-                    current_thread->untracked_memory = 0;
-                    memory_tracker->alloc(tmp);
-                }
-            }
-            /// total_memory_tracker only, ignore untracked_memory
-            else
-            {
-                memory_tracker->alloc(size);
-            }
-        }
-    }
-
-    void realloc(Int64 old_size, Int64 new_size)
-    {
-        Int64 addition = new_size - old_size;
-        addition > 0 ? alloc(addition) : free(-addition);
-    }
-
-    void free(Int64 size)
-    {
-        if (auto * memory_tracker = getMemoryTracker())
-        {
-            if (current_thread)
-            {
-                current_thread->untracked_memory -= size;
-                if (current_thread->untracked_memory < -current_thread->untracked_memory_limit)
-                {
-                    memory_tracker->free(-current_thread->untracked_memory);
-                    current_thread->untracked_memory = 0;
-                }
-            }
-            /// total_memory_tracker only, ignore untracked_memory
-            else
-            {
-                memory_tracker->free(size);
-            }
-        }
-    }
-}
--- a/src/Common/MemoryTracker.h
+++ b/src/Common/MemoryTracker.h
@ -136,11 +136,20 @@ public:
    private:
        BlockerInThread(const BlockerInThread &) = delete;
        BlockerInThread & operator=(const BlockerInThread &) = delete;
+
        static thread_local uint64_t counter;
+        static thread_local VariableContext level;
+
+        VariableContext previous_level;
    public:
-        BlockerInThread() { ++counter; }
-        ~BlockerInThread() { --counter; }
-        static bool isBlocked() { return counter > 0; }
+        /// level_ - block in level and above
+        BlockerInThread(VariableContext level_ = VariableContext::Global);
+        ~BlockerInThread();
+
+        static bool isBlocked(VariableContext current_level)
+        {
+            return counter > 0 && current_level >= level;
+        }
    };

    /// To be able to avoid MEMORY_LIMIT_EXCEEDED Exception in destructors:
@ -160,21 +169,24 @@ public:
    private:
        LockExceptionInThread(const LockExceptionInThread &) = delete;
        LockExceptionInThread & operator=(const LockExceptionInThread &) = delete;
+
        static thread_local uint64_t counter;
+        static thread_local VariableContext level;
+        static thread_local bool block_fault_injections;
+
+        VariableContext previous_level;
+        bool previous_block_fault_injections;
    public:
-        LockExceptionInThread() { ++counter; }
-        ~LockExceptionInThread() { --counter; }
-        static bool isBlocked() { return counter > 0; }
+        /// level_ - block in level and above
+        /// block_fault_injections_ - block in fault injection too
+        LockExceptionInThread(VariableContext level_ = VariableContext::Global, bool block_fault_injections_ = true);
+        ~LockExceptionInThread();
+
+        static bool isBlocked(VariableContext current_level, bool fault_injection)
+        {
+            return counter > 0 && current_level >= level && (!fault_injection || (fault_injection && block_fault_injections));
+        }
    };
 };

 extern MemoryTracker total_memory_tracker;
-
-
-/// Convenience methods, that use current thread's memory_tracker if it is available.
-namespace CurrentMemoryTracker
-{
-    void alloc(Int64 size);
-    void realloc(Int64 old_size, Int64 new_size);
-    void free(Int64 size);
-}
--- a/src/Common/PODArray.h
+++ b/src/Common/PODArray.h
@ -89,8 +89,8 @@ protected:
    static constexpr size_t pad_right = integerRoundUp(pad_right_, ELEMENT_SIZE);
    /// pad_left is also rounded up to 16 bytes to maintain alignment of allocated memory.
    static constexpr size_t pad_left = integerRoundUp(integerRoundUp(pad_left_, ELEMENT_SIZE), 16);
-    /// Empty array will point to this static memory as padding.
-    static constexpr char * null = pad_left ? const_cast<char *>(empty_pod_array) + empty_pod_array_size : nullptr;
+    /// Empty array will point to this static memory as padding and begin/end.
+    static constexpr char * null = const_cast<char *>(empty_pod_array) + pad_left;

    static_assert(pad_left <= empty_pod_array_size && "Left Padding exceeds empty_pod_array_size. Is the element size too large?");

@ -268,8 +268,11 @@ public:
            reserve(required_capacity, std::forward<TAllocatorParams>(allocator_params)...);

        size_t items_byte_size = byte_size(number_of_items);
-        memcpy(c_end, ptr, items_byte_size);
-        c_end += items_byte_size;
+        if (items_byte_size)
+        {
+            memcpy(c_end, ptr, items_byte_size);
+            c_end += items_byte_size;
+        }
    }

    void protect()
@ -289,6 +292,18 @@ public:
 #endif
    }

+    template <typename It1, typename It2>
+    inline void assertNotIntersects(It1 from_begin [[maybe_unused]], It2 from_end [[maybe_unused]])
+    {
+#if !defined(NDEBUG)
+        const char * ptr_begin = reinterpret_cast<const char *>(&*from_begin);
+        const char * ptr_end = reinterpret_cast<const char *>(&*from_end);
+
+        /// Also it's safe if the range is empty.
+        assert(!((ptr_begin >= c_start && ptr_begin <= c_end) || (ptr_end >= c_start && ptr_end <= c_end)) || (ptr_begin == ptr_end));
+#endif
+    }
+
    ~PODArrayBase()
    {
        dealloc();
@ -444,6 +459,7 @@ public:
    template <typename It1, typename It2, typename ... TAllocatorParams>
    void insertPrepare(It1 from_begin, It2 from_end, TAllocatorParams &&... allocator_params)
    {
+        this->assertNotIntersects(from_begin, from_end);
        size_t required_capacity = this->size() + (from_end - from_begin);
        if (required_capacity > this->capacity())
            this->reserve(roundUpToPowerOfTwoOrZero(required_capacity), std::forward<TAllocatorParams>(allocator_params)...);
@ -457,6 +473,28 @@ public:
        insert_assume_reserved(from_begin, from_end);
    }

+    /// In contrast to 'insert' this method is Ok even for inserting from itself.
+    /// Because we obtain iterators after reserving memory.
+    template <typename Container, typename ... TAllocatorParams>
+    void insertByOffsets(Container && rhs, size_t from_begin, size_t from_end, TAllocatorParams &&... allocator_params)
+    {
+        static_assert(memcpy_can_be_used_for_assignment<std::decay_t<T>, std::decay_t<decltype(rhs.front())>>);
+
+        assert(from_end >= from_begin);
+        assert(from_end <= rhs.size());
+
+        size_t required_capacity = this->size() + (from_end - from_begin);
+        if (required_capacity > this->capacity())
+            this->reserve(roundUpToPowerOfTwoOrZero(required_capacity), std::forward<TAllocatorParams>(allocator_params)...);
+
+        size_t bytes_to_copy = this->byte_size(from_end - from_begin);
+        if (bytes_to_copy)
+        {
+            memcpy(this->c_end, reinterpret_cast<const void *>(rhs.begin() + from_begin), bytes_to_copy);
+            this->c_end += bytes_to_copy;
+        }
+    }
+
    /// Works under assumption, that it's possible to read up to 15 excessive bytes after `from_end` and this PODArray is padded.
    template <typename It1, typename It2, typename ... TAllocatorParams>
    void insertSmallAllowReadWriteOverflow15(It1 from_begin, It2 from_end, TAllocatorParams &&... allocator_params)
@ -476,6 +514,9 @@ public:
        static_assert(memcpy_can_be_used_for_assignment<std::decay_t<T>, std::decay_t<decltype(*from_begin)>>);

        size_t bytes_to_copy = this->byte_size(from_end - from_begin);
+        if (!bytes_to_copy)
+            return;
+
        size_t bytes_to_move = this->byte_size(end() - it);

        insertPrepare(from_begin, from_end);
@ -492,10 +533,14 @@ public:
    void insert_assume_reserved(It1 from_begin, It2 from_end)
    {
        static_assert(memcpy_can_be_used_for_assignment<std::decay_t<T>, std::decay_t<decltype(*from_begin)>>);
+        this->assertNotIntersects(from_begin, from_end);

        size_t bytes_to_copy = this->byte_size(from_end - from_begin);
-        memcpy(this->c_end, reinterpret_cast<const void *>(&*from_begin), bytes_to_copy);
-        this->c_end += bytes_to_copy;
+        if (bytes_to_copy)
+        {
+            memcpy(this->c_end, reinterpret_cast<const void *>(&*from_begin), bytes_to_copy);
+            this->c_end += bytes_to_copy;
+        }
    }

    template <typename... TAllocatorParams>
@ -626,15 +671,18 @@ public:
    void assign(It1 from_begin, It2 from_end, TAllocatorParams &&... allocator_params)
    {
        static_assert(memcpy_can_be_used_for_assignment<std::decay_t<T>, std::decay_t<decltype(*from_begin)>>);
+        this->assertNotIntersects(from_begin, from_end);

        size_t required_capacity = from_end - from_begin;
        if (required_capacity > this->capacity())
            this->reserve_exact(required_capacity, std::forward<TAllocatorParams>(allocator_params)...);

        size_t bytes_to_copy = this->byte_size(required_capacity);
-        memcpy(this->c_start, reinterpret_cast<const void *>(&*from_begin), bytes_to_copy);
-
-        this->c_end = this->c_start + bytes_to_copy;
+        if (bytes_to_copy)
+        {
+            memcpy(this->c_start, reinterpret_cast<const void *>(&*from_begin), bytes_to_copy);
+            this->c_end = this->c_start + bytes_to_copy;
+        }
    }

    // ISO C++ has strict ambiguity rules, thus we cannot apply TAllocatorParams here.
--- a/src/Common/ThreadPool.h
+++ b/src/Common/ThreadPool.h
@ -13,7 +13,6 @@
 #include <Common/ThreadStatus.h>
 #include <ext/scope_guard.h>

-
 /** Very simple thread pool similar to boost::threadpool.
  * Advantages:
  * - catches exceptions and rethrows on wait.
@ -188,7 +187,7 @@ public:
    ThreadFromGlobalPool & operator=(ThreadFromGlobalPool && rhs)
    {
        if (joinable())
-            std::terminate();
+            abort();
        state = std::move(rhs.state);
        return *this;
    }
@ -196,13 +195,13 @@ public:
    ~ThreadFromGlobalPool()
    {
        if (joinable())
-            std::terminate();
+            abort();
    }

    void join()
    {
        if (!joinable())
-            std::terminate();
+            abort();

        state->wait();
        state.reset();
@ -211,7 +210,7 @@ public:
    void detach()
    {
        if (!joinable())
-            std::terminate();
+            abort();
        state.reset();
    }

--- a/src/Common/UnicodeBar.cpp
+++ b/src/Common/UnicodeBar.cpp
@ -5,33 +5,25 @@
 #include <common/arithmeticOverflow.h>
 #include <Common/Exception.h>
 #include <Common/UnicodeBar.h>
+#include <Common/NaNUtils.h>

-
-namespace DB
-{
-    namespace ErrorCodes
-    {
-        extern const int PARAMETER_OUT_OF_BOUND;
-    }
-}
+#include <iostream>


 namespace UnicodeBar
 {
-    double getWidth(Int64 x, Int64 min, Int64 max, double max_width)
+    double getWidth(double x, double min, double max, double max_width)
    {
+        if (isNaN(x))
+            return 0;
+
        if (x <= min)
            return 0;

        if (x >= max)
            return max_width;

-        /// The case when max - min overflows
-        Int64 max_difference;
-        if (common::subOverflow(max, min, max_difference))
-            throw DB::Exception(DB::ErrorCodes::PARAMETER_OUT_OF_BOUND, "The arguments to render unicode bar will lead to arithmetic overflow");
-
-        return (x - min) * max_width / max_difference;
+        return (x - min) / (max - min) * max_width;
    }

    size_t getWidthInBytes(double width)
--- a/src/Common/UnicodeBar.h
+++ b/src/Common/UnicodeBar.h
@ -10,7 +10,7 @@
  */
 namespace UnicodeBar
 {
-    double getWidth(Int64 x, Int64 min, Int64 max, double max_width);
+    double getWidth(double x, double min, double max, double max_width);
    size_t getWidthInBytes(double width);

    /// In `dst` there must be a space for barWidthInBytes(width) characters and a trailing zero.
--- a/src/Common/new_delete.cpp
+++ b/src/Common/new_delete.cpp
@ -1,5 +1,5 @@
 #include <common/memory.h>
-#include <Common/MemoryTracker.h>
+#include <Common/CurrentMemoryTracker.h>

 #include <iostream>
 #include <new>
--- a/src/Common/ya.make
+++ b/src/Common/ya.make
@ -33,6 +33,7 @@ SRCS(
    Config/ConfigProcessor.cpp
    Config/ConfigReloader.cpp
    Config/configReadClient.cpp
+    CurrentMemoryTracker.cpp
    CurrentMetrics.cpp
    CurrentThread.cpp
    DNSResolver.cpp
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -121,6 +121,7 @@ class IColumn;
    \
    M(Bool, input_format_parallel_parsing, true, "Enable parallel parsing for some data formats.", 0) \
    M(UInt64, min_chunk_bytes_for_parallel_parsing, (10 * 1024 * 1024), "The minimum chunk size in bytes, which each thread will parse in parallel.", 0) \
+    M(Bool, output_format_parallel_formatting, true, "Enable parallel formatting for some data formats.", 0) \
    \
    M(UInt64, merge_tree_min_rows_for_concurrent_read, (20 * 8192), "If at least as many lines are read from one file, the reading can be parallelized.", 0) \
    M(UInt64, merge_tree_min_bytes_for_concurrent_read, (24 * 10 * 1024 * 1024), "If at least as many bytes are read from one file, the reading can be parallelized.", 0) \
--- a/src/DataStreams/ParallelParsingBlockInputStream.cpp
+++ b/src/DataStreams/ParallelParsingBlockInputStream.cpp
@ -1,313 +0,0 @@
-#include <DataStreams/ParallelParsingBlockInputStream.h>
-#include <IO/ReadBuffer.h>
-#include <Common/CurrentThread.h>
-#include <Common/setThreadName.h>
-#include <ext/scope_guard.h>
-
-namespace DB
-{
-
-ParallelParsingBlockInputStream::ParallelParsingBlockInputStream(const Params & params)
-    : header(params.input_creator_params.sample),
-      row_input_format_params(params.input_creator_params.row_input_format_params),
-      format_settings(params.input_creator_params.settings),
-      input_processor_creator(params.input_processor_creator),
-      min_chunk_bytes(params.min_chunk_bytes),
-      original_buffer(params.read_buffer),
-      // Subtract one thread that we use for segmentation and one for
-      // reading. After that, must have at least two threads left for
-      // parsing. See the assertion below.
-      pool(std::max(2, static_cast<int>(params.max_threads) - 2)),
-      file_segmentation_engine(params.file_segmentation_engine)
-{
-    // See comment above.
-    assert(params.max_threads >= 4);
-
-    // One unit for each thread, including segmentator and reader, plus a
-    // couple more units so that the segmentation thread doesn't spuriously
-    // bump into reader thread on wraparound.
-    processing_units.resize(params.max_threads + 2);
-
-    segmentator_thread = ThreadFromGlobalPool(
-        &ParallelParsingBlockInputStream::segmentatorThreadFunction, this, CurrentThread::getGroup());
-}
-
-ParallelParsingBlockInputStream::~ParallelParsingBlockInputStream()
-{
-    finishAndWait();
-}
-
-void ParallelParsingBlockInputStream::cancel(bool kill)
-{
-    /**
-      * Can be called multiple times, from different threads. Saturate the
-      * the kill flag with OR.
-      */
-    if (kill)
-        is_killed = true;
-    is_cancelled = true;
-
-    /*
-     * The format parsers themselves are not being cancelled here, so we'll
-     * have to wait until they process the current block. Given that the
-     * chunk size is on the order of megabytes, this shouldn't be too long.
-     * We can't call IInputFormat->cancel here, because the parser object is
-     * local to the parser thread, and we don't want to introduce any
-     * synchronization between parser threads and the other threads to get
-     * better performance. An ideal solution would be to add a callback to
-     * IInputFormat that checks whether it was cancelled.
-     */
-
-    finishAndWait();
-}
-
-void ParallelParsingBlockInputStream::scheduleParserThreadForUnitWithNumber(size_t ticket_number)
-{
-    pool.scheduleOrThrowOnError([this, ticket_number, group = CurrentThread::getGroup()]()
-    {
-        parserThreadFunction(group, ticket_number);
-    });
-}
-
-void ParallelParsingBlockInputStream::finishAndWait()
-{
-    finished = true;
-
-    {
-        std::unique_lock<std::mutex> lock(mutex);
-        segmentator_condvar.notify_all();
-        reader_condvar.notify_all();
-    }
-
-    if (segmentator_thread.joinable())
-        segmentator_thread.join();
-
-    try
-    {
-        pool.wait();
-    }
-    catch (...)
-    {
-        tryLogCurrentException(__PRETTY_FUNCTION__);
-    }
-}
-
-void ParallelParsingBlockInputStream::segmentatorThreadFunction(ThreadGroupStatusPtr thread_group)
-{
-    SCOPE_EXIT(
-        if (thread_group)
-            CurrentThread::detachQueryIfNotDetached();
-    );
-    if (thread_group)
-        CurrentThread::attachTo(thread_group);
-
-    setThreadName("Segmentator");
-
-    try
-    {
-        while (!finished)
-        {
-            const auto current_unit_number = segmentator_ticket_number % processing_units.size();
-            auto & unit = processing_units[current_unit_number];
-
-            {
-                std::unique_lock<std::mutex> lock(mutex);
-                segmentator_condvar.wait(lock,
-                    [&]{ return unit.status == READY_TO_INSERT || finished; });
-            }
-
-            if (finished)
-            {
-                break;
-            }
-
-            assert(unit.status == READY_TO_INSERT);
-
-            // Segmentating the original input.
-            unit.segment.resize(0);
-
-            auto [have_more_data, currently_read_rows] = file_segmentation_engine(
-                original_buffer, unit.segment, min_chunk_bytes);
-
-            unit.offset = successfully_read_rows_count;
-            successfully_read_rows_count += currently_read_rows;
-
-            unit.is_last = !have_more_data;
-            unit.status = READY_TO_PARSE;
-            scheduleParserThreadForUnitWithNumber(segmentator_ticket_number);
-            ++segmentator_ticket_number;
-
-            if (!have_more_data)
-            {
-                break;
-            }
-        }
-    }
-    catch (...)
-    {
-        onBackgroundException(successfully_read_rows_count);
-    }
-}
-
-void ParallelParsingBlockInputStream::parserThreadFunction(ThreadGroupStatusPtr thread_group, size_t current_ticket_number)
-{
-    SCOPE_EXIT(
-        if (thread_group)
-            CurrentThread::detachQueryIfNotDetached();
-    );
-    if (thread_group)
-        CurrentThread::attachTo(thread_group);
-
-    setThreadName("ChunkParser");
-
-    const auto current_unit_number = current_ticket_number % processing_units.size();
-    auto & unit = processing_units[current_unit_number];
-
-    try
-    {
-        /*
-         * This is kind of suspicious -- the input_process_creator contract with
-         * respect to multithreaded use is not clear, but we hope that it is
-         * just a 'normal' factory class that doesn't have any state, and so we
-         * can use it from multiple threads simultaneously.
-         */
-        ReadBuffer read_buffer(unit.segment.data(), unit.segment.size(), 0);
-        auto format = input_processor_creator(read_buffer, header, row_input_format_params, format_settings);
-        format->setCurrentUnitNumber(current_ticket_number);
-        auto parser = std::make_unique<InputStreamFromInputFormat>(std::move(format));
-
-        unit.block_ext.block.clear();
-        unit.block_ext.block_missing_values.clear();
-
-        // We don't know how many blocks will be. So we have to read them all
-        // until an empty block occurred.
-        Block block;
-        while (!finished && (block = parser->read()) != Block())
-        {
-            unit.block_ext.block.emplace_back(block);
-            unit.block_ext.block_missing_values.emplace_back(parser->getMissingValues());
-        }
-
-        // We suppose we will get at least some blocks for a non-empty buffer,
-        // except at the end of file. Also see a matching assert in readImpl().
-        assert(unit.is_last || !unit.block_ext.block.empty());
-
-        std::unique_lock<std::mutex> lock(mutex);
-        unit.status = READY_TO_READ;
-        reader_condvar.notify_all();
-    }
-    catch (...)
-    {
-        onBackgroundException(unit.offset);
-    }
-}
-
-void ParallelParsingBlockInputStream::onBackgroundException(size_t offset)
-{
-    std::unique_lock<std::mutex> lock(mutex);
-    if (!background_exception)
-    {
-        background_exception = std::current_exception();
-
-        if (ParsingException * e = exception_cast<ParsingException *>(background_exception))
-            if (e->getLineNumber() != -1)
-                e->setLineNumber(e->getLineNumber() + offset);
-    }
-    tryLogCurrentException(__PRETTY_FUNCTION__);
-    finished = true;
-    reader_condvar.notify_all();
-    segmentator_condvar.notify_all();
-}
-
-Block ParallelParsingBlockInputStream::readImpl()
-{
-    if (isCancelledOrThrowIfKilled() || finished)
-    {
-        /**
-          * Check for background exception and rethrow it before we return.
-          */
-        std::unique_lock<std::mutex> lock(mutex);
-        if (background_exception)
-        {
-            lock.unlock();
-            cancel(false);
-            std::rethrow_exception(background_exception);
-        }
-
-        return Block{};
-    }
-
-    const auto current_unit_number = reader_ticket_number % processing_units.size();
-    auto & unit = processing_units[current_unit_number];
-
-    if (!next_block_in_current_unit.has_value())
-    {
-        // We have read out all the Blocks from the previous Processing Unit,
-        // wait for the current one to become ready.
-        std::unique_lock<std::mutex> lock(mutex);
-        reader_condvar.wait(lock, [&](){ return unit.status == READY_TO_READ || finished; });
-
-        if (finished)
-        {
-            /**
-              * Check for background exception and rethrow it before we return.
-              */
-            if (background_exception)
-            {
-                lock.unlock();
-                cancel(false);
-                std::rethrow_exception(background_exception);
-            }
-
-            return Block{};
-        }
-
-        assert(unit.status == READY_TO_READ);
-        next_block_in_current_unit = 0;
-    }
-
-    if (unit.block_ext.block.empty())
-    {
-        /*
-         * Can we get zero blocks for an entire segment, when the format parser
-         * skips it entire content and does not create any blocks? Probably not,
-         * but if we ever do, we should add a loop around the above if, to skip
-         * these. Also see a matching assert in the parser thread.
-         */
-        assert(unit.is_last);
-        finished = true;
-        return Block{};
-    }
-
-    assert(next_block_in_current_unit.value() < unit.block_ext.block.size());
-
-    Block res = std::move(unit.block_ext.block.at(*next_block_in_current_unit));
-    last_block_missing_values = std::move(unit.block_ext.block_missing_values[*next_block_in_current_unit]);
-
-    next_block_in_current_unit.value() += 1;
-
-    if (*next_block_in_current_unit == unit.block_ext.block.size())
-    {
-        // Finished reading this Processing Unit, move to the next one.
-        next_block_in_current_unit.reset();
-        ++reader_ticket_number;
-
-        if (unit.is_last)
-        {
-            // It it was the last unit, we're finished.
-            finished = true;
-        }
-        else
-        {
-            // Pass the unit back to the segmentator.
-            std::unique_lock<std::mutex> lock(mutex);
-            unit.status = READY_TO_INSERT;
-            segmentator_condvar.notify_all();
-        }
-    }
-
-    return res;
-}
-
-
-}
--- a/src/DataStreams/ParallelParsingBlockInputStream.h
+++ b/src/DataStreams/ParallelParsingBlockInputStream.h
@ -1,181 +0,0 @@
-#pragma once
-
-#include <DataStreams/IBlockInputStream.h>
-#include <Formats/FormatFactory.h>
-#include <Common/ThreadPool.h>
-#include <Processors/Formats/IRowInputFormat.h>
-#include <Processors/Formats/InputStreamFromInputFormat.h>
-
-namespace DB
-{
-
-class ReadBuffer;
-
-/**
- * ORDER-PRESERVING parallel parsing of data formats.
- * It splits original data into chunks. Then each chunk is parsed by different thread.
- * The number of chunks equals to the number or parser threads.
- * The size of chunk is equal to min_chunk_bytes_for_parallel_parsing setting.
- *
- * This stream has three kinds of threads: one segmentator, multiple parsers,
- * and one reader thread -- that is, the one from which readImpl() is called.
- * They operate one after another on parts of data called "processing units".
- * One unit consists of buffer with raw data from file, filled by segmentator
- * thread. This raw data is then parsed by a parser thread to form a number of
- * Blocks. These Blocks are returned to the parent stream from readImpl().
- * After being read out, a processing unit is reused, to save on allocating
- * memory for the raw buffer. The processing units are organized into a circular
- * array to facilitate reuse and to apply backpressure on the segmentator thread
- * -- after it runs out of processing units, it has to wait for the reader to
- * read out the previous blocks.
- * The outline of what the threads do is as follows:
- * segmentator thread:
- *  1) wait for the next processing unit to become empty
- *  2) fill it with a part of input file
- *  3) start a parser thread
- *  4) repeat until eof
- * parser thread:
- *  1) parse the given raw buffer without any synchronization
- *  2) signal that the given unit is ready to read
- *  3) finish
- * readImpl():
- *  1) wait for the next processing unit to become ready to read
- *  2) take the blocks from the processing unit to return them to the caller
- *  3) signal that the processing unit is empty
- *  4) repeat until it encounters unit that is marked as "past_the_end"
- * All threads must also check for cancel/eof/exception flags.
- */
-class ParallelParsingBlockInputStream : public IBlockInputStream
-{
-private:
-    using ReadCallback = std::function<void()>;
-
-    using InputProcessorCreator = std::function<InputFormatPtr(
-            ReadBuffer & buf,
-            const Block & header,
-            const RowInputFormatParams & params,
-            const FormatSettings & settings)>;
-public:
-    struct InputCreatorParams
-    {
-        const Block & sample;
-        const RowInputFormatParams & row_input_format_params;
-        const FormatSettings &settings;
-    };
-
-    struct Params
-    {
-        ReadBuffer & read_buffer;
-        const InputProcessorCreator & input_processor_creator;
-        const InputCreatorParams & input_creator_params;
-        FormatFactory::FileSegmentationEngine file_segmentation_engine;
-        size_t max_threads;
-        size_t min_chunk_bytes;
-    };
-
-    explicit ParallelParsingBlockInputStream(const Params & params);
-    ~ParallelParsingBlockInputStream() override;
-
-    String getName() const override { return "ParallelParsing"; }
-    Block getHeader() const override { return header; }
-
-    void cancel(bool kill) override;
-
-protected:
-    // Reader routine
-    Block readImpl() override;
-
-    const BlockMissingValues & getMissingValues() const override
-    {
-        return last_block_missing_values;
-    }
-
-private:
-    const Block header;
-    const RowInputFormatParams row_input_format_params;
-    const FormatSettings format_settings;
-    const InputProcessorCreator input_processor_creator;
-
-    const size_t min_chunk_bytes;
-
-    /*
-     * This is declared as atomic to avoid UB, because parser threads access it
-     * without synchronization.
-     */
-    std::atomic<bool> finished{false};
-
-    BlockMissingValues last_block_missing_values;
-
-    // Original ReadBuffer to read from.
-    ReadBuffer & original_buffer;
-
-    //Non-atomic because it is used in one thread.
-    std::optional<size_t> next_block_in_current_unit;
-    size_t segmentator_ticket_number{0};
-    size_t reader_ticket_number{0};
-
-    std::mutex mutex;
-    std::condition_variable reader_condvar;
-    std::condition_variable segmentator_condvar;
-
-    // There are multiple "parsers", that's why we use thread pool.
-    ThreadPool pool;
-    // Reading and segmentating the file
-    ThreadFromGlobalPool segmentator_thread;
-
-    // Function to segment the file. Then "parsers" will parse that segments.
-    FormatFactory::FileSegmentationEngine file_segmentation_engine;
-
-    enum ProcessingUnitStatus
-    {
-        READY_TO_INSERT,
-        READY_TO_PARSE,
-        READY_TO_READ
-    };
-
-    struct BlockExt
-    {
-        std::vector<Block> block;
-        std::vector<BlockMissingValues> block_missing_values;
-    };
-
-    struct ProcessingUnit
-    {
-        explicit ProcessingUnit()
-            : status(ProcessingUnitStatus::READY_TO_INSERT)
-        {
-        }
-
-        BlockExt block_ext;
-        Memory<> segment;
-        std::atomic<ProcessingUnitStatus> status;
-        /// Needed for better exception message.
-        size_t offset = 0;
-        bool is_last{false};
-    };
-
-    std::exception_ptr background_exception = nullptr;
-
-    // We use deque instead of vector, because it does not require a move
-    // constructor, which is absent for atomics that are inside ProcessingUnit.
-    std::deque<ProcessingUnit> processing_units;
-
-
-    /// Compute it to have a more understandable error message.
-    size_t successfully_read_rows_count{0};
-
-
-    void scheduleParserThreadForUnitWithNumber(size_t ticket_number);
-    void finishAndWait();
-
-    void segmentatorThreadFunction(ThreadGroupStatusPtr thread_group);
-    void parserThreadFunction(ThreadGroupStatusPtr thread_group, size_t current_ticket_number);
-
-    // Save/log a background exception, set termination flag, wake up all
-    // threads. This function is used by segmentator and parsed threads.
-    // readImpl() is called from the main thread, so the exception handling
-    // is different.
-    void onBackgroundException(size_t offset);
-};
-
-}
--- a/src/DataStreams/ya.make
+++ b/src/DataStreams/ya.make
@ -35,7 +35,6 @@ SRCS(
    MongoDBBlockInputStream.cpp
    NativeBlockInputStream.cpp
    NativeBlockOutputStream.cpp
-    ParallelParsingBlockInputStream.cpp
    PushingToViewsBlockOutputStream.cpp
    RemoteBlockInputStream.cpp
    RemoteBlockOutputStream.cpp
--- a/src/Dictionaries/ExecutableDictionarySource.cpp
+++ b/src/Dictionaries/ExecutableDictionarySource.cpp
@ -184,7 +184,7 @@ BlockInputStreamPtr ExecutableDictionarySource::loadIds(const std::vector<UInt64
        context, format, sample_block, command, log,
        [&ids, this](WriteBufferFromFile & out) mutable
        {
-            auto output_stream = context.getOutputFormat(format, out, sample_block);
+            auto output_stream = context.getOutputStream(format, out, sample_block);
            formatIDs(output_stream, ids);
            out.close();
        });
@ -198,7 +198,7 @@ BlockInputStreamPtr ExecutableDictionarySource::loadKeys(const Columns & key_col
        context, format, sample_block, command, log,
        [key_columns, &requested_rows, this](WriteBufferFromFile & out) mutable
        {
-            auto output_stream = context.getOutputFormat(format, out, sample_block);
+            auto output_stream = context.getOutputStream(format, out, sample_block);
            formatKeys(dict_struct, output_stream, key_columns, requested_rows);
            out.close();
        });
--- a/src/Dictionaries/HTTPDictionarySource.cpp
+++ b/src/Dictionaries/HTTPDictionarySource.cpp
@ -134,7 +134,7 @@ BlockInputStreamPtr HTTPDictionarySource::loadIds(const std::vector<UInt64> & id
    ReadWriteBufferFromHTTP::OutStreamCallback out_stream_callback = [&](std::ostream & ostr)
    {
        WriteBufferFromOStream out_buffer(ostr);
-        auto output_stream = context.getOutputFormat(format, out_buffer, sample_block);
+        auto output_stream = context.getOutputStream(format, out_buffer, sample_block);
        formatIDs(output_stream, ids);
    };

@ -153,7 +153,7 @@ BlockInputStreamPtr HTTPDictionarySource::loadKeys(const Columns & key_columns,
    ReadWriteBufferFromHTTP::OutStreamCallback out_stream_callback = [&](std::ostream & ostr)
    {
        WriteBufferFromOStream out_buffer(ostr);
-        auto output_stream = context.getOutputFormat(format, out_buffer, sample_block);
+        auto output_stream = context.getOutputStream(format, out_buffer, sample_block);
        formatKeys(dict_struct, output_stream, key_columns, requested_rows);
    };

--- a/src/Dictionaries/XDBCDictionarySource.cpp
+++ b/src/Dictionaries/XDBCDictionarySource.cpp
@ -4,6 +4,7 @@
 #include <DataStreams/IBlockInputStream.h>
 #include <DataTypes/DataTypeString.h>
 #include <Formats/FormatFactory.h>
+#include <Processors/Formats/InputStreamFromInputFormat.h>
 #include <IO/ReadWriteBufferFromHTTP.h>
 #include <IO/WriteHelpers.h>
 #include <IO/ConnectionTimeoutsContext.h>
@ -47,8 +48,8 @@ namespace
            : name(name_)
        {
            read_buf = std::make_unique<ReadWriteBufferFromHTTP>(uri, Poco::Net::HTTPRequest::HTTP_POST, callback, timeouts);
-            reader
-                = FormatFactory::instance().getInput(IXDBCBridgeHelper::DEFAULT_FORMAT, *read_buf, sample_block, context, max_block_size);
+            auto format = FormatFactory::instance().getInput(IXDBCBridgeHelper::DEFAULT_FORMAT, *read_buf, sample_block, context, max_block_size);
+            reader = std::make_shared<InputStreamFromInputFormat>(format);
        }

        Block getHeader() const override { return reader->getHeader(); }
--- a/src/Formats/FormatFactory.cpp
+++ b/src/Formats/FormatFactory.cpp
@ -5,18 +5,22 @@
 #include <Interpreters/Context.h>
 #include <Core/Settings.h>
 #include <DataStreams/MaterializingBlockOutputStream.h>
-#include <DataStreams/ParallelParsingBlockInputStream.h>
+#include <DataStreams/SquashingBlockOutputStream.h>
+#include <DataStreams/NativeBlockInputStream.h>
 #include <Formats/FormatSettings.h>
 #include <Processors/Formats/IRowInputFormat.h>
 #include <Processors/Formats/IRowOutputFormat.h>
 #include <Processors/Formats/InputStreamFromInputFormat.h>
 #include <Processors/Formats/OutputStreamToOutputFormat.h>
-#include <DataStreams/NativeBlockInputStream.h>
 #include <Processors/Formats/Impl/ValuesBlockInputFormat.h>
 #include <Processors/Formats/Impl/MySQLOutputFormat.h>
-#include <Processors/Formats/Impl/PostgreSQLOutputFormat.h>
+#include <Processors/Formats/Impl/NativeFormat.h>
+#include <Processors/Formats/Impl/ParallelParsingInputFormat.h>
+#include <Processors/Formats/Impl/ParallelFormattingOutputFormat.h>
 #include <Poco/URI.h>

+#include <IO/ReadHelpers.h>
+
 #if !defined(ARCADIA_BUILD)
 #    include <Common/config.h>
 #endif
@ -132,7 +136,7 @@ FormatSettings getFormatSettings<Settings>(const Context & context,
    const Settings & settings);


-BlockInputStreamPtr FormatFactory::getInput(
+InputFormatPtr FormatFactory::getInput(
    const String & name,
    ReadBuffer & buf,
    const Block & sample,
@ -141,19 +145,14 @@ BlockInputStreamPtr FormatFactory::getInput(
    const std::optional<FormatSettings> & _format_settings) const
 {
    if (name == "Native")
-        return std::make_shared<NativeBlockInputStream>(buf, sample, 0);
+        return std::make_shared<NativeInputFormatFromNativeBlockInputStream>(sample, buf);

    auto format_settings = _format_settings
        ? *_format_settings : getFormatSettings(context);

    if (!getCreators(name).input_processor_creator)
    {
-        const auto & input_getter = getCreators(name).input_creator;
-        if (!input_getter)
-            throw Exception("Format " + name + " is not suitable for input", ErrorCodes::FORMAT_IS_NOT_SUITABLE_FOR_INPUT);
-
-
-        return input_getter(buf, sample, max_block_size, {}, format_settings);
+        throw Exception("Format " + name + " is not suitable for input (with processors)", ErrorCodes::FORMAT_IS_NOT_SUITABLE_FOR_INPUT);
    }

    const Settings & settings = context.getSettingsRef();
@ -166,6 +165,9 @@ BlockInputStreamPtr FormatFactory::getInput(
    if (settings.max_memory_usage && settings.min_chunk_bytes_for_parallel_parsing * settings.max_threads * 2 > settings.max_memory_usage)
        parallel_parsing = false;

+    if (settings.max_memory_usage_for_user && settings.min_chunk_bytes_for_parallel_parsing * settings.max_threads * 2 > settings.max_memory_usage_for_user)
+        parallel_parsing = false;
+
    if (parallel_parsing && name == "JSONEachRow")
    {
        /// FIXME ParallelParsingBlockInputStream doesn't support formats with non-trivial readPrefix() and readSuffix()
@ -179,8 +181,6 @@ BlockInputStreamPtr FormatFactory::getInput(
    if (parallel_parsing)
    {
        const auto & input_getter = getCreators(name).input_processor_creator;
-        if (!input_getter)
-            throw Exception("Format " + name + " is not suitable for input", ErrorCodes::FORMAT_IS_NOT_SUITABLE_FOR_INPUT);

        RowInputFormatParams row_input_format_params;
        row_input_format_params.max_block_size = max_block_size;
@ -189,23 +189,56 @@ BlockInputStreamPtr FormatFactory::getInput(
        row_input_format_params.max_execution_time = settings.max_execution_time;
        row_input_format_params.timeout_overflow_mode = settings.timeout_overflow_mode;

-        auto input_creator_params =
-            ParallelParsingBlockInputStream::InputCreatorParams{sample,
-                row_input_format_params, format_settings};
-        ParallelParsingBlockInputStream::Params params{buf, input_getter,
-            input_creator_params, file_segmentation_engine,
-            settings.max_threads,
-            settings.min_chunk_bytes_for_parallel_parsing};
-        return std::make_shared<ParallelParsingBlockInputStream>(params);
+        /// Const reference is copied to lambda.
+        auto parser_creator = [input_getter, sample, row_input_format_params, format_settings]
+            (ReadBuffer & input) -> InputFormatPtr
+            { return input_getter(input, sample, row_input_format_params, format_settings); };
+
+
+        ParallelParsingInputFormat::Params params{
+            buf, sample, parser_creator, file_segmentation_engine, name, settings.max_threads, settings.min_chunk_bytes_for_parallel_parsing};
+        return std::make_shared<ParallelParsingInputFormat>(params);
    }

-    auto format = getInputFormat(name, buf, sample, context, max_block_size,
-        format_settings);
-    return std::make_shared<InputStreamFromInputFormat>(std::move(format));
+
+    auto format = getInputFormat(name, buf, sample, context, max_block_size, format_settings);
+    return format;
+}
+
+BlockOutputStreamPtr FormatFactory::getOutputStreamParallelIfPossible(const String & name,
+    WriteBuffer & buf, const Block & sample, const Context & context,
+    WriteCallback callback, const std::optional<FormatSettings> & _format_settings) const
+{
+    const auto & output_getter = getCreators(name).output_processor_creator;
+
+    const Settings & settings = context.getSettingsRef();
+    bool parallel_formatting = settings.output_format_parallel_formatting;
+
+    if (output_getter && parallel_formatting && getCreators(name).supports_parallel_formatting
+        && !settings.output_format_json_array_of_rows)
+    {
+        auto format_settings = _format_settings
+        ? *_format_settings : getFormatSettings(context);
+
+        auto formatter_creator = [output_getter, sample, callback, format_settings]
+            (WriteBuffer & output) -> OutputFormatPtr
+            { return output_getter(output, sample, {std::move(callback)}, format_settings);};
+
+        ParallelFormattingOutputFormat::Params params{buf, sample, formatter_creator, settings.max_threads};
+        auto format = std::make_shared<ParallelFormattingOutputFormat>(params);
+
+        /// Enable auto-flush for streaming mode. Currently it is needed by INSERT WATCH query.
+        if (format_settings.enable_streaming)
+            format->setAutoFlush();
+
+        return std::make_shared<MaterializingBlockOutputStream>(std::make_shared<OutputStreamToOutputFormat>(format), sample);
+    }
+
+    return getOutputStream(name, buf, sample, context, callback, _format_settings);
 }


-BlockOutputStreamPtr FormatFactory::getOutput(const String & name,
+BlockOutputStreamPtr FormatFactory::getOutputStream(const String & name,
    WriteBuffer & buf, const Block & sample, const Context & context,
    WriteCallback callback, const std::optional<FormatSettings> & _format_settings) const
 {
@ -226,10 +259,8 @@ BlockOutputStreamPtr FormatFactory::getOutput(const String & name,
            sample);
    }

-    auto format = getOutputFormat(name, buf, sample, context, std::move(callback),
-        format_settings);
-    return std::make_shared<MaterializingBlockOutputStream>(
-        std::make_shared<OutputStreamToOutputFormat>(format), sample);
+    auto format = getOutputFormat(name, buf, sample, context, std::move(callback), _format_settings);
+    return std::make_shared<MaterializingBlockOutputStream>(std::make_shared<OutputStreamToOutputFormat>(format), sample);
 }


@ -266,6 +297,35 @@ InputFormatPtr FormatFactory::getInputFormat(
    return format;
 }

+OutputFormatPtr FormatFactory::getOutputFormatParallelIfPossible(
+    const String & name, WriteBuffer & buf, const Block & sample,
+    const Context & context, WriteCallback callback,
+    const std::optional<FormatSettings> & _format_settings) const
+{
+    const auto & output_getter = getCreators(name).output_processor_creator;
+    if (!output_getter)
+        throw Exception("Format " + name + " is not suitable for output (with processors)", ErrorCodes::FORMAT_IS_NOT_SUITABLE_FOR_OUTPUT);
+
+    auto format_settings = _format_settings
+        ? *_format_settings : getFormatSettings(context);
+
+    const Settings & settings = context.getSettingsRef();
+
+    if (settings.output_format_parallel_formatting && getCreators(name).supports_parallel_formatting
+        && !settings.output_format_json_array_of_rows)
+    {
+        auto formatter_creator = [output_getter, sample, callback, format_settings]
+        (WriteBuffer & output) -> OutputFormatPtr
+        { return output_getter(output, sample, {std::move(callback)}, format_settings);};
+
+        ParallelFormattingOutputFormat::Params builder{buf, sample, formatter_creator, settings.max_threads};
+
+        return std::make_shared<ParallelFormattingOutputFormat>(builder);
+    }
+
+    return getOutputFormat(name, buf, sample, context, callback, _format_settings);
+}
+

 OutputFormatPtr FormatFactory::getOutputFormat(
    const String & name, WriteBuffer & buf, const Block & sample,
@ -274,7 +334,7 @@ OutputFormatPtr FormatFactory::getOutputFormat(
 {
    const auto & output_getter = getCreators(name).output_processor_creator;
    if (!output_getter)
-        throw Exception("Format " + name + " is not suitable for output", ErrorCodes::FORMAT_IS_NOT_SUITABLE_FOR_OUTPUT);
+        throw Exception("Format " + name + " is not suitable for output (with processors)", ErrorCodes::FORMAT_IS_NOT_SUITABLE_FOR_OUTPUT);

    RowOutputFormatParams params;
    params.callback = std::move(callback);
@ -339,6 +399,16 @@ void FormatFactory::registerFileSegmentationEngine(const String & name, FileSegm
    target = std::move(file_segmentation_engine);
 }

+
+void FormatFactory::markOutputFormatSupportsParallelFormatting(const String & name)
+{
+    auto & target = dict[name].supports_parallel_formatting;
+    if (target)
+        throw Exception("FormatFactory: Output format " + name + " is already marked as supporting parallel formatting.", ErrorCodes::LOGICAL_ERROR);
+    target = true;
+}
+
+
 FormatFactory & FormatFactory::instance()
 {
    static FormatFactory ret;
--- a/src/Formats/FormatFactory.h
+++ b/src/Formats/FormatFactory.h
@ -79,11 +79,13 @@ private:
        WriteCallback callback,
        const FormatSettings & settings)>;

-    using InputProcessorCreator = std::function<InputFormatPtr(
-            ReadBuffer & buf,
-            const Block & header,
-            const RowInputFormatParams & params,
-            const FormatSettings & settings)>;
+    using InputProcessorCreatorFunc = InputFormatPtr(
+        ReadBuffer & buf,
+        const Block & header,
+        const RowInputFormatParams & params,
+        const FormatSettings & settings);
+
+    using InputProcessorCreator = std::function<InputProcessorCreatorFunc>;

    using OutputProcessorCreator = std::function<OutputFormatPtr(
            WriteBuffer & buf,
@ -98,6 +100,7 @@ private:
        InputProcessorCreator input_processor_creator;
        OutputProcessorCreator output_processor_creator;
        FileSegmentationEngine file_segmentation_engine;
+        bool supports_parallel_formatting{false};
    };

    using FormatsDictionary = std::unordered_map<String, Creators>;
@ -105,7 +108,7 @@ private:
 public:
    static FormatFactory & instance();

-    BlockInputStreamPtr getInput(
+    InputFormatPtr getInput(
        const String & name,
        ReadBuffer & buf,
        const Block & sample,
@ -113,7 +116,14 @@ public:
        UInt64 max_block_size,
        const std::optional<FormatSettings> & format_settings = std::nullopt) const;

-    BlockOutputStreamPtr getOutput(const String & name, WriteBuffer & buf,
+    /// Checks all preconditions. Returns ordinary stream if parallel formatting cannot be done.
+    /// Currently used only in Client. Don't use it something else! Better look at getOutputFormatParallelIfPossible.
+    BlockOutputStreamPtr getOutputStreamParallelIfPossible(const String & name, WriteBuffer & buf,
+        const Block & sample, const Context & context, WriteCallback callback = {},
+        const std::optional<FormatSettings> & format_settings = std::nullopt) const;
+
+    /// Currently used only in Client. Don't use it something else! Better look at getOutputFormat.
+    BlockOutputStreamPtr getOutputStream(const String & name, WriteBuffer & buf,
        const Block & sample, const Context & context, WriteCallback callback = {},
        const std::optional<FormatSettings> & format_settings = std::nullopt) const;

@ -125,6 +135,12 @@ public:
        UInt64 max_block_size,
        const std::optional<FormatSettings> & format_settings = std::nullopt) const;

+    /// Checks all preconditions. Returns ordinary format if parallel formatting cannot be done.
+    OutputFormatPtr getOutputFormatParallelIfPossible(
+        const String & name, WriteBuffer & buf, const Block & sample,
+        const Context & context, WriteCallback callback = {},
+        const std::optional<FormatSettings> & format_settings = std::nullopt) const;
+
    OutputFormatPtr getOutputFormat(
        const String & name, WriteBuffer & buf, const Block & sample,
        const Context & context, WriteCallback callback = {},
@ -138,6 +154,8 @@ public:
    void registerInputFormatProcessor(const String & name, InputProcessorCreator input_creator);
    void registerOutputFormatProcessor(const String & name, OutputProcessorCreator output_creator);

+    void markOutputFormatSupportsParallelFormatting(const String & name);
+
    const FormatsDictionary & getAllFormats() const
    {
        return dict;
--- a/src/Functions/CMakeLists.txt
+++ b/src/Functions/CMakeLists.txt
@ -18,7 +18,6 @@ target_link_libraries(clickhouse_functions
        clickhouse_dictionaries_embedded
        clickhouse_parsers
        consistent-hashing
-        consistent-hashing-sumbur
        dbms
        metrohash
        murmurhash
--- a/src/Functions/FunctionBinaryArithmetic.h
+++ b/src/Functions/FunctionBinaryArithmetic.h
@ -416,8 +416,8 @@ private:
        DivideIntegralImpl<NativeResultType, NativeResultType>, /// substitute divide by intDiv (throw on division by zero)
        Operation<NativeResultType, NativeResultType>>;

-    template <OpCase op_case, OpCase target>
-    static auto unwrap(const auto& elem, size_t i)
+    template <OpCase op_case, OpCase target, class E>
+    static auto unwrap(const E& elem, size_t i)
    {
        if constexpr (op_case == target)
            return undec(elem);
@ -744,8 +744,8 @@ class FunctionBinaryArithmetic : public IFunction
        return function->execute(new_arguments, result_type, input_rows_count);
    }

-    template <class T, class ResultDataType>
-    static auto helperGetOrConvert(const auto & col_const, const auto & col)
+    template <class T, class ResultDataType, class CC, class C>
+    static auto helperGetOrConvert(const CC & col_const, const C & col)
    {
        using ResultType = typename ResultDataType::FieldType;
        using NativeResultType = typename NativeType<ResultType>::Type;
@ -756,8 +756,9 @@ class FunctionBinaryArithmetic : public IFunction
            return col_const->template getValue<T>();
    }

-    template <OpCase op_case, bool left_decimal, bool right_decimal, class OpImpl, class OpImplCheck>
-    void helperInvokeEither(const auto& left, const auto& right, auto& vec_res, auto scale_a, auto scale_b) const
+    template <OpCase op_case, bool left_decimal, bool right_decimal, class OpImpl, class OpImplCheck,
+              class L, class R, class VR, class SA, class SB>
+    void helperInvokeEither(const L& left, const R& right, VR& vec_res, SA scale_a, SB scale_b) const
    {
        if (check_decimal_overflow)
            OpImplCheck::template process<op_case, left_decimal, right_decimal>(left, right, vec_res, scale_a, scale_b);
@ -765,11 +766,12 @@ class FunctionBinaryArithmetic : public IFunction
            OpImpl::template process<op_case, left_decimal, right_decimal>(left, right, vec_res, scale_a, scale_b);
    }

-    template <class LeftDataType, class RightDataType, class ResultDataType>
+    template <class LeftDataType, class RightDataType, class ResultDataType,
+              class L, class R, class CL, class CR>
    ColumnPtr executeNumericWithDecimal(
-        const auto & left, const auto & right,
+        const L & left, const R & right,
        const ColumnConst * const col_left_const, const ColumnConst * const col_right_const,
-        const auto * const col_left, const auto * const col_right,
+        const CL * const col_left, const CR * const col_right,
        size_t col_left_size) const
    {
        using T0 = typename LeftDataType::FieldType;
--- a/src/Functions/FunctionsCoding.h
+++ b/src/Functions/FunctionsCoding.h
@ -1,7 +1,8 @@
 #pragma once

-#include <Common/hex.h>
 #include <Common/formatIPv6.h>
+#include <Common/hex.h>
+#include <Common/IPv6ToBinary.h>
 #include <Common/typeid_cast.h>
 #include <IO/WriteHelpers.h>
 #include <DataTypes/DataTypeFactory.h>
@ -1617,20 +1618,28 @@ public:
 class FunctionIPv6CIDRToRange : public IFunction
 {
 private:
-    /// TODO Inefficient.
+
+#if defined(__SSE2__)
+
+    #include <emmintrin.h>
+
+    static inline void applyCIDRMask(const UInt8 * __restrict src, UInt8 * __restrict dst_lower, UInt8 * __restrict dst_upper, UInt8 bits_to_keep)
+    {
+        __m128i mask = _mm_loadu_si128(reinterpret_cast<const __m128i *>(getCIDRMaskIPv6(bits_to_keep)));
+        __m128i lower = _mm_and_si128(_mm_loadu_si128(reinterpret_cast<const __m128i *>(src)), mask);
+        _mm_storeu_si128(reinterpret_cast<__m128i *>(dst_lower), lower);
+
+        __m128i inv_mask = _mm_xor_si128(mask, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
+        __m128i upper = _mm_or_si128(lower, inv_mask);
+        _mm_storeu_si128(reinterpret_cast<__m128i *>(dst_upper), upper);
+    }
+
+#else
+
    /// NOTE IPv6 is stored in memory in big endian format that makes some difficulties.
    static void applyCIDRMask(const UInt8 * __restrict src, UInt8 * __restrict dst_lower, UInt8 * __restrict dst_upper, UInt8 bits_to_keep)
    {
-        UInt8 mask[16]{};
-
-        UInt8 bytes_to_keep = bits_to_keep / 8;
-        UInt8 bits_to_keep_in_last_byte = bits_to_keep % 8;
-
-        for (size_t i = 0; i < bits_to_keep / 8; ++i)
-            mask[i] = 0xFFU;
-
-        if (bits_to_keep_in_last_byte)
-            mask[bytes_to_keep] = 0xFFU << (8 - bits_to_keep_in_last_byte);
+        const auto * mask = getCIDRMaskIPv6(bits_to_keep);

        for (size_t i = 0; i < 16; ++i)
        {
@ -1639,6 +1648,8 @@ private:
        }
    }

+#endif
+
 public:
    static constexpr auto name = "IPv6CIDRToRange";
    static FunctionPtr create(const Context &) { return std::make_shared<FunctionIPv6CIDRToRange>(); }
--- a/src/Functions/bar.cpp
+++ b/src/Functions/bar.cpp
@ -2,7 +2,6 @@
 #include <Functions/FunctionFactory.h>
 #include <DataTypes/DataTypeString.h>
 #include <Columns/ColumnString.h>
-#include <Columns/ColumnVector.h>
 #include <Common/UnicodeBar.h>
 #include <Common/FieldVisitors.h>
 #include <IO/WriteHelpers.h>
@ -57,23 +56,30 @@ public:
                    + ".",
                ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);

-        if (!isNativeNumber(arguments[0]) || !isNativeNumber(arguments[1]) || !isNativeNumber(arguments[2])
-            || (arguments.size() == 4 && !isNativeNumber(arguments[3])))
+        if (!isNumber(arguments[0]) || !isNumber(arguments[1]) || !isNumber(arguments[2])
+            || (arguments.size() == 4 && !isNumber(arguments[3])))
            throw Exception("All arguments for function " + getName() + " must be numeric.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);

        return std::make_shared<DataTypeString>();
    }

    bool useDefaultImplementationForConstants() const override { return true; }
-    ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1, 2, 3}; }
+    ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {3}; }

-    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override
+    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
    {
-        Int64 min = extractConstant<Int64>(arguments, 1, "Second"); /// The level at which the line has zero length.
-        Int64 max = extractConstant<Int64>(arguments, 2, "Third"); /// The level at which the line has the maximum length.
+        /// The maximum width of the bar in characters.
+        Float64 max_width = 80; /// Motivated by old-school terminal size.

-        /// The maximum width of the bar in characters, by default.
-        Float64 max_width = arguments.size() == 4 ? extractConstant<Float64>(arguments, 3, "Fourth") : 80;
+        if (arguments.size() == 4)
+        {
+            const auto & max_width_column = *arguments[3].column;
+
+            if (!isColumnConst(max_width_column))
+                throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Fourth argument for function {} must be constant", getName());
+
+            max_width = max_width_column.getFloat64(0);
+        }

        if (isNaN(max_width))
            throw Exception("Argument 'max_width' must not be NaN", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
@ -86,83 +92,32 @@ public:

        const auto & src = *arguments[0].column;

-        auto res_column = ColumnString::create();
-
-        if (executeNumber<UInt8>(src, *res_column, min, max, max_width)
-            || executeNumber<UInt16>(src, *res_column, min, max, max_width)
-            || executeNumber<UInt32>(src, *res_column, min, max, max_width)
-            || executeNumber<UInt64>(src, *res_column, min, max, max_width)
-            || executeNumber<Int8>(src, *res_column, min, max, max_width)
-            || executeNumber<Int16>(src, *res_column, min, max, max_width)
-            || executeNumber<Int32>(src, *res_column, min, max, max_width)
-            || executeNumber<Int64>(src, *res_column, min, max, max_width)
-            || executeNumber<Float32>(src, *res_column, min, max, max_width)
-            || executeNumber<Float64>(src, *res_column, min, max, max_width))
-        {
-            return res_column;
-        }
-        else
-            throw Exception(
-                "Illegal column " + arguments[0].column->getName() + " of argument of function " + getName(),
-                ErrorCodes::ILLEGAL_COLUMN);
-    }
-
-private:
-    template <typename T>
-    T extractConstant(const ColumnsWithTypeAndName & arguments, size_t argument_pos, const char * which_argument) const
-    {
-        const auto & column = *arguments[argument_pos].column;
-
-        if (!isColumnConst(column))
-            throw Exception(
-                which_argument + String(" argument for function ") + getName() + " must be constant.", ErrorCodes::ILLEGAL_COLUMN);
-
-        return applyVisitor(FieldVisitorConvertToNumber<T>(), column[0]);
-    }
-
-    template <typename T>
-    static void fill(const PaddedPODArray<T> & src,
-        ColumnString::Chars & dst_chars,
-        ColumnString::Offsets & dst_offsets,
-        Int64 min,
-        Int64 max,
-        Float64 max_width)
-    {
-        size_t size = src.size();
        size_t current_offset = 0;

-        dst_offsets.resize(size);
-        dst_chars.reserve(size * (UnicodeBar::getWidthInBytes(max_width) + 1)); /// lines 0-terminated.
+        auto res_column = ColumnString::create();

-        for (size_t i = 0; i < size; ++i)
+        ColumnString::Chars & dst_chars = res_column->getChars();
+        ColumnString::Offsets & dst_offsets = res_column->getOffsets();
+
+        dst_offsets.resize(input_rows_count);
+        dst_chars.reserve(input_rows_count * (UnicodeBar::getWidthInBytes(max_width) + 1)); /// strings are 0-terminated.
+
+        for (size_t i = 0; i < input_rows_count; ++i)
        {
-            Float64 width = UnicodeBar::getWidth(src[i], min, max, max_width);
+            Float64 width = UnicodeBar::getWidth(
+                src.getFloat64(i),
+                arguments[1].column->getFloat64(i),
+                arguments[2].column->getFloat64(i),
+                max_width);
+
            size_t next_size = current_offset + UnicodeBar::getWidthInBytes(width) + 1;
            dst_chars.resize(next_size);
            UnicodeBar::render(width, reinterpret_cast<char *>(&dst_chars[current_offset]));
            current_offset = next_size;
            dst_offsets[i] = current_offset;
        }
-    }

-    template <typename T>
-    static void fill(T src, String & dst_chars, Int64 min, Int64 max, Float64 max_width)
-    {
-        Float64 width = UnicodeBar::getWidth(src, min, max, max_width);
-        dst_chars.resize(UnicodeBar::getWidthInBytes(width));
-        UnicodeBar::render(width, dst_chars.data());
-    }
-
-    template <typename T>
-    static bool executeNumber(const IColumn & src, ColumnString & dst, Int64 min, Int64 max, Float64 max_width)
-    {
-        if (const ColumnVector<T> * col = checkAndGetColumn<ColumnVector<T>>(&src))
-        {
-            fill(col->getData(), dst.getChars(), dst.getOffsets(), min, max, max_width);
-            return true;
-        }
-        else
-            return false;
+        return res_column;
    }
 };

--- a/src/Functions/registerFunctionsConsistentHashing.cpp
+++ b/src/Functions/registerFunctionsConsistentHashing.cpp
@ -1,22 +1,14 @@
 namespace DB
-
 {
 class FunctionFactory;

 void registerFunctionYandexConsistentHash(FunctionFactory & factory);
 void registerFunctionJumpConsistentHash(FunctionFactory & factory);
-#if !defined(ARCADIA_BUILD)
-void registerFunctionSumburConsistentHash(FunctionFactory & factory);
-#endif
-

 void registerFunctionsConsistentHashing(FunctionFactory & factory)
 {
    registerFunctionYandexConsistentHash(factory);
    registerFunctionJumpConsistentHash(factory);
-#if !defined(ARCADIA_BUILD)
-    registerFunctionSumburConsistentHash(factory);
-#endif
 }

 }
--- a/src/Functions/sumburConsistentHash.cpp
+++ b/src/Functions/sumburConsistentHash.cpp
@ -1,38 +0,0 @@
-#include "FunctionsConsistentHashing.h"
-#include <Functions/FunctionFactory.h>
-
-#include <sumbur.h>
-
-
-namespace DB
-{
-namespace
-{
-
-struct SumburConsistentHashImpl
-{
-    static constexpr auto name = "sumburConsistentHash";
-
-    using HashType = UInt32;
-    using ResultType = UInt16;
-    using BucketsType = ResultType;
-    static constexpr auto max_buckets = static_cast<UInt64>(std::numeric_limits<BucketsType>::max());
-
-    static inline ResultType apply(HashType hash, BucketsType n)
-    {
-        return static_cast<ResultType>(sumburConsistentHash(hash, n));
-    }
-};
-
-using FunctionSumburConsistentHash = FunctionConsistentHashImpl<SumburConsistentHashImpl>;
-
-}
-
-void registerFunctionSumburConsistentHash(FunctionFactory & factory)
-{
-    factory.registerFunction<FunctionSumburConsistentHash>();
-}
-
-}
-
-
--- a/src/Functions/ya.make.in
+++ b/src/Functions/ya.make.in
@ -35,7 +35,7 @@ PEERDIR(
 # "Arcadia" build is slightly deficient. It lacks many libraries that we need.

 SRCS(
-<? find . -name '*.cpp' | grep -i -v -P 'tests|Bitmap|sumbur|abtesting' | sed 's/^\.\//    /' | sort ?>
+<? find . -name '*.cpp' | grep -i -v -P 'tests|Bitmap|abtesting' | sed 's/^\.\//    /' | sort ?>
 )

 END()
--- a/src/IO/BufferWithOwnMemory.h
+++ b/src/IO/BufferWithOwnMemory.h
@ -149,4 +149,35 @@ public:
 };


+/** Buffer that could write data to external memory which came from outside
+  * Template parameter: ReadBuffer or WriteBuffer
+  */
+template <typename Base>
+class BufferWithOutsideMemory : public Base
+{
+protected:
+    Memory<> & memory;
+public:
+
+    explicit BufferWithOutsideMemory(Memory<> & memory_)
+        : Base(memory_.data(), memory_.size()), memory(memory_)
+    {
+        Base::set(memory.data(), memory.size(), 0);
+        Base::padded = false;
+    }
+
+    size_t getActualSize()
+    {
+        return Base::count();
+    }
+
+private:
+    void nextImpl() override final
+    {
+        const size_t prev_size = Base::position() - memory.data();
+        memory.resize(2 * prev_size + 1);
+        Base::set(memory.data() + prev_size, memory.size() - prev_size, 0);
+    }
+};
+
 }
--- a/src/IO/WriteBuffer.h
+++ b/src/IO/WriteBuffer.h
@ -27,6 +27,8 @@ namespace ErrorCodes
 class WriteBuffer : public BufferBase
 {
 public:
+    using BufferBase::set;
+    using BufferBase::position;
    WriteBuffer(Position ptr, size_t size) : BufferBase(ptr, size, 0) {}
    void set(Position ptr, size_t size) { BufferBase::set(ptr, size, 0); }

--- a/src/IO/WriteBufferFromArena.h
+++ b/src/IO/WriteBufferFromArena.h
@ -35,7 +35,7 @@ private:
        /// tear down the entire WriteBuffer thing and implement it again,
        /// properly.
        size_t continuation_size = std::max(size_t(1),
-            std::max(count(), arena.remainingSpaceInCurrentChunk()));
+            std::max(count(), arena.remainingSpaceInCurrentMemoryChunk()));

        /// allocContinue method will possibly move memory region to new place and modify "begin" pointer.

--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@ -16,6 +16,7 @@
 #include <Compression/ICompressionCodec.h>
 #include <Core/BackgroundSchedulePool.h>
 #include <Formats/FormatFactory.h>
+#include <Processors/Formats/InputStreamFromInputFormat.h>
 #include <Databases/IDatabase.h>
 #include <Storages/IStorage.h>
 #include <Storages/MarkCache.h>
@ -849,7 +850,17 @@ std::optional<QuotaUsage> Context::getQuotaUsage() const

 void Context::setProfile(const String & profile_name)
 {
-    applySettingsChanges(*getAccessControlManager().getProfileSettings(profile_name));
+    SettingsChanges profile_settings_changes = *getAccessControlManager().getProfileSettings(profile_name);
+    try
+    {
+        checkSettingsConstraints(profile_settings_changes);
+    }
+    catch (Exception & e)
+    {
+        e.addMessage(", while trying to set settings profile {}", profile_name);
+        throw;
+    }
+    applySettingsChanges(profile_settings_changes);
 }


@ -2085,15 +2096,25 @@ void Context::checkPartitionCanBeDropped(const String & database, const String &

 BlockInputStreamPtr Context::getInputFormat(const String & name, ReadBuffer & buf, const Block & sample, UInt64 max_block_size) const
 {
-    return FormatFactory::instance().getInput(name, buf, sample, *this, max_block_size);
+    return std::make_shared<InputStreamFromInputFormat>(FormatFactory::instance().getInput(name, buf, sample, *this, max_block_size));
 }

-BlockOutputStreamPtr Context::getOutputFormat(const String & name, WriteBuffer & buf, const Block & sample) const
+BlockOutputStreamPtr Context::getOutputStreamParallelIfPossible(const String & name, WriteBuffer & buf, const Block & sample) const
 {
-    return FormatFactory::instance().getOutput(name, buf, sample, *this);
+    return FormatFactory::instance().getOutputStreamParallelIfPossible(name, buf, sample, *this);
 }

-OutputFormatPtr Context::getOutputFormatProcessor(const String & name, WriteBuffer & buf, const Block & sample) const
+BlockOutputStreamPtr Context::getOutputStream(const String & name, WriteBuffer & buf, const Block & sample) const
+{
+    return FormatFactory::instance().getOutputStream(name, buf, sample, *this);
+}
+
+OutputFormatPtr Context::getOutputFormatParallelIfPossible(const String & name, WriteBuffer & buf, const Block & sample) const
+{
+    return FormatFactory::instance().getOutputFormatParallelIfPossible(name, buf, sample, *this);
+}
+
+OutputFormatPtr Context::getOutputFormat(const String & name, WriteBuffer & buf, const Block & sample) const
 {
    return FormatFactory::instance().getOutputFormat(name, buf, sample, *this);
 }
--- a/src/Interpreters/Context.h
+++ b/src/Interpreters/Context.h
@ -425,9 +425,13 @@ public:

    /// I/O formats.
    BlockInputStreamPtr getInputFormat(const String & name, ReadBuffer & buf, const Block & sample, UInt64 max_block_size) const;
-    BlockOutputStreamPtr getOutputFormat(const String & name, WriteBuffer & buf, const Block & sample) const;

-    OutputFormatPtr getOutputFormatProcessor(const String & name, WriteBuffer & buf, const Block & sample) const;
+    /// Don't use streams. Better look at getOutputFormat...
+    BlockOutputStreamPtr getOutputStreamParallelIfPossible(const String & name, WriteBuffer & buf, const Block & sample) const;
+    BlockOutputStreamPtr getOutputStream(const String & name, WriteBuffer & buf, const Block & sample) const;
+
+    OutputFormatPtr getOutputFormatParallelIfPossible(const String & name, WriteBuffer & buf, const Block & sample) const;
+    OutputFormatPtr getOutputFormat(const String & name, WriteBuffer & buf, const Block & sample) const;

    InterserverIOHandler & getInterserverIOHandler();

--- a/src/Interpreters/PartLog.cpp
+++ b/src/Interpreters/PartLog.cpp
@ -12,6 +12,7 @@
 #include <Interpreters/PartLog.h>
 #include <Interpreters/Context.h>

+#include <Common/CurrentThread.h>

 namespace DB
 {
@ -32,6 +33,7 @@ Block PartLogElement::createBlock()

    return
    {
+        {ColumnString::create(), std::make_shared<DataTypeString>(),   "query_id"},
        {ColumnInt8::create(),   std::move(event_type_datatype),       "event_type"},
        {ColumnUInt16::create(), std::make_shared<DataTypeDate>(),     "event_date"},
        {ColumnUInt32::create(), std::make_shared<DataTypeDateTime>(), "event_time"},
@ -63,6 +65,7 @@ void PartLogElement::appendToBlock(MutableColumns & columns) const
 {
    size_t i = 0;

+    columns[i++]->insert(query_id);
    columns[i++]->insert(event_type);
    columns[i++]->insert(DateLUT::instance().toDayNum(event_time));
    columns[i++]->insert(event_time);
@ -114,10 +117,15 @@ bool PartLog::addNewParts(Context & current_context, const PartLog::MutableDataP
        if (!part_log)
            return false;

+        auto query_id = CurrentThread::getQueryId();
+
        for (const auto & part : parts)
        {
            PartLogElement elem;

+            if (query_id.data && query_id.size)
+                elem.query_id.insert(0, query_id.data, query_id.size);
+
            elem.event_type = PartLogElement::NEW_PART;
            elem.event_time = time(nullptr);
            elem.duration_ms = elapsed_ns / 1000000;
--- a/src/Interpreters/PartLog.h
+++ b/src/Interpreters/PartLog.h
@ -18,6 +18,8 @@ struct PartLogElement
        MOVE_PART = 6,
    };

+    String query_id;
+
    Type event_type = NEW_PART;

    time_t event_time = 0;
--- a/src/Interpreters/executeQuery.cpp
+++ b/src/Interpreters/executeQuery.cpp
@ -974,7 +974,7 @@ void executeQuery(
                ? getIdentifierName(ast_query_with_output->format)
                : context.getDefaultFormat();

-            BlockOutputStreamPtr out = context.getOutputFormat(format_name, *out_buf, streams.in->getHeader());
+            auto out = context.getOutputStream(format_name, *out_buf, streams.in->getHeader());

            /// Save previous progress callback if any. TODO Do it more conveniently.
            auto previous_progress_callback = context.getProgressCallback();
@ -1019,7 +1019,7 @@ void executeQuery(
                    return std::make_shared<MaterializingTransform>(header);
                });

-                auto out = context.getOutputFormatProcessor(format_name, *out_buf, pipeline.getHeader());
+                auto out = context.getOutputFormat(format_name, *out_buf, pipeline.getHeader());
                out->setAutoFlush();

                /// Save previous progress callback if any. TODO Do it more conveniently.
--- a/src/Processors/Formats/IOutputFormat.cpp
+++ b/src/Processors/Formats/IOutputFormat.cpp
@ -5,6 +5,11 @@
 namespace DB
 {

+namespace ErrorCodes
+{
+    extern const int NOT_IMPLEMENTED;
+}
+
 IOutputFormat::IOutputFormat(const Block & header_, WriteBuffer & out_)
    : IProcessor({header_, header_, header_}, {}), out(out_)
 {
@ -30,7 +35,7 @@ IOutputFormat::Status IOutputFormat::prepare()
        if (!input.hasData())
            return Status::NeedData;

-        current_chunk = input.pull(true);
+        current_chunk = input.pullData(true);
        current_block_kind = kind;
        has_input = true;
        return Status::Ready;
@ -44,23 +49,31 @@ IOutputFormat::Status IOutputFormat::prepare()
    return Status::Finished;
 }

-static Chunk prepareTotals(Chunk chunk)
+static Port::Data prepareTotals(Port::Data data)
 {
-    if (!chunk.hasRows())
+    if (data.exception)
+        return data;
+
+    if (!data.chunk.hasRows())
        return {};

-    if (chunk.getNumRows() > 1)
+    if (data.chunk.getNumRows() > 1)
    {
        /// This may happen if something like ARRAY JOIN was executed on totals.
        /// Skip rows except the first one.
-        auto columns = chunk.detachColumns();
+        auto columns = data.chunk.detachColumns();
        for (auto & column : columns)
            column = column->cut(0, 1);

-        chunk.setColumns(std::move(columns), 1);
+        data.chunk.setColumns(std::move(columns), 1);
    }

-    return chunk;
+    return data;
+}
+
+void IOutputFormat::consume(Chunk)
+{
+    throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method consume s not implemented for {}", getName());
 }

 void IOutputFormat::work()
@ -84,17 +97,24 @@ void IOutputFormat::work()
    switch (current_block_kind)
    {
        case Main:
-            result_rows += current_chunk.getNumRows();
-            result_bytes += current_chunk.allocatedBytes();
+        {
+            result_rows += current_chunk.chunk.getNumRows();
+            result_bytes += current_chunk.chunk.allocatedBytes();
            consume(std::move(current_chunk));
            break;
+        }
        case Totals:
-            if (auto totals = prepareTotals(std::move(current_chunk)))
+        {
+            auto totals = prepareTotals(std::move(current_chunk));
+            if (totals.exception || totals.chunk)
                consumeTotals(std::move(totals));
            break;
+        }
        case Extremes:
+        {
            consumeExtremes(std::move(current_chunk));
            break;
+        }
    }

    if (auto_flush)
--- a/src/Processors/Formats/IOutputFormat.h
+++ b/src/Processors/Formats/IOutputFormat.h
@ -28,7 +28,7 @@ public:
 protected:
    WriteBuffer & out;

-    Chunk current_chunk;
+    Port::Data current_chunk;
    PortKind current_block_kind = PortKind::Main;
    bool has_input = false;
    bool finished = false;
@ -39,9 +39,16 @@ protected:

    RowsBeforeLimitCounterPtr rows_before_limit_counter;

-    virtual void consume(Chunk) = 0;
+    friend class ParallelFormattingOutputFormat;
+
+    virtual void consume(Chunk);
    virtual void consumeTotals(Chunk) {}
    virtual void consumeExtremes(Chunk) {}
+
+    virtual void consume(Port::Data data) { consume(data.getChunkOrTrow()); }
+    virtual void consumeTotals(Port::Data data) { consumeTotals(data.getChunkOrTrow()); }
+    virtual void consumeExtremes(Port::Data data) { consumeExtremes(data.getChunkOrTrow()); }
+
    virtual void finalize() {}

 public:
@ -77,8 +84,19 @@ public:
    virtual void doWritePrefix() {}
    virtual void doWriteSuffix() { finalize(); }

-    void setTotals(const Block & totals) { consumeTotals(Chunk(totals.getColumns(), totals.rows())); }
-    void setExtremes(const Block & extremes) { consumeExtremes(Chunk(extremes.getColumns(), extremes.rows())); }
+    void setTotals(const Block & totals)
+    {
+        Port::Data data;
+        data.chunk = Chunk(totals.getColumns(), totals.rows());
+        consumeTotals(std::move(data));
+    }
+
+    void setExtremes(const Block & extremes)
+    {
+        Port::Data data;
+        data.chunk = Chunk(extremes.getColumns(), extremes.rows());
+        consumeExtremes(std::move(data));
+    }

    size_t getResultRows() const { return result_rows; }
    size_t getResultBytes() const { return result_bytes; }
--- a/src/Processors/Formats/Impl/CSVRowOutputFormat.cpp
+++ b/src/Processors/Formats/Impl/CSVRowOutputFormat.cpp
@ -82,6 +82,7 @@ void registerOutputFormatProcessorCSV(FormatFactory & factory)
        {
                return std::make_shared<CSVRowOutputFormat>(buf, sample, with_names, params, format_settings);
        });
+        factory.markOutputFormatSupportsParallelFormatting(with_names ? "CSVWithNames" : "CSV");
    }
 }

--- a/src/Processors/Formats/Impl/JSONCompactEachRowRowOutputFormat.cpp
+++ b/src/Processors/Formats/Impl/JSONCompactEachRowRowOutputFormat.cpp
@ -68,7 +68,7 @@ void JSONCompactEachRowRowOutputFormat::writeTotals(const Columns & columns, siz
    writeCString("]\n", out);
 }

-void JSONCompactEachRowRowOutputFormat::writePrefix()
+void JSONCompactEachRowRowOutputFormat::doWritePrefix()
 {
    if (with_names)
    {
@ -108,6 +108,7 @@ void registerOutputFormatProcessorJSONCompactEachRow(FormatFactory & factory)
    {
        return std::make_shared<JSONCompactEachRowRowOutputFormat>(buf, sample, params, format_settings, false, false);
    });
+    factory.markOutputFormatSupportsParallelFormatting("JSONCompactEachRow");

    factory.registerOutputFormatProcessor("JSONCompactEachRowWithNamesAndTypes", [](
            WriteBuffer &buf,
@ -117,6 +118,7 @@ void registerOutputFormatProcessorJSONCompactEachRow(FormatFactory & factory)
    {
        return std::make_shared<JSONCompactEachRowRowOutputFormat>(buf, sample, params, format_settings, true, false);
    });
+    factory.markOutputFormatSupportsParallelFormatting("JSONCompactEachRowWithNamesAndTypes");

    factory.registerOutputFormatProcessor("JSONCompactStringsEachRow", [](
            WriteBuffer & buf,
@ -126,6 +128,7 @@ void registerOutputFormatProcessorJSONCompactEachRow(FormatFactory & factory)
    {
        return std::make_shared<JSONCompactEachRowRowOutputFormat>(buf, sample, params, format_settings, false, true);
    });
+    factory.markOutputFormatSupportsParallelFormatting("JSONCompactStringsEachRow");

    factory.registerOutputFormatProcessor("JSONCompactStringsEachRowWithNamesAndTypes", [](
            WriteBuffer &buf,
@ -135,6 +138,7 @@ void registerOutputFormatProcessorJSONCompactEachRow(FormatFactory & factory)
    {
        return std::make_shared<JSONCompactEachRowRowOutputFormat>(buf, sample, params, format_settings, true, true);
    });
+    factory.markOutputFormatSupportsParallelFormatting("JSONCompactStringsEachRowWithNamesAndTypes");
 }


--- a/src/Processors/Formats/Impl/JSONCompactEachRowRowOutputFormat.h
+++ b/src/Processors/Formats/Impl/JSONCompactEachRowRowOutputFormat.h
@ -25,7 +25,7 @@ public:

    String getName() const override { return "JSONCompactEachRowRowOutputFormat"; }

-    void writePrefix() override;
+    void doWritePrefix() override;

    void writeBeforeTotals() override {}
    void writeTotals(const Columns & columns, size_t row_num) override;
--- a/src/Processors/Formats/Impl/JSONEachRowRowOutputFormat.cpp
+++ b/src/Processors/Formats/Impl/JSONEachRowRowOutputFormat.cpp
@ -138,6 +138,7 @@ void registerOutputFormatProcessorJSONEachRow(FormatFactory & factory)
        return std::make_shared<JSONEachRowRowOutputFormat>(buf, sample, params,
            settings);
    });
+    factory.markOutputFormatSupportsParallelFormatting("JSONEachRow");

    factory.registerOutputFormatProcessor("JSONStringsEachRow", [](
        WriteBuffer & buf,
@ -150,6 +151,7 @@ void registerOutputFormatProcessorJSONEachRow(FormatFactory & factory)
        return std::make_shared<JSONEachRowRowOutputFormat>(buf, sample, params,
            settings);
    });
+    factory.markOutputFormatSupportsParallelFormatting("JSONStringEachRow");
 }

 }
--- a/src/Processors/Formats/Impl/NativeFormat.cpp
+++ b/src/Processors/Formats/Impl/NativeFormat.cpp
@ -1,3 +1,4 @@
+#pragma once
 #include <DataStreams/NativeBlockInputStream.h>
 #include <DataStreams/NativeBlockOutputStream.h>
 #include <Formats/FormatFactory.h>
--- a/src/Processors/Formats/Impl/ParallelFormattingOutputFormat.cpp
+++ b/src/Processors/Formats/Impl/ParallelFormattingOutputFormat.cpp
@ -0,0 +1,201 @@
+#include <Processors/Formats/Impl/ParallelFormattingOutputFormat.h>
+
+#include <Common/setThreadName.h>
+
+namespace DB
+{
+    void ParallelFormattingOutputFormat::finalize()
+    {
+        need_flush = true;
+        IOutputFormat::finalized = true;
+        /// Don't throw any background_exception here, because we want to finalize the execution.
+        /// Exception will be checked after main thread is finished.
+        addChunk(Chunk{}, ProcessingUnitType::FINALIZE, /*can_throw_exception*/ false);
+        collector_finished.wait();
+
+        if (collector_thread.joinable())
+            collector_thread.join();
+
+        {
+            std::unique_lock<std::mutex> lock(mutex);
+            if (background_exception)
+                std::rethrow_exception(background_exception);
+        }
+    }
+
+    void ParallelFormattingOutputFormat::addChunk(Chunk chunk, ProcessingUnitType type, bool can_throw_exception)
+    {
+        {
+            std::unique_lock<std::mutex> lock(mutex);
+            if (background_exception && can_throw_exception)
+                std::rethrow_exception(background_exception);
+        }
+
+        const auto current_unit_number = writer_unit_number % processing_units.size();
+        auto & unit = processing_units[current_unit_number];
+
+        {
+            std::unique_lock<std::mutex> lock(mutex);
+            writer_condvar.wait(lock,
+                [&]{ return unit.status == READY_TO_INSERT || emergency_stop; });
+        }
+
+        if (emergency_stop)
+            return;
+
+        assert(unit.status == READY_TO_INSERT);
+        unit.chunk = std::move(chunk);
+        /// Resize memory without deallocation.
+        unit.segment.resize(0);
+        unit.status = READY_TO_FORMAT;
+        unit.type = type;
+
+        scheduleFormatterThreadForUnitWithNumber(current_unit_number);
+
+        ++writer_unit_number;
+    }
+
+
+    void ParallelFormattingOutputFormat::finishAndWait()
+    {
+        emergency_stop = true;
+
+        {
+            std::unique_lock<std::mutex> lock(mutex);
+            collector_condvar.notify_all();
+            writer_condvar.notify_all();
+        }
+
+        if (collector_thread.joinable())
+            collector_thread.join();
+
+        try
+        {
+            pool.wait();
+        }
+        catch (...)
+        {
+            tryLogCurrentException(__PRETTY_FUNCTION__);
+        }
+    }
+
+
+    void ParallelFormattingOutputFormat::collectorThreadFunction()
+    {
+        setThreadName("Collector");
+
+        try
+        {
+            while (!emergency_stop)
+            {
+                const auto current_unit_number = collector_unit_number % processing_units.size();
+                auto & unit = processing_units[current_unit_number];
+
+                {
+                    std::unique_lock<std::mutex> lock(mutex);
+                    collector_condvar.wait(lock,
+                        [&]{ return unit.status == READY_TO_READ || emergency_stop; });
+                }
+
+                if (emergency_stop)
+                    break;
+
+                assert(unit.status == READY_TO_READ);
+
+                /// Use this copy to after notification to stop the execution.
+                auto copy_if_unit_type = unit.type;
+
+                /// Do main work here.
+                out.write(unit.segment.data(), unit.actual_memory_size);
+
+                if (need_flush.exchange(false) || auto_flush)
+                    IOutputFormat::flush();
+
+                ++collector_unit_number;
+
+                {
+                    /// Notify other threads.
+                    std::lock_guard<std::mutex> lock(mutex);
+                    unit.status = READY_TO_INSERT;
+                    writer_condvar.notify_all();
+                }
+                /// We can exit only after writing last piece of to out buffer.
+                if (copy_if_unit_type == ProcessingUnitType::FINALIZE)
+                {
+                    break;
+                }
+            }
+            collector_finished.set();
+        }
+        catch (...)
+        {
+            collector_finished.set();
+            onBackgroundException();
+        }
+    }
+
+
+    void ParallelFormattingOutputFormat::formatterThreadFunction(size_t current_unit_number)
+    {
+        setThreadName("Formatter");
+
+        try
+        {
+            auto & unit = processing_units[current_unit_number];
+            assert(unit.status = READY_TO_FORMAT);
+
+            /// We want to preallocate memory buffer (increase capacity)
+            /// and put the pointer at the beginning of the buffer
+            unit.segment.resize(DBMS_DEFAULT_BUFFER_SIZE);
+            /// The second invocation won't release memory, only set size equals to 0.
+            unit.segment.resize(0);
+
+            unit.actual_memory_size = 0;
+            BufferWithOutsideMemory<WriteBuffer> out_buffer(unit.segment);
+
+            auto formatter = internal_formatter_creator(out_buffer);
+
+            switch (unit.type)
+            {
+                case ProcessingUnitType::START :
+                {
+                    formatter->doWritePrefix();
+                    break;
+                }
+                case ProcessingUnitType::PLAIN :
+                {
+                    formatter->consume(std::move(unit.chunk));
+                    break;
+                }
+                case ProcessingUnitType::TOTALS :
+                {
+                    formatter->consumeTotals(std::move(unit.chunk));
+                    break;
+                }
+                case ProcessingUnitType::EXTREMES :
+                {
+                    formatter->consumeExtremes(std::move(unit.chunk));
+                    break;
+                }
+                case ProcessingUnitType::FINALIZE :
+                {
+                    formatter->doWriteSuffix();
+                    break;
+                }
+            }
+            /// Flush all the data to handmade buffer.
+            formatter->flush();
+            unit.actual_memory_size = out_buffer.getActualSize();
+
+            {
+                std::lock_guard<std::mutex> lock(mutex);
+                unit.status = READY_TO_READ;
+                collector_condvar.notify_all();
+            }
+        }
+        catch (...)
+        {
+            onBackgroundException();
+        }
+    }
+}
--- a/src/Processors/Formats/Impl/ParallelFormattingOutputFormat.h
+++ b/src/Processors/Formats/Impl/ParallelFormattingOutputFormat.h
@ -0,0 +1,203 @@
+#pragma once
+
+#include <Processors/Formats/IOutputFormat.h>
+
+#include <Common/Arena.h>
+#include <Common/ThreadPool.h>
+#include <common/logger_useful.h>
+#include <Common/Exception.h>
+#include <Formats/FormatFactory.h>
+#include <Poco/Event.h>
+#include <IO/BufferWithOwnMemory.h>
+#include <IO/WriteBuffer.h>
+#include <IO/WriteBufferFromArena.h>
+
+#include <deque>
+#include <atomic>
+
+namespace DB
+{
+
+/**
+ * ORDER-PRESERVING parallel formatting of data formats.
+ * The idea is similar to ParallelParsingInputFormat.
+ * You add several Chunks through consume() method, each Chunk is formatted by some thread
+ * in ThreadPool into a temporary buffer. (Formatting is being done in parallel.)
+ * Then, another thread add temporary buffers into a "real" WriteBuffer.
+ *
+ *                   Formatters
+ *      |   |   |   |   |   |   |   |   |   |
+ *      v   v   v   v   v   v   v   v   v   v
+ *    |---|---|---|---|---|---|---|---|---|---|
+ *    | 1 | 2 | 3 | 4 | 5 | . | . | . | . | N | <-- Processing units
+ *    |---|---|---|---|---|---|---|---|---|---|
+ *      ^               ^
+ *      |               |
+ *   Collector       addChunk
+ *
+ * There is a container of ProcessingUnits - internal entity, storing a Chunk to format,
+ * a continuous memory buffer to store the formatted Chunk and some flags for synchronization needs.
+ * Each ProcessingUnits has a unique number - the place in the container.
+ * So, Chunk is added through addChunk method, which waits until current ProcessingUnit would be ready to insert
+ * (ProcessingUnitStatus = READY_TO_INSERT), changes status to READY_TO_PARSE and spawns a new task in ThreadPool to parse it.
+ * The other thread, we call it Collector waits until a ProcessingUnit which it points to would be READY_TO_READ.
+ * Then it adds a temporary buffer to a real WriteBuffer.
+ * Both Collector and a thread which adds Chunks have unit_number - a pointer to ProcessingUnit which they are aim to work with.
+ *
+ * Note, that collector_unit_number is always less or equal to current_unit_number, that's why the formatting is order-preserving.
+ *
+ * To stop the execution, a fake Chunk is added (ProcessingUnitType = FINALIZE) and finalize()
+ * function is blocked until the Collector thread is done.
+*/
+class ParallelFormattingOutputFormat : public IOutputFormat
+{
+public:
+    /// Used to recreate formatter on every new data piece.
+    using InternalFormatterCreator = std::function<OutputFormatPtr(WriteBuffer & buf)>;
+
+    /// Struct to simplify constructor.
+    struct Params
+    {
+        WriteBuffer & out;
+        const Block & header;
+        InternalFormatterCreator internal_formatter_creator;
+        const size_t max_threads_for_parallel_formatting;
+    };
+
+    ParallelFormattingOutputFormat() = delete;
+
+    explicit ParallelFormattingOutputFormat(Params params)
+        : IOutputFormat(params.header, params.out)
+        , internal_formatter_creator(params.internal_formatter_creator)
+        , pool(params.max_threads_for_parallel_formatting)
+
+    {
+        /// Just heuristic. We need one thread for collecting, one thread for receiving chunks
+        /// and n threads for formatting.
+        processing_units.resize(params.max_threads_for_parallel_formatting + 2);
+        collector_thread = ThreadFromGlobalPool([&] { collectorThreadFunction(); });
+        LOG_TRACE(&Poco::Logger::get("ParallelFormattingOutputFormat"), "Parallel formatting is being used");
+    }
+
+    ~ParallelFormattingOutputFormat() override
+    {
+        finishAndWait();
+    }
+
+    String getName() const override { return "ParallelFormattingOutputFormat"; }
+
+    void flush() override
+    {
+        need_flush = true;
+    }
+
+    void doWritePrefix() override
+    {
+        addChunk(Chunk{}, ProcessingUnitType::START, /*can_throw_exception*/ true);
+    }
+
+    void onCancel() override
+    {
+        finishAndWait();
+    }
+
+protected:
+    void consume(Chunk chunk) override final
+    {
+        addChunk(std::move(chunk), ProcessingUnitType::PLAIN, /*can_throw_exception*/ true);
+    }
+
+    void consumeTotals(Chunk totals) override
+    {
+        addChunk(std::move(totals), ProcessingUnitType::TOTALS, /*can_throw_exception*/ true);
+    }
+
+    void consumeExtremes(Chunk extremes) override
+    {
+        addChunk(std::move(extremes), ProcessingUnitType::EXTREMES, /*can_throw_exception*/ true);
+    }
+
+    void finalize() override;
+
+private:
+    InternalFormatterCreator internal_formatter_creator;
+
+    /// Status to synchronize multiple threads.
+    enum ProcessingUnitStatus
+    {
+        READY_TO_INSERT,
+        READY_TO_FORMAT,
+        READY_TO_READ
+    };
+
+    /// Some information about what methods to call from internal parser.
+    enum class ProcessingUnitType
+    {
+        START,
+        PLAIN,
+        TOTALS,
+        EXTREMES,
+        FINALIZE
+    };
+
+    void addChunk(Chunk chunk, ProcessingUnitType type, bool can_throw_exception);
+
+    struct ProcessingUnit
+    {
+        std::atomic<ProcessingUnitStatus> status{ProcessingUnitStatus::READY_TO_INSERT};
+        ProcessingUnitType type{ProcessingUnitType::START};
+        Chunk chunk;
+        Memory<> segment;
+        size_t actual_memory_size{0};
+    };
+
+    Poco::Event collector_finished{};
+
+    std::atomic_bool need_flush{false};
+
+    // There are multiple "formatters", that's why we use thread pool.
+    ThreadPool pool;
+    // Collecting all memory to original ReadBuffer
+    ThreadFromGlobalPool collector_thread;
+
+    std::exception_ptr background_exception = nullptr;
+
+    /// We use deque, because ProcessingUnit doesn't have move or copy constructor.
+    std::deque<ProcessingUnit> processing_units;
+
+    std::mutex mutex;
+    std::atomic_bool emergency_stop{false};
+
+    std::atomic_size_t collector_unit_number{0};
+    std::atomic_size_t writer_unit_number{0};
+
+    std::condition_variable collector_condvar;
+    std::condition_variable writer_condvar;
+
+    void finishAndWait();
+
+    void onBackgroundException()
+    {
+        std::unique_lock<std::mutex> lock(mutex);
+        if (!background_exception)
+        {
+            background_exception = std::current_exception();
+        }
+        emergency_stop = true;
+        writer_condvar.notify_all();
+        collector_condvar.notify_all();
+    }
+
+    void scheduleFormatterThreadForUnitWithNumber(size_t ticket_number)
+    {
+        pool.scheduleOrThrowOnError([this, ticket_number] { formatterThreadFunction(ticket_number); });
+    }
+
+    /// Collects all temporary buffers into main WriteBuffer.
+    void collectorThreadFunction();
+
+    /// This function is executed in ThreadPool and the only purpose of it is to format one Chunk into a continuous buffer in memory.
+    void formatterThreadFunction(size_t current_unit_number);
+};
+
+}
--- a/src/Processors/Formats/Impl/ParallelParsingInputFormat.cpp
+++ b/src/Processors/Formats/Impl/ParallelParsingInputFormat.cpp
@ -0,0 +1,227 @@
+#include <Processors/Formats/Impl/ParallelParsingInputFormat.h>
+#include <IO/ReadHelpers.h>
+#include <Common/CurrentThread.h>
+#include <Common/setThreadName.h>
+#include <ext/scope_guard.h>
+
+namespace DB
+{
+
+void ParallelParsingInputFormat::segmentatorThreadFunction(ThreadGroupStatusPtr thread_group)
+{
+    SCOPE_EXIT(
+        if (thread_group)
+            CurrentThread::detachQueryIfNotDetached();
+    );
+    if (thread_group)
+        CurrentThread::attachTo(thread_group);
+
+    setThreadName("Segmentator");
+    try
+    {
+        while (!parsing_finished)
+        {
+            const auto segmentator_unit_number = segmentator_ticket_number % processing_units.size();
+            auto & unit = processing_units[segmentator_unit_number];
+
+            {
+                std::unique_lock<std::mutex> lock(mutex);
+                segmentator_condvar.wait(lock,
+                                         [&]{ return unit.status == READY_TO_INSERT || parsing_finished; });
+            }
+
+            if (parsing_finished)
+                break;
+
+            assert(unit.status == READY_TO_INSERT);
+
+            // Segmentating the original input.
+            unit.segment.resize(0);
+
+            auto [have_more_data, currently_read_rows] = file_segmentation_engine(in, unit.segment, min_chunk_bytes);
+
+            unit.offset = successfully_read_rows_count;
+            successfully_read_rows_count += currently_read_rows;
+
+            unit.is_last = !have_more_data;
+            unit.status = READY_TO_PARSE;
+            scheduleParserThreadForUnitWithNumber(segmentator_ticket_number);
+            ++segmentator_ticket_number;
+
+            if (!have_more_data)
+                break;
+        }
+    }
+    catch (...)
+    {
+        onBackgroundException(successfully_read_rows_count);
+    }
+}
+
+void ParallelParsingInputFormat::parserThreadFunction(ThreadGroupStatusPtr thread_group, size_t current_ticket_number)
+{
+    SCOPE_EXIT(
+        if (thread_group)
+            CurrentThread::detachQueryIfNotDetached();
+    );
+    if (thread_group)
+        CurrentThread::attachTo(thread_group);
+
+    const auto parser_unit_number = current_ticket_number % processing_units.size();
+    auto & unit = processing_units[parser_unit_number];
+
+    try
+    {
+        setThreadName("ChunkParser");
+
+        /*
+         * This is kind of suspicious -- the input_process_creator contract with
+         * respect to multithreaded use is not clear, but we hope that it is
+         * just a 'normal' factory class that doesn't have any state, and so we
+         * can use it from multiple threads simultaneously.
+         */
+        ReadBuffer read_buffer(unit.segment.data(), unit.segment.size(), 0);
+
+        InputFormatPtr input_format = internal_parser_creator(read_buffer);
+        input_format->setCurrentUnitNumber(current_ticket_number);
+        InternalParser parser(input_format);
+
+        unit.chunk_ext.chunk.clear();
+        unit.chunk_ext.block_missing_values.clear();
+
+        // We don't know how many blocks will be. So we have to read them all
+        // until an empty block occurred.
+        Chunk chunk;
+        while (!parsing_finished && (chunk = parser.getChunk()) != Chunk())
+        {
+            /// Variable chunk is moved, but it is not really used in the next iteration.
+            /// NOLINTNEXTLINE(bugprone-use-after-move)
+            unit.chunk_ext.chunk.emplace_back(std::move(chunk));
+            unit.chunk_ext.block_missing_values.emplace_back(parser.getMissingValues());
+        }
+
+        // We suppose we will get at least some blocks for a non-empty buffer,
+        // except at the end of file. Also see a matching assert in readImpl().
+        assert(unit.is_last || !unit.chunk_ext.chunk.empty() || parsing_finished);
+
+        std::lock_guard<std::mutex> lock(mutex);
+        unit.status = READY_TO_READ;
+        reader_condvar.notify_all();
+    }
+    catch (...)
+    {
+        onBackgroundException(unit.offset);
+    }
+}
+
+
+void ParallelParsingInputFormat::onBackgroundException(size_t offset)
+{
+    tryLogCurrentException(__PRETTY_FUNCTION__);
+
+    std::unique_lock<std::mutex> lock(mutex);
+    if (!background_exception)
+    {
+        background_exception = std::current_exception();
+        if (ParsingException * e = exception_cast<ParsingException *>(background_exception))
+            if (e->getLineNumber() != -1)
+                e->setLineNumber(e->getLineNumber() + offset);
+    }
+    tryLogCurrentException(__PRETTY_FUNCTION__);
+    parsing_finished = true;
+    reader_condvar.notify_all();
+    segmentator_condvar.notify_all();
+}
+
+Chunk ParallelParsingInputFormat::generate()
+{
+    if (isCancelled() || parsing_finished)
+    {
+        /**
+          * Check for background exception and rethrow it before we return.
+          */
+        std::unique_lock<std::mutex> lock(mutex);
+        if (background_exception)
+        {
+            lock.unlock();
+            onCancel();
+            std::rethrow_exception(background_exception);
+        }
+
+        return {};
+    }
+
+    const auto inserter_unit_number = reader_ticket_number % processing_units.size();
+    auto & unit = processing_units[inserter_unit_number];
+
+    if (!next_block_in_current_unit.has_value())
+    {
+        // We have read out all the Blocks from the previous Processing Unit,
+        // wait for the current one to become ready.
+        std::unique_lock<std::mutex> lock(mutex);
+        reader_condvar.wait(lock, [&](){ return unit.status == READY_TO_READ || parsing_finished; });
+
+        if (parsing_finished)
+        {
+            /**
+              * Check for background exception and rethrow it before we return.
+              */
+            if (background_exception)
+            {
+                lock.unlock();
+                cancel();
+                std::rethrow_exception(background_exception);
+            }
+
+            return {};
+        }
+
+        assert(unit.status == READY_TO_READ);
+        next_block_in_current_unit = 0;
+    }
+
+    if (unit.chunk_ext.chunk.empty())
+    {
+        /*
+         * Can we get zero blocks for an entire segment, when the format parser
+         * skips it entire content and does not create any blocks? Probably not,
+         * but if we ever do, we should add a loop around the above if, to skip
+         * these. Also see a matching assert in the parser thread.
+         */
+        assert(unit.is_last);
+        parsing_finished = true;
+        return {};
+    }
+
+    assert(next_block_in_current_unit.value() < unit.chunk_ext.chunk.size());
+
+    Chunk res = std::move(unit.chunk_ext.chunk.at(*next_block_in_current_unit));
+    last_block_missing_values = std::move(unit.chunk_ext.block_missing_values[*next_block_in_current_unit]);
+
+    next_block_in_current_unit.value() += 1;
+
+    if (*next_block_in_current_unit == unit.chunk_ext.chunk.size())
+    {
+        // parsing_finished reading this Processing Unit, move to the next one.
+        next_block_in_current_unit.reset();
+        ++reader_ticket_number;
+
+        if (unit.is_last)
+        {
+            // It it was the last unit, we're parsing_finished.
+            parsing_finished = true;
+        }
+        else
+        {
+            // Pass the unit back to the segmentator.
+            std::unique_lock<std::mutex> lock(mutex);
+            unit.status = READY_TO_INSERT;
+            segmentator_condvar.notify_all();
+        }
+    }
+
+    return res;
+}
+
+
+}
--- a/src/Processors/Formats/Impl/ParallelParsingInputFormat.h
+++ b/src/Processors/Formats/Impl/ParallelParsingInputFormat.h
@ -0,0 +1,288 @@
+#pragma once
+
+#include <Processors/Formats/IInputFormat.h>
+#include <DataStreams/IBlockInputStream.h>
+#include <Formats/FormatFactory.h>
+#include <Common/CurrentThread.h>
+#include <Common/ThreadPool.h>
+#include <Common/setThreadName.h>
+#include <IO/BufferWithOwnMemory.h>
+#include <IO/ReadBuffer.h>
+#include <Processors/Formats/IRowInputFormat.h>
+#include <Interpreters/Context.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+}
+
+class Context;
+
+/**
+ * ORDER-PRESERVING parallel parsing of data formats.
+ * It splits original data into chunks. Then each chunk is parsed by different thread.
+ * The number of chunks equals to the number or parser threads.
+ * The size of chunk is equal to min_chunk_bytes_for_parallel_parsing setting.
+ *
+ *                    Parsers
+ *      |   |   |   |   |   |   |   |   |   |
+ *      v   v   v   v   v   v   v   v   v   v
+ *    |---|---|---|---|---|---|---|---|---|---|
+ *    | 1 | 2 | 3 | 4 | 5 | . | . | . | . | N | <-- Processing units
+ *    |---|---|---|---|---|---|---|---|---|---|
+ *      ^               ^
+ *      |               |
+ *   readImpl        Segmentator
+ *
+ * This stream has three kinds of threads: one segmentator, multiple parsers,
+ * and one reader thread -- that is, the one from which readImpl() is called.
+ * They operate one after another on parts of data called "processing units".
+ * One unit consists of buffer with raw data from file, filled by segmentator
+ * thread. This raw data is then parsed by a parser thread to form a number of
+ * Blocks. These Blocks are returned to the parent stream from readImpl().
+ * After being read out, a processing unit is reused, to save on allocating
+ * memory for the raw buffer. The processing units are organized into a circular
+ * array to facilitate reuse and to apply backpressure on the segmentator thread
+ * -- after it runs out of processing units, it has to wait for the reader to
+ * read out the previous blocks.
+ * The outline of what the threads do is as follows:
+ * segmentator thread:
+ *  1) wait for the next processing unit to become empty
+ *  2) fill it with a part of input file
+ *  3) start a parser thread
+ *  4) repeat until eof
+ * parser thread:
+ *  1) parse the given raw buffer without any synchronization
+ *  2) signal that the given unit is ready to read
+ *  3) finish
+ * readImpl():
+ *  1) wait for the next processing unit to become ready to read
+ *  2) take the blocks from the processing unit to return them to the caller
+ *  3) signal that the processing unit is empty
+ *  4) repeat until it encounters unit that is marked as "past_the_end"
+ * All threads must also check for cancel/eof/exception flags.
+ */
+class ParallelParsingInputFormat : public IInputFormat
+{
+public:
+    /* Used to recreate parser on every new data piece.*/
+    using InternalParserCreator = std::function<InputFormatPtr(ReadBuffer & buf)>;
+
+    struct Params
+    {
+        ReadBuffer & in;
+        Block header;
+        InternalParserCreator internal_parser_creator;
+        FormatFactory::FileSegmentationEngine file_segmentation_engine;
+        String format_name;
+        size_t max_threads;
+        size_t min_chunk_bytes;
+    };
+
+    explicit ParallelParsingInputFormat(Params params)
+        : IInputFormat(std::move(params.header), params.in)
+        , internal_parser_creator(params.internal_parser_creator)
+        , file_segmentation_engine(params.file_segmentation_engine)
+        , format_name(params.format_name)
+        , min_chunk_bytes(params.min_chunk_bytes)
+        , pool(params.max_threads)
+    {
+        // One unit for each thread, including segmentator and reader, plus a
+        // couple more units so that the segmentation thread doesn't spuriously
+        // bump into reader thread on wraparound.
+        processing_units.resize(params.max_threads + 2);
+
+        segmentator_thread = ThreadFromGlobalPool(
+            &ParallelParsingInputFormat::segmentatorThreadFunction, this, CurrentThread::getGroup());
+    }
+
+    ~ParallelParsingInputFormat() override
+    {
+        finishAndWait();
+    }
+
+    void resetParser() override final
+    {
+        throw Exception("resetParser() is not allowed for " + getName(), ErrorCodes::LOGICAL_ERROR);
+    }
+
+    const BlockMissingValues & getMissingValues() const override final
+    {
+        return last_block_missing_values;
+    }
+
+    String getName() const override final { return "ParallelParsingBlockInputFormat"; }
+
+protected:
+
+    Chunk generate() override final;
+
+    void onCancel() override final
+    {
+        /*
+         * The format parsers themselves are not being cancelled here, so we'll
+         * have to wait until they process the current block. Given that the
+         * chunk size is on the order of megabytes, this shouldn't be too long.
+         * We can't call IInputFormat->cancel here, because the parser object is
+         * local to the parser thread, and we don't want to introduce any
+         * synchronization between parser threads and the other threads to get
+         * better performance. An ideal solution would be to add a callback to
+         * IInputFormat that checks whether it was cancelled.
+         */
+
+        finishAndWait();
+    }
+
+private:
+
+    class InternalParser
+    {
+    public:
+        explicit InternalParser(const InputFormatPtr & input_format_)
+        : input_format(input_format_)
+        , port(input_format->getPort().getHeader(), input_format.get())
+        {
+            connect(input_format->getPort(), port);
+            port.setNeeded();
+        }
+
+        Chunk getChunk()
+        {
+            while (true)
+            {
+                IProcessor::Status status = input_format->prepare();
+                switch (status)
+                {
+                    case IProcessor::Status::Ready:
+                        input_format->work();
+                        break;
+
+                    case IProcessor::Status::Finished:
+                        return {};
+
+                    case IProcessor::Status::PortFull:
+                        return port.pull();
+
+                    case IProcessor::Status::NeedData: break;
+                    case IProcessor::Status::Async: break;
+                    case IProcessor::Status::ExpandPipeline:
+                        throw Exception("One of the parsers returned status " + IProcessor::statusToName(status) +
+                                             " during parallel parsing", ErrorCodes::LOGICAL_ERROR);
+                }
+            }
+        }
+
+        const BlockMissingValues & getMissingValues() const { return input_format->getMissingValues(); }
+
+    private:
+        const InputFormatPtr & input_format;
+        InputPort port;
+    };
+
+    const InternalParserCreator internal_parser_creator;
+    /// Function to segment the file. Then "parsers" will parse that segments.
+    FormatFactory::FileSegmentationEngine file_segmentation_engine;
+    const String format_name;
+    const size_t min_chunk_bytes;
+
+    BlockMissingValues last_block_missing_values;
+
+    /// Non-atomic because it is used in one thread.
+    std::optional<size_t> next_block_in_current_unit;
+    size_t segmentator_ticket_number{0};
+    size_t reader_ticket_number{0};
+
+    std::mutex mutex;
+    std::condition_variable reader_condvar;
+    std::condition_variable segmentator_condvar;
+
+    std::atomic<bool> parsing_finished{false};
+
+    /// There are multiple "parsers", that's why we use thread pool.
+    ThreadPool pool;
+    /// Reading and segmentating the file
+    ThreadFromGlobalPool segmentator_thread;
+
+    enum ProcessingUnitStatus
+    {
+        READY_TO_INSERT,
+        READY_TO_PARSE,
+        READY_TO_READ
+    };
+
+    struct ChunkExt
+    {
+        std::vector<Chunk> chunk;
+        std::vector<BlockMissingValues> block_missing_values;
+    };
+
+    struct ProcessingUnit
+    {
+        explicit ProcessingUnit()
+            : status(ProcessingUnitStatus::READY_TO_INSERT)
+        {
+        }
+
+        ChunkExt chunk_ext;
+        Memory<> segment;
+        std::atomic<ProcessingUnitStatus> status;
+        /// Needed for better exception message.
+        size_t offset = 0;
+        bool is_last{false};
+    };
+
+    std::exception_ptr background_exception = nullptr;
+
+    /// We use deque instead of vector, because it does not require a move
+    /// constructor, which is absent for atomics that are inside ProcessingUnit.
+    std::deque<ProcessingUnit> processing_units;
+
+    /// Compute it to have a more understandable error message.
+    size_t successfully_read_rows_count{0};
+
+
+    void scheduleParserThreadForUnitWithNumber(size_t ticket_number)
+    {
+        pool.scheduleOrThrowOnError([this, ticket_number, group = CurrentThread::getGroup()]()
+        {
+            parserThreadFunction(group, ticket_number);
+        });
+    }
+
+    void finishAndWait()
+    {
+        parsing_finished = true;
+
+        {
+            std::unique_lock<std::mutex> lock(mutex);
+            segmentator_condvar.notify_all();
+            reader_condvar.notify_all();
+        }
+
+        if (segmentator_thread.joinable())
+            segmentator_thread.join();
+
+        try
+        {
+            pool.wait();
+        }
+        catch (...)
+        {
+            tryLogCurrentException(__PRETTY_FUNCTION__);
+        }
+    }
+
+    void segmentatorThreadFunction(ThreadGroupStatusPtr thread_group);
+    void parserThreadFunction(ThreadGroupStatusPtr thread_group, size_t current_ticket_number);
+
+    /// Save/log a background exception, set termination flag, wake up all
+    /// threads. This function is used by segmentator and parsed threads.
+    /// readImpl() is called from the main thread, so the exception handling
+    /// is different.
+    void onBackgroundException(size_t offset);
+};
+
+}
--- a/src/Processors/Formats/Impl/TSKVRowOutputFormat.cpp
+++ b/src/Processors/Formats/Impl/TSKVRowOutputFormat.cpp
@ -49,6 +49,7 @@ void registerOutputFormatProcessorTSKV(FormatFactory & factory)
    {
        return std::make_shared<TSKVRowOutputFormat>(buf, sample, params, settings);
    });
+    factory.markOutputFormatSupportsParallelFormatting("TSKV");
 }

 }
--- a/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.cpp
+++ b/src/Processors/Formats/Impl/TabSeparatedRowOutputFormat.cpp
@ -85,6 +85,7 @@ void registerOutputFormatProcessorTabSeparated(FormatFactory & factory)
        {
            return std::make_shared<TabSeparatedRowOutputFormat>(buf, sample, false, false, params, settings);
        });
+        factory.markOutputFormatSupportsParallelFormatting(name);
    }

    for (const auto * name : {"TabSeparatedRaw", "TSVRaw"})
@ -97,6 +98,7 @@ void registerOutputFormatProcessorTabSeparated(FormatFactory & factory)
        {
            return std::make_shared<TabSeparatedRawRowOutputFormat>(buf, sample, false, false, params, settings);
        });
+        factory.markOutputFormatSupportsParallelFormatting(name);
    }

    for (const auto * name : {"TabSeparatedWithNames", "TSVWithNames"})
@ -109,6 +111,7 @@ void registerOutputFormatProcessorTabSeparated(FormatFactory & factory)
        {
            return std::make_shared<TabSeparatedRowOutputFormat>(buf, sample, true, false, params, settings);
        });
+        factory.markOutputFormatSupportsParallelFormatting(name);
    }

    for (const auto * name : {"TabSeparatedWithNamesAndTypes", "TSVWithNamesAndTypes"})
@ -121,6 +124,7 @@ void registerOutputFormatProcessorTabSeparated(FormatFactory & factory)
        {
            return std::make_shared<TabSeparatedRowOutputFormat>(buf, sample, true, true, params, settings);
        });
+        factory.markOutputFormatSupportsParallelFormatting(name);
    }
 }

--- a/src/Processors/Formats/LazyOutputFormat.cpp
+++ b/src/Processors/Formats/LazyOutputFormat.cpp
@ -15,24 +15,24 @@ Chunk LazyOutputFormat::getChunk(UInt64 milliseconds)
            return {};
    }

-    Chunk chunk;
-    if (!queue.tryPop(chunk, milliseconds))
+    Port::Data data;
+    if (!queue.tryPop(data, milliseconds))
        return {};

-    if (chunk)
-        info.update(chunk.getNumRows(), chunk.allocatedBytes());
+    if (!data.exception)
+        info.update(data.chunk.getNumRows(), data.chunk.allocatedBytes());

-    return chunk;
+    return data.getChunkOrTrow();
 }

 Chunk LazyOutputFormat::getTotals()
 {
-    return std::move(totals);
+    return totals.getChunkOrTrow();
 }

 Chunk LazyOutputFormat::getExtremes()
 {
-    return std::move(extremes);
+    return extremes.getChunkOrTrow();
 }

 void LazyOutputFormat::setRowsBeforeLimit(size_t rows_before_limit)
--- a/src/Processors/Formats/LazyOutputFormat.h
+++ b/src/Processors/Formats/LazyOutputFormat.h
@ -37,28 +37,28 @@ public:
    }

 protected:
-    void consume(Chunk chunk) override
+    void consume(Port::Data data) override
    {
        if (!finished_processing)
-            queue.emplace(std::move(chunk));
+            queue.emplace(std::move(data));
    }

-    void consumeTotals(Chunk chunk) override { totals = std::move(chunk); }
-    void consumeExtremes(Chunk chunk) override { extremes = std::move(chunk); }
+    void consumeTotals(Port::Data data) override { totals = std::move(data); }
+    void consumeExtremes(Port::Data data) override { extremes = std::move(data); }

    void finalize() override
    {
        finished_processing = true;

        /// In case we are waiting for result.
-        queue.emplace(Chunk());
+        queue.emplace(Port::Data{});
    }

 private:

-    ConcurrentBoundedQueue<Chunk> queue;
-    Chunk totals;
-    Chunk extremes;
+    ConcurrentBoundedQueue<Port::Data> queue;
+    Port::Data totals;
+    Port::Data extremes;

    /// Is not used.
    static WriteBuffer out;
--- a/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.cpp
+++ b/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.cpp
@ -4,9 +4,11 @@
 #include <Columns/ColumnAggregateFunction.h>
 #include <Columns/ColumnTuple.h>
 #include <Common/AlignedBuffer.h>
+#include <Common/Arena.h>
 #include <Common/FieldVisitors.h>
 #include <Common/StringUtils/StringUtils.h>
 #include <DataTypes/DataTypeArray.h>
+#include <DataTypes/DataTypeCustomSimpleAggregateFunction.h>
 #include <DataTypes/NestedUtils.h>
 #include <IO/WriteHelpers.h>

@ -44,13 +46,20 @@ struct SummingSortedAlgorithm::AggregateDescription
    /// In case when column has type AggregateFunction:
    /// use the aggregate function from itself instead of 'function' above.
    bool is_agg_func_type = false;
+    bool is_simple_agg_func_type = false;

    void init(const char * function_name, const DataTypes & argument_types)
    {
        AggregateFunctionProperties properties;
-        function = AggregateFunctionFactory::instance().get(function_name, argument_types, {}, properties);
+        init(AggregateFunctionFactory::instance().get(function_name, argument_types, {}, properties));
+    }
+
+    void init(AggregateFunctionPtr function_, bool is_simple_agg_func_type_ = false)
+    {
+        function = std::move(function_);
        add_function = function->getAddressOfAddFunction();
        state.reset(function->sizeOfData(), function->alignOfData());
+        is_simple_agg_func_type = is_simple_agg_func_type_;
    }

    void createState()
@ -206,8 +215,9 @@ static SummingSortedAlgorithm::ColumnsDefinition defineColumns(
    {
        const ColumnWithTypeAndName & column = header.safeGetByPosition(i);

+        const auto * simple = dynamic_cast<const DataTypeCustomSimpleAggregateFunction *>(column.type->getCustomName());
        /// Discover nested Maps and find columns for summation
-        if (typeid_cast<const DataTypeArray *>(column.type.get()))
+        if (typeid_cast<const DataTypeArray *>(column.type.get()) && !simple)
        {
            const auto map_name = Nested::extractTableName(column.name);
            /// if nested table name ends with `Map` it is a possible candidate for special handling
@ -224,7 +234,7 @@ static SummingSortedAlgorithm::ColumnsDefinition defineColumns(
            bool is_agg_func = WhichDataType(column.type).isAggregateFunction();

            /// There are special const columns for example after prewhere sections.
-            if ((!column.type->isSummable() && !is_agg_func) || isColumnConst(*column.column))
+            if ((!column.type->isSummable() && !is_agg_func && !simple) || isColumnConst(*column.column))
            {
                def.column_numbers_not_to_aggregate.push_back(i);
                continue;
@ -246,7 +256,14 @@ static SummingSortedAlgorithm::ColumnsDefinition defineColumns(
                desc.is_agg_func_type = is_agg_func;
                desc.column_numbers = {i};

-                if (!is_agg_func)
+                if (simple)
+                {
+                    // simple aggregate function
+                    desc.init(simple->getFunction(), true);
+                    if (desc.function->allocatesMemoryInArena())
+                        def.allocates_memory_in_arena = true;
+                }
+                else if (!is_agg_func)
                {
                    desc.init("sumWithOverflow", {column.type});
                }
@ -354,7 +371,7 @@ static MutableColumns getMergedDataColumns(
    for (const auto & desc : def.columns_to_aggregate)
    {
        // Wrap aggregated columns in a tuple to match function signature
-        if (!desc.is_agg_func_type && isTuple(desc.function->getReturnType()))
+        if (!desc.is_agg_func_type && !desc.is_simple_agg_func_type && isTuple(desc.function->getReturnType()))
        {
            size_t tuple_size = desc.column_numbers.size();
            MutableColumns tuple_columns(tuple_size);
@ -399,7 +416,7 @@ static void postprocessChunk(
        auto column = std::move(columns[next_column]);
        ++next_column;

-        if (!desc.is_agg_func_type && isTuple(desc.function->getReturnType()))
+        if (!desc.is_agg_func_type && !desc.is_simple_agg_func_type && isTuple(desc.function->getReturnType()))
        {
            /// Unpack tuple into block.
            size_t tuple_size = desc.column_numbers.size();
@ -455,6 +472,13 @@ SummingSortedAlgorithm::SummingMergedData::SummingMergedData(
 {
    current_row.resize(def.column_names.size());
    initAggregateDescription();
+
+    /// Just to make startGroup() simpler.
+    if (def.allocates_memory_in_arena)
+    {
+        arena = std::make_unique<Arena>();
+        arena_size = arena->size();
+    }
 }

 void SummingSortedAlgorithm::SummingMergedData::startGroup(ColumnRawPtrs & raw_columns, size_t row)
@ -467,6 +491,12 @@ void SummingSortedAlgorithm::SummingMergedData::startGroup(ColumnRawPtrs & raw_c
    for (auto & desc : def.columns_to_aggregate)
        desc.createState();

+    if (def.allocates_memory_in_arena && arena->size() > arena_size)
+    {
+        arena = std::make_unique<Arena>();
+        arena_size = arena->size();
+    }
+
    if (def.maps_to_sum.empty())
    {
        /// We have only columns_to_aggregate. The status of current row will be determined
@ -505,10 +535,10 @@ void SummingSortedAlgorithm::SummingMergedData::finishGroup()
            {
                try
                {
-                    desc.function->insertResultInto(desc.state.data(), *desc.merged_column, nullptr);
+                    desc.function->insertResultInto(desc.state.data(), *desc.merged_column, arena.get());

                    /// Update zero status of current row
-                    if (desc.column_numbers.size() == 1)
+                    if (!desc.is_simple_agg_func_type && desc.column_numbers.size() == 1)
                    {
                        // Flag row as non-empty if at least one column number if non-zero
                        current_row_is_zero = current_row_is_zero
@ -586,7 +616,7 @@ void SummingSortedAlgorithm::SummingMergedData::addRowImpl(ColumnRawPtrs & raw_c
            if (desc.column_numbers.size() == 1)
            {
                auto & col = raw_columns[desc.column_numbers[0]];
-                desc.add_function(desc.function.get(), desc.state.data(), &col, row, nullptr);
+                desc.add_function(desc.function.get(), desc.state.data(), &col, row, arena.get());
            }
            else
            {
@ -595,7 +625,7 @@ void SummingSortedAlgorithm::SummingMergedData::addRowImpl(ColumnRawPtrs & raw_c
                for (size_t i = 0; i < desc.column_numbers.size(); ++i)
                    column_ptrs[i] = raw_columns[desc.column_numbers[i]];

-                desc.add_function(desc.function.get(), desc.state.data(), column_ptrs.data(), row, nullptr);
+                desc.add_function(desc.function.get(), desc.state.data(), column_ptrs.data(), row, arena.get());
            }
        }
    }
--- a/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.h
+++ b/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.h
@ -50,6 +50,9 @@ public:

        /// Names of columns from header.
        Names column_names;
+
+        /// Does SimpleAggregateFunction allocates memory in arena?
+        bool allocates_memory_in_arena = false;
    };

    /// Specialization for SummingSortedTransform. Inserts only data for non-aggregated columns.
@ -73,6 +76,11 @@ public:
    private:
        ColumnsDefinition & def;

+        /// Memory pool for SimpleAggregateFunction
+        /// (only when allocates_memory_in_arena == true).
+        std::unique_ptr<Arena> arena;
+        size_t arena_size = 0;
+
        bool is_group_started = false;

        Row current_row;
--- a/src/Processors/Port.h
+++ b/src/Processors/Port.h
@ -60,6 +60,14 @@ protected:
            /// Note: std::variant can be used. But move constructor for it can't be inlined.
            Chunk chunk;
            std::exception_ptr exception;
+
+            Chunk getChunkOrTrow()
+            {
+                if (exception)
+                    std::rethrow_exception(std::move(exception));
+
+                return std::move(chunk);
+            }
        };

    private:
@ -303,12 +311,7 @@ public:

    Chunk ALWAYS_INLINE pull(bool set_not_needed = false)
    {
-        auto data_ = pullData(set_not_needed);
-
-        if (data_.exception)
-            std::rethrow_exception(data_.exception);
-
-        return std::move(data_.chunk);
+        return pullData(set_not_needed).getChunkOrTrow();
    }

    bool ALWAYS_INLINE isFinished() const
--- a/src/Processors/ya.make
+++ b/src/Processors/ya.make
@ -43,9 +43,10 @@ SRCS(
    Formats/Impl/MsgPackRowInputFormat.cpp
    Formats/Impl/MsgPackRowOutputFormat.cpp
    Formats/Impl/MySQLOutputFormat.cpp
-    Formats/Impl/NativeFormat.cpp
    Formats/Impl/NullFormat.cpp
    Formats/Impl/ODBCDriver2BlockOutputFormat.cpp
+    Formats/Impl/ParallelFormattingOutputFormat.cpp
+    Formats/Impl/ParallelParsingInputFormat.cpp
    Formats/Impl/PostgreSQLOutputFormat.cpp
    Formats/Impl/PrettyBlockOutputFormat.cpp
    Formats/Impl/PrettyCompactBlockOutputFormat.cpp
--- a/Show More
+++ b/Show More