diff --git a/.gitignore b/.gitignore
index 1e9765dca9e..d33dbf0600d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -137,3 +137,9 @@ website/package-lock.json
 /prof
 
 *.iml
+
+# data store
+/programs/server/data
+/programs/server/metadata
+/programs/server/store
+
diff --git a/.gitmodules b/.gitmodules
index 519ba082304..7a2c5600e65 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -220,4 +220,4 @@
 	url = https://github.com/ClickHouse-Extras/boringssl.git
 [submodule "contrib/NuRaft"]
 	path = contrib/NuRaft
-	url = https://github.com/eBay/NuRaft.git
+	url = https://github.com/ClickHouse-Extras/NuRaft.git
diff --git a/README.md b/README.md
index 1c6a021c00c..3329a98877f 100644
--- a/README.md
+++ b/README.md
@@ -13,6 +13,3 @@ ClickHouse® is an open-source column-oriented database management system that a
 * [Code Browser](https://clickhouse.tech/codebrowser/html_report/ClickHouse/index.html) with syntax highlight and navigation.
 * [Contacts](https://clickhouse.tech/#contacts) can help to get your questions answered if there are any.
 * You can also [fill this form](https://clickhouse.tech/#meet) to meet Yandex ClickHouse team in person.
-
-## Upcoming Events
-* [Chinese ClickHouse Meetup (online)](http://hdxu.cn/8KxZE) on 6 February 2021.
diff --git a/base/common/arithmeticOverflow.h b/base/common/arithmeticOverflow.h
index 8df037a14af..fd557fd5b2d 100644
--- a/base/common/arithmeticOverflow.h
+++ b/base/common/arithmeticOverflow.h
@@ -1,6 +1,8 @@
 #pragma once
 
 #include <common/extended_types.h>
+#include <common/defines.h>
+
 
 namespace common
 {
@@ -156,4 +158,11 @@ namespace common
             return false;
         return (x * y) / y != x;
     }
+
+    /// Multiply and ignore overflow.
+    template <typename T1, typename T2>
+    inline auto NO_SANITIZE_UNDEFINED mulIgnoreOverflow(T1 x, T2 y)
+    {
+        return x * y;
+    }
 }
diff --git a/base/daemon/BaseDaemon.cpp b/base/daemon/BaseDaemon.cpp
index 18449dad855..db7019d3572 100644
--- a/base/daemon/BaseDaemon.cpp
+++ b/base/daemon/BaseDaemon.cpp
@@ -152,7 +152,7 @@ static void signalHandler(int sig, siginfo_t * info, void * context)
     if (sig != SIGTSTP) /// This signal is used for debugging.
     {
         /// The time that is usually enough for separate thread to print info into log.
-        sleepForSeconds(10);
+        sleepForSeconds(20);  /// FIXME: use some feedback from threads that process stacktrace
         call_default_signal_handler(sig);
     }
 
@@ -311,7 +311,8 @@ private:
         if (stack_trace.getSize())
         {
             /// Write bare stack trace (addresses) just in case if we will fail to print symbolized stack trace.
-            /// NOTE This still require memory allocations and mutex lock inside logger. BTW we can also print it to stderr using write syscalls.
+            /// NOTE: This still require memory allocations and mutex lock inside logger.
+            ///       BTW we can also print it to stderr using write syscalls.
 
             std::stringstream bare_stacktrace;
             bare_stacktrace << "Stack trace:";
@@ -324,7 +325,7 @@ private:
         /// Write symbolized stack trace line by line for better grep-ability.
         stack_trace.toStringEveryLine([&](const std::string & s) { LOG_FATAL(log, s); });
 
-#if defined(__linux__)
+#if defined(OS_LINUX)
         /// Write information about binary checksum. It can be difficult to calculate, so do it only after printing stack trace.
         String calculated_binary_hash = getHashOfLoadedBinaryHex();
         if (daemon.stored_binary_hash.empty())
@@ -561,6 +562,7 @@ void debugIncreaseOOMScore()
     {
         DB::WriteBufferFromFile buf("/proc/self/oom_score_adj");
         buf.write(new_score.c_str(), new_score.size());
+        buf.close();
     }
     catch (const Poco::Exception & e)
     {
@@ -783,7 +785,7 @@ void BaseDaemon::initializeTerminationAndSignalProcessing()
     /// Setup signal handlers.
     /// SIGTSTP is added for debugging purposes. To output a stack trace of any running thread at anytime.
 
-    addSignalHandler({SIGABRT, SIGSEGV, SIGILL, SIGBUS, SIGSYS, SIGFPE, SIGPIPE, SIGTSTP}, signalHandler, &handled_signals);
+    addSignalHandler({SIGABRT, SIGSEGV, SIGILL, SIGBUS, SIGSYS, SIGFPE, SIGPIPE, SIGTSTP, SIGTRAP}, signalHandler, &handled_signals);
     addSignalHandler({SIGHUP, SIGUSR1}, closeLogsSignalHandler, &handled_signals);
     addSignalHandler({SIGINT, SIGQUIT, SIGTERM}, terminateRequestedSignalHandler, &handled_signals);
 
diff --git a/cmake/find/nuraft.cmake b/cmake/find/nuraft.cmake
index d31fe9c1de8..7fa5251946e 100644
--- a/cmake/find/nuraft.cmake
+++ b/cmake/find/nuraft.cmake
@@ -11,7 +11,7 @@ if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/NuRaft/CMakeLists.txt")
     return()
 endif ()
 
-if (NOT OS_FREEBSD)
+if (NOT OS_FREEBSD AND NOT OS_DARWIN)
     set (USE_NURAFT 1)
     set (NURAFT_LIBRARY nuraft)
 
@@ -20,5 +20,5 @@ if (NOT OS_FREEBSD)
     message (STATUS "Using NuRaft=${USE_NURAFT}: ${NURAFT_INCLUDE_DIR} : ${NURAFT_LIBRARY}")
 else()
     set (USE_NURAFT 0)
-    message (STATUS "Using internal NuRaft library on FreeBSD is not supported")
+    message (STATUS "Using internal NuRaft library on FreeBSD and Darwin is not supported")
 endif()
diff --git a/contrib/NuRaft b/contrib/NuRaft
index 410bd149da8..7adf7ae33e7 160000
--- a/contrib/NuRaft
+++ b/contrib/NuRaft
@@ -1 +1 @@
-Subproject commit 410bd149da84cdde60b4436b02b738749f4e87e1
+Subproject commit 7adf7ae33e7d5c307342431b577c8ab1025ee793
diff --git a/contrib/boost b/contrib/boost
index 8e259cd2a6b..48f40ebb539 160000
--- a/contrib/boost
+++ b/contrib/boost
@@ -1 +1 @@
-Subproject commit 8e259cd2a6b60d75dd17e73432f11bb7b9351bb1
+Subproject commit 48f40ebb539220d328958f8823b094c0b07a4e79
diff --git a/contrib/nuraft-cmake/CMakeLists.txt b/contrib/nuraft-cmake/CMakeLists.txt
index e5bb7f7d11b..83137fe73bf 100644
--- a/contrib/nuraft-cmake/CMakeLists.txt
+++ b/contrib/nuraft-cmake/CMakeLists.txt
@@ -30,7 +30,12 @@ set(SRCS
 
 add_library(nuraft ${SRCS})
 
-target_compile_definitions(nuraft PRIVATE USE_BOOST_ASIO=1 BOOST_ASIO_STANDALONE=1)
+
+if (NOT OPENSSL_SSL_LIBRARY OR NOT OPENSSL_CRYPTO_LIBRARY)
+    target_compile_definitions(nuraft PRIVATE USE_BOOST_ASIO=1 BOOST_ASIO_STANDALONE=1 SSL_LIBRARY_NOT_FOUND=1)
+else()
+    target_compile_definitions(nuraft PRIVATE USE_BOOST_ASIO=1 BOOST_ASIO_STANDALONE=1)
+endif()
 
 target_include_directories (nuraft SYSTEM PRIVATE ${LIBRARY_DIR}/include/libnuraft)
 # for some reason include "asio.h" directly without "boost/" prefix.
diff --git a/docker/server/README.md b/docker/server/README.md
index d8e9204dffa..6f799d68185 100644
--- a/docker/server/README.md
+++ b/docker/server/README.md
@@ -56,7 +56,7 @@ $ echo 'SELECT version()' | curl 'http://localhost:8123/' --data-binary @-
 20.12.3.3
 ```
 
-### Volumes 
+### Volumes
 
 Typically you may want to mount the following folders inside your container to archieve persistency:
 
@@ -76,7 +76,7 @@ You may also want to mount:
 * `/etc/clickhouse-server/usert.d/*.xml` - files with use settings adjustmenets
 * `/docker-entrypoint-initdb.d/` - folder with database initialization scripts (see below).
 
-### Linux capabilities 
+### Linux capabilities
 
 ClickHouse has some advanced functionality which requite enabling several [linux capabilities](https://man7.org/linux/man-pages/man7/capabilities.7.html).
 
@@ -113,10 +113,10 @@ $ docker run --rm -e CLICKHOUSE_UID=0 -e CLICKHOUSE_GID=0 --name clickhouse-serv
 
 ### How to create default database and user on starting
 
-Sometimes you may want to create user (user named `default` is used by default) and database on image starting. You can do it using environment variables `CLICKHOUSE_DB`, `CLICKHOUSE_USER` and `CLICKHOUSE_PASSWORD`:
+Sometimes you may want to create user (user named `default` is used by default) and database on image starting. You can do it using environment variables `CLICKHOUSE_DB`, `CLICKHOUSE_USER`, `CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT` and `CLICKHOUSE_PASSWORD`:
 
 ```
-$ docker run --rm -e CLICKHOUSE_DB=my_database -e CLICKHOUSE_USER=username -e CLICKHOUSE_PASSWORD=password -p 9000:9000/tcp yandex/clickhouse-server
+$ docker run --rm -e CLICKHOUSE_DB=my_database -e CLICKHOUSE_USER=username -e CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT=1 -e CLICKHOUSE_PASSWORD=password -p 9000:9000/tcp yandex/clickhouse-server
 ```
 
 ## How to extend this image
diff --git a/docker/server/alpine-build.sh b/docker/server/alpine-build.sh
index 0142149b5bd..329888f2fcb 100755
--- a/docker/server/alpine-build.sh
+++ b/docker/server/alpine-build.sh
@@ -54,8 +54,10 @@ docker cp -L "${ubuntu20image}":/lib/x86_64-linux-gnu/libm.so.6       "${CONTAIN
 docker cp -L "${ubuntu20image}":/lib/x86_64-linux-gnu/libpthread.so.0 "${CONTAINER_ROOT_FOLDER}/lib"
 docker cp -L "${ubuntu20image}":/lib/x86_64-linux-gnu/librt.so.1      "${CONTAINER_ROOT_FOLDER}/lib"
 docker cp -L "${ubuntu20image}":/lib/x86_64-linux-gnu/libnss_dns.so.2 "${CONTAINER_ROOT_FOLDER}/lib"
+docker cp -L "${ubuntu20image}":/lib/x86_64-linux-gnu/libnss_files.so.2 "${CONTAINER_ROOT_FOLDER}/lib"
 docker cp -L "${ubuntu20image}":/lib/x86_64-linux-gnu/libresolv.so.2  "${CONTAINER_ROOT_FOLDER}/lib"
 docker cp -L "${ubuntu20image}":/lib64/ld-linux-x86-64.so.2           "${CONTAINER_ROOT_FOLDER}/lib64"
+docker cp -L "${ubuntu20image}":/etc/nsswitch.conf                    "${CONTAINER_ROOT_FOLDER}/etc"
 
 docker build "$DOCKER_BUILD_FOLDER" -f Dockerfile.alpine -t "${DOCKER_IMAGE}:${VERSION}-alpine" --pull
 rm -rf "$CONTAINER_ROOT_FOLDER"
diff --git a/docker/server/entrypoint.sh b/docker/server/entrypoint.sh
index 549ff601c59..0138a165505 100755
--- a/docker/server/entrypoint.sh
+++ b/docker/server/entrypoint.sh
@@ -54,6 +54,7 @@ FORMAT_SCHEMA_PATH="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_
 CLICKHOUSE_USER="${CLICKHOUSE_USER:-default}"
 CLICKHOUSE_PASSWORD="${CLICKHOUSE_PASSWORD:-}"
 CLICKHOUSE_DB="${CLICKHOUSE_DB:-}"
+CLICKHOUSE_ACCESS_MANAGEMENT="${CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT:-0}"
 
 for dir in "$DATA_DIR" \
   "$ERROR_LOG_DIR" \
@@ -97,6 +98,7 @@ if [ -n "$CLICKHOUSE_USER" ] && [ "$CLICKHOUSE_USER" != "default" ] || [ -n "$CL
           </networks>
           <password>${CLICKHOUSE_PASSWORD}</password>
           <quota>default</quota>
+          <access_management>${CLICKHOUSE_ACCESS_MANAGEMENT}</access_management>
         </${CLICKHOUSE_USER}>
       </users>
     </yandex>
diff --git a/docker/test/fasttest/run.sh b/docker/test/fasttest/run.sh
index a8aa852ca5b..e6294b5d74d 100755
--- a/docker/test/fasttest/run.sh
+++ b/docker/test/fasttest/run.sh
@@ -163,6 +163,7 @@ function clone_submodules
             contrib/xz
             contrib/dragonbox
             contrib/fast_float
+            contrib/NuRaft
         )
 
         git submodule sync
@@ -182,6 +183,7 @@ function run_cmake
         "-DENABLE_EMBEDDED_COMPILER=0"
         "-DENABLE_THINLTO=0"
         "-DUSE_UNWIND=1"
+        "-DENABLE_NURAFT=1"
     )
 
     # TODO remove this? we don't use ccache anyway. An option would be to download it
diff --git a/docker/test/stress/run.sh b/docker/test/stress/run.sh
index 9da2f3d3ada..88a633ac488 100755
--- a/docker/test/stress/run.sh
+++ b/docker/test/stress/run.sh
@@ -64,7 +64,7 @@ clickhouse-client --query "RENAME TABLE datasets.hits_v1 TO test.hits"
 clickhouse-client --query "RENAME TABLE datasets.visits_v1 TO test.visits"
 clickhouse-client --query "SHOW TABLES FROM test"
 
-./stress --output-folder test_output --skip-func-tests "$SKIP_TESTS_OPTION"
+./stress --hung-check --output-folder test_output --skip-func-tests "$SKIP_TESTS_OPTION" && echo "OK" > /test_output/script_exit_code.txt || echo "FAIL" > /test_output/script_exit_code.txt
 
 stop
 start
diff --git a/docker/test/stress/stress b/docker/test/stress/stress
index 458f78fcdb4..d2ec86b4421 100755
--- a/docker/test/stress/stress
+++ b/docker/test/stress/stress
@@ -1,8 +1,9 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 from multiprocessing import cpu_count
-from subprocess import Popen, check_call
+from subprocess import Popen, call, STDOUT
 import os
+import sys
 import shutil
 import argparse
 import logging
@@ -64,7 +65,8 @@ if __name__ == "__main__":
     parser.add_argument("--server-log-folder", default='/var/log/clickhouse-server')
     parser.add_argument("--output-folder")
     parser.add_argument("--global-time-limit", type=int, default=3600)
-    parser.add_argument("--num-parallel", default=cpu_count());
+    parser.add_argument("--num-parallel", default=cpu_count())
+    parser.add_argument('--hung-check', action='store_true', default=False)
 
     args = parser.parse_args()
     func_pipes = []
@@ -81,4 +83,13 @@ if __name__ == "__main__":
         logging.info("Finished %s from %s processes", len(retcodes), len(func_pipes))
         time.sleep(5)
 
+    logging.info("All processes finished")
+    if args.hung_check:
+        logging.info("Checking if some queries hung")
+        cmd = "{} {} {}".format(args.test_cmd, "--hung-check", "00001_select_1")
+        res = call(cmd, shell=True, stderr=STDOUT)
+        if res != 0:
+            logging.info("Hung check failed with exit code {}".format(res))
+            sys.exit(1)
+
     logging.info("Stress test finished")
diff --git a/docs/_description_templates/template-data-type.md b/docs/_description_templates/template-data-type.md
new file mode 100644
index 00000000000..edb6586ee7d
--- /dev/null
+++ b/docs/_description_templates/template-data-type.md
@@ -0,0 +1,29 @@
+---
+toc_priority: 
+toc_title: 
+---
+
+# data_type_name {#data_type-name}
+
+Description.
+
+**Parameters** (Optional)
+
+-   `x` — Description. [Type name](relative/path/to/type/dscr.md#type).
+-   `y` — Description. [Type name](relative/path/to/type/dscr.md#type).
+
+**Examples**
+
+```sql
+
+```
+
+## Additional Info {#additional-info} (Optional)
+
+The name of an additional section can be any, for example, **Usage**.
+
+**See Also** (Optional)
+
+-   [link](#)
+
+[Original article](https://clickhouse.tech/docs/en/data_types/<data-type-name>/) <!--hide-->
diff --git a/docs/_description_templates/template-function.md b/docs/_description_templates/template-function.md
index b69d7ed5309..a0074a76ef6 100644
--- a/docs/_description_templates/template-function.md
+++ b/docs/_description_templates/template-function.md
@@ -12,16 +12,20 @@ Alias: `<alias name>`. (Optional)
 
 More text (Optional).
 
-**Parameters** (Optional)
+**Arguments** (Optional)
 
 -   `x` — Description. [Type name](relative/path/to/type/dscr.md#type).
 -   `y` — Description. [Type name](relative/path/to/type/dscr.md#type).
 
+**Parameters** (Optional, only for parametric aggregate functions)
+
+-   `z` — Description. [Type name](relative/path/to/type/dscr.md#type).
+
 **Returned value(s)**
 
--   Returned values list.
+-   Returned values list. 
 
-Type: [Type](relative/path/to/type/dscr.md#type).
+Type: [Type name](relative/path/to/type/dscr.md#type).
 
 **Example**
 
diff --git a/docs/_description_templates/template-system-table.md b/docs/_description_templates/template-system-table.md
index 3fdf9788d79..f2decc4bb6d 100644
--- a/docs/_description_templates/template-system-table.md
+++ b/docs/_description_templates/template-system-table.md
@@ -8,10 +8,14 @@ Columns:
 
 **Example**
 
+Query:
+
 ``` sql
 SELECT * FROM system.table_name
 ```
 
+Result:
+
 ``` text
 Some output. It shouldn't be too long.
 ```
diff --git a/docs/en/engines/table-engines/integrations/rabbitmq.md b/docs/en/engines/table-engines/integrations/rabbitmq.md
index b0901ee6f6e..c73876fdebe 100644
--- a/docs/en/engines/table-engines/integrations/rabbitmq.md
+++ b/docs/en/engines/table-engines/integrations/rabbitmq.md
@@ -59,10 +59,11 @@ Optional parameters:
 -   `rabbitmq_max_block_size`
 -   `rabbitmq_flush_interval_ms`
 
-Required configuration:
 
 The RabbitMQ server configuration should be added using the ClickHouse config file.
 
+Required configuration:
+
 ``` xml
  <rabbitmq>
     <username>root</username>
@@ -70,6 +71,14 @@ The RabbitMQ server configuration should be added using the ClickHouse config fi
  </rabbitmq>
 ```
 
+Additional configuration:
+
+``` xml
+ <rabbitmq>
+    <vhost>clickhouse</vhost>
+ </rabbitmq>
+```
+
 Example:
 
 ``` sql
diff --git a/docs/en/engines/table-engines/integrations/s3.md b/docs/en/engines/table-engines/integrations/s3.md
index d8cceb4d511..5858a0803e6 100644
--- a/docs/en/engines/table-engines/integrations/s3.md
+++ b/docs/en/engines/table-engines/integrations/s3.md
@@ -136,8 +136,7 @@ The following settings can be specified in configuration file for given endpoint
 -   `access_key_id` and `secret_access_key` — Optional. Specifies credentials to use with given endpoint.
 -   `use_environment_credentials` — Optional, default value is `false`. If set to `true`, S3 client will try to obtain credentials from environment variables and Amazon EC2 metadata for given endpoint.
 -   `header` — Optional, can be speficied multiple times. Adds specified HTTP header to a request to given endpoint.
-
-This configuration also applies to S3 disks in `MergeTree` table engine family.
+-   `server_side_encryption_customer_key_base64` — Optional. If specified, required headers for accessing S3 objects with SSE-C encryption will be set.
 
 Example:
 
@@ -149,6 +148,7 @@ Example:
         <!-- <secret_access_key>SECRET_ACCESS_KEY</secret_access_key> -->
         <!-- <use_environment_credentials>false</use_environment_credentials> -->
         <!-- <header>Authorization: Bearer SOME-TOKEN</header> -->
+        <!-- <server_side_encryption_customer_key_base64>BASE64-ENCODED-KEY</server_side_encryption_customer_key_base64> -->
     </endpoint-name>
 </s3>
 ```
diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md
index c69da4197b8..753859b46d2 100644
--- a/docs/en/engines/table-engines/mergetree-family/mergetree.md
+++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md
@@ -715,6 +715,7 @@ Configuration markup:
             <endpoint>https://storage.yandexcloud.net/my-bucket/root-path/</endpoint>
             <access_key_id>your_access_key_id</access_key_id>
             <secret_access_key>your_secret_access_key</secret_access_key>
+            <server_side_encryption_customer_key_base64>your_base64_encoded_customer_key</server_side_encryption_customer_key_base64>
             <proxy>
                 <uri>http://proxy1</uri>
                 <uri>http://proxy2</uri>
@@ -750,7 +751,8 @@ Optional parameters:
 -   `metadata_path` — Path on local FS to store metadata files for S3. Default value is `/var/lib/clickhouse/disks/<disk_name>/`. 
 -   `cache_enabled` — Allows to cache mark and index files on local FS. Default value is `true`. 
 -   `cache_path` — Path on local FS where to store cached mark and index files. Default value is `/var/lib/clickhouse/disks/<disk_name>/cache/`. 
--   `skip_access_check` — If true disk access checks will not be performed on disk start-up. Default value is `false`.
+-   `skip_access_check` — If true, disk access checks will not be performed on disk start-up. Default value is `false`.
+-   `server_side_encryption_customer_key_base64` — If specified, required headers for accessing S3 objects with SSE-C encryption will be set.
 
 
 S3 disk can be configured as `main` or `cold` storage:
diff --git a/docs/en/faq/operations/delete-old-data.md b/docs/en/faq/operations/delete-old-data.md
index 5addc455602..fdf1f1f290e 100644
--- a/docs/en/faq/operations/delete-old-data.md
+++ b/docs/en/faq/operations/delete-old-data.md
@@ -39,4 +39,4 @@ More details on [manipulating partitions](../../sql-reference/statements/alter/p
 
 It’s rather radical to drop all data from a table, but in some cases it might be exactly what you need.
 
-More details on [table truncation](../../sql-reference/statements/alter/partition.md#alter_drop-partition).
+More details on [table truncation](../../sql-reference/statements/truncate.md).
diff --git a/docs/en/getting-started/tutorial.md b/docs/en/getting-started/tutorial.md
index 64363c963c5..fe697972dff 100644
--- a/docs/en/getting-started/tutorial.md
+++ b/docs/en/getting-started/tutorial.md
@@ -644,7 +644,7 @@ If there are no replicas at the moment on replicated table creation, a new first
 
 ``` sql
 CREATE TABLE tutorial.hits_replica (...)
-ENGINE = ReplcatedMergeTree(
+ENGINE = ReplicatedMergeTree(
     '/clickhouse_perftest/tables/{shard}/hits',
     '{replica}'
 )
diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md
index 11291d61300..33bf90a8b52 100644
--- a/docs/en/interfaces/formats.md
+++ b/docs/en/interfaces/formats.md
@@ -31,8 +31,8 @@ The supported formats are:
 | [JSONCompactString](#jsoncompactstring)                                                 | ✗     | ✔      |
 | [JSONEachRow](#jsoneachrow)                                                             | ✔     | ✔      |
 | [JSONEachRowWithProgress](#jsoneachrowwithprogress)                                     | ✗     | ✔      |
-| [JSONStringEachRow](#jsonstringeachrow)                                                 | ✔     | ✔      |
-| [JSONStringEachRowWithProgress](#jsonstringeachrowwithprogress)                         | ✗     | ✔      |
+| [JSONStringsEachRow](#jsonstringseachrow)                                               | ✔     | ✔      |
+| [JSONStringsEachRowWithProgress](#jsonstringseachrowwithprogress)                       | ✗     | ✔      |
 | [JSONCompactEachRow](#jsoncompacteachrow)                                               | ✔     | ✔      |
 | [JSONCompactEachRowWithNamesAndTypes](#jsoncompacteachrowwithnamesandtypes)             | ✔     | ✔      |
 | [JSONCompactStringEachRow](#jsoncompactstringeachrow)                                   | ✔     | ✔      |
@@ -612,7 +612,7 @@ Example:
 ```
 
 ## JSONEachRow {#jsoneachrow}
-## JSONStringEachRow {#jsonstringeachrow}
+## JSONStringsEachRow {#jsonstringseachrow}
 ## JSONCompactEachRow {#jsoncompacteachrow}
 ## JSONCompactStringEachRow {#jsoncompactstringeachrow}
 
@@ -627,9 +627,9 @@ When using these formats, ClickHouse outputs rows as separated, newline-delimite
 When inserting the data, you should provide a separate JSON value for each row.
 
 ## JSONEachRowWithProgress {#jsoneachrowwithprogress}
-## JSONStringEachRowWithProgress {#jsonstringeachrowwithprogress}
+## JSONStringsEachRowWithProgress {#jsonstringseachrowwithprogress}
 
-Differs from `JSONEachRow`/`JSONStringEachRow` in that ClickHouse will also yield progress information as JSON values.
+Differs from `JSONEachRow`/`JSONStringsEachRow` in that ClickHouse will also yield progress information as JSON values.
 
 ```json
 {"row":{"'hello'":"hello","multiply(42, number)":"0","range(5)":[0,1,2,3,4]}}
diff --git a/docs/en/operations/backup.md b/docs/en/operations/backup.md
index ea37a22c165..f4206f5d70c 100644
--- a/docs/en/operations/backup.md
+++ b/docs/en/operations/backup.md
@@ -5,7 +5,7 @@ toc_title: Data Backup
 
 # Data Backup {#data-backup}
 
-While [replication](../engines/table-engines/mergetree-family/replication.md) provides protection from hardware failures, it does not protect against human errors: accidental deletion of data, deletion of the wrong table or a table on the wrong cluster, and software bugs that result in incorrect data processing or data corruption. In many cases mistakes like these will affect all replicas. ClickHouse has built-in safeguards to prevent some types of mistakes — for example, by default [you can’t just drop tables with a MergeTree-like engine containing more than 50 Gb of data](https://github.com/ClickHouse/ClickHouse/blob/v18.14.18-stable/programs/server/config.xml#L322-L330). However, these safeguards don’t cover all possible cases and can be circumvented.
+While [replication](../engines/table-engines/mergetree-family/replication.md) provides protection from hardware failures, it does not protect against human errors: accidental deletion of data, deletion of the wrong table or a table on the wrong cluster, and software bugs that result in incorrect data processing or data corruption. In many cases mistakes like these will affect all replicas. ClickHouse has built-in safeguards to prevent some types of mistakes — for example, by default [you can’t just drop tables with a MergeTree-like engine containing more than 50 Gb of data](server-configuration-parameters/settings.md#max-table-size-to-drop). However, these safeguards don’t cover all possible cases and can be circumvented.
 
 In order to effectively mitigate possible human errors, you should carefully prepare a strategy for backing up and restoring your data **in advance**.
 
diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md
index c7ee48c11bf..43519bfc8dc 100644
--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@@ -312,7 +312,7 @@ Enables or disables parsing enum values as enum ids for TSV input format.
 Possible values:
 
 -   0 — Enum values are parsed as values.
--   1 — Enum values are parsed as enum IDs
+-   1 — Enum values are parsed as enum IDs.
 
 Default value: 0.
 
@@ -2592,6 +2592,18 @@ Possible values:
 
 Default value: `16`.
 
+## opentelemetry_start_trace_probability {#opentelemetry-start-trace-probability}
+
+Sets the probability that the ClickHouse can start a trace for executed queries (if no parent [trace context](https://www.w3.org/TR/trace-context/) is supplied).
+
+Possible values:
+
+-   0 — The trace for all executed queries is disabled (if no parent trace context is supplied).
+-   Positive floating-point number in the range [0..1]. For example, if the setting value is `0,5`, ClickHouse can start a trace on average for half of the queries.
+-   1 — The trace for all executed queries is enabled.
+
+Default value: `0`.
+
 ## optimize_on_insert {#optimize-on-insert}
 
 Enables or disables data transformation before the insertion, as if merge was done on this block (according to table engine).
diff --git a/docs/en/operations/system-tables/opentelemetry_span_log.md b/docs/en/operations/system-tables/opentelemetry_span_log.md
new file mode 100644
index 00000000000..e45a989742c
--- /dev/null
+++ b/docs/en/operations/system-tables/opentelemetry_span_log.md
@@ -0,0 +1,53 @@
+# system.opentelemetry_span_log {#system_tables-opentelemetry_span_log}
+
+Contains information about [trace spans](https://opentracing.io/docs/overview/spans/) for executed queries.
+
+Columns:
+
+-   `trace_id` ([UUID](../../sql-reference/data-types/uuid.md) — ID of the trace for executed query.
+
+-   `span_id` ([UInt64](../../sql-reference/data-types/int-uint.md)) — ID of the `trace span`.
+
+-   `parent_span_id` ([UInt64](../../sql-reference/data-types/int-uint.md)) — ID of the parent `trace span`.
+
+-   `operation_name` ([String](../../sql-reference/data-types/string.md)) — The name of the operation.
+
+-   `start_time_us` ([UInt64](../../sql-reference/data-types/int-uint.md)) — The start time of the `trace span` (in microseconds).
+
+-   `finish_time_us` ([UInt64](../../sql-reference/data-types/int-uint.md)) — The finish time of the `trace span` (in microseconds).
+
+-   `finish_date` ([Date](../../sql-reference/data-types/date.md)) — The finish date of the `trace span`.
+
+-   `attribute.names` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — [Attribute](https://opentelemetry.io/docs/go/instrumentation/#attributes) names depending on the `trace span`. They are filled in according to the recommendations in the [OpenTelemetry](https://opentelemetry.io/) standard.
+
+-   `attribute.values` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — Attribute values depending on the `trace span`. They are filled in according to the recommendations in the `OpenTelemetry` standard.
+
+**Example**
+
+Query:
+
+``` sql
+SELECT * FROM system.opentelemetry_span_log LIMIT 1 FORMAT Vertical;
+```
+
+Result:
+
+``` text
+Row 1:
+──────
+trace_id:         cdab0847-0d62-61d5-4d38-dd65b19a1914
+span_id:          701487461015578150
+parent_span_id:   2991972114672045096
+operation_name:   DB::Block DB::InterpreterSelectQuery::getSampleBlockImpl()
+start_time_us:    1612374594529090
+finish_time_us:   1612374594529108
+finish_date:      2021-02-03
+attribute.names:  []
+attribute.values: []
+```
+
+**See Also**
+
+-   [OpenTelemetry](../../operations/opentelemetry.md)
+
+[Original article](https://clickhouse.tech/docs/en/operations/system_tables/opentelemetry_span_log) <!--hide-->
diff --git a/docs/en/operations/update.md b/docs/en/operations/update.md
index edacf1ff973..9fa9c44e130 100644
--- a/docs/en/operations/update.md
+++ b/docs/en/operations/update.md
@@ -1,9 +1,9 @@
 ---
 toc_priority: 47
-toc_title: ClickHouse Update
+toc_title: ClickHouse Upgrade
 ---
 
-# ClickHouse Update {#clickhouse-update}
+# ClickHouse Upgrade {#clickhouse-upgrade}
 
 If ClickHouse was installed from `deb` packages, execute the following commands on the server:
 
@@ -16,3 +16,19 @@ $ sudo service clickhouse-server restart
 If you installed ClickHouse using something other than the recommended `deb` packages, use the appropriate update method.
 
 ClickHouse does not support a distributed update. The operation should be performed consecutively on each separate server. Do not update all the servers on a cluster simultaneously, or the cluster will be unavailable for some time.
+
+The upgrade of older version of ClickHouse to specific version:
+
+As an example:
+ 
+`xx.yy.a.b` is a current stable version. The latest stable version could be found [here](https://github.com/ClickHouse/ClickHouse/releases)
+
+```bash
+$ sudo apt-get update
+$ sudo apt-get install clickhouse-server=xx.yy.a.b clickhouse-client=xx.yy.a.b clickhouse-common-static=xx.yy.a.b
+$ sudo service clickhouse-server restart
+```
+
+
+
+
diff --git a/docs/en/sql-reference/aggregate-functions/combinators.md b/docs/en/sql-reference/aggregate-functions/combinators.md
index 431968bc629..015c90e90c7 100644
--- a/docs/en/sql-reference/aggregate-functions/combinators.md
+++ b/docs/en/sql-reference/aggregate-functions/combinators.md
@@ -72,7 +72,7 @@ If an aggregate function doesn’t have input values, with this combinator it re
 <aggFunction>OrDefault(x)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `x` — Aggregate function parameters.
 
@@ -132,7 +132,7 @@ This combinator converts a result of an aggregate function to the [Nullable](../
 <aggFunction>OrNull(x)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `x` — Aggregate function parameters.
 
@@ -189,7 +189,7 @@ Lets you divide data into groups, and then separately aggregates the data in tho
 <aggFunction>Resample(start, end, step)(<aggFunction_params>, resampling_key)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `start` — Starting value of the whole required interval for `resampling_key` values.
 -   `stop` — Ending value of the whole required interval for `resampling_key` values. The whole interval doesn’t include the `stop` value `[start, stop)`.
diff --git a/docs/en/sql-reference/aggregate-functions/parametric-functions.md b/docs/en/sql-reference/aggregate-functions/parametric-functions.md
index 4b3bf12aa8c..035bc91b9ed 100644
--- a/docs/en/sql-reference/aggregate-functions/parametric-functions.md
+++ b/docs/en/sql-reference/aggregate-functions/parametric-functions.md
@@ -17,10 +17,13 @@ histogram(number_of_bins)(values)
 
 The functions uses [A Streaming Parallel Decision Tree Algorithm](http://jmlr.org/papers/volume11/ben-haim10a/ben-haim10a.pdf). The borders of histogram bins are adjusted as new data enters a function. In common case, the widths of bins are not equal.
 
+**Arguments**
+
+`values` — [Expression](../../sql-reference/syntax.md#syntax-expressions) resulting in input values.
+
 **Parameters**
 
 `number_of_bins` — Upper limit for the number of bins in the histogram. The function automatically calculates the number of bins. It tries to reach the specified number of bins, but if it fails, it uses fewer bins.
-`values` — [Expression](../../sql-reference/syntax.md#syntax-expressions) resulting in input values.
 
 **Returned values**
 
@@ -89,14 +92,16 @@ sequenceMatch(pattern)(timestamp, cond1, cond2, ...)
 !!! warning "Warning"
     Events that occur at the same second may lay in the sequence in an undefined order affecting the result.
 
-**Parameters**
-
--   `pattern` — Pattern string. See [Pattern syntax](#sequence-function-pattern-syntax).
+**Arguments**
 
 -   `timestamp` — Column considered to contain time data. Typical data types are `Date` and `DateTime`. You can also use any of the supported [UInt](../../sql-reference/data-types/int-uint.md) data types.
 
 -   `cond1`, `cond2` — Conditions that describe the chain of events. Data type: `UInt8`. You can pass up to 32 condition arguments. The function takes only the events described in these conditions into account. If the sequence contains data that isn’t described in a condition, the function skips them.
 
+**Parameters**
+
+-   `pattern` — Pattern string. See [Pattern syntax](#sequence-function-pattern-syntax).
+
 **Returned values**
 
 -   1, if the pattern is matched.
@@ -176,14 +181,16 @@ Counts the number of event chains that matched the pattern. The function searche
 sequenceCount(pattern)(timestamp, cond1, cond2, ...)
 ```
 
-**Parameters**
-
--   `pattern` — Pattern string. See [Pattern syntax](#sequence-function-pattern-syntax).
+**Arguments**
 
 -   `timestamp` — Column considered to contain time data. Typical data types are `Date` and `DateTime`. You can also use any of the supported [UInt](../../sql-reference/data-types/int-uint.md) data types.
 
 -   `cond1`, `cond2` — Conditions that describe the chain of events. Data type: `UInt8`. You can pass up to 32 condition arguments. The function takes only the events described in these conditions into account. If the sequence contains data that isn’t described in a condition, the function skips them.
 
+**Parameters**
+
+-   `pattern` — Pattern string. See [Pattern syntax](#sequence-function-pattern-syntax).
+
 **Returned values**
 
 -   Number of non-overlapping event chains that are matched.
@@ -239,13 +246,16 @@ The function works according to the algorithm:
 windowFunnel(window, [mode])(timestamp, cond1, cond2, ..., condN)
 ```
 
+**Arguments**
+
+-   `timestamp` — Name of the column containing the timestamp. Data types supported: [Date](../../sql-reference/data-types/date.md), [DateTime](../../sql-reference/data-types/datetime.md#data_type-datetime) and other unsigned integer types (note that even though timestamp supports the `UInt64` type, it’s value can’t exceed the Int64 maximum, which is 2^63 - 1).
+-   `cond` — Conditions or data describing the chain of events. [UInt8](../../sql-reference/data-types/int-uint.md).
+
 **Parameters**
 
 -   `window` — Length of the sliding window. The unit of `window` depends on the timestamp itself and varies. Determined using the expression `timestamp of cond2 <= timestamp of cond1 + window`.
--   `mode` - It is an optional argument.
+-   `mode` - It is an optional parameter.
     -   `'strict'` - When the `'strict'` is set, the windowFunnel() applies conditions only for the unique values.
--   `timestamp` — Name of the column containing the timestamp. Data types supported: [Date](../../sql-reference/data-types/date.md), [DateTime](../../sql-reference/data-types/datetime.md#data_type-datetime) and other unsigned integer types (note that even though timestamp supports the `UInt64` type, it’s value can’t exceed the Int64 maximum, which is 2^63 - 1).
--   `cond` — Conditions or data describing the chain of events. [UInt8](../../sql-reference/data-types/int-uint.md).
 
 **Returned value**
 
@@ -324,7 +334,7 @@ The conditions, except the first, apply in pairs: the result of the second will
 retention(cond1, cond2, ..., cond32);
 ```
 
-**Parameters**
+**Arguments**
 
 -   `cond` — an expression that returns a `UInt8` result (1 or 0).
 
diff --git a/docs/en/sql-reference/aggregate-functions/reference/argmax.md b/docs/en/sql-reference/aggregate-functions/reference/argmax.md
index 9899c731ce9..7639117042f 100644
--- a/docs/en/sql-reference/aggregate-functions/reference/argmax.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/argmax.md
@@ -20,7 +20,7 @@ or
 argMax(tuple(arg, val))
 ```
 
-**Parameters**
+**Arguments**
 
 -   `arg` — Argument.
 -   `val` — Value.
diff --git a/docs/en/sql-reference/aggregate-functions/reference/argmin.md b/docs/en/sql-reference/aggregate-functions/reference/argmin.md
index 2fe9a313260..7ddc38cd28a 100644
--- a/docs/en/sql-reference/aggregate-functions/reference/argmin.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/argmin.md
@@ -20,7 +20,7 @@ or
 argMin(tuple(arg, val))
 ```
 
-**Parameters**
+**Arguments**
 
 -   `arg` — Argument.
 -   `val` — Value.
diff --git a/docs/en/sql-reference/aggregate-functions/reference/avg.md b/docs/en/sql-reference/aggregate-functions/reference/avg.md
index e2e6aace734..12dc4ac1e9d 100644
--- a/docs/en/sql-reference/aggregate-functions/reference/avg.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/avg.md
@@ -12,7 +12,7 @@ Calculates the arithmetic mean.
 avgWeighted(x)
 ```
 
-**Parameter**
+**Arguments**
 
 -   `x` — Values.
 
diff --git a/docs/en/sql-reference/aggregate-functions/reference/avgweighted.md b/docs/en/sql-reference/aggregate-functions/reference/avgweighted.md
index 7b9c0de2755..2df09e560b4 100644
--- a/docs/en/sql-reference/aggregate-functions/reference/avgweighted.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/avgweighted.md
@@ -12,7 +12,7 @@ Calculates the [weighted arithmetic mean](https://en.wikipedia.org/wiki/Weighted
 avgWeighted(x, weight)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `x` — Values.
 -   `weight` — Weights of the values.
diff --git a/docs/en/sql-reference/aggregate-functions/reference/count.md b/docs/en/sql-reference/aggregate-functions/reference/count.md
index e5d31429e12..0a5aef2fe97 100644
--- a/docs/en/sql-reference/aggregate-functions/reference/count.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/count.md
@@ -10,7 +10,7 @@ ClickHouse supports the following syntaxes for `count`:
 - `count(expr)` or `COUNT(DISTINCT expr)`.
 - `count()` or `COUNT(*)`. The `count()` syntax is ClickHouse-specific.
 
-**Parameters**
+**Arguments**
 
 The function can take:
 
diff --git a/docs/en/sql-reference/aggregate-functions/reference/grouparrayinsertat.md b/docs/en/sql-reference/aggregate-functions/reference/grouparrayinsertat.md
index f4b8665a0a4..68456bf7844 100644
--- a/docs/en/sql-reference/aggregate-functions/reference/grouparrayinsertat.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/grouparrayinsertat.md
@@ -17,7 +17,7 @@ If in one query several values are inserted into the same position, the function
 -   If a query is executed in a single thread, the first one of the inserted values is used.
 -   If a query is executed in multiple threads, the resulting value is an undetermined one of the inserted values.
 
-**Parameters**
+**Arguments**
 
 -   `x` — Value to be inserted. [Expression](../../../sql-reference/syntax.md#syntax-expressions) resulting in one of the [supported data types](../../../sql-reference/data-types/index.md).
 -   `pos` — Position at which the specified element `x` is to be inserted. Index numbering in the array starts from zero. [UInt32](../../../sql-reference/data-types/int-uint.md#uint-ranges).
diff --git a/docs/en/sql-reference/aggregate-functions/reference/grouparraymovingavg.md b/docs/en/sql-reference/aggregate-functions/reference/grouparraymovingavg.md
index 1cd40c2002f..c732efecf58 100644
--- a/docs/en/sql-reference/aggregate-functions/reference/grouparraymovingavg.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/grouparraymovingavg.md
@@ -13,7 +13,7 @@ groupArrayMovingAvg(window_size)(numbers_for_summing)
 
 The function can take the window size as a parameter. If left unspecified, the function takes the window size equal to the number of rows in the column.
 
-**Parameters**
+**Arguments**
 
 -   `numbers_for_summing` — [Expression](../../../sql-reference/syntax.md#syntax-expressions) resulting in a numeric data type value.
 -   `window_size` — Size of the calculation window.
diff --git a/docs/en/sql-reference/aggregate-functions/reference/grouparraymovingsum.md b/docs/en/sql-reference/aggregate-functions/reference/grouparraymovingsum.md
index ef979cd5f6a..c3dfeda850e 100644
--- a/docs/en/sql-reference/aggregate-functions/reference/grouparraymovingsum.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/grouparraymovingsum.md
@@ -13,7 +13,7 @@ groupArrayMovingSum(window_size)(numbers_for_summing)
 
 The function can take the window size as a parameter. If left unspecified, the function takes the window size equal to the number of rows in the column.
 
-**Parameters**
+**Arguments**
 
 -   `numbers_for_summing` — [Expression](../../../sql-reference/syntax.md#syntax-expressions) resulting in a numeric data type value.
 -   `window_size` — Size of the calculation window.
diff --git a/docs/en/sql-reference/aggregate-functions/reference/grouparraysample.md b/docs/en/sql-reference/aggregate-functions/reference/grouparraysample.md
index 36fa6a9d661..df0b8120eef 100644
--- a/docs/en/sql-reference/aggregate-functions/reference/grouparraysample.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/grouparraysample.md
@@ -12,7 +12,7 @@ Creates an array of sample argument values. The size of the resulting array is l
 groupArraySample(max_size[, seed])(x)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `max_size` — Maximum size of the resulting array. [UInt64](../../data-types/int-uint.md).
 -   `seed` — Seed for the random number generator. Optional. [UInt64](../../data-types/int-uint.md). Default value: `123456`.
diff --git a/docs/en/sql-reference/aggregate-functions/reference/groupbitand.md b/docs/en/sql-reference/aggregate-functions/reference/groupbitand.md
index 9be73fd54ec..1275ad7536c 100644
--- a/docs/en/sql-reference/aggregate-functions/reference/groupbitand.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/groupbitand.md
@@ -10,7 +10,7 @@ Applies bitwise `AND` for series of numbers.
 groupBitAnd(expr)
 ```
 
-**Parameters**
+**Arguments**
 
 `expr` – An expression that results in `UInt*` type.
 
diff --git a/docs/en/sql-reference/aggregate-functions/reference/groupbitmap.md b/docs/en/sql-reference/aggregate-functions/reference/groupbitmap.md
index 9367652db38..9317ef98783 100644
--- a/docs/en/sql-reference/aggregate-functions/reference/groupbitmap.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/groupbitmap.md
@@ -10,7 +10,7 @@ Bitmap or Aggregate calculations from a unsigned integer column, return cardinal
 groupBitmap(expr)
 ```
 
-**Parameters**
+**Arguments**
 
 `expr` – An expression that results in `UInt*` type.
 
diff --git a/docs/en/sql-reference/aggregate-functions/reference/groupbitmapand.md b/docs/en/sql-reference/aggregate-functions/reference/groupbitmapand.md
index 7c0c89040bb..f59bb541a42 100644
--- a/docs/en/sql-reference/aggregate-functions/reference/groupbitmapand.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/groupbitmapand.md
@@ -10,7 +10,7 @@ Calculations the AND of a bitmap column, return cardinality of type UInt64, if a
 groupBitmapAnd(expr)
 ```
 
-**Parameters**
+**Arguments**
 
 `expr` – An expression that results in `AggregateFunction(groupBitmap, UInt*)` type.
 
diff --git a/docs/en/sql-reference/aggregate-functions/reference/groupbitmapor.md b/docs/en/sql-reference/aggregate-functions/reference/groupbitmapor.md
index 894c6c90aab..a4d99fd29e3 100644
--- a/docs/en/sql-reference/aggregate-functions/reference/groupbitmapor.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/groupbitmapor.md
@@ -10,7 +10,7 @@ Calculations the OR of a bitmap column, return cardinality of type UInt64, if ad
 groupBitmapOr(expr)
 ```
 
-**Parameters**
+**Arguments**
 
 `expr` – An expression that results in `AggregateFunction(groupBitmap, UInt*)` type.
 
diff --git a/docs/en/sql-reference/aggregate-functions/reference/groupbitmapxor.md b/docs/en/sql-reference/aggregate-functions/reference/groupbitmapxor.md
index 5d0ec0fb097..834f088d02f 100644
--- a/docs/en/sql-reference/aggregate-functions/reference/groupbitmapxor.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/groupbitmapxor.md
@@ -10,7 +10,7 @@ Calculations the XOR of a bitmap column, return cardinality of type UInt64, if a
 groupBitmapOr(expr)
 ```
 
-**Parameters**
+**Arguments**
 
 `expr` – An expression that results in `AggregateFunction(groupBitmap, UInt*)` type.
 
diff --git a/docs/en/sql-reference/aggregate-functions/reference/groupbitor.md b/docs/en/sql-reference/aggregate-functions/reference/groupbitor.md
index 7383e620060..e427a9ad970 100644
--- a/docs/en/sql-reference/aggregate-functions/reference/groupbitor.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/groupbitor.md
@@ -10,7 +10,7 @@ Applies bitwise `OR` for series of numbers.
 groupBitOr(expr)
 ```
 
-**Parameters**
+**Arguments**
 
 `expr` – An expression that results in `UInt*` type.
 
diff --git a/docs/en/sql-reference/aggregate-functions/reference/groupbitxor.md b/docs/en/sql-reference/aggregate-functions/reference/groupbitxor.md
index 01026012b91..4b8323f92db 100644
--- a/docs/en/sql-reference/aggregate-functions/reference/groupbitxor.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/groupbitxor.md
@@ -10,7 +10,7 @@ Applies bitwise `XOR` for series of numbers.
 groupBitXor(expr)
 ```
 
-**Parameters**
+**Arguments**
 
 `expr` – An expression that results in `UInt*` type.
 
diff --git a/docs/en/sql-reference/aggregate-functions/reference/initializeAggregation.md b/docs/en/sql-reference/aggregate-functions/reference/initializeAggregation.md
index ea44d5f1ddd..313d6bf81f5 100644
--- a/docs/en/sql-reference/aggregate-functions/reference/initializeAggregation.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/initializeAggregation.md
@@ -13,7 +13,7 @@ Use it for tests or to process columns of types `AggregateFunction` and `Aggrega
 initializeAggregation (aggregate_function, column_1, column_2);
 ```
 
-**Parameters**
+**Arguments**
 
 -   `aggregate_function` — Name of the aggregation function. The state of this function — the creating one. [String](../../../sql-reference/data-types/string.md#string).
 -   `column_n` — The column to translate it into the function as it's argument. [String](../../../sql-reference/data-types/string.md#string).
diff --git a/docs/en/sql-reference/aggregate-functions/reference/kurtpop.md b/docs/en/sql-reference/aggregate-functions/reference/kurtpop.md
index 65e7e31b9b4..db402c99663 100644
--- a/docs/en/sql-reference/aggregate-functions/reference/kurtpop.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/kurtpop.md
@@ -10,7 +10,7 @@ Computes the [kurtosis](https://en.wikipedia.org/wiki/Kurtosis) of a sequence.
 kurtPop(expr)
 ```
 
-**Parameters**
+**Arguments**
 
 `expr` — [Expression](../../../sql-reference/syntax.md#syntax-expressions) returning a number.
 
diff --git a/docs/en/sql-reference/aggregate-functions/reference/kurtsamp.md b/docs/en/sql-reference/aggregate-functions/reference/kurtsamp.md
index 224bbbdb9e7..4bb9f76763b 100644
--- a/docs/en/sql-reference/aggregate-functions/reference/kurtsamp.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/kurtsamp.md
@@ -12,7 +12,7 @@ It represents an unbiased estimate of the kurtosis of a random variable if passe
 kurtSamp(expr)
 ```
 
-**Parameters**
+**Arguments**
 
 `expr` — [Expression](../../../sql-reference/syntax.md#syntax-expressions) returning a number.
 
diff --git a/docs/en/sql-reference/aggregate-functions/reference/mannwhitneyutest.md b/docs/en/sql-reference/aggregate-functions/reference/mannwhitneyutest.md
index 012df7052aa..12982849513 100644
--- a/docs/en/sql-reference/aggregate-functions/reference/mannwhitneyutest.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/mannwhitneyutest.md
@@ -16,6 +16,11 @@ mannWhitneyUTest[(alternative[, continuity_correction])](sample_data, sample_ind
 Values of both samples are in the `sample_data` column. If `sample_index` equals to 0 then the value in that row belongs to the sample from the first population. Otherwise it belongs to the sample from the second population. 
 The null hypothesis is that two populations are stochastically equal. Also one-sided hypothesises can be tested. This test does not assume that data have normal distribution.
 
+**Arguments**
+
+-   `sample_data` — sample data. [Integer](../../../sql-reference/data-types/int-uint.md), [Float](../../../sql-reference/data-types/float.md) or [Decimal](../../../sql-reference/data-types/decimal.md).
+-   `sample_index` — sample index. [Integer](../../../sql-reference/data-types/int-uint.md).
+
 **Parameters**
 
 -   `alternative` — alternative hypothesis. (Optional, default: `'two-sided'`.) [String](../../../sql-reference/data-types/string.md).
@@ -23,9 +28,6 @@ The null hypothesis is that two populations are stochastically equal. Also one-s
     -   `'greater'`;
     -   `'less'`.
 -   `continuity_correction` - if not 0 then continuity correction in the normal approximation for the p-value is applied. (Optional, default: 1.) [UInt64](../../../sql-reference/data-types/int-uint.md).
--   `sample_data` — sample data. [Integer](../../../sql-reference/data-types/int-uint.md), [Float](../../../sql-reference/data-types/float.md) or [Decimal](../../../sql-reference/data-types/decimal.md).
--   `sample_index` — sample index. [Integer](../../../sql-reference/data-types/int-uint.md).
-
 
 **Returned values**
 
diff --git a/docs/en/sql-reference/aggregate-functions/reference/quantile.md b/docs/en/sql-reference/aggregate-functions/reference/quantile.md
index 77f858a1735..d625ef4cfd9 100644
--- a/docs/en/sql-reference/aggregate-functions/reference/quantile.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/quantile.md
@@ -18,7 +18,7 @@ quantile(level)(expr)
 
 Alias: `median`.
 
-**Parameters**
+**Arguments**
 
 -   `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median).
 -   `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md).
diff --git a/docs/en/sql-reference/aggregate-functions/reference/quantiledeterministic.md b/docs/en/sql-reference/aggregate-functions/reference/quantiledeterministic.md
index 6046447dd10..a20ac26f599 100644
--- a/docs/en/sql-reference/aggregate-functions/reference/quantiledeterministic.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/quantiledeterministic.md
@@ -18,7 +18,7 @@ quantileDeterministic(level)(expr, determinator)
 
 Alias: `medianDeterministic`.
 
-**Parameters**
+**Arguments**
 
 -   `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median).
 -   `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md).
diff --git a/docs/en/sql-reference/aggregate-functions/reference/quantileexact.md b/docs/en/sql-reference/aggregate-functions/reference/quantileexact.md
index a39f724f368..06ef7ccfbd3 100644
--- a/docs/en/sql-reference/aggregate-functions/reference/quantileexact.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/quantileexact.md
@@ -18,7 +18,7 @@ quantileExact(level)(expr)
 
 Alias: `medianExact`.
 
-**Parameters**
+**Arguments**
 
 -   `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median).
 -   `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md).
@@ -77,7 +77,7 @@ quantileExact(level)(expr)
 
 Alias: `medianExactLow`.
 
-**Parameters**
+**Arguments**
 
 -   `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median).
 -   `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md).
@@ -128,7 +128,7 @@ quantileExactHigh(level)(expr)
 
 Alias: `medianExactHigh`.
 
-**Parameters**
+**Arguments**
 
 -   `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median).
 -   `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md).
diff --git a/docs/en/sql-reference/aggregate-functions/reference/quantileexactweighted.md b/docs/en/sql-reference/aggregate-functions/reference/quantileexactweighted.md
index 3251f8298a6..210f44e7587 100644
--- a/docs/en/sql-reference/aggregate-functions/reference/quantileexactweighted.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/quantileexactweighted.md
@@ -18,7 +18,7 @@ quantileExactWeighted(level)(expr, weight)
 
 Alias: `medianExactWeighted`.
 
-**Parameters**
+**Arguments**
 
 -   `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median).
 -   `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md).
diff --git a/docs/en/sql-reference/aggregate-functions/reference/quantiletdigest.md b/docs/en/sql-reference/aggregate-functions/reference/quantiletdigest.md
index bda98ea338d..dcc665a68af 100644
--- a/docs/en/sql-reference/aggregate-functions/reference/quantiletdigest.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/quantiletdigest.md
@@ -20,7 +20,7 @@ quantileTDigest(level)(expr)
 
 Alias: `medianTDigest`.
 
-**Parameters**
+**Arguments**
 
 -   `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median).
 -   `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md).
diff --git a/docs/en/sql-reference/aggregate-functions/reference/quantiletdigestweighted.md b/docs/en/sql-reference/aggregate-functions/reference/quantiletdigestweighted.md
index 309cbe95e95..56ef598f7e7 100644
--- a/docs/en/sql-reference/aggregate-functions/reference/quantiletdigestweighted.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/quantiletdigestweighted.md
@@ -20,7 +20,7 @@ quantileTDigest(level)(expr)
 
 Alias: `medianTDigest`.
 
-**Parameters**
+**Arguments**
 
 -   `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median).
 -   `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md).
diff --git a/docs/en/sql-reference/aggregate-functions/reference/quantiletiming.md b/docs/en/sql-reference/aggregate-functions/reference/quantiletiming.md
index 867e8b87e74..58ce6495a96 100644
--- a/docs/en/sql-reference/aggregate-functions/reference/quantiletiming.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/quantiletiming.md
@@ -18,7 +18,7 @@ quantileTiming(level)(expr)
 
 Alias: `medianTiming`.
 
-**Parameters**
+**Arguments**
 
 -   `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median).
 
diff --git a/docs/en/sql-reference/aggregate-functions/reference/quantiletimingweighted.md b/docs/en/sql-reference/aggregate-functions/reference/quantiletimingweighted.md
index 817cd831d85..fb3b9dbf4d2 100644
--- a/docs/en/sql-reference/aggregate-functions/reference/quantiletimingweighted.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/quantiletimingweighted.md
@@ -18,7 +18,7 @@ quantileTimingWeighted(level)(expr, weight)
 
 Alias: `medianTimingWeighted`.
 
-**Parameters**
+**Arguments**
 
 -   `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median).
 
diff --git a/docs/en/sql-reference/aggregate-functions/reference/rankCorr.md b/docs/en/sql-reference/aggregate-functions/reference/rankCorr.md
index dc23029f239..55ee1b8289b 100644
--- a/docs/en/sql-reference/aggregate-functions/reference/rankCorr.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/rankCorr.md
@@ -8,7 +8,7 @@ Computes a rank correlation coefficient.
 rankCorr(x, y)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `x` — Arbitrary value. [Float32](../../../sql-reference/data-types/float.md#float32-float64) or [Float64](../../../sql-reference/data-types/float.md#float32-float64).
 -   `y` — Arbitrary value. [Float32](../../../sql-reference/data-types/float.md#float32-float64) or [Float64](../../../sql-reference/data-types/float.md#float32-float64).
diff --git a/docs/en/sql-reference/aggregate-functions/reference/skewpop.md b/docs/en/sql-reference/aggregate-functions/reference/skewpop.md
index d15a5ffdd47..b9dfc390f9d 100644
--- a/docs/en/sql-reference/aggregate-functions/reference/skewpop.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/skewpop.md
@@ -10,7 +10,7 @@ Computes the [skewness](https://en.wikipedia.org/wiki/Skewness) of a sequence.
 skewPop(expr)
 ```
 
-**Parameters**
+**Arguments**
 
 `expr` — [Expression](../../../sql-reference/syntax.md#syntax-expressions) returning a number.
 
diff --git a/docs/en/sql-reference/aggregate-functions/reference/skewsamp.md b/docs/en/sql-reference/aggregate-functions/reference/skewsamp.md
index cb323f4b142..f7a6df8f507 100644
--- a/docs/en/sql-reference/aggregate-functions/reference/skewsamp.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/skewsamp.md
@@ -12,7 +12,7 @@ It represents an unbiased estimate of the skewness of a random variable if passe
 skewSamp(expr)
 ```
 
-**Parameters**
+**Arguments**
 
 `expr` — [Expression](../../../sql-reference/syntax.md#syntax-expressions) returning a number.
 
diff --git a/docs/en/sql-reference/aggregate-functions/reference/studentttest.md b/docs/en/sql-reference/aggregate-functions/reference/studentttest.md
index f868e976039..ba10c1d62d9 100644
--- a/docs/en/sql-reference/aggregate-functions/reference/studentttest.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/studentttest.md
@@ -16,7 +16,7 @@ studentTTest(sample_data, sample_index)
 Values of both samples are in the `sample_data` column. If `sample_index` equals to 0 then the value in that row belongs to the sample from the first population. Otherwise it belongs to the sample from the second population.
 The null hypothesis is that means of populations are equal. Normal distribution with equal variances is assumed.
 
-**Parameters**
+**Arguments**
 
 -   `sample_data` — sample data. [Integer](../../../sql-reference/data-types/int-uint.md), [Float](../../../sql-reference/data-types/float.md) or [Decimal](../../../sql-reference/data-types/decimal.md).
 -   `sample_index` — sample index. [Integer](../../../sql-reference/data-types/int-uint.md).
diff --git a/docs/en/sql-reference/aggregate-functions/reference/topk.md b/docs/en/sql-reference/aggregate-functions/reference/topk.md
index 004a67d33af..b3e79803ba1 100644
--- a/docs/en/sql-reference/aggregate-functions/reference/topk.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/topk.md
@@ -16,7 +16,7 @@ This function doesn’t provide a guaranteed result. In certain situations, erro
 
 We recommend using the `N < 10` value; performance is reduced with large `N` values. Maximum value of `N = 65536`.
 
-**Parameters**
+**Arguments**
 
 -   ‘N’ is the number of elements to return.
 
diff --git a/docs/en/sql-reference/aggregate-functions/reference/topkweighted.md b/docs/en/sql-reference/aggregate-functions/reference/topkweighted.md
index b597317f44e..02b9f77ea6f 100644
--- a/docs/en/sql-reference/aggregate-functions/reference/topkweighted.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/topkweighted.md
@@ -12,7 +12,7 @@ Similar to `topK` but takes one additional argument of integer type - `weight`.
 topKWeighted(N)(x, weight)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `N` — The number of elements to return.
 
diff --git a/docs/en/sql-reference/aggregate-functions/reference/uniq.md b/docs/en/sql-reference/aggregate-functions/reference/uniq.md
index 81d1ec6761e..7ba2cdc6cb8 100644
--- a/docs/en/sql-reference/aggregate-functions/reference/uniq.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/uniq.md
@@ -10,7 +10,7 @@ Calculates the approximate number of different values of the argument.
 uniq(x[, ...])
 ```
 
-**Parameters**
+**Arguments**
 
 The function takes a variable number of parameters. Parameters can be `Tuple`, `Array`, `Date`, `DateTime`, `String`, or numeric types.
 
diff --git a/docs/en/sql-reference/aggregate-functions/reference/uniqcombined.md b/docs/en/sql-reference/aggregate-functions/reference/uniqcombined.md
index c52486bc38f..4434686ae61 100644
--- a/docs/en/sql-reference/aggregate-functions/reference/uniqcombined.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/uniqcombined.md
@@ -12,7 +12,7 @@ uniqCombined(HLL_precision)(x[, ...])
 
 The `uniqCombined` function is a good choice for calculating the number of different values.
 
-**Parameters**
+**Arguments**
 
 The function takes a variable number of parameters. Parameters can be `Tuple`, `Array`, `Date`, `DateTime`, `String`, or numeric types.
 
diff --git a/docs/en/sql-reference/aggregate-functions/reference/uniqexact.md b/docs/en/sql-reference/aggregate-functions/reference/uniqexact.md
index 9a6224533c8..eee675016ee 100644
--- a/docs/en/sql-reference/aggregate-functions/reference/uniqexact.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/uniqexact.md
@@ -14,7 +14,7 @@ Use the `uniqExact` function if you absolutely need an exact result. Otherwise u
 
 The `uniqExact` function uses more memory than `uniq`, because the size of the state has unbounded growth as the number of different values increases.
 
-**Parameters**
+**Arguments**
 
 The function takes a variable number of parameters. Parameters can be `Tuple`, `Array`, `Date`, `DateTime`, `String`, or numeric types.
 
diff --git a/docs/en/sql-reference/aggregate-functions/reference/uniqhll12.md b/docs/en/sql-reference/aggregate-functions/reference/uniqhll12.md
index fcddc22cc46..5b23ea81eae 100644
--- a/docs/en/sql-reference/aggregate-functions/reference/uniqhll12.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/uniqhll12.md
@@ -10,7 +10,7 @@ Calculates the approximate number of different argument values, using the [Hyper
 uniqHLL12(x[, ...])
 ```
 
-**Parameters**
+**Arguments**
 
 The function takes a variable number of parameters. Parameters can be `Tuple`, `Array`, `Date`, `DateTime`, `String`, or numeric types.
 
diff --git a/docs/en/sql-reference/aggregate-functions/reference/welchttest.md b/docs/en/sql-reference/aggregate-functions/reference/welchttest.md
index 3fe1c9d58b9..18cff885867 100644
--- a/docs/en/sql-reference/aggregate-functions/reference/welchttest.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/welchttest.md
@@ -16,7 +16,7 @@ welchTTest(sample_data, sample_index)
 Values of both samples are in the `sample_data` column. If `sample_index` equals to 0 then the value in that row belongs to the sample from the first population. Otherwise it belongs to the sample from the second population.
 The null hypothesis is that means of populations are equal. Normal distribution is assumed. Populations may have unequal variance.
 
-**Parameters**
+**Arguments**
 
 -   `sample_data` — sample data. [Integer](../../../sql-reference/data-types/int-uint.md), [Float](../../../sql-reference/data-types/float.md) or [Decimal](../../../sql-reference/data-types/decimal.md).
 -   `sample_index` — sample index. [Integer](../../../sql-reference/data-types/int-uint.md).
diff --git a/docs/en/sql-reference/data-types/map.md b/docs/en/sql-reference/data-types/map.md
new file mode 100644
index 00000000000..58634e5b669
--- /dev/null
+++ b/docs/en/sql-reference/data-types/map.md
@@ -0,0 +1,83 @@
+---
+toc_priority: 65
+toc_title: Map(key, value)
+---
+
+# Map(key, value) {#data_type-map}
+
+`Map(key, value)` data type stores `key:value` pairs. 
+
+**Parameters** 
+-   `key` — The key part of the pair. [String](../../sql-reference/data-types/string.md) or [Integer](../../sql-reference/data-types/int-uint.md).
+-   `value` — The value part of the pair. [String](../../sql-reference/data-types/string.md), [Integer](../../sql-reference/data-types/int-uint.md) or [Array](../../sql-reference/data-types/array.md).
+
+!!! warning "Warning"
+    Currently `Map` data type is an experimental feature. To work with it you must set `allow_experimental_map_type = 1`.
+
+To get the value from an `a Map('key', 'value')` column, use `a['key']` syntax. This lookup works now with a linear complexity.
+
+**Examples**
+
+Consider the table:
+
+``` sql
+CREATE TABLE table_map (a Map(String, UInt64)) ENGINE=Memory;
+INSERT INTO table_map VALUES ({'key1':1, 'key2':10}), ({'key1':2,'key2':20}), ({'key1':3,'key2':30});
+```
+
+Select all `key2` values: 
+
+```sql
+SELECT a['key2'] FROM table_map;
+```
+Result:
+
+```text
+┌─arrayElement(a, 'key2')─┐
+│                      10 │
+│                      20 │
+│                      30 │
+└─────────────────────────┘
+```
+
+If there's no such `key` in the `Map()` column, the query returns zeros for numerical values, empty strings or empty arrays. 
+
+```sql
+INSERT INTO table_map VALUES ({'key3':100}), ({});
+SELECT a['key3'] FROM table_map;
+```
+
+Result:
+
+```text
+┌─arrayElement(a, 'key3')─┐
+│                     100 │
+│                       0 │
+└─────────────────────────┘
+┌─arrayElement(a, 'key3')─┐
+│                       0 │
+│                       0 │
+│                       0 │
+└─────────────────────────┘
+```
+
+## Convert Tuple to Map Type {#map-and-tuple}
+
+You can cast `Tuple()` as `Map()` using [CAST](../../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast) function:
+
+``` sql
+SELECT CAST(([1, 2, 3], ['Ready', 'Steady', 'Go']), 'Map(UInt8, String)') AS map;
+```
+
+``` text
+┌─map───────────────────────────┐
+│ {1:'Ready',2:'Steady',3:'Go'} │
+└───────────────────────────────┘
+```
+
+**See Also**
+
+-   [map()](../../sql-reference/functions/tuple-map-functions.md#function-map) function
+-   [CAST()](../../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast) function
+
+[Original article](https://clickhouse.tech/docs/en/data-types/map/) <!--hide-->
diff --git a/docs/en/sql-reference/functions/array-functions.md b/docs/en/sql-reference/functions/array-functions.md
index d5b357795d7..c9c418d57a4 100644
--- a/docs/en/sql-reference/functions/array-functions.md
+++ b/docs/en/sql-reference/functions/array-functions.md
@@ -61,7 +61,7 @@ Combines arrays passed as arguments.
 arrayConcat(arrays)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `arrays` – Arbitrary number of arguments of [Array](../../sql-reference/data-types/array.md) type.
     **Example**
@@ -111,7 +111,7 @@ Checks whether one array is a subset of another.
 hasAll(set, subset)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `set` – Array of any type with a set of elements.
 -   `subset` – Array of any type with elements that should be tested to be a subset of `set`.
@@ -149,7 +149,7 @@ Checks whether two arrays have intersection by some elements.
 hasAny(array1, array2)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `array1` – Array of any type with a set of elements.
 -   `array2` – Array of any type with a set of elements.
@@ -191,7 +191,7 @@ For Example:
 - `hasSubstr([1,2,3,4], [2,3])` returns 1. However, `hasSubstr([1,2,3,4], [3,2])` will return `0`.
 - `hasSubstr([1,2,3,4], [1,2,3])` returns 1. However, `hasSubstr([1,2,3,4], [1,2,4])` will return `0`.
 
-**Parameters**
+**Arguments**
 
 -   `array1` – Array of any type with a set of elements.
 -   `array2` – Array of any type with a set of elements.
@@ -369,7 +369,7 @@ Removes the last item from the array.
 arrayPopBack(array)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `array` – Array.
 
@@ -393,7 +393,7 @@ Removes the first item from the array.
 arrayPopFront(array)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `array` – Array.
 
@@ -417,7 +417,7 @@ Adds one item to the end of the array.
 arrayPushBack(array, single_value)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `array` – Array.
 -   `single_value` – A single value. Only numbers can be added to an array with numbers, and only strings can be added to an array of strings. When adding numbers, ClickHouse automatically sets the `single_value` type for the data type of the array. For more information about the types of data in ClickHouse, see “[Data types](../../sql-reference/data-types/index.md#data_types)”. Can be `NULL`. The function adds a `NULL` element to an array, and the type of array elements converts to `Nullable`.
@@ -442,7 +442,7 @@ Adds one element to the beginning of the array.
 arrayPushFront(array, single_value)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `array` – Array.
 -   `single_value` – A single value. Only numbers can be added to an array with numbers, and only strings can be added to an array of strings. When adding numbers, ClickHouse automatically sets the `single_value` type for the data type of the array. For more information about the types of data in ClickHouse, see “[Data types](../../sql-reference/data-types/index.md#data_types)”. Can be `NULL`. The function adds a `NULL` element to an array, and the type of array elements converts to `Nullable`.
@@ -467,7 +467,7 @@ Changes the length of the array.
 arrayResize(array, size[, extender])
 ```
 
-**Parameters:**
+**Arguments:**
 
 -   `array` — Array.
 -   `size` — Required length of the array.
@@ -509,7 +509,7 @@ Returns a slice of the array.
 arraySlice(array, offset[, length])
 ```
 
-**Parameters**
+**Arguments**
 
 -   `array` – Array of data.
 -   `offset` – Indent from the edge of the array. A positive value indicates an offset on the left, and a negative value is an indent on the right. Numbering of the array items begins with 1.
@@ -751,7 +751,7 @@ Calculates the difference between adjacent array elements. Returns an array wher
 arrayDifference(array)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `array` – [Array](https://clickhouse.tech/docs/en/data_types/array/).
 
@@ -803,7 +803,7 @@ Takes an array, returns an array containing the distinct elements only.
 arrayDistinct(array)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `array` – [Array](https://clickhouse.tech/docs/en/data_types/array/).
 
@@ -871,7 +871,7 @@ Applies an aggregate function to array elements and returns its result. The name
 arrayReduce(agg_func, arr1, arr2, ..., arrN)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `agg_func` — The name of an aggregate function which should be a constant [string](../../sql-reference/data-types/string.md).
 -   `arr` — Any number of [array](../../sql-reference/data-types/array.md) type columns as the parameters of the aggregation function.
@@ -936,7 +936,7 @@ Applies an aggregate function to array elements in given ranges and returns an a
 arrayReduceInRanges(agg_func, ranges, arr1, arr2, ..., arrN)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `agg_func` — The name of an aggregate function which should be a constant [string](../../sql-reference/data-types/string.md).
 -   `ranges` — The ranges to aggretate which should be an [array](../../sql-reference/data-types/array.md) of [tuples](../../sql-reference/data-types/tuple.md) which containing the index and the length of each range.
@@ -1007,7 +1007,7 @@ flatten(array_of_arrays)
 
 Alias: `flatten`.
 
-**Parameters**
+**Arguments**
 
 -   `array_of_arrays` — [Array](../../sql-reference/data-types/array.md) of arrays. For example, `[[1,2,3], [4,5]]`.
 
@@ -1033,7 +1033,7 @@ Removes consecutive duplicate elements from an array. The order of result values
 arrayCompact(arr)
 ```
 
-**Parameters**
+**Arguments**
 
 `arr` — The [array](../../sql-reference/data-types/array.md) to inspect.
 
@@ -1069,7 +1069,7 @@ Combines multiple arrays into a single array. The resulting array contains the c
 arrayZip(arr1, arr2, ..., arrN)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `arrN` — [Array](../../sql-reference/data-types/array.md).
 
@@ -1107,7 +1107,7 @@ Calculate AUC (Area Under the Curve, which is a concept in machine learning, see
 arrayAUC(arr_scores, arr_labels)
 ```
 
-**Parameters**
+**Arguments**
 - `arr_scores` — scores prediction model gives.
 - `arr_labels` — labels of samples, usually 1 for positive sample and 0 for negtive sample.
 
@@ -1302,7 +1302,7 @@ Note that the `arrayMin` is a [higher-order function](../../sql-reference/functi
 arrayMin([func,] arr)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `func` — Function. [Expression](../../sql-reference/data-types/special-data-types/expression.md).
 -   `arr` — Array. [Array](../../sql-reference/data-types/array.md).
@@ -1357,7 +1357,7 @@ Note that the `arrayMax` is a [higher-order function](../../sql-reference/functi
 arrayMax([func,] arr)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `func` — Function. [Expression](../../sql-reference/data-types/special-data-types/expression.md).
 -   `arr` — Array. [Array](../../sql-reference/data-types/array.md).
@@ -1412,7 +1412,7 @@ Note that the `arraySum` is a [higher-order function](../../sql-reference/functi
 arraySum([func,] arr)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `func` — Function. [Expression](../../sql-reference/data-types/special-data-types/expression.md).
 -   `arr` — Array. [Array](../../sql-reference/data-types/array.md).   
@@ -1467,7 +1467,7 @@ Note that the `arrayAvg` is a [higher-order function](../../sql-reference/functi
 arrayAvg([func,] arr)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `func` — Function. [Expression](../../sql-reference/data-types/special-data-types/expression.md).
 -   `arr` — Array. [Array](../../sql-reference/data-types/array.md).   
diff --git a/docs/en/sql-reference/functions/bit-functions.md b/docs/en/sql-reference/functions/bit-functions.md
index 57c2ae42ada..a3d0c82d8ab 100644
--- a/docs/en/sql-reference/functions/bit-functions.md
+++ b/docs/en/sql-reference/functions/bit-functions.md
@@ -35,7 +35,7 @@ Takes any integer and converts it into [binary form](https://en.wikipedia.org/wi
 SELECT bitTest(number, index)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `number` – integer number.
 -   `index` – position of bit.
@@ -100,7 +100,7 @@ The conjuction for bitwise operations:
 SELECT bitTestAll(number, index1, index2, index3, index4, ...)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `number` – integer number.
 -   `index1`, `index2`, `index3`, `index4` – positions of bit. For example, for set of positions (`index1`, `index2`, `index3`, `index4`) is true if and only if all of its positions are true (`index1` ⋀ `index2`, ⋀ `index3` ⋀ `index4`).
@@ -165,7 +165,7 @@ The disjunction for bitwise operations:
 SELECT bitTestAny(number, index1, index2, index3, index4, ...)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `number` – integer number.
 -   `index1`, `index2`, `index3`, `index4` – positions of bit.
@@ -220,7 +220,7 @@ Calculates the number of bits set to one in the binary representation of a numbe
 bitCount(x)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `x` — [Integer](../../sql-reference/data-types/int-uint.md) or [floating-point](../../sql-reference/data-types/float.md) number. The function uses the value representation in memory. It allows supporting floating-point numbers.
 
diff --git a/docs/en/sql-reference/functions/bitmap-functions.md b/docs/en/sql-reference/functions/bitmap-functions.md
index a66098beffb..bfff70576f2 100644
--- a/docs/en/sql-reference/functions/bitmap-functions.md
+++ b/docs/en/sql-reference/functions/bitmap-functions.md
@@ -21,7 +21,7 @@ Build a bitmap from unsigned integer array.
 bitmapBuild(array)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `array` – unsigned integer array.
 
@@ -45,7 +45,7 @@ Convert bitmap to integer array.
 bitmapToArray(bitmap)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `bitmap` – bitmap object.
 
@@ -69,7 +69,7 @@ Return subset in specified range (not include the range_end).
 bitmapSubsetInRange(bitmap, range_start, range_end)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `bitmap` – [Bitmap object](#bitmap_functions-bitmapbuild).
 -   `range_start` – range start point. Type: [UInt32](../../sql-reference/data-types/int-uint.md).
@@ -97,7 +97,7 @@ Creates a subset of bitmap with n elements taken between `range_start` and `card
 bitmapSubsetLimit(bitmap, range_start, cardinality_limit)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `bitmap` – [Bitmap object](#bitmap_functions-bitmapbuild).
 -   `range_start` – The subset starting point. Type: [UInt32](../../sql-reference/data-types/int-uint.md).
@@ -133,7 +133,7 @@ Checks whether the bitmap contains an element.
 bitmapContains(haystack, needle)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `haystack` – [Bitmap object](#bitmap_functions-bitmapbuild), where the function searches.
 -   `needle` – Value that the function searches. Type: [UInt32](../../sql-reference/data-types/int-uint.md).
@@ -167,7 +167,7 @@ bitmapHasAny(bitmap1, bitmap2)
 
 If you are sure that `bitmap2` contains strictly one element, consider using the [bitmapContains](#bitmap_functions-bitmapcontains) function. It works more efficiently.
 
-**Parameters**
+**Arguments**
 
 -   `bitmap*` – bitmap object.
 
@@ -197,7 +197,7 @@ If the second argument is an empty bitmap then returns 1.
 bitmapHasAll(bitmap,bitmap)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `bitmap` – bitmap object.
 
@@ -221,7 +221,7 @@ Retrun bitmap cardinality of type UInt64.
 bitmapCardinality(bitmap)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `bitmap` – bitmap object.
 
@@ -243,7 +243,7 @@ Retrun the smallest value of type UInt64 in the set, UINT32_MAX if the set is em
 
     bitmapMin(bitmap)
 
-**Parameters**
+**Arguments**
 
 -   `bitmap` – bitmap object.
 
@@ -263,7 +263,7 @@ Retrun the greatest value of type UInt64 in the set, 0 if the set is empty.
 
     bitmapMax(bitmap)
 
-**Parameters**
+**Arguments**
 
 -   `bitmap` – bitmap object.
 
@@ -283,7 +283,7 @@ Transform an array of values in a bitmap to another array of values, the result
 
     bitmapTransform(bitmap, from_array, to_array)
 
-**Parameters**
+**Arguments**
 
 -   `bitmap` – bitmap object.
 -   `from_array` – UInt32 array. For idx in range \[0, from_array.size()), if bitmap contains from_array\[idx\], then replace it with to_array\[idx\]. Note that the result depends on array ordering if there are common elements between from_array and to_array.
@@ -307,7 +307,7 @@ Two bitmap and calculation, the result is a new bitmap.
 bitmapAnd(bitmap,bitmap)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `bitmap` – bitmap object.
 
@@ -331,7 +331,7 @@ Two bitmap or calculation, the result is a new bitmap.
 bitmapOr(bitmap,bitmap)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `bitmap` – bitmap object.
 
@@ -355,7 +355,7 @@ Two bitmap xor calculation, the result is a new bitmap.
 bitmapXor(bitmap,bitmap)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `bitmap` – bitmap object.
 
@@ -379,7 +379,7 @@ Two bitmap andnot calculation, the result is a new bitmap.
 bitmapAndnot(bitmap,bitmap)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `bitmap` – bitmap object.
 
@@ -403,7 +403,7 @@ Two bitmap and calculation, return cardinality of type UInt64.
 bitmapAndCardinality(bitmap,bitmap)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `bitmap` – bitmap object.
 
@@ -427,7 +427,7 @@ Two bitmap or calculation, return cardinality of type UInt64.
 bitmapOrCardinality(bitmap,bitmap)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `bitmap` – bitmap object.
 
@@ -451,7 +451,7 @@ Two bitmap xor calculation, return cardinality of type UInt64.
 bitmapXorCardinality(bitmap,bitmap)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `bitmap` – bitmap object.
 
@@ -475,7 +475,7 @@ Two bitmap andnot calculation, return cardinality of type UInt64.
 bitmapAndnotCardinality(bitmap,bitmap)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `bitmap` – bitmap object.
 
diff --git a/docs/en/sql-reference/functions/conditional-functions.md b/docs/en/sql-reference/functions/conditional-functions.md
index 446a4729ff2..2d57cbb3bd5 100644
--- a/docs/en/sql-reference/functions/conditional-functions.md
+++ b/docs/en/sql-reference/functions/conditional-functions.md
@@ -17,7 +17,7 @@ SELECT if(cond, then, else)
 
 If the condition `cond` evaluates to a non-zero value, returns the result of the expression `then`, and the result of the expression `else`, if present, is skipped. If the `cond` is zero or `NULL`, then the result of the `then` expression is skipped and the result of the `else` expression, if present, is returned.
 
-**Parameters**
+**Arguments**
 
 -   `cond` – The condition for evaluation that can be zero or not. The type is UInt8, Nullable(UInt8) or NULL.
 -   `then` - The expression to return if condition is met.
@@ -117,7 +117,7 @@ Allows you to write the [CASE](../../sql-reference/operators/index.md#operator_c
 
 Syntax: `multiIf(cond_1, then_1, cond_2, then_2, ..., else)`
 
-**Parameters:**
+**Arguments:**
 
 -   `cond_N` — The condition for the function to return `then_N`.
 -   `then_N` — The result of the function when executed.
diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md
index 4a73bdb2546..f26e1bee6c9 100644
--- a/docs/en/sql-reference/functions/date-time-functions.md
+++ b/docs/en/sql-reference/functions/date-time-functions.md
@@ -186,7 +186,7 @@ Truncates sub-seconds.
 toStartOfSecond(value[, timezone])
 ```
 
-**Parameters**
+**Arguments**
 
 -   `value` — Date and time. [DateTime64](../../sql-reference/data-types/datetime64.md).
 -   `timezone` — [Timezone](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) for the returned value (optional). If not specified, the function uses the timezone of the `value` parameter. [String](../../sql-reference/data-types/string.md).
@@ -328,7 +328,7 @@ For mode values with a meaning of “contains January 1”, the week contains Ja
 toWeek(date, [, mode][, Timezone])
 ```
 
-**Parameters**
+**Arguments**
 
 -   `date` – Date or DateTime.
 -   `mode` – Optional parameter, Range of values is \[0,9\], default is 0.
@@ -378,7 +378,7 @@ date_trunc(unit, value[, timezone])
 
 Alias: `dateTrunc`. 
 
-**Parameters**
+**Arguments**
 
 -   `unit` — The type of interval to truncate the result. [String Literal](../syntax.md#syntax-string-literal).
     Possible values:
@@ -447,7 +447,7 @@ date_add(unit, value, date)
 
 Aliases: `dateAdd`, `DATE_ADD`. 
 
-**Parameters**
+**Arguments**
 
 -   `unit` — The type of interval to add. [String](../../sql-reference/data-types/string.md).
 
@@ -484,7 +484,7 @@ date_diff('unit', startdate, enddate, [timezone])
 
 Aliases: `dateDiff`, `DATE_DIFF`. 
 
-**Parameters**
+**Arguments**
 
 -   `unit` — The type of interval for result [String](../../sql-reference/data-types/string.md).
 
@@ -530,7 +530,7 @@ date_sub(unit, value, date)
 
 Aliases: `dateSub`, `DATE_SUB`. 
 
-**Parameters**
+**Arguments**
 
 -   `unit` — The type of interval to subtract. [String](../../sql-reference/data-types/string.md).
 
@@ -570,7 +570,7 @@ timestamp_add(date, INTERVAL value unit)
 
 Aliases: `timeStampAdd`, `TIMESTAMP_ADD`. 
 
-**Parameters**
+**Arguments**
     
 -   `date` — Date or Date with time - [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md).
 -   `value` - Value in specified unit - [Int](../../sql-reference/data-types/int-uint.md)
@@ -606,7 +606,7 @@ timestamp_sub(unit, value, date)
 
 Aliases: `timeStampSub`, `TIMESTAMP_SUB`. 
 
-**Parameters**
+**Arguments**
 
 -   `unit` — The type of interval to add. [String](../../sql-reference/data-types/string.md).
 
@@ -640,7 +640,7 @@ Returns the current date and time.
 now([timezone])
 ```
 
-**Parameters**
+**Arguments**
 
 -   `timezone` — [Timezone name](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone) for the returned value (optional). [String](../../sql-reference/data-types/string.md).
 
@@ -855,7 +855,7 @@ Converts a [Proleptic Gregorian calendar](https://en.wikipedia.org/wiki/Prolepti
 toModifiedJulianDay(date)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `date` — Date in text form. [String](../../sql-reference/data-types/string.md) or [FixedString](../../sql-reference/data-types/fixedstring.md).
 
@@ -891,7 +891,7 @@ Similar to [toModifiedJulianDay()](#tomodifiedjulianday), but instead of raising
 toModifiedJulianDayOrNull(date)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `date` — Date in text form. [String](../../sql-reference/data-types/string.md) or [FixedString](../../sql-reference/data-types/fixedstring.md).
 
@@ -927,7 +927,7 @@ Converts a [Modified Julian Day](https://en.wikipedia.org/wiki/Julian_day#Varian
 fromModifiedJulianDay(day)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `day` — Modified Julian Day number. [Any integral types](../../sql-reference/data-types/int-uint.md).
 
@@ -963,7 +963,7 @@ Similar to [fromModifiedJulianDayOrNull()](#frommodifiedjuliandayornull), but in
 fromModifiedJulianDayOrNull(day)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `day` — Modified Julian Day number. [Any integral types](../../sql-reference/data-types/int-uint.md).
 
diff --git a/docs/en/sql-reference/functions/encoding-functions.md b/docs/en/sql-reference/functions/encoding-functions.md
index bc3f5ca4345..31e84c08b39 100644
--- a/docs/en/sql-reference/functions/encoding-functions.md
+++ b/docs/en/sql-reference/functions/encoding-functions.md
@@ -15,7 +15,7 @@ Returns the string with the length as the number of passed arguments and each by
 char(number_1, [number_2, ..., number_n]);
 ```
 
-**Parameters**
+**Arguments**
 
 -   `number_1, number_2, ..., number_n` — Numerical arguments interpreted as integers. Types: [Int](../../sql-reference/data-types/int-uint.md), [Float](../../sql-reference/data-types/float.md).
 
@@ -107,7 +107,7 @@ For `String` and `FixedString`, all bytes are simply encoded as two hexadecimal
 
 Values of floating point and Decimal types are encoded as their representation in memory. As we support little endian architecture, they are encoded in little endian. Zero leading/trailing bytes are not omitted.
 
-**Parameters**
+**Arguments**
 
 -   `arg` — A value to convert to hexadecimal. Types: [String](../../sql-reference/data-types/string.md), [UInt](../../sql-reference/data-types/int-uint.md), [Float](../../sql-reference/data-types/float.md), [Decimal](../../sql-reference/data-types/decimal.md), [Date](../../sql-reference/data-types/date.md) or [DateTime](../../sql-reference/data-types/datetime.md).
 
diff --git a/docs/en/sql-reference/functions/encryption-functions.md b/docs/en/sql-reference/functions/encryption-functions.md
index 9e360abfe26..0dd7469b25e 100644
--- a/docs/en/sql-reference/functions/encryption-functions.md
+++ b/docs/en/sql-reference/functions/encryption-functions.md
@@ -31,7 +31,7 @@ This function encrypts data using these modes:
 encrypt('mode', 'plaintext', 'key' [, iv, aad])
 ```
 
-**Parameters**
+**Arguments**
 
 -   `mode` — Encryption mode. [String](../../sql-reference/data-types/string.md#string).
 -   `plaintext` — Text thats need to be encrypted. [String](../../sql-reference/data-types/string.md#string).
@@ -127,7 +127,7 @@ Supported encryption modes:
 aes_encrypt_mysql('mode', 'plaintext', 'key' [, iv])
 ```
 
-**Parameters**
+**Arguments**
 
 -   `mode` — Encryption mode. [String](../../sql-reference/data-types/string.md#string).
 -   `plaintext` — Text that needs to be encrypted. [String](../../sql-reference/data-types/string.md#string).
@@ -238,7 +238,7 @@ This function decrypts ciphertext into a plaintext using these modes:
 decrypt('mode', 'ciphertext', 'key' [, iv, aad])
 ```
 
-**Parameters**
+**Arguments**
 
 -   `mode` — Decryption mode. [String](../../sql-reference/data-types/string.md#string).
 -   `ciphertext` — Encrypted text that needs to be decrypted. [String](../../sql-reference/data-types/string.md#string).
@@ -317,7 +317,7 @@ Supported decryption modes:
 aes_decrypt_mysql('mode', 'ciphertext', 'key' [, iv])
 ```
 
-**Parameters**
+**Arguments**
 
 -   `mode` — Decryption mode. [String](../../sql-reference/data-types/string.md#string).
 -   `ciphertext` — Encrypted text that needs to be decrypted. [String](../../sql-reference/data-types/string.md#string).
diff --git a/docs/en/sql-reference/functions/ext-dict-functions.md b/docs/en/sql-reference/functions/ext-dict-functions.md
index 7df6ef54f2a..834fcdf8282 100644
--- a/docs/en/sql-reference/functions/ext-dict-functions.md
+++ b/docs/en/sql-reference/functions/ext-dict-functions.md
@@ -19,7 +19,7 @@ dictGet('dict_name', 'attr_name', id_expr)
 dictGetOrDefault('dict_name', 'attr_name', id_expr, default_value_expr)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `dict_name` — Name of the dictionary. [String literal](../../sql-reference/syntax.md#syntax-string-literal).
 -   `attr_name` — Name of the column of the dictionary. [String literal](../../sql-reference/syntax.md#syntax-string-literal).
@@ -108,7 +108,7 @@ Checks whether a key is present in a dictionary.
 dictHas('dict_name', id_expr)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `dict_name` — Name of the dictionary. [String literal](../../sql-reference/syntax.md#syntax-string-literal).
 -   `id_expr` — Key value. [Expression](../../sql-reference/syntax.md#syntax-expressions) returning a [UInt64](../../sql-reference/data-types/int-uint.md) or [Tuple](../../sql-reference/data-types/tuple.md)-type value depending on the dictionary configuration.
@@ -130,7 +130,7 @@ Creates an array, containing all the parents of a key in the [hierarchical dicti
 dictGetHierarchy('dict_name', key)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `dict_name` — Name of the dictionary. [String literal](../../sql-reference/syntax.md#syntax-string-literal).
 -   `key` — Key value. [Expression](../../sql-reference/syntax.md#syntax-expressions) returning a [UInt64](../../sql-reference/data-types/int-uint.md)-type value.
@@ -149,7 +149,7 @@ Checks the ancestor of a key through the whole hierarchical chain in the diction
 dictIsIn('dict_name', child_id_expr, ancestor_id_expr)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `dict_name` — Name of the dictionary. [String literal](../../sql-reference/syntax.md#syntax-string-literal).
 -   `child_id_expr` — Key to be checked. [Expression](../../sql-reference/syntax.md#syntax-expressions) returning a [UInt64](../../sql-reference/data-types/int-uint.md)-type value.
@@ -185,7 +185,7 @@ dictGet[Type]('dict_name', 'attr_name', id_expr)
 dictGet[Type]OrDefault('dict_name', 'attr_name', id_expr, default_value_expr)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `dict_name` — Name of the dictionary. [String literal](../../sql-reference/syntax.md#syntax-string-literal).
 -   `attr_name` — Name of the column of the dictionary. [String literal](../../sql-reference/syntax.md#syntax-string-literal).
diff --git a/docs/en/sql-reference/functions/functions-for-nulls.md b/docs/en/sql-reference/functions/functions-for-nulls.md
index c32af7194fb..df75e96c8fb 100644
--- a/docs/en/sql-reference/functions/functions-for-nulls.md
+++ b/docs/en/sql-reference/functions/functions-for-nulls.md
@@ -13,7 +13,7 @@ Checks whether the argument is [NULL](../../sql-reference/syntax.md#null-literal
 isNull(x)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `x` — A value with a non-compound data type.
 
@@ -53,7 +53,7 @@ Checks whether the argument is [NULL](../../sql-reference/syntax.md#null-literal
 isNotNull(x)
 ```
 
-**Parameters:**
+**Arguments:**
 
 -   `x` — A value with a non-compound data type.
 
@@ -93,7 +93,7 @@ Checks from left to right whether `NULL` arguments were passed and returns the f
 coalesce(x,...)
 ```
 
-**Parameters:**
+**Arguments:**
 
 -   Any number of parameters of a non-compound type. All parameters must be compatible by data type.
 
@@ -136,7 +136,7 @@ Returns an alternative value if the main argument is `NULL`.
 ifNull(x,alt)
 ```
 
-**Parameters:**
+**Arguments:**
 
 -   `x` — The value to check for `NULL`.
 -   `alt` — The value that the function returns if `x` is `NULL`.
@@ -176,7 +176,7 @@ Returns `NULL` if the arguments are equal.
 nullIf(x, y)
 ```
 
-**Parameters:**
+**Arguments:**
 
 `x`, `y` — Values for comparison. They must be compatible types, or ClickHouse will generate an exception.
 
@@ -215,7 +215,7 @@ Results in a value of type [Nullable](../../sql-reference/data-types/nullable.md
 assumeNotNull(x)
 ```
 
-**Parameters:**
+**Arguments:**
 
 -   `x` — The original value.
 
@@ -277,7 +277,7 @@ Converts the argument type to `Nullable`.
 toNullable(x)
 ```
 
-**Parameters:**
+**Arguments:**
 
 -   `x` — The value of any non-compound type.
 
diff --git a/docs/en/sql-reference/functions/geo/geohash.md b/docs/en/sql-reference/functions/geo/geohash.md
index 6f288a7687d..c27eab0b421 100644
--- a/docs/en/sql-reference/functions/geo/geohash.md
+++ b/docs/en/sql-reference/functions/geo/geohash.md
@@ -72,7 +72,7 @@ Returns an array of [geohash](#geohash)-encoded strings of given precision that
 geohashesInBox(longitude_min, latitude_min, longitude_max, latitude_max, precision)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `longitude_min` — Minimum longitude. Range: `[-180°, 180°]`. Type: [Float](../../../sql-reference/data-types/float.md).
 -   `latitude_min` — Minimum latitude. Range: `[-90°, 90°]`. Type: [Float](../../../sql-reference/data-types/float.md).
diff --git a/docs/en/sql-reference/functions/geo/h3.md b/docs/en/sql-reference/functions/geo/h3.md
index 4ed651e4e9e..9dda947b3a7 100644
--- a/docs/en/sql-reference/functions/geo/h3.md
+++ b/docs/en/sql-reference/functions/geo/h3.md
@@ -162,7 +162,7 @@ Returns [H3](#h3index) point index `(lon, lat)` with specified resolution.
 geoToH3(lon, lat, resolution)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `lon` — Longitude. Type: [Float64](../../../sql-reference/data-types/float.md).
 -   `lat` — Latitude. Type: [Float64](../../../sql-reference/data-types/float.md).
@@ -201,7 +201,7 @@ Result:
 h3kRing(h3index, k)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `h3index` — Hexagon index number. Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
 -   `k` — Raduis. Type: [integer](../../../sql-reference/data-types/int-uint.md)
@@ -315,7 +315,7 @@ Returns whether or not the provided [H3](#h3index) indexes are neighbors.
 h3IndexesAreNeighbors(index1, index2)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `index1` — Hexagon index number. Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
 -   `index2` — Hexagon index number. Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
@@ -353,7 +353,7 @@ Returns an array of child indexes for the given [H3](#h3index) index.
 h3ToChildren(index, resolution)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `index` — Hexagon index number. Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
 -   `resolution` — Index resolution. Range: `[0, 15]`. Type: [UInt8](../../../sql-reference/data-types/int-uint.md).
@@ -390,7 +390,7 @@ Returns the parent (coarser) index containing the given [H3](#h3index) index.
 h3ToParent(index, resolution)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `index` — Hexagon index number. Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
 -   `resolution` — Index resolution. Range: `[0, 15]`. Type: [UInt8](../../../sql-reference/data-types/int-uint.md).
diff --git a/docs/en/sql-reference/functions/hash-functions.md b/docs/en/sql-reference/functions/hash-functions.md
index 9394426b20b..465ad01527f 100644
--- a/docs/en/sql-reference/functions/hash-functions.md
+++ b/docs/en/sql-reference/functions/hash-functions.md
@@ -18,9 +18,9 @@ halfMD5(par1, ...)
 The function is relatively slow (5 million short strings per second per processor core).
 Consider using the [sipHash64](#hash_functions-siphash64) function instead.
 
-**Parameters**
+**Arguments**
 
-The function takes a variable number of input parameters. Parameters can be any of the [supported data types](../../sql-reference/data-types/index.md).
+The function takes a variable number of input parameters. Arguments can be any of the [supported data types](../../sql-reference/data-types/index.md).
 
 **Returned Value**
 
@@ -61,9 +61,9 @@ Function [interprets](../../sql-reference/functions/type-conversion-functions.md
 3.  Then the function takes the hash value, calculated at the previous step, and the third element of the initial hash array, and calculates a hash for the array of them.
 4.  The previous step is repeated for all the remaining elements of the initial hash array.
 
-**Parameters**
+**Arguments**
 
-The function takes a variable number of input parameters. Parameters can be any of the [supported data types](../../sql-reference/data-types/index.md).
+The function takes a variable number of input parameters. Arguments can be any of the [supported data types](../../sql-reference/data-types/index.md).
 
 **Returned Value**
 
@@ -97,9 +97,9 @@ cityHash64(par1,...)
 
 This is a fast non-cryptographic hash function. It uses the CityHash algorithm for string parameters and implementation-specific fast non-cryptographic hash function for parameters with other data types. The function uses the CityHash combinator to get the final results.
 
-**Parameters**
+**Arguments**
 
-The function takes a variable number of input parameters. Parameters can be any of the [supported data types](../../sql-reference/data-types/index.md).
+The function takes a variable number of input parameters. Arguments can be any of the [supported data types](../../sql-reference/data-types/index.md).
 
 **Returned Value**
 
@@ -166,9 +166,9 @@ farmHash64(par1, ...)
 
 These functions use the `Fingerprint64` and `Hash64` methods respectively from all [available methods](https://github.com/google/farmhash/blob/master/src/farmhash.h).
 
-**Parameters**
+**Arguments**
 
-The function takes a variable number of input parameters. Parameters can be any of the [supported data types](../../sql-reference/data-types/index.md).
+The function takes a variable number of input parameters. Arguments can be any of the [supported data types](../../sql-reference/data-types/index.md).
 
 **Returned Value**
 
@@ -226,7 +226,7 @@ Calculates [JavaHash](http://hg.openjdk.java.net/jdk8u/jdk8u/jdk/file/478a4add97
 javaHashUTF16LE(stringUtf16le)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `stringUtf16le` — a string in UTF-16LE encoding.
 
@@ -292,9 +292,9 @@ Produces a 64-bit [MetroHash](http://www.jandrewrogers.com/2015/05/27/metrohash/
 metroHash64(par1, ...)
 ```
 
-**Parameters**
+**Arguments**
 
-The function takes a variable number of input parameters. Parameters can be any of the [supported data types](../../sql-reference/data-types/index.md).
+The function takes a variable number of input parameters. Arguments can be any of the [supported data types](../../sql-reference/data-types/index.md).
 
 **Returned Value**
 
@@ -327,9 +327,9 @@ murmurHash2_32(par1, ...)
 murmurHash2_64(par1, ...)
 ```
 
-**Parameters**
+**Arguments**
 
-Both functions take a variable number of input parameters. Parameters can be any of the [supported data types](../../sql-reference/data-types/index.md).
+Both functions take a variable number of input parameters. Arguments can be any of the [supported data types](../../sql-reference/data-types/index.md).
 
 **Returned Value**
 
@@ -358,7 +358,7 @@ Calculates a 64-bit [MurmurHash2](https://github.com/aappleby/smhasher) hash val
 gccMurmurHash(par1, ...);
 ```
 
-**Parameters**
+**Arguments**
 
 -   `par1, ...` — A variable number of parameters that can be any of the [supported data types](../../sql-reference/data-types/index.md#data_types).
 
@@ -395,9 +395,9 @@ murmurHash3_32(par1, ...)
 murmurHash3_64(par1, ...)
 ```
 
-**Parameters**
+**Arguments**
 
-Both functions take a variable number of input parameters. Parameters can be any of the [supported data types](../../sql-reference/data-types/index.md).
+Both functions take a variable number of input parameters. Arguments can be any of the [supported data types](../../sql-reference/data-types/index.md).
 
 **Returned Value**
 
@@ -424,7 +424,7 @@ Produces a 128-bit [MurmurHash3](https://github.com/aappleby/smhasher) hash valu
 murmurHash3_128( expr )
 ```
 
-**Parameters**
+**Arguments**
 
 -   `expr` — [Expressions](../../sql-reference/syntax.md#syntax-expressions) returning a [String](../../sql-reference/data-types/string.md)-type value.
 
diff --git a/docs/en/sql-reference/functions/introspection.md b/docs/en/sql-reference/functions/introspection.md
index bfa1998d68a..964265a461b 100644
--- a/docs/en/sql-reference/functions/introspection.md
+++ b/docs/en/sql-reference/functions/introspection.md
@@ -32,7 +32,7 @@ If you use official ClickHouse packages, you need to install the `clickhouse-com
 addressToLine(address_of_binary_instruction)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `address_of_binary_instruction` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Address of instruction in a running process.
 
@@ -123,7 +123,7 @@ Converts virtual memory address inside ClickHouse server process to the symbol f
 addressToSymbol(address_of_binary_instruction)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `address_of_binary_instruction` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Address of instruction in a running process.
 
@@ -220,7 +220,7 @@ Converts a symbol that you can get using the [addressToSymbol](#addresstosymbol)
 demangle(symbol)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `symbol` ([String](../../sql-reference/data-types/string.md)) — Symbol from an object file.
 
@@ -345,7 +345,7 @@ Emits trace log message to server log for each [Block](https://clickhouse.tech/d
 logTrace('message')
 ```
 
-**Parameters**
+**Arguments**
 
 -   `message` — Message that is emitted to server log. [String](../../sql-reference/data-types/string.md#string).
 
diff --git a/docs/en/sql-reference/functions/ip-address-functions.md b/docs/en/sql-reference/functions/ip-address-functions.md
index 1361eb65a56..eaea5e250fb 100644
--- a/docs/en/sql-reference/functions/ip-address-functions.md
+++ b/docs/en/sql-reference/functions/ip-address-functions.md
@@ -265,32 +265,81 @@ SELECT toIPv6('127.0.0.1')
 └─────────────────────┘
 ```
 
-## isIPv4String
+## isIPv4String {#isipv4string}
 
-Determines if the input string is an IPv4 address or not. Returns `1` if true `0` otherwise.
+Determines whether the input string is an IPv4 address or not. If `string` is IPv6 address returns `0`.
 
-``` sql
-SELECT isIPv4String('127.0.0.1')
+**Syntax**
+
+```sql
+isIPv4String(string)
 ```
 
+**Arguments**
+
+-   `string` — IP address. [String](../../sql-reference/data-types/string.md).
+
+**Returned value**
+
+-   `1` if `string` is IPv4 address, `0` otherwise.
+
+Type: [UInt8](../../sql-reference/data-types/int-uint.md).
+
+**Examples**
+
+Query:
+
+```sql
+SELECT addr, isIPv4String(addr) FROM ( SELECT ['0.0.0.0', '127.0.0.1', '::ffff:127.0.0.1'] AS addr ) ARRAY JOIN addr
+```
+
+Result:
+
 ``` text
-┌─isIPv4String('127.0.0.1')─┐
-│                         1 │
-└───────────────────────────┘
+┌─addr─────────────┬─isIPv4String(addr)─┐
+│ 0.0.0.0          │                  1 │
+│ 127.0.0.1        │                  1 │
+│ ::ffff:127.0.0.1 │                  0 │
+└──────────────────┴────────────────────┘
 ```
 
-## isIPv6String
+## isIPv6String {#isipv6string}
 
-Determines if the input string is an IPv6 address or not. Returns `1` if true `0` otherwise.
+Determines whether the input string is an IPv6 address or not. If `string` is IPv4 address returns `0`.
+
+**Syntax**
+
+```sql
+isIPv6String(string)
+```
+
+**Arguments**
+
+-   `string` — IP address. [String](../../sql-reference/data-types/string.md).
+
+**Returned value**
+
+-   `1` if `string` is IPv6 address, `0` otherwise.
+
+Type: [UInt8](../../sql-reference/data-types/int-uint.md).
+
+**Examples**
+
+Query:
 
 ``` sql
-SELECT isIPv6String('2001:438:ffff::407d:1bc1')
+SELECT addr, isIPv6String(addr) FROM ( SELECT ['::', '1111::ffff', '::ffff:127.0.0.1', '127.0.0.1'] AS addr ) ARRAY JOIN addr
 ```
 
+Result:
+
 ``` text
-┌─isIPv6String('2001:438:ffff::407d:1bc1')─┐
-│                                        1 │
-└──────────────────────────────────────────┘
+┌─addr─────────────┬─isIPv6String(addr)─┐
+│ ::               │                  1 │
+│ 1111::ffff       │                  1 │
+│ ::ffff:127.0.0.1 │                  1 │
+│ 127.0.0.1        │                  0 │
+└──────────────────┴────────────────────┘
 ```
 
 [Original article](https://clickhouse.tech/docs/en/query_language/functions/ip_address_functions/) <!--hide-->
diff --git a/docs/en/sql-reference/functions/json-functions.md b/docs/en/sql-reference/functions/json-functions.md
index 05e755eaddc..edee048eb77 100644
--- a/docs/en/sql-reference/functions/json-functions.md
+++ b/docs/en/sql-reference/functions/json-functions.md
@@ -236,7 +236,7 @@ Extracts raw data from a JSON object.
 JSONExtractKeysAndValuesRaw(json[, p, a, t, h])
 ```
 
-**Parameters**
+**Arguments**
 
 -   `json` — [String](../../sql-reference/data-types/string.md) with valid JSON.
 -   `p, a, t, h` — Comma-separated indices or keys that specify the path to the inner field in a nested JSON object. Each argument can be either a [string](../../sql-reference/data-types/string.md) to get the field by the key or an [integer](../../sql-reference/data-types/int-uint.md) to get the N-th field (indexed from 1, negative integers count from the end). If not set, the whole JSON is parsed as the top-level object. Optional parameter.
diff --git a/docs/en/sql-reference/functions/machine-learning-functions.md b/docs/en/sql-reference/functions/machine-learning-functions.md
index 8627fc26bad..f103a4ea421 100644
--- a/docs/en/sql-reference/functions/machine-learning-functions.md
+++ b/docs/en/sql-reference/functions/machine-learning-functions.md
@@ -27,7 +27,7 @@ Compares test groups (variants) and calculates for each group the probability to
 bayesAB(distribution_name, higher_is_better, variant_names, x, y)
 ```
 
-**Parameters** 
+**Arguments** 
 
 -   `distribution_name` — Name of the probability distribution. [String](../../sql-reference/data-types/string.md). Possible values:
 
diff --git a/docs/en/sql-reference/functions/math-functions.md b/docs/en/sql-reference/functions/math-functions.md
index 8dc287593c7..f56a721c0c0 100644
--- a/docs/en/sql-reference/functions/math-functions.md
+++ b/docs/en/sql-reference/functions/math-functions.md
@@ -121,7 +121,7 @@ Accepts a numeric argument and returns a UInt64 number close to 10 to the power
 cosh(x)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `x` — The angle, in radians. Values from the interval: `-∞ < x < +∞`. [Float64](../../sql-reference/data-types/float.md#float32-float64).
 
@@ -157,7 +157,7 @@ Result:
 acosh(x)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `x` — Hyperbolic cosine of angle. Values from the interval: `1 <= x < +∞`. [Float64](../../sql-reference/data-types/float.md#float32-float64).
 
@@ -197,7 +197,7 @@ Result:
 sinh(x)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `x` — The angle, in radians. Values from the interval: `-∞ < x < +∞`. [Float64](../../sql-reference/data-types/float.md#float32-float64).
 
@@ -233,7 +233,7 @@ Result:
 asinh(x)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `x` — Hyperbolic sine of angle. Values from the interval: `-∞ < x < +∞`. [Float64](../../sql-reference/data-types/float.md#float32-float64).
 
@@ -273,7 +273,7 @@ Result:
 atanh(x)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `x` — Hyperbolic tangent of angle. Values from the interval: `–1 < x < 1`. [Float64](../../sql-reference/data-types/float.md#float32-float64).
 
@@ -309,7 +309,7 @@ The [function](https://en.wikipedia.org/wiki/Atan2) calculates the angle in the
 atan2(y, x)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `y` — y-coordinate of the point through which the ray passes. [Float64](../../sql-reference/data-types/float.md#float32-float64).
 -   `x` — x-coordinate of the point through which the ray passes. [Float64](../../sql-reference/data-types/float.md#float32-float64).
@@ -346,7 +346,7 @@ Calculates the length of the hypotenuse of a right-angle triangle. The [function
 hypot(x, y)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `x` — The first cathetus of a right-angle triangle. [Float64](../../sql-reference/data-types/float.md#float32-float64).
 -   `y` — The second cathetus of a right-angle triangle. [Float64](../../sql-reference/data-types/float.md#float32-float64).
@@ -383,7 +383,7 @@ Calculates `log(1+x)`. The [function](https://en.wikipedia.org/wiki/Natural_loga
 log1p(x)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `x` — Values from the interval: `-1 < x < +∞`. [Float64](../../sql-reference/data-types/float.md#float32-float64).
 
@@ -423,7 +423,7 @@ The `sign` function can extract the sign of a real number.
 sign(x)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `x` — Values from  `-∞` to `+∞`. Support all numeric types in ClickHouse.
 
diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md
index 7308ed60b5c..04e921b5c55 100644
--- a/docs/en/sql-reference/functions/other-functions.md
+++ b/docs/en/sql-reference/functions/other-functions.md
@@ -19,7 +19,7 @@ Gets a named value from the [macros](../../operations/server-configuration-param
 getMacro(name);
 ```
 
-**Parameters**
+**Arguments**
 
 -   `name` — Name to retrieve from the `macros` section. [String](../../sql-reference/data-types/string.md#string).
 
@@ -108,7 +108,7 @@ Extracts the trailing part of a string after the last slash or backslash. This f
 basename( expr )
 ```
 
-**Parameters**
+**Arguments**
 
 -   `expr` — Expression resulting in a [String](../../sql-reference/data-types/string.md) type value. All the backslashes must be escaped in the resulting value.
 
@@ -192,7 +192,7 @@ Returns estimation of uncompressed byte size of its arguments in memory.
 byteSize(argument [, ...])
 ```
 
-**Parameters**
+**Arguments**
 
 -   `argument` — Value.
 
@@ -349,7 +349,7 @@ The function is intended for development, debugging and demonstration.
 isConstant(x)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `x` — Expression to check.
 
@@ -420,7 +420,7 @@ Checks whether floating point value is finite.
 
     ifNotFinite(x,y)
 
-**Parameters**
+**Arguments**
 
 -   `x` — Value to be checked for infinity. Type: [Float\*](../../sql-reference/data-types/float.md).
 -   `y` — Fallback value. Type: [Float\*](../../sql-reference/data-types/float.md).
@@ -460,7 +460,7 @@ Allows building a unicode-art diagram.
 
 `bar(x, min, max, width)` draws a band with a width proportional to `(x - min)` and equal to `width` characters when `x = max`.
 
-Parameters:
+**Arguments**
 
 -   `x` — Size to display.
 -   `min, max` — Integer constants. The value must fit in `Int64`.
@@ -645,7 +645,7 @@ Accepts the time delta in seconds. Returns a time delta with (year, month, day,
 formatReadableTimeDelta(column[, maximum_unit])
 ```
 
-**Parameters**
+**Arguments**
 
 -   `column` — A column with numeric time delta.
 -   `maximum_unit` — Optional. Maximum unit to show. Acceptable values seconds, minutes, hours, days, months, years.
@@ -730,7 +730,7 @@ The result of the function depends on the affected data blocks and the order of
 The rows order used during the calculation of `neighbor` can differ from the order of rows returned to the user.
 To prevent that you can make a subquery with ORDER BY and call the function from outside the subquery.
 
-**Parameters**
+**Arguments**
 
 -   `column` — A column name or scalar expression.
 -   `offset` — The number of rows forwards or backwards from the current row of `column`. [Int64](../../sql-reference/data-types/int-uint.md).
@@ -909,6 +909,66 @@ WHERE diff != 1
 
 Same as for [runningDifference](../../sql-reference/functions/other-functions.md#other_functions-runningdifference), the difference is the value of the first row, returned the value of the first row, and each subsequent row returns the difference from the previous row.
 
+## runningConcurrency {#runningconcurrency}
+
+Given a series of beginning time and ending time of events, this function calculates concurrency of the events at each of the data point, that is, the beginning time.
+
+!!! warning "Warning"
+    Events spanning multiple data blocks will not be processed correctly. The function resets its state for each new data block.
+
+The result of the function depends on the order of data in the block. It assumes the beginning time is sorted in ascending order.
+
+**Syntax**
+
+``` sql
+runningConcurrency(begin, end)
+```
+
+**Arguments**
+
+-   `begin` — A column for the beginning time of events (inclusive). [Date](../../sql-reference/data-types/date.md), [DateTime](../../sql-reference/data-types/datetime.md), or [DateTime64](../../sql-reference/data-types/datetime64.md).
+-   `end` — A column for the ending time of events (exclusive).  [Date](../../sql-reference/data-types/date.md), [DateTime](../../sql-reference/data-types/datetime.md), or [DateTime64](../../sql-reference/data-types/datetime64.md).
+
+Note that two columns `begin` and `end` must have the same type.
+
+**Returned values**
+
+-   The concurrency of events at the data point.
+
+Type: [UInt32](../../sql-reference/data-types/int-uint.md)
+
+**Example**
+
+Input table:
+
+``` text
+┌───────────────begin─┬─────────────────end─┐
+│ 2020-12-01 00:00:00 │ 2020-12-01 00:59:59 │
+│ 2020-12-01 00:30:00 │ 2020-12-01 00:59:59 │
+│ 2020-12-01 00:40:00 │ 2020-12-01 01:30:30 │
+│ 2020-12-01 01:10:00 │ 2020-12-01 01:30:30 │
+│ 2020-12-01 01:50:00 │ 2020-12-01 01:59:59 │
+└─────────────────────┴─────────────────────┘
+```
+
+Query:
+
+``` sql
+SELECT runningConcurrency(begin, end) FROM example
+```
+
+Result:
+
+``` text
+┌─runningConcurrency(begin, end)─┐
+│                              1 │
+│                              2 │
+│                              3 │
+│                              2 │
+│                              1 │
+└────────────────────────────────┘
+```
+
 ## MACNumToString(num) {#macnumtostringnum}
 
 Accepts a UInt64 number. Interprets it as a MAC address in big endian. Returns a string containing the corresponding MAC address in the format AA:BB:CC:DD:EE:FF (colon-separated numbers in hexadecimal form).
@@ -929,7 +989,7 @@ Returns the number of fields in [Enum](../../sql-reference/data-types/enum.md).
 getSizeOfEnumType(value)
 ```
 
-**Parameters:**
+**Arguments:**
 
 -   `value` — Value of type `Enum`.
 
@@ -958,7 +1018,7 @@ Returns size on disk (without taking into account compression).
 blockSerializedSize(value[, value[, ...]])
 ```
 
-**Parameters**
+**Arguments**
 
 -   `value` — Any value.
 
@@ -990,7 +1050,7 @@ Returns the name of the class that represents the data type of the column in RAM
 toColumnTypeName(value)
 ```
 
-**Parameters:**
+**Arguments:**
 
 -   `value` — Any type of value.
 
@@ -1030,7 +1090,7 @@ Outputs a detailed description of data structures in RAM
 dumpColumnStructure(value)
 ```
 
-**Parameters:**
+**Arguments:**
 
 -   `value` — Any type of value.
 
@@ -1060,7 +1120,7 @@ Does not include default values for custom columns set by the user.
 defaultValueOfArgumentType(expression)
 ```
 
-**Parameters:**
+**Arguments:**
 
 -   `expression` — Arbitrary type of value or an expression that results in a value of an arbitrary type.
 
@@ -1102,7 +1162,7 @@ Does not include default values for custom columns set by the user.
 defaultValueOfTypeName(type)
 ```
 
-**Parameters:**
+**Arguments:**
 
 -   `type` — A string representing a type name.
 
@@ -1144,7 +1204,7 @@ Used for internal implementation of [arrayJoin](../../sql-reference/functions/ar
 SELECT replicate(x, arr);
 ```
 
-**Parameters:**
+**Arguments:**
 
 -   `arr` — Original array. ClickHouse creates a new array of the same length as the original and fills it with the value `x`.
 -   `x` — The value that the resulting array will be filled with.
@@ -1277,7 +1337,7 @@ Takes state of aggregate function. Returns result of aggregation (or finalized s
 finalizeAggregation(state)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `state` — State of aggregation. [AggregateFunction](../../sql-reference/data-types/aggregatefunction.md#data-type-aggregatefunction).
 
@@ -1381,7 +1441,7 @@ Accumulates states of an aggregate function for each row of a data block.
 runningAccumulate(agg_state[, grouping]);
 ```
 
-**Parameters**
+**Arguments**
 
 -   `agg_state` — State of the aggregate function. [AggregateFunction](../../sql-reference/data-types/aggregatefunction.md#data-type-aggregatefunction).
 -   `grouping` — Grouping key. Optional. The state of the function is reset if the `grouping` value is changed. It can be any of the [supported data types](../../sql-reference/data-types/index.md) for which the equality operator is defined.
@@ -1487,7 +1547,7 @@ Only supports tables created with the `ENGINE = Join(ANY, LEFT, <join_keys>)` st
 joinGet(join_storage_table_name, `value_column`, join_keys)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `join_storage_table_name` — an [identifier](../../sql-reference/syntax.md#syntax-identifiers) indicates where search is performed. The identifier is searched in the default database (see parameter `default_database` in the config file). To override the default database, use the `USE db_name` or specify the database and the table through the separator `db_name.db_table`, see the example.
 -   `value_column` — name of the column of the table that contains required data.
@@ -1591,7 +1651,7 @@ Generates a string with a random set of [ASCII](https://en.wikipedia.org/wiki/AS
 randomPrintableASCII(length)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `length` — Resulting string length. Positive integer.
 
@@ -1627,7 +1687,7 @@ Generates a binary string of the specified length filled with random bytes (incl
 randomString(length)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `length` — String length. Positive integer.
 
@@ -1675,7 +1735,7 @@ Generates a binary string of the specified length filled with random bytes (incl
 randomFixedString(length);
 ```
 
-**Parameters**
+**Arguments**
 
 -   `length` — String length in bytes. [UInt64](../../sql-reference/data-types/int-uint.md).
 
@@ -1713,7 +1773,7 @@ Generates a random string of a specified length. Result string contains valid UT
 randomStringUTF8(length);
 ```
 
-**Parameters**
+**Arguments**
 
 -   `length` — Required length of the resulting string in code points. [UInt64](../../sql-reference/data-types/int-uint.md).
 
@@ -1785,7 +1845,7 @@ Checks whether the [Decimal](../../sql-reference/data-types/decimal.md) value is
 isDecimalOverflow(d, [p])
 ```
 
-**Parameters**
+**Arguments**
 
 -   `d` — value. [Decimal](../../sql-reference/data-types/decimal.md).
 -   `p` — precision. Optional. If omitted, the initial precision of the first argument is used. Using of this paratemer could be helpful for data extraction to another DBMS or file. [UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges).
@@ -1822,7 +1882,7 @@ Returns number of decimal digits you need to represent the value.
 countDigits(x)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `x` — [Int](../../sql-reference/data-types/int-uint.md) or [Decimal](../../sql-reference/data-types/decimal.md) value.
 
@@ -1881,7 +1941,7 @@ Returns [native interface](../../interfaces/tcp.md) TCP port number listened by
 tcpPort()
 ```
 
-**Parameters**
+**Arguments**
 
 -   None.
 
diff --git a/docs/en/sql-reference/functions/random-functions.md b/docs/en/sql-reference/functions/random-functions.md
index 68998928398..2b9846344e4 100644
--- a/docs/en/sql-reference/functions/random-functions.md
+++ b/docs/en/sql-reference/functions/random-functions.md
@@ -32,7 +32,7 @@ Produces a constant column with a random value.
 randConstant([x])
 ```
 
-**Parameters**
+**Arguments**
 
 -   `x` — [Expression](../../sql-reference/syntax.md#syntax-expressions) resulting in any of the [supported data types](../../sql-reference/data-types/index.md#data_types). The resulting value is discarded, but the expression itself if used for bypassing [common subexpression elimination](../../sql-reference/functions/index.md#common-subexpression-elimination) if the function is called multiple times in one query. Optional parameter.
 
@@ -81,7 +81,7 @@ fuzzBits([s], [prob])
 
 Inverts bits of `s`, each with probability `prob`.
 
-**Parameters**
+**Arguments**
 - `s` - `String` or `FixedString`
 - `prob` - constant `Float32/64`
 
diff --git a/docs/en/sql-reference/functions/rounding-functions.md b/docs/en/sql-reference/functions/rounding-functions.md
index 922cf7374d7..83db1975366 100644
--- a/docs/en/sql-reference/functions/rounding-functions.md
+++ b/docs/en/sql-reference/functions/rounding-functions.md
@@ -35,7 +35,7 @@ The function returns the nearest number of the specified order. In case when giv
 round(expression [, decimal_places])
 ```
 
-**Parameters:**
+**Arguments:**
 
 -   `expression` — A number to be rounded. Can be any [expression](../../sql-reference/syntax.md#syntax-expressions) returning the numeric [data type](../../sql-reference/data-types/index.md#data_types).
 -   `decimal-places` — An integer value.
@@ -114,7 +114,7 @@ For example, sum numbers 1.5, 2.5, 3.5, 4.5 with different rounding:
 roundBankers(expression [, decimal_places])
 ```
 
-**Parameters**
+**Arguments**
 
 -   `expression` — A number to be rounded. Can be any [expression](../../sql-reference/syntax.md#syntax-expressions) returning the numeric [data type](../../sql-reference/data-types/index.md#data_types).
 -   `decimal-places` — Decimal places. An integer number.
diff --git a/docs/en/sql-reference/functions/splitting-merging-functions.md b/docs/en/sql-reference/functions/splitting-merging-functions.md
index 25f41211b47..c70ee20f076 100644
--- a/docs/en/sql-reference/functions/splitting-merging-functions.md
+++ b/docs/en/sql-reference/functions/splitting-merging-functions.md
@@ -16,7 +16,7 @@ Returns an array of selected substrings. Empty substrings may be selected if the
 splitByChar(<separator>, <s>)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `separator` — The separator which should contain exactly one character. [String](../../sql-reference/data-types/string.md).
 -   `s` — The string to split. [String](../../sql-reference/data-types/string.md).
@@ -53,7 +53,7 @@ Splits a string into substrings separated by a string. It uses a constant string
 splitByString(<separator>, <s>)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `separator` — The separator. [String](../../sql-reference/data-types/string.md).
 -   `s` — The string to split. [String](../../sql-reference/data-types/string.md).
@@ -121,7 +121,7 @@ Extracts all groups from non-overlapping substrings matched by a regular express
 extractAllGroups(text, regexp) 
 ```
 
-**Parameters** 
+**Arguments** 
 
 -   `text` — [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md).
 -   `regexp` — Regular expression. Constant. [String](../data-types/string.md) or [FixedString](../data-types/fixedstring.md).
diff --git a/docs/en/sql-reference/functions/string-functions.md b/docs/en/sql-reference/functions/string-functions.md
index 2b93dd924a3..dc5304b39aa 100644
--- a/docs/en/sql-reference/functions/string-functions.md
+++ b/docs/en/sql-reference/functions/string-functions.md
@@ -76,7 +76,7 @@ Replaces invalid UTF-8 characters by the `�` (U+FFFD) character. All running i
 toValidUTF8( input_string )
 ```
 
-Parameters:
+**Arguments**
 
 -   input_string — Any set of bytes represented as the [String](../../sql-reference/data-types/string.md) data type object.
 
@@ -104,7 +104,7 @@ Repeats a string as many times as specified and concatenates the replicated valu
 repeat(s, n)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `s` — The string to repeat. [String](../../sql-reference/data-types/string.md).
 -   `n` — The number of times to repeat the string. [UInt](../../sql-reference/data-types/int-uint.md).
@@ -173,7 +173,7 @@ Concatenates the strings listed in the arguments, without a separator.
 concat(s1, s2, ...)
 ```
 
-**Parameters**
+**Arguments**
 
 Values of type String or FixedString.
 
@@ -211,7 +211,7 @@ The function is named “injective” if it always returns different result for
 concatAssumeInjective(s1, s2, ...)
 ```
 
-**Parameters**
+**Arguments**
 
 Values of type String or FixedString.
 
@@ -328,7 +328,7 @@ By default removes all consecutive occurrences of common whitespace (ASCII chara
 trim([[LEADING|TRAILING|BOTH] trim_character FROM] input_string)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `trim_character` — specified characters for trim. [String](../../sql-reference/data-types/string.md).
 -   `input_string` — string for trim. [String](../../sql-reference/data-types/string.md).
@@ -367,7 +367,7 @@ trimLeft(input_string)
 
 Alias: `ltrim(input_string)`.
 
-**Parameters**
+**Arguments**
 
 -   `input_string` — string to trim. [String](../../sql-reference/data-types/string.md).
 
@@ -405,7 +405,7 @@ trimRight(input_string)
 
 Alias: `rtrim(input_string)`.
 
-**Parameters**
+**Arguments**
 
 -   `input_string` — string to trim. [String](../../sql-reference/data-types/string.md).
 
@@ -443,7 +443,7 @@ trimBoth(input_string)
 
 Alias: `trim(input_string)`.
 
-**Parameters**
+**Arguments**
 
 -   `input_string` — string to trim. [String](../../sql-reference/data-types/string.md).
 
@@ -496,7 +496,7 @@ Replaces literals, sequences of literals and complex aliases with placeholders.
 normalizeQuery(x)
 ```
 
-**Parameters** 
+**Arguments** 
 
 -   `x` — Sequence of characters. [String](../../sql-reference/data-types/string.md).
 
@@ -532,7 +532,7 @@ Returns identical 64bit hash values without the values of literals for similar q
 normalizedQueryHash(x)
 ```
 
-**Parameters** 
+**Arguments** 
 
 -   `x` — Sequence of characters. [String](../../sql-reference/data-types/string.md).
 
@@ -570,7 +570,7 @@ The following five XML predefined entities will be replaced: `<`, `&`, `>`, `"`,
 encodeXMLComponent(x)
 ```
 
-**Parameters** 
+**Arguments** 
 
 -   `x` — The sequence of characters. [String](../../sql-reference/data-types/string.md).
 
diff --git a/docs/en/sql-reference/functions/string-search-functions.md b/docs/en/sql-reference/functions/string-search-functions.md
index 92591c89a37..83b0edea438 100644
--- a/docs/en/sql-reference/functions/string-search-functions.md
+++ b/docs/en/sql-reference/functions/string-search-functions.md
@@ -24,7 +24,7 @@ position(haystack, needle[, start_pos])
 
 Alias: `locate(haystack, needle[, start_pos])`.
 
-**Parameters**
+**Arguments**
 
 -   `haystack` — string, in which substring will to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal).
 -   `needle` — substring to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal).
@@ -95,7 +95,7 @@ Works under the assumption that the string contains a set of bytes representing
 positionCaseInsensitive(haystack, needle[, start_pos])
 ```
 
-**Parameters**
+**Arguments**
 
 -   `haystack` — string, in which substring will to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal).
 -   `needle` — substring to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal).
@@ -138,7 +138,7 @@ For a case-insensitive search, use the function [positionCaseInsensitiveUTF8](#p
 positionUTF8(haystack, needle[, start_pos])
 ```
 
-**Parameters**
+**Arguments**
 
 -   `haystack` — string, in which substring will to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal).
 -   `needle` — substring to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal).
@@ -211,7 +211,7 @@ Works under the assumption that the string contains a set of bytes representing
 positionCaseInsensitiveUTF8(haystack, needle[, start_pos])
 ```
 
-**Parameters**
+**Arguments**
 
 -   `haystack` — string, in which substring will to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal).
 -   `needle` — substring to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal).
@@ -256,7 +256,7 @@ The search is performed on sequences of bytes without respect to string encoding
 multiSearchAllPositions(haystack, [needle1, needle2, ..., needlen])
 ```
 
-**Parameters**
+**Arguments**
 
 -   `haystack` — string, in which substring will to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal).
 -   `needle` — substring to be searched. [String](../../sql-reference/syntax.md#syntax-string-literal).
@@ -371,7 +371,7 @@ Matches all groups of the `haystack` string using the `pattern` regular expressi
 extractAllGroupsHorizontal(haystack, pattern)
 ```
 
-**Parameters** 
+**Arguments** 
 
 -   `haystack` — Input string. Type: [String](../../sql-reference/data-types/string.md).
 -   `pattern` — Regular expression with [re2 syntax](https://github.com/google/re2/wiki/Syntax). Must contain groups, each group enclosed in parentheses. If `pattern` contains no groups, an exception is thrown. Type: [String](../../sql-reference/data-types/string.md). 
@@ -412,7 +412,7 @@ Matches all groups of the `haystack` string using the `pattern` regular expressi
 extractAllGroupsVertical(haystack, pattern)
 ```
 
-**Parameters** 
+**Arguments** 
 
 -   `haystack` — Input string. Type: [String](../../sql-reference/data-types/string.md).
 -   `pattern` — Regular expression with [re2 syntax](https://github.com/google/re2/wiki/Syntax). Must contain groups, each group enclosed in parentheses. If `pattern` contains no groups, an exception is thrown. Type: [String](../../sql-reference/data-types/string.md).
@@ -471,7 +471,7 @@ Case insensitive variant of [like](https://clickhouse.tech/docs/en/sql-reference
 ilike(haystack, pattern)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `haystack` — Input string. [String](../../sql-reference/syntax.md#syntax-string-literal).
 -   `pattern` — If `pattern` doesn't contain percent signs or underscores, then the `pattern` only represents the string itself. An underscore (`_`) in `pattern` stands for (matches) any single character. A percent sign (`%`) matches any sequence of zero or more characters.
@@ -548,7 +548,7 @@ For a case-insensitive search, use [countSubstringsCaseInsensitive](../../sql-re
 countSubstrings(haystack, needle[, start_pos])
 ```
 
-**Parameters**
+**Arguments**
 
 -   `haystack` — The string to search in. [String](../../sql-reference/syntax.md#syntax-string-literal).
 -   `needle` — The substring to search for. [String](../../sql-reference/syntax.md#syntax-string-literal).
@@ -614,7 +614,7 @@ Returns the number of substring occurrences case-insensitive.
 countSubstringsCaseInsensitive(haystack, needle[, start_pos])
 ```
 
-**Parameters**
+**Arguments**
 
 -   `haystack` — The string to search in. [String](../../sql-reference/syntax.md#syntax-string-literal).
 -   `needle` — The substring to search for. [String](../../sql-reference/syntax.md#syntax-string-literal).
@@ -680,7 +680,7 @@ Returns the number of substring occurrences in `UTF-8` case-insensitive.
 SELECT countSubstringsCaseInsensitiveUTF8(haystack, needle[, start_pos])
 ```
 
-**Parameters**
+**Arguments**
 
 -   `haystack` — The string to search in. [String](../../sql-reference/syntax.md#syntax-string-literal).
 -   `needle` — The substring to search for. [String](../../sql-reference/syntax.md#syntax-string-literal).
@@ -732,7 +732,7 @@ Returns the number of regular expression matches for a `pattern` in a `haystack`
 countMatches(haystack, pattern)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `haystack` — The string to search in. [String](../../sql-reference/syntax.md#syntax-string-literal).
 -   `pattern` — The regular expression with [re2 syntax](https://github.com/google/re2/wiki/Syntax). [String](../../sql-reference/data-types/string.md).
diff --git a/docs/en/sql-reference/functions/tuple-functions.md b/docs/en/sql-reference/functions/tuple-functions.md
index dcbcd3e374b..1006b68b8ee 100644
--- a/docs/en/sql-reference/functions/tuple-functions.md
+++ b/docs/en/sql-reference/functions/tuple-functions.md
@@ -45,7 +45,7 @@ untuple(x)
 
 You can use the `EXCEPT` expression to skip columns as a result of the query.
 
-**Parameters**
+**Arguments**
 
 -   `x` - A `tuple` function, column, or tuple of elements. [Tuple](../../sql-reference/data-types/tuple.md).
 
diff --git a/docs/en/sql-reference/functions/tuple-map-functions.md b/docs/en/sql-reference/functions/tuple-map-functions.md
index a46c36395b8..2b3a9d9103f 100644
--- a/docs/en/sql-reference/functions/tuple-map-functions.md
+++ b/docs/en/sql-reference/functions/tuple-map-functions.md
@@ -5,6 +5,68 @@ toc_title: Working with maps
 
 # Functions for maps {#functions-for-working-with-tuple-maps}
 
+## map {#function-map}
+
+Arranges `key:value` pairs into [Map(key, value)](../../sql-reference/data-types/map.md) data type.
+
+**Syntax** 
+
+``` sql
+map(key1, value1[, key2, value2, ...])
+```
+
+**Arguments** 
+
+-   `key` — The key part of the pair. [String](../../sql-reference/data-types/string.md) or [Integer](../../sql-reference/data-types/int-uint.md).
+-   `value` — The value part of the pair. [String](../../sql-reference/data-types/string.md), [Integer](../../sql-reference/data-types/int-uint.md) or [Array](../../sql-reference/data-types/array.md).
+
+**Returned value**
+
+-  Data structure as `key:value` pairs.
+
+Type: [Map(key, value)](../../sql-reference/data-types/map.md).
+
+**Examples**
+
+Query:
+
+``` sql
+SELECT map('key1', number, 'key2', number * 2) FROM numbers(3);
+```
+
+Result:
+
+``` text
+┌─map('key1', number, 'key2', multiply(number, 2))─┐
+│ {'key1':0,'key2':0}                              │
+│ {'key1':1,'key2':2}                              │
+│ {'key1':2,'key2':4}                              │
+└──────────────────────────────────────────────────┘
+```
+
+Query:
+
+``` sql
+CREATE TABLE table_map (a Map(String, UInt64)) ENGINE = MergeTree() ORDER BY a;
+INSERT INTO table_map SELECT map('key1', number, 'key2', number * 2) FROM numbers(3);
+SELECT a['key2'] FROM table_map;
+```
+
+Result:
+
+``` text
+┌─arrayElement(a, 'key2')─┐
+│                       0 │
+│                       2 │
+│                       4 │
+└─────────────────────────┘
+```
+
+**See Also** 
+
+-   [Map(key, value)](../../sql-reference/data-types/map.md) data type
+
+
 ## mapAdd {#function-mapadd}
 
 Collect all the keys and sum corresponding values.
@@ -15,7 +77,7 @@ Collect all the keys and sum corresponding values.
 mapAdd(Tuple(Array, Array), Tuple(Array, Array) [, ...])
 ```
 
-**Parameters** 
+**Arguments** 
 
 Arguments are [tuples](../../sql-reference/data-types/tuple.md#tuplet1-t2) of two [arrays](../../sql-reference/data-types/array.md#data-type-array), where items in the first array represent keys, and the second array contains values for the each key. All key arrays should have same type, and all value arrays should contain items which are promote to the one type ([Int64](../../sql-reference/data-types/int-uint.md#int-ranges), [UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges) or [Float64](../../sql-reference/data-types/float.md#float32-float64)). The common promoted type is used as a type for the result array.
 
@@ -49,7 +111,7 @@ Collect all the keys and subtract corresponding values.
 mapSubtract(Tuple(Array, Array), Tuple(Array, Array) [, ...])
 ```
 
-**Parameters** 
+**Arguments** 
 
 Arguments are [tuples](../../sql-reference/data-types/tuple.md#tuplet1-t2) of two [arrays](../../sql-reference/data-types/array.md#data-type-array), where items in the first array represent keys, and the second array contains values for the each key. All key arrays should have same type, and all value arrays should contain items which are promote to the one type ([Int64](../../sql-reference/data-types/int-uint.md#int-ranges), [UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges) or [Float64](../../sql-reference/data-types/float.md#float32-float64)). The common promoted type is used as a type for the result array.
 
@@ -87,7 +149,7 @@ Generates a map, where keys are a series of numbers, from minimum to maximum key
 
 The number of elements in `keys` and `values` must be the same for each row.
 
-**Parameters**
+**Arguments**
 
 -   `keys` — Array of keys. [Array](../../sql-reference/data-types/array.md#data-type-array)([Int](../../sql-reference/data-types/int-uint.md#uint-ranges)).
 -   `values` — Array of values. [Array](../../sql-reference/data-types/array.md#data-type-array)([Int](../../sql-reference/data-types/int-uint.md#uint-ranges)).
@@ -112,4 +174,4 @@ Result:
 └──────────────────────────────┴───────────────────────────────────┘
 ```
 
-[Original article](https://clickhouse.tech/docs/en/query_language/functions/tuple-map-functions/) <!--hide-->
+[Original article](https://clickhouse.tech/docs/en/sql-reference/functions/tuple-map-functions/) <!--hide-->
diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md
index 3ca36f41c78..450945a5ab9 100644
--- a/docs/en/sql-reference/functions/type-conversion-functions.md
+++ b/docs/en/sql-reference/functions/type-conversion-functions.md
@@ -22,7 +22,7 @@ Converts an input value to the [Int](../../sql-reference/data-types/int-uint.md)
 -   `toInt128(expr)` — Results in the `Int128` data type.
 -   `toInt256(expr)` — Results in the `Int256` data type.
 
-**Parameters**
+**Arguments**
 
 -   `expr` — [Expression](../../sql-reference/syntax.md#syntax-expressions) returning a number or a string with the decimal representation of a number. Binary, octal, and hexadecimal representations of numbers are not supported. Leading zeroes are stripped.
 
@@ -88,7 +88,7 @@ Converts an input value to the [UInt](../../sql-reference/data-types/int-uint.md
 -   `toUInt64(expr)` — Results in the `UInt64` data type.
 -   `toUInt256(expr)` — Results in the `UInt256` data type.
 
-**Parameters**
+**Arguments**
 
 -   `expr` — [Expression](../../sql-reference/syntax.md#syntax-expressions) returning a number or a string with the decimal representation of a number. Binary, octal, and hexadecimal representations of numbers are not supported. Leading zeroes are stripped.
 
@@ -154,7 +154,7 @@ Converts an input string to a [Nullable(Decimal(P,S))](../../sql-reference/data-
 
 These functions should be used instead of `toDecimal*()` functions, if you prefer to get a `NULL` value instead of an exception in the event of an input value parsing error.
 
-**Parameters**
+**Arguments**
 
 -   `expr` — [Expression](../../sql-reference/syntax.md#syntax-expressions), returns a value in the [String](../../sql-reference/data-types/string.md) data type. ClickHouse expects the textual representation of the decimal number. For example, `'1.111'`.
 -   `S` — Scale, the number of decimal places in the resulting value.
@@ -199,7 +199,7 @@ Converts an input value to the [Decimal(P,S)](../../sql-reference/data-types/dec
 
 These functions should be used instead of `toDecimal*()` functions, if you prefer to get a `0` value instead of an exception in the event of an input value parsing error.
 
-**Parameters**
+**Arguments**
 
 -   `expr` — [Expression](../../sql-reference/syntax.md#syntax-expressions), returns a value in the [String](../../sql-reference/data-types/string.md) data type. ClickHouse expects the textual representation of the decimal number. For example, `'1.111'`.
 -   `S` — Scale, the number of decimal places in the resulting value.
@@ -467,7 +467,7 @@ toIntervalQuarter(number)
 toIntervalYear(number)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `number` — Duration of interval. Positive integer number.
 
@@ -505,7 +505,7 @@ The function parses [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601), [RFC 112
 parseDateTimeBestEffort(time_string [, time_zone]);
 ```
 
-**Parameters**
+**Arguments**
 
 -   `time_string` — String containing a date and time to convert. [String](../../sql-reference/data-types/string.md).
 -   `time_zone` — Time zone. The function parses `time_string` according to the time zone. [String](../../sql-reference/data-types/string.md).
@@ -617,7 +617,7 @@ This function is similar to [‘parseDateTimeBestEffort’](#parsedatetimebestef
 parseDateTimeBestEffortUS(time_string [, time_zone]);
 ```
 
-**Parameters**
+**Arguments**
 
 -   `time_string` — String containing a date and time to convert. [String](../../sql-reference/data-types/string.md).
 -   `time_zone` — Time zone. The function parses `time_string` according to the time zone. [String](../../sql-reference/data-types/string.md).
@@ -701,7 +701,7 @@ To convert data from the `LowCardinality` data type use the [CAST](#type_convers
 toLowCardinality(expr)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `expr` — [Expression](../../sql-reference/syntax.md#syntax-expressions) resulting in one of the [supported data types](../../sql-reference/data-types/index.md#data_types).
 
@@ -741,7 +741,7 @@ Converts a `DateTime64` to a `Int64` value with fixed sub-second precision. Inpu
 toUnixTimestamp64Milli(value)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `value` — DateTime64 value with any precision.
 
@@ -793,7 +793,7 @@ Converts an `Int64` to a `DateTime64` value with fixed sub-second precision and
 fromUnixTimestamp64Milli(value [, ti])
 ```
 
-**Parameters**
+**Arguments**
 
 -   `value` — `Int64` value with any precision.
 -   `timezone` — `String` (optional) timezone name of the result.
@@ -825,7 +825,7 @@ Converts arbitrary expressions into a string via given format.
 formatRow(format, x, y, ...)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `format` — Text format. For example, [CSV](../../interfaces/formats.md#csv), [TSV](../../interfaces/formats.md#tabseparated).
 -   `x`,`y`, ... — Expressions.
@@ -866,7 +866,7 @@ Converts arbitrary expressions into a string via given format. The function trim
 formatRowNoNewline(format, x, y, ...)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `format` — Text format. For example, [CSV](../../interfaces/formats.md#csv), [TSV](../../interfaces/formats.md#tabseparated).
 -   `x`,`y`, ... — Expressions.
diff --git a/docs/en/sql-reference/functions/url-functions.md b/docs/en/sql-reference/functions/url-functions.md
index 006542f494a..9e79ef2d0cb 100644
--- a/docs/en/sql-reference/functions/url-functions.md
+++ b/docs/en/sql-reference/functions/url-functions.md
@@ -25,7 +25,7 @@ Extracts the hostname from a URL.
 domain(url)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `url` — URL. Type: [String](../../sql-reference/data-types/string.md).
 
@@ -76,7 +76,7 @@ Extracts the the top-level domain from a URL.
 topLevelDomain(url)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `url` — URL. Type: [String](../../sql-reference/data-types/string.md).
 
@@ -133,10 +133,9 @@ For example:
 
 ### cutToFirstSignificantSubdomainCustom {#cuttofirstsignificantsubdomaincustom}
 
-Same as `cutToFirstSignificantSubdomain` but accept custom TLD list name, useful if:
+Returns the part of the domain that includes top-level subdomains up to the first significant subdomain. Accepts custom [TLD list](https://en.wikipedia.org/wiki/List_of_Internet_top-level_domains) name.
 
-- you need fresh TLD list,
-- or you have custom.
+Can be useful if you need fresh TLD list or you have custom.
 
 Configuration example:
 
@@ -149,21 +148,150 @@ Configuration example:
 </top_level_domains_lists>
 ```
 
-Example:
+**Syntax**
 
--   `cutToFirstSignificantSubdomain('https://news.yandex.com.tr/', 'public_suffix_list') = 'yandex.com.tr'`.
+``` sql
+cutToFirstSignificantSubdomain(URL, TLD)
+```
+
+**Parameters**
+
+-   `URL` — URL. [String](../../sql-reference/data-types/string.md).
+-   `TLD` — Custom TLD list name. [String](../../sql-reference/data-types/string.md).
+
+**Returned value**
+
+-   Part of the domain that includes top-level subdomains up to the first significant subdomain.
+
+Type: [String](../../sql-reference/data-types/string.md).
+
+**Example**
+
+Query:
+
+```sql
+SELECT cutToFirstSignificantSubdomainCustom('bar.foo.there-is-no-such-domain', 'public_suffix_list');
+```
+
+Result:
+
+```text
+┌─cutToFirstSignificantSubdomainCustom('bar.foo.there-is-no-such-domain', 'public_suffix_list')─┐
+│ foo.there-is-no-such-domain                                                                   │
+└───────────────────────────────────────────────────────────────────────────────────────────────┘
+```
+
+**See Also**
+
+-   [firstSignificantSubdomain](#firstsignificantsubdomain).
 
 ### cutToFirstSignificantSubdomainCustomWithWWW {#cuttofirstsignificantsubdomaincustomwithwww}
 
-Same as `cutToFirstSignificantSubdomainWithWWW` but accept custom TLD list name.
+Returns the part of the domain that includes top-level subdomains up to the first significant subdomain without stripping `www`. Accepts custom TLD list name.
+
+Can be useful if you need fresh TLD list or you have custom.
+
+Configuration example:
+
+```xml
+<!-- <top_level_domains_path>/var/lib/clickhouse/top_level_domains/</top_level_domains_path> -->
+<top_level_domains_lists>
+    <!-- https://publicsuffix.org/list/public_suffix_list.dat -->
+    <public_suffix_list>public_suffix_list.dat</public_suffix_list>
+    <!-- NOTE: path is under top_level_domains_path -->
+</top_level_domains_lists>
+```
+
+**Syntax**
+
+```sql
+cutToFirstSignificantSubdomainCustomWithWWW(URL, TLD)
+```
+
+**Parameters**
+
+-   `URL` — URL. [String](../../sql-reference/data-types/string.md).
+-   `TLD` — Custom TLD list name. [String](../../sql-reference/data-types/string.md).
+
+**Returned value**
+
+-   Part of the domain that includes top-level subdomains up to the first significant subdomain without stripping `www`.
+
+Type: [String](../../sql-reference/data-types/string.md).
+
+**Example**
+
+Query:
+
+```sql
+SELECT cutToFirstSignificantSubdomainCustomWithWWW('www.foo', 'public_suffix_list');
+```
+
+Result:
+
+```text
+┌─cutToFirstSignificantSubdomainCustomWithWWW('www.foo', 'public_suffix_list')─┐
+│ www.foo                                                                      │
+└──────────────────────────────────────────────────────────────────────────────┘
+```
+
+**See Also**
+
+-   [firstSignificantSubdomain](#firstsignificantsubdomain).
 
 ### firstSignificantSubdomainCustom {#firstsignificantsubdomaincustom}
 
-Same as `firstSignificantSubdomain` but accept custom TLD list name.
+Returns the first significant subdomain. Accepts customs TLD list name.
 
-### cutToFirstSignificantSubdomainCustomWithWWW {#cuttofirstsignificantsubdomaincustomwithwww}
+Can be useful if you need fresh TLD list or you have custom.
 
-Same as `cutToFirstSignificantSubdomainWithWWW` but accept custom TLD list name.
+Configuration example:
+
+```xml
+<!-- <top_level_domains_path>/var/lib/clickhouse/top_level_domains/</top_level_domains_path> -->
+<top_level_domains_lists>
+    <!-- https://publicsuffix.org/list/public_suffix_list.dat -->
+    <public_suffix_list>public_suffix_list.dat</public_suffix_list>
+    <!-- NOTE: path is under top_level_domains_path -->
+</top_level_domains_lists>
+```
+
+**Syntax**
+
+```sql
+firstSignificantSubdomainCustom(URL, TLD)
+```
+
+**Parameters**
+
+-   `URL` — URL. [String](../../sql-reference/data-types/string.md).
+-   `TLD` — Custom TLD list name. [String](../../sql-reference/data-types/string.md).
+
+**Returned value**
+
+-   First significant subdomain.
+
+Type: [String](../../sql-reference/data-types/string.md).
+
+**Example**
+
+Query:
+
+```sql
+SELECT firstSignificantSubdomainCustom('bar.foo.there-is-no-such-domain', 'public_suffix_list');
+```
+
+Result:
+
+```text 
+┌─firstSignificantSubdomainCustom('bar.foo.there-is-no-such-domain', 'public_suffix_list')─┐
+│ foo                                                                                      │
+└──────────────────────────────────────────────────────────────────────────────────────────┘
+```
+
+**See Also**
+
+-   [firstSignificantSubdomain](#firstsignificantsubdomain).
 
 ### port(URL\[, default_port = 0\]) {#port}
 
@@ -242,7 +370,7 @@ Extracts network locality (`username:password@host:port`) from a URL.
 netloc(URL)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `url` — URL. [String](../../sql-reference/data-types/string.md).
 
diff --git a/docs/en/sql-reference/functions/ym-dict-functions.md b/docs/en/sql-reference/functions/ym-dict-functions.md
index f70532252c7..56530b5e83b 100644
--- a/docs/en/sql-reference/functions/ym-dict-functions.md
+++ b/docs/en/sql-reference/functions/ym-dict-functions.md
@@ -115,7 +115,7 @@ Finds the highest continent in the hierarchy for the region.
 regionToTopContinent(id[, geobase]);
 ```
 
-**Parameters**
+**Arguments**
 
 -   `id` — Region ID from the Yandex geobase. [UInt32](../../sql-reference/data-types/int-uint.md).
 -   `geobase` — Dictionary key. See [Multiple Geobases](#multiple-geobases). [String](../../sql-reference/data-types/string.md). Optional.
diff --git a/docs/en/sql-reference/statements/alter/column.md b/docs/en/sql-reference/statements/alter/column.md
index 0ea4d4b3dc5..0fa2c492bee 100644
--- a/docs/en/sql-reference/statements/alter/column.md
+++ b/docs/en/sql-reference/statements/alter/column.md
@@ -20,6 +20,7 @@ The following actions are supported:
 
 -   [ADD COLUMN](#alter_add-column) — Adds a new column to the table.
 -   [DROP COLUMN](#alter_drop-column) — Deletes the column.
+-   [RENAME COLUMN](#alter_rename-column) — Renames the column.
 -   [CLEAR COLUMN](#alter_clear-column) — Resets column values.
 -   [COMMENT COLUMN](#alter_comment-column) — Adds a text comment to the column.
 -   [MODIFY COLUMN](#alter_modify-column) — Changes column’s type, default expression and TTL.
@@ -78,6 +79,22 @@ Example:
 ALTER TABLE visits DROP COLUMN browser
 ```
 
+## RENAME COLUMN {#alter_rename-column}
+
+``` sql
+RENAME COLUMN [IF EXISTS] name to new_name
+```
+
+Renames the column `name` to `new_name`. If the `IF EXISTS` clause is specified, the query won’t return an error if the column doesn’t exist. Since renaming does not involve the underlying data, the query is completed almost instantly.
+
+**NOTE**: Columns specified in the key expression of the table (either with `ORDER BY` or `PRIMARY KEY`) cannot be renamed. Trying to change these columns will produce `SQL Error [524]`. 
+
+Example:
+
+``` sql
+ALTER TABLE visits RENAME COLUMN webBrowser TO browser
+```
+
 ## CLEAR COLUMN {#alter_clear-column}
 
 ``` sql
diff --git a/docs/en/sql-reference/table-functions/generate.md b/docs/en/sql-reference/table-functions/generate.md
index 5bbd22dfe4e..be6ba2b8bc4 100644
--- a/docs/en/sql-reference/table-functions/generate.md
+++ b/docs/en/sql-reference/table-functions/generate.md
@@ -13,7 +13,7 @@ Supports all data types that can be stored in table except `LowCardinality` and
 generateRandom('name TypeName[, name TypeName]...', [, 'random_seed'[, 'max_string_length'[, 'max_array_length']]]);
 ```
 
-**Parameters**
+**Arguments**
 
 -   `name` — Name of corresponding column.
 -   `TypeName` — Type of corresponding column.
diff --git a/docs/en/sql-reference/table-functions/mysql.md b/docs/en/sql-reference/table-functions/mysql.md
index eec4a1d0c46..14cd4369285 100644
--- a/docs/en/sql-reference/table-functions/mysql.md
+++ b/docs/en/sql-reference/table-functions/mysql.md
@@ -13,7 +13,7 @@ Allows `SELECT` and `INSERT` queries to be performed on data that is stored on a
 mysql('host:port', 'database', 'table', 'user', 'password'[, replace_query, 'on_duplicate_clause'])
 ```
 
-**Parameters**
+**Arguments**
 
 -   `host:port` — MySQL server address.
 
diff --git a/docs/en/sql-reference/table-functions/view.md b/docs/en/sql-reference/table-functions/view.md
index 9997971af65..08096c2b019 100644
--- a/docs/en/sql-reference/table-functions/view.md
+++ b/docs/en/sql-reference/table-functions/view.md
@@ -13,7 +13,7 @@ Turns a subquery into a table. The function implements views (see [CREATE VIEW](
 view(subquery)
 ```
 
-**Parameters**
+**Arguments**
 
 -   `subquery` — `SELECT` query.
 
diff --git a/docs/en/sql-reference/window-functions/index.md b/docs/en/sql-reference/window-functions/index.md
index 5a6f13226a5..cbf03a44d46 100644
--- a/docs/en/sql-reference/window-functions/index.md
+++ b/docs/en/sql-reference/window-functions/index.md
@@ -10,33 +10,51 @@ This is an experimental feature that is currently in development and is not read
 for general use. It will change in unpredictable backwards-incompatible ways in
 the future releases. Set `allow_experimental_window_functions = 1` to enable it.
 
-ClickHouse currently supports calculation of aggregate functions over a window.
-Pure window functions such as `rank`, `lag`, `lead` and so on are not yet supported.
+ClickHouse supports the standard grammar for defining windows and window functions. The following features are currently supported:
 
-The window can be specified either with an `OVER` clause or with a separate
-`WINDOW` clause.
-
-Only two variants of frame are supported, `ROWS` and `RANGE`. Offsets for the `RANGE` frame are not yet supported.
+| Feature | Support or workaround |
+| --------| ----------|
+| ad hoc window specification (`count(*) over (partition by id order by time desc)`) | supported |
+| expressions involving window functions, e.g. `(count(*) over ()) / 2)` | not supported, wrap in a subquery ([feature request](https://github.com/ClickHouse/ClickHouse/issues/19857)) |
+| `WINDOW` clause (`select ... from table window w as (partiton by id)`) | supported |
+| `ROWS` frame | supported |
+| `RANGE` frame | supported, the default |
+| `INTERVAL` syntax for `DateTime` `RANGE OFFSET` frame | not supported, specify the number of seconds instead |
+| `GROUPS` frame | not supported |
+| Calculating aggregate functions over a frame (`sum(value) over (order by time)`) | all aggregate functions are supported |
+| `rank()`, `dense_rank()`, `row_number()` | supported |
+| `lag/lead(value, offset)` | not supported, replace with `any(value) over (.... rows between <offset> preceding and <offset> preceding)`, or `following` for `lead`| 
 
 ## References
 
 ### GitHub Issues
+
 The roadmap for the initial support of window functions is [in this issue](https://github.com/ClickHouse/ClickHouse/issues/18097).
 
 All GitHub issues related to window funtions have the [comp-window-functions](https://github.com/ClickHouse/ClickHouse/labels/comp-window-functions) tag.
 
 ### Tests
+
 These tests contain the examples of the currently supported grammar:
+
 https://github.com/ClickHouse/ClickHouse/blob/master/tests/performance/window_functions.xml
+
 https://github.com/ClickHouse/ClickHouse/blob/master/tests/queries/0_stateless/01591_window_functions.sql
 
 ### Postgres Docs
+
 https://www.postgresql.org/docs/current/sql-select.html#SQL-WINDOW
+
 https://www.postgresql.org/docs/devel/sql-expressions.html#SYNTAX-WINDOW-FUNCTIONS
+
 https://www.postgresql.org/docs/devel/functions-window.html
+
 https://www.postgresql.org/docs/devel/tutorial-window.html
 
 ### MySQL Docs
+
 https://dev.mysql.com/doc/refman/8.0/en/window-function-descriptions.html
+
 https://dev.mysql.com/doc/refman/8.0/en/window-functions-usage.html
+
 https://dev.mysql.com/doc/refman/8.0/en/window-functions-frames.html
diff --git a/docs/es/operations/backup.md b/docs/es/operations/backup.md
index a6297070663..be33851574a 100644
--- a/docs/es/operations/backup.md
+++ b/docs/es/operations/backup.md
@@ -5,7 +5,7 @@ toc_title: Copia de seguridad de datos
 
 # Copia de seguridad de datos {#data-backup}
 
-Mientras que la [replicación](../engines/table-engines/mergetree-family/replication.md) proporciona protección contra fallos de hardware, no protege de errores humanos: el borrado accidental de datos, elminar la tabla equivocada o una tabla en el clúster equivocado, y bugs de software que dan como resultado un procesado incorrecto de los datos o la corrupción de los datos. En muchos casos, errores como estos afectarán a todas las réplicas. ClickHouse dispone de salvaguardas para prevenir algunos tipos de errores — por ejemplo, por defecto [no se puede simplemente eliminar tablas con un motor similar a MergeTree que contenga más de 50 Gb de datos](https://github.com/ClickHouse/ClickHouse/blob/v18.14.18-stable/programs/server/config.xml#L322-L330). Sin embargo, estas salvaguardas no cubren todos los casos posibles y pueden eludirse.
+Mientras que la [replicación](../engines/table-engines/mergetree-family/replication.md) proporciona protección contra fallos de hardware, no protege de errores humanos: el borrado accidental de datos, elminar la tabla equivocada o una tabla en el clúster equivocado, y bugs de software que dan como resultado un procesado incorrecto de los datos o la corrupción de los datos. En muchos casos, errores como estos afectarán a todas las réplicas. ClickHouse dispone de salvaguardas para prevenir algunos tipos de errores — por ejemplo, por defecto [no se puede simplemente eliminar tablas con un motor similar a MergeTree que contenga más de 50 Gb de datos](server-configuration-parameters/settings.md#max-table-size-to-drop). Sin embargo, estas salvaguardas no cubren todos los casos posibles y pueden eludirse.
 
 Para mitigar eficazmente los posibles errores humanos, debe preparar cuidadosamente una estrategia para realizar copias de seguridad y restaurar sus datos **previamente**.
 
diff --git a/docs/fr/operations/backup.md b/docs/fr/operations/backup.md
index 9a463372947..953a96a04eb 100644
--- a/docs/fr/operations/backup.md
+++ b/docs/fr/operations/backup.md
@@ -7,7 +7,7 @@ toc_title: "La Sauvegarde Des Donn\xE9es"
 
 # La Sauvegarde Des Données {#data-backup}
 
-Alors [réplication](../engines/table-engines/mergetree-family/replication.md) provides protection from hardware failures, it does not protect against human errors: accidental deletion of data, deletion of the wrong table or a table on the wrong cluster, and software bugs that result in incorrect data processing or data corruption. In many cases mistakes like these will affect all replicas. ClickHouse has built-in safeguards to prevent some types of mistakes — for example, by default [vous ne pouvez pas simplement supprimer des tables avec un moteur de type MergeTree contenant plus de 50 Go de données](https://github.com/ClickHouse/ClickHouse/blob/v18.14.18-stable/programs/server/config.xml#L322-L330). Toutefois, ces garanties ne couvrent pas tous les cas possibles et peuvent être contournés.
+Alors [réplication](../engines/table-engines/mergetree-family/replication.md) provides protection from hardware failures, it does not protect against human errors: accidental deletion of data, deletion of the wrong table or a table on the wrong cluster, and software bugs that result in incorrect data processing or data corruption. In many cases mistakes like these will affect all replicas. ClickHouse has built-in safeguards to prevent some types of mistakes — for example, by default [vous ne pouvez pas simplement supprimer des tables avec un moteur de type MergeTree contenant plus de 50 Go de données](server-configuration-parameters/settings.md#max-table-size-to-drop). Toutefois, ces garanties ne couvrent pas tous les cas possibles et peuvent être contournés.
 
 Afin d'atténuer efficacement les erreurs humaines possibles, vous devez préparer soigneusement une stratégie de sauvegarde et de restauration de vos données **préalablement**.
 
diff --git a/docs/ja/operations/backup.md b/docs/ja/operations/backup.md
index 994271371a4..b0cde00e23c 100644
--- a/docs/ja/operations/backup.md
+++ b/docs/ja/operations/backup.md
@@ -7,7 +7,7 @@ toc_title: "\u30C7\u30FC\u30BF\u30D0\u30C3\u30AF\u30A2"
 
 # データバックア {#data-backup}
 
-ながら [複製](../engines/table-engines/mergetree-family/replication.md) provides protection from hardware failures, it does not protect against human errors: accidental deletion of data, deletion of the wrong table or a table on the wrong cluster, and software bugs that result in incorrect data processing or data corruption. In many cases mistakes like these will affect all replicas. ClickHouse has built-in safeguards to prevent some types of mistakes — for example, by default [50Gbを超えるデータを含むMergeTreeのようなエンジンでは、テーブルを削除することはできません](https://github.com/ClickHouse/ClickHouse/blob/v18.14.18-stable/programs/server/config.xml#L322-L330). しかし、これらの保障措置がカバーしないすべてのケースで回避.
+ながら [複製](../engines/table-engines/mergetree-family/replication.md) provides protection from hardware failures, it does not protect against human errors: accidental deletion of data, deletion of the wrong table or a table on the wrong cluster, and software bugs that result in incorrect data processing or data corruption. In many cases mistakes like these will affect all replicas. ClickHouse has built-in safeguards to prevent some types of mistakes — for example, by default [50Gbを超えるデータを含むMergeTreeのようなエンジンでは、テーブルを削除することはできません](server-configuration-parameters/settings.md#max-table-size-to-drop). しかし、これらの保障措置がカバーしないすべてのケースで回避.
 
 ヒューマンエラーを効果的に軽減するには、データのバックアップと復元のための戦略を慎重に準備する必要があります **事前に**.
 
diff --git a/docs/ru/engines/table-engines/integrations/rabbitmq.md b/docs/ru/engines/table-engines/integrations/rabbitmq.md
index dedb5842d68..2a44e085ede 100644
--- a/docs/ru/engines/table-engines/integrations/rabbitmq.md
+++ b/docs/ru/engines/table-engines/integrations/rabbitmq.md
@@ -52,10 +52,11 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
 -   `rabbitmq_max_block_size`
 -   `rabbitmq_flush_interval_ms`
 
-Требуемая конфигурация:
 
 Конфигурация сервера RabbitMQ добавляется с помощью конфигурационного файла ClickHouse.
 
+Требуемая конфигурация:
+
 ``` xml
  <rabbitmq>
     <username>root</username>
@@ -63,6 +64,14 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
  </rabbitmq>
 ```
 
+Дополнительная конфигурация:
+
+``` xml
+ <rabbitmq>
+    <vhost>clickhouse</vhost>
+ </rabbitmq>
+```
+
 Example:
 
 ``` sql
diff --git a/docs/ru/operations/backup.md b/docs/ru/operations/backup.md
index 0dcb6fd307d..165b54d9b62 100644
--- a/docs/ru/operations/backup.md
+++ b/docs/ru/operations/backup.md
@@ -5,7 +5,7 @@ toc_title: "\u0420\u0435\u0437\u0435\u0440\u0432\u043d\u043e\u0435\u0020\u043a\u
 
 # Резервное копирование данных {#rezervnoe-kopirovanie-dannykh}
 
-[Репликация](../engines/table-engines/mergetree-family/replication.md) обеспечивает защиту от аппаратных сбоев, но не защищает от человеческих ошибок: случайного удаления данных, удаления не той таблицы, которую надо было, или таблицы на не том кластере, а также программных ошибок, которые приводят к неправильной обработке данных или их повреждению. Во многих случаях подобные ошибки влияют на все реплики. ClickHouse имеет встроенные средства защиты для предотвращения некоторых типов ошибок — например, по умолчанию [не получится удалить таблицы \*MergeTree, содержащие более 50 Гб данных, одной командой](https://github.com/ClickHouse/ClickHouse/blob/v18.14.18-stable/programs/server/config.xml#L322-L330). Однако эти средства защиты не охватывают все возможные случаи и могут быть обойдены.
+[Репликация](../engines/table-engines/mergetree-family/replication.md) обеспечивает защиту от аппаратных сбоев, но не защищает от человеческих ошибок: случайного удаления данных, удаления не той таблицы, которую надо было, или таблицы на не том кластере, а также программных ошибок, которые приводят к неправильной обработке данных или их повреждению. Во многих случаях подобные ошибки влияют на все реплики. ClickHouse имеет встроенные средства защиты для предотвращения некоторых типов ошибок — например, по умолчанию [не получится удалить таблицы \*MergeTree, содержащие более 50 Гб данных, одной командой](server-configuration-parameters/settings.md#max-table-size-to-drop). Однако эти средства защиты не охватывают все возможные случаи и могут быть обойдены.
 
 Для того чтобы эффективно уменьшить возможные человеческие ошибки, следует тщательно подготовить стратегию резервного копирования и восстановления данных **заранее**.
 
diff --git a/docs/ru/operations/settings/settings.md b/docs/ru/operations/settings/settings.md
index 1352fe850df..7322b6c9184 100644
--- a/docs/ru/operations/settings/settings.md
+++ b/docs/ru/operations/settings/settings.md
@@ -283,12 +283,10 @@ INSERT INTO test VALUES (lower('Hello')), (lower('world')), (lower('INSERT')), (
 
 ## input_format_tsv_empty_as_default {#settings-input-format-tsv-empty-as-default}
 
-Если эта настройка включена, замените пустые поля ввода в TSV значениями по умолчанию. Для сложных выражений по умолчанию также должна быть включена настройка `input_format_defaults_for_omitted_fields`.
+Если эта настройка включена, все пустые поля во входящем TSV заменяются значениями по умолчанию. Для сложных выражений по умолчанию также должна быть включена настройка `input_format_defaults_for_omitted_fields`.
 
 По умолчанию отключена.
 
-Disabled by default.
-
 ## input_format_tsv_enum_as_number {#settings-input_format_tsv_enum_as_number}
 
 Включает или отключает парсинг значений перечислений как идентификаторов перечислений для входного формата TSV.
@@ -708,7 +706,7 @@ ClickHouse использует этот параметр при чтении д
 
 Установка логирования запроса.
 
-Запросы, переданные в ClickHouse с этой установкой, логируются согласно правилам конфигурационного параметра сервера [query_log](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-query-log).
+Запросы, переданные в ClickHouse с этой настройкой, логируются согласно правилам конфигурационного параметра сервера [query_log](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-query-log).
 
 Пример:
 
@@ -1521,7 +1519,7 @@ ClickHouse генерирует исключение
 -   Тип: секунды
 -   Значение по умолчанию: 60 секунд
 
-Управляет скоростью обнуления ошибок в распределенных таблицах. Если реплика недоступна в течение некоторого времени, накапливает 5 ошибок, а distributed_replica_error_half_life установлена на 1 секунду, то реплика считается нормальной через 3 секунды после последней ошибки.
+Управляет скоростью обнуления счетчика ошибок в распределенных таблицах. Предположим, реплика остается недоступна в течение какого-то времени, и за этот период накопилось 5 ошибок. Если настройка `distributed_replica_error_half_life` установлена в значение 1 секунда, то реплика снова будет считаться доступной через 3 секунды после последней ошибки.
 
 См. также:
 
@@ -1673,7 +1671,7 @@ ClickHouse генерирует исключение
 -   Тип: bool
 -   Значение по умолчанию: True
 
-Обеспечивает параллельный анализ форматов данных с сохранением порядка. Поддерживается только для форматов TSV, TKSV, CSV и JSONEachRow.
+Включает режим, при котором входящие данные парсятся параллельно, но с сохранением исходного порядка следования. Поддерживается только для форматов TSV, TKSV, CSV и JSONEachRow.
 
 ## min_chunk_bytes_for_parallel_parsing {#min-chunk-bytes-for-parallel-parsing}
 
@@ -1987,7 +1985,7 @@ SELECT idx, i FROM null_in WHERE i IN (1, NULL) SETTINGS transform_null_in = 1;
 
 ## output_format_pretty_grid_charset {#output-format-pretty-grid-charset}
 
-Позволяет изменить кодировку, которая используется для печати грид-границ. Доступны следующие кодировки: UTF-8, ASCII.
+Позволяет изменить кодировку, которая используется для отрисовки таблицы при выводе результатов запросов. Доступны следующие кодировки: UTF-8, ASCII.
 
 **Пример**
 
@@ -2473,6 +2471,18 @@ SELECT SUM(-1), MAX(0) FROM system.one WHERE 0;
 
 Значение по умолчанию: `16`.
 
+## opentelemetry_start_trace_probability {#opentelemetry-start-trace-probability}
+
+Задает вероятность того, что ClickHouse начнет трассировку для выполненных запросов (если не указан [входящий контекст](https://www.w3.org/TR/trace-context/) трассировки).
+
+Возможные значения:
+
+-   0 — трассировка для выполненных запросов отключена (если не указан входящий контекст трассировки).
+-   Положительное число с плавающей точкой в диапазоне [0..1]. Например, при значении настройки, равной `0,5`, ClickHouse начнет трассировку в среднем для половины запросов.
+-   1 — трассировка для всех выполненных запросов включена.
+
+Значение по умолчанию: `0`.
+
 ## optimize_on_insert {#optimize-on-insert}
 
 Включает или выключает преобразование данных перед добавлением в таблицу, как будто над добавляемым блоком предварительно было произведено слияние (в соответствии с движком таблицы).
diff --git a/docs/ru/operations/system-tables/opentelemetry_span_log.md b/docs/ru/operations/system-tables/opentelemetry_span_log.md
new file mode 100644
index 00000000000..96555064b0e
--- /dev/null
+++ b/docs/ru/operations/system-tables/opentelemetry_span_log.md
@@ -0,0 +1,49 @@
+# system.opentelemetry_span_log {#system_tables-opentelemetry_span_log}
+
+Содержит информацию о [trace spans](https://opentracing.io/docs/overview/spans/) для выполненных запросов.
+
+Столбцы:
+
+-   `trace_id` ([UUID](../../sql-reference/data-types/uuid.md) — идентификатор трассировки для выполненного запроса.
+
+-   `span_id` ([UInt64](../../sql-reference/data-types/int-uint.md)) — идентификатор `trace span`.
+
+-   `parent_span_id` ([UInt64](../../sql-reference/data-types/int-uint.md)) — идентификатор родительского `trace span`.
+
+-   `operation_name` ([String](../../sql-reference/data-types/string.md)) — имя операции.
+
+-   `start_time_us` ([UInt64](../../sql-reference/data-types/int-uint.md)) — время начала `trace span` (в микросекундах).
+
+-   `finish_time_us` ([UInt64](../../sql-reference/data-types/int-uint.md)) — время окончания `trace span` (в микросекундах).
+
+-   `finish_date` ([Date](../../sql-reference/data-types/date.md)) — дата окончания `trace span`.
+
+-   `attribute.names` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — имена [атрибутов](https://opentelemetry.io/docs/go/instrumentation/#attributes) в зависимости от `trace span`. Заполняются согласно рекомендациям в стандарте [OpenTelemetry](https://opentelemetry.io/).
+
+-   `attribute.values` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — значения атрибутов в зависимости от `trace span`. Заполняются согласно рекомендациям в стандарте `OpenTelemetry`.
+
+**Пример**
+
+Запрос:
+
+``` sql
+SELECT * FROM system.opentelemetry_span_log LIMIT 1 FORMAT Vertical;
+```
+
+Результат:
+
+``` text
+Row 1:
+──────
+trace_id:         cdab0847-0d62-61d5-4d38-dd65b19a1914
+span_id:          701487461015578150
+parent_span_id:   2991972114672045096
+operation_name:   DB::Block DB::InterpreterSelectQuery::getSampleBlockImpl()
+start_time_us:    1612374594529090
+finish_time_us:   1612374594529108
+finish_date:      2021-02-03
+attribute.names:  []
+attribute.values: []
+```
+
+[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/opentelemetry_span_log) <!--hide-->
diff --git a/docs/ru/sql-reference/data-types/map.md b/docs/ru/sql-reference/data-types/map.md
new file mode 100644
index 00000000000..6cb8ccf1143
--- /dev/null
+++ b/docs/ru/sql-reference/data-types/map.md
@@ -0,0 +1,69 @@
+---
+toc_priority: 65
+toc_title: Map(key, value)
+---
+
+# Map(key, value) {#data_type-map}
+
+Тип данных `Map(key, value)` хранит пары `ключ:значение`. 
+
+**Параметры** 
+-   `key` — ключ. [String](../../sql-reference/data-types/string.md) или [Integer](../../sql-reference/data-types/int-uint.md).
+-   `value` — значение. [String](../../sql-reference/data-types/string.md), [Integer](../../sql-reference/data-types/int-uint.md) или [Array](../../sql-reference/data-types/array.md).
+
+!!! warning "Предупреждение"
+    Сейчас использование типа данных `Map` является экспериментальной возможностью. Чтобы использовать этот тип данных, включите настройку `allow_experimental_map_type = 1`.
+
+Чтобы получить значение из колонки `a Map('key', 'value')`, используйте синтаксис `a['key']`. В настоящее время такая подстановка работает по алгоритму с линейной сложностью.
+
+**Примеры**
+
+Рассмотрим таблицу:
+
+``` sql
+CREATE TABLE table_map (a Map(String, UInt64)) ENGINE=Memory;
+INSERT INTO table_map VALUES ({'key1':1, 'key2':10}), ({'key1':2,'key2':20}), ({'key1':3,'key2':30});
+```
+
+Выборка всех значений ключа `key2`: 
+
+```sql
+SELECT a['key2'] FROM table_map;
+```
+Результат:
+
+```text
+┌─arrayElement(a, 'key2')─┐
+│                      10 │
+│                      20 │
+│                      30 │
+└─────────────────────────┘
+```
+
+Если для какого-то ключа `key` в колонке с типом `Map()` нет значения, запрос возвращает нули для числовых колонок, пустые строки или пустые массивы. 
+
+```sql
+INSERT INTO table_map VALUES ({'key3':100}), ({});
+SELECT a['key3'] FROM table_map;
+```
+
+Результат:
+
+```text
+┌─arrayElement(a, 'key3')─┐
+│                     100 │
+│                       0 │
+└─────────────────────────┘
+┌─arrayElement(a, 'key3')─┐
+│                       0 │
+│                       0 │
+│                       0 │
+└─────────────────────────┘
+```
+
+**См. также**
+
+-   функция [map()](../../sql-reference/functions/tuple-map-functions.md#function-map)
+-   функция [CAST()](../../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast)
+
+[Original article](https://clickhouse.tech/docs/ru/data-types/map/) <!--hide-->
diff --git a/docs/ru/sql-reference/functions/ip-address-functions.md b/docs/ru/sql-reference/functions/ip-address-functions.md
index 724fb97c0d5..52f0a92bc9f 100644
--- a/docs/ru/sql-reference/functions/ip-address-functions.md
+++ b/docs/ru/sql-reference/functions/ip-address-functions.md
@@ -243,4 +243,81 @@ SELECT
 └───────────────────────────────────┴──────────────────────────────────┘
 ```
 
+## isIPv4String {#isipv4string}
+
+Определяет, является ли строка адресом IPv4 или нет. Также вернет `0`, если `string` — адрес IPv6.
+
+**Синтаксис**
+
+```sql
+isIPv4String(string)
+```
+
+**Параметры**
+
+-   `string` — IP адрес. [String](../../sql-reference/data-types/string.md).
+
+**Возвращаемое значение**
+
+-   `1` если `string` является адресом IPv4 , иначе — `0`.
+
+Тип: [UInt8](../../sql-reference/data-types/int-uint.md).
+
+**Примеры**
+
+Запрос:
+
+```sql
+SELECT addr, isIPv4String(addr) FROM ( SELECT ['0.0.0.0', '127.0.0.1', '::ffff:127.0.0.1'] AS addr ) ARRAY JOIN addr
+```
+
+Результат:
+
+``` text
+┌─addr─────────────┬─isIPv4String(addr)─┐
+│ 0.0.0.0          │                  1 │
+│ 127.0.0.1        │                  1 │
+│ ::ffff:127.0.0.1 │                  0 │
+└──────────────────┴────────────────────┘
+```
+
+## isIPv6String {#isipv6string}
+
+Определяет, является ли строка адресом IPv6 или нет. Также вернет `0`, если `string` — адрес IPv4.
+
+**Синтаксис**
+
+```sql
+isIPv6String(string)
+```
+
+**Параметры**
+
+-   `string` — IP адрес. [String](../../sql-reference/data-types/string.md).
+
+**Возвращаемое значение**
+
+-   `1` если `string` является адресом IPv6 , иначе — `0`.
+
+Тип: [UInt8](../../sql-reference/data-types/int-uint.md).
+
+**Примеры**
+
+Запрос:
+
+``` sql
+SELECT addr, isIPv6String(addr) FROM ( SELECT ['::', '1111::ffff', '::ffff:127.0.0.1', '127.0.0.1'] AS addr ) ARRAY JOIN addr
+```
+
+Результат:
+
+``` text
+┌─addr─────────────┬─isIPv6String(addr)─┐
+│ ::               │                  1 │
+│ 1111::ffff       │                  1 │
+│ ::ffff:127.0.0.1 │                  1 │
+│ 127.0.0.1        │                  0 │
+└──────────────────┴────────────────────┘
+```
+
 [Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/ip_address_functions/) <!--hide-->
diff --git a/docs/ru/sql-reference/functions/tuple-map-functions.md b/docs/ru/sql-reference/functions/tuple-map-functions.md
index a2b25e68fe5..a36613280a1 100644
--- a/docs/ru/sql-reference/functions/tuple-map-functions.md
+++ b/docs/ru/sql-reference/functions/tuple-map-functions.md
@@ -5,6 +5,66 @@ toc_title: Работа с контейнерами map
 
 # Функции для работы с контейнерами map {#functions-for-working-with-tuple-maps}
 
+## map {#function-map}
+
+Преобразовывает пары `ключ:значение` в тип данных [Map(key, value)](../../sql-reference/data-types/map.md).
+
+**Синтаксис** 
+
+``` sql
+map(key1, value1[, key2, value2, ...])
+```
+
+**Параметры** 
+
+-   `key` — ключ. [String](../../sql-reference/data-types/string.md) или [Integer](../../sql-reference/data-types/int-uint.md).
+-   `value` — значение. [String](../../sql-reference/data-types/string.md), [Integer](../../sql-reference/data-types/int-uint.md) или [Array](../../sql-reference/data-types/array.md).
+
+**Возвращаемое значение**
+
+-   Структура данных в виде пар `ключ:значение`.
+
+Тип: [Map(key, value)](../../sql-reference/data-types/map.md).
+
+**Примеры**
+
+Запрос:
+
+``` sql
+SELECT map('key1', number, 'key2', number * 2) FROM numbers(3);
+```
+
+Результат:
+
+``` text
+┌─map('key1', number, 'key2', multiply(number, 2))─┐
+│ {'key1':0,'key2':0}                              │
+│ {'key1':1,'key2':2}                              │
+│ {'key1':2,'key2':4}                              │
+└──────────────────────────────────────────────────┘
+```
+
+Запрос:
+
+``` sql
+CREATE TABLE table_map (a Map(String, UInt64)) ENGINE = MergeTree() ORDER BY a;
+INSERT INTO table_map SELECT map('key1', number, 'key2', number * 2) FROM numbers(3);
+SELECT a['key2'] FROM table_map;
+```
+
+Результат:
+
+``` text
+┌─arrayElement(a, 'key2')─┐
+│                       0 │
+│                       2 │
+│                       4 │
+└─────────────────────────┘
+```
+
+**См. также** 
+
+-   тип данных [Map(key, value)](../../sql-reference/data-types/map.md)
 ## mapAdd {#function-mapadd}
 
 Собирает все ключи и суммирует соответствующие значения.
diff --git a/docs/ru/sql-reference/functions/url-functions.md b/docs/ru/sql-reference/functions/url-functions.md
index 1008e2a359c..7541e16bed4 100644
--- a/docs/ru/sql-reference/functions/url-functions.md
+++ b/docs/ru/sql-reference/functions/url-functions.md
@@ -115,6 +115,168 @@ SELECT topLevelDomain('svn+ssh://www.some.svn-hosting.com:80/repo/trunk')
 
 Например, `cutToFirstSignificantSubdomain('https://news.yandex.com.tr/') = 'yandex.com.tr'`.
 
+### cutToFirstSignificantSubdomainCustom {#cuttofirstsignificantsubdomaincustom}
+
+Возвращает часть домена, включающую поддомены верхнего уровня до первого существенного поддомена. Принимает имя пользовательского [списка доменов верхнего уровня](https://ru.wikipedia.org/wiki/Список_доменов_верхнего_уровня).
+
+Полезно, если требуется актуальный список доменов верхнего уровня или если есть пользовательский.
+
+Пример конфигурации:
+
+```xml
+<!-- <top_level_domains_path>/var/lib/clickhouse/top_level_domains/</top_level_domains_path> -->
+<top_level_domains_lists>
+    <!-- https://publicsuffix.org/list/public_suffix_list.dat -->
+    <public_suffix_list>public_suffix_list.dat</public_suffix_list>
+    <!-- NOTE: path is under top_level_domains_path -->
+</top_level_domains_lists>
+```
+
+**Синтаксис**
+
+``` sql
+cutToFirstSignificantSubdomain(URL, TLD)
+```
+
+**Parameters**
+
+-   `URL` — URL. [String](../../sql-reference/data-types/string.md).
+-   `TLD` — имя пользовательского списка доменов верхнего уровня. [String](../../sql-reference/data-types/string.md).
+
+**Возвращаемое значение**
+
+-   Часть домена, включающая поддомены верхнего уровня до первого существенного поддомена.
+
+Тип: [String](../../sql-reference/data-types/string.md).
+
+**Пример**
+
+Запрос:
+
+```sql
+SELECT cutToFirstSignificantSubdomainCustom('bar.foo.there-is-no-such-domain', 'public_suffix_list');
+```
+
+Результат:
+
+```text
+┌─cutToFirstSignificantSubdomainCustom('bar.foo.there-is-no-such-domain', 'public_suffix_list')─┐
+│ foo.there-is-no-such-domain                                                                   │
+└───────────────────────────────────────────────────────────────────────────────────────────────┘
+```
+
+**Смотрите также**
+
+-   [firstSignificantSubdomain](#firstsignificantsubdomain).
+
+### cutToFirstSignificantSubdomainCustomWithWWW {#cuttofirstsignificantsubdomaincustomwithwww}
+
+Возвращает часть домена, включающую поддомены верхнего уровня до первого существенного поддомена, не опуская "www". Принимает имя пользовательского списка доменов верхнего уровня.
+
+Полезно, если требуется актуальный список доменов верхнего уровня или если есть пользовательский.
+
+Пример конфигурации:
+
+```xml
+<!-- <top_level_domains_path>/var/lib/clickhouse/top_level_domains/</top_level_domains_path> -->
+<top_level_domains_lists>
+    <!-- https://publicsuffix.org/list/public_suffix_list.dat -->
+    <public_suffix_list>public_suffix_list.dat</public_suffix_list>
+    <!-- NOTE: path is under top_level_domains_path -->
+</top_level_domains_lists>
+```
+
+**Синтаксис**
+
+```sql
+cutToFirstSignificantSubdomainCustomWithWWW(URL, TLD)
+```
+
+**Параметры**
+
+-   `URL` — URL. [String](../../sql-reference/data-types/string.md).
+-   `TLD` — имя пользовательского списка доменов верхнего уровня. [String](../../sql-reference/data-types/string.md).
+
+**Возвращаемое значение**
+
+-   Часть домена, включающая поддомены верхнего уровня до первого существенного поддомена, без удаления `www`.
+
+Тип: [String](../../sql-reference/data-types/string.md).
+
+**Пример**
+
+Запрос:
+
+```sql
+SELECT cutToFirstSignificantSubdomainCustomWithWWW('www.foo', 'public_suffix_list');
+```
+
+Результат:
+
+```text
+┌─cutToFirstSignificantSubdomainCustomWithWWW('www.foo', 'public_suffix_list')─┐
+│ www.foo                                                                      │
+└──────────────────────────────────────────────────────────────────────────────┘
+```
+
+**Смотрите также**
+
+-   [firstSignificantSubdomain](#firstsignificantsubdomain).
+
+### firstSignificantSubdomainCustom {#firstsignificantsubdomaincustom}
+
+Возвращает первый существенный поддомен. Принимает имя пользовательского списка доменов верхнего уровня.
+
+Полезно, если требуется актуальный список доменов верхнего уровня или если есть пользовательский.
+
+Пример конфигурации:
+
+```xml
+<!-- <top_level_domains_path>/var/lib/clickhouse/top_level_domains/</top_level_domains_path> -->
+<top_level_domains_lists>
+    <!-- https://publicsuffix.org/list/public_suffix_list.dat -->
+    <public_suffix_list>public_suffix_list.dat</public_suffix_list>
+    <!-- NOTE: path is under top_level_domains_path -->
+</top_level_domains_lists>
+```
+
+**Синтаксис**
+
+```sql
+firstSignificantSubdomainCustom(URL, TLD)
+```
+
+**Параметры**
+
+-   `URL` — URL. [String](../../sql-reference/data-types/string.md).
+-   `TLD` — имя пользовательского списка доменов верхнего уровня. [String](../../sql-reference/data-types/string.md).
+
+**Возвращаемое значение**
+
+-   Первый существенный поддомен.
+
+Тип: [String](../../sql-reference/data-types/string.md).
+
+**Пример**
+
+Запрос:
+
+```sql
+SELECT firstSignificantSubdomainCustom('bar.foo.there-is-no-such-domain', 'public_suffix_list');
+```
+
+Результат:
+
+```text 
+┌─firstSignificantSubdomainCustom('bar.foo.there-is-no-such-domain', 'public_suffix_list')─┐
+│ foo                                                                                      │
+└──────────────────────────────────────────────────────────────────────────────────────────┘
+```
+
+**Смотрите также**
+
+-   [firstSignificantSubdomain](#firstsignificantsubdomain).
+
 ### port(URL[, default_port = 0]) {#port}
 
 Возвращает порт или значение `default_port`, если в URL-адресе нет порта (или передан невалидный URL) 
diff --git a/docs/zh/operations/backup.md b/docs/zh/operations/backup.md
index 72491bb53ff..1b1993e3ae6 100644
--- a/docs/zh/operations/backup.md
+++ b/docs/zh/operations/backup.md
@@ -7,7 +7,7 @@ toc_title: "\u6570\u636E\u5907\u4EFD"
 
 # 数据备份 {#data-backup}
 
-尽管[副本](../engines/table-engines/mergetree-family/replication.md) 可以预防硬件错误带来的数据丢失, 但是它不能防止人为操作的错误: 意外删除数据, 删除错误的 table 或者删除错误 cluster 上的 table, 可以导致错误数据处理错误或者数据损坏的 bugs. 这类意外可能会影响所有的副本. ClickHouse 有内建的保障措施可以预防一些错误 — 例如, 默认情况下[您不能使用类似MergeTree的引擎删除包含超过50Gb数据的表](https://github.com/ClickHouse/ClickHouse/blob/v18.14.18-stable/programs/server/config.xml#L322-L330). 但是，这些保障措施不能涵盖所有可能的情况，并且可以规避。
+尽管[副本](../engines/table-engines/mergetree-family/replication.md) 可以预防硬件错误带来的数据丢失, 但是它不能防止人为操作的错误: 意外删除数据, 删除错误的 table 或者删除错误 cluster 上的 table, 可以导致错误数据处理错误或者数据损坏的 bugs. 这类意外可能会影响所有的副本. ClickHouse 有内建的保障措施可以预防一些错误 — 例如, 默认情况下[您不能使用类似MergeTree的引擎删除包含超过50Gb数据的表](server-configuration-parameters/settings.md#max-table-size-to-drop). 但是，这些保障措施不能涵盖所有可能的情况，并且可以规避。
 
 为了有效地减少可能的人为错误，您应该 **提前**准备备份和还原数据的策略.
 
diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp
index 2bb5181d348..a96cb2b8973 100644
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@@ -59,7 +59,6 @@
 #include <Disks/registerDisks.h>
 #include <Common/Config/ConfigReloader.h>
 #include <Server/HTTPHandlerFactory.h>
-#include <Server/TestKeeperTCPHandlerFactory.h>
 #include "MetricsTransmitter.h"
 #include <Common/StatusFile.h>
 #include <Server/TCPHandlerFactory.h>
@@ -94,6 +93,9 @@
 #   include <Server/GRPCServer.h>
 #endif
 
+#if USE_NURAFT
+#   include <Server/NuKeeperTCPHandlerFactory.h>
+#endif
 
 namespace CurrentMetrics
 {
@@ -842,23 +844,33 @@ int Server::main(const std::vector<std::string> & /*args*/)
         listen_try = true;
     }
 
-    for (const auto & listen_host : listen_hosts)
+    if (config().has("test_keeper_server"))
     {
-        /// TCP TestKeeper
-        const char * port_name = "test_keeper_server.tcp_port";
-        createServer(listen_host, port_name, listen_try, [&](UInt16 port)
+#if USE_NURAFT
+        /// Initialize test keeper RAFT. Do nothing if no nu_keeper_server in config.
+        global_context->initializeNuKeeperStorageDispatcher();
+        for (const auto & listen_host : listen_hosts)
         {
-            Poco::Net::ServerSocket socket;
-            auto address = socketBindListen(socket, listen_host, port);
-            socket.setReceiveTimeout(settings.receive_timeout);
-            socket.setSendTimeout(settings.send_timeout);
-            servers_to_start_before_tables->emplace_back(
-                port_name,
-                std::make_unique<Poco::Net::TCPServer>(
-                    new TestKeeperTCPHandlerFactory(*this), server_pool, socket, new Poco::Net::TCPServerParams));
+            /// TCP NuKeeper
+            const char * port_name = "test_keeper_server.tcp_port";
+            createServer(listen_host, port_name, listen_try, [&](UInt16 port)
+            {
+                Poco::Net::ServerSocket socket;
+                auto address = socketBindListen(socket, listen_host, port);
+                socket.setReceiveTimeout(settings.receive_timeout);
+                socket.setSendTimeout(settings.send_timeout);
+                servers_to_start_before_tables->emplace_back(
+                    port_name,
+                    std::make_unique<Poco::Net::TCPServer>(
+                        new NuKeeperTCPHandlerFactory(*this), server_pool, socket, new Poco::Net::TCPServerParams));
+
+                LOG_INFO(log, "Listening for connections to NuKeeper (tcp): {}", address.toString());
+            });
+        }
+#else
+        throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "ClickHouse server built without NuRaft library. Cannot use internal coordination.");
+#endif
 
-            LOG_INFO(log, "Listening for connections to fake zookeeper (tcp): {}", address.toString());
-        });
     }
 
     for (auto & server : *servers_to_start_before_tables)
@@ -898,6 +910,8 @@ int Server::main(const std::vector<std::string> & /*args*/)
                 LOG_INFO(log, "Closed connections to servers for tables. But {} remain. Probably some tables of other users cannot finish their connections after context shutdown.", current_connections);
             else
                 LOG_INFO(log, "Closed connections to servers for tables.");
+
+            global_context->shutdownNuKeeperStorageDispatcher();
         }
 
         /** Explicitly destroy Context. It is more convenient than in destructor of Server, because logger is still available.
diff --git a/programs/server/config.xml b/programs/server/config.xml
index 849d3dc32ba..ca57987d901 100644
--- a/programs/server/config.xml
+++ b/programs/server/config.xml
@@ -421,9 +421,15 @@
     <!-- Comma-separated list of prefixes for user-defined settings. -->
     <custom_settings_prefixes></custom_settings_prefixes>
 
-    <!-- System profile of settings. This settings are used by internal processes (Buffer storage, Distributed DDL worker and so on). -->
+    <!-- System profile of settings. This settings are used by internal processes (Distributed DDL worker and so on). -->
     <!-- <system_profile>default</system_profile> -->
 
+    <!-- Buffer profile of settings.
+         This settings are used by Buffer storage to flush data to the underlying table.
+         Default: used from system_profile directive.
+    -->
+    <!-- <buffer_profile>default</buffer_profile> -->
+
     <!-- Default database. -->
     <default_database>default</default_database>
 
diff --git a/src/Access/DiskAccessStorage.cpp b/src/Access/DiskAccessStorage.cpp
index 426c27ea799..80594f66dfc 100644
--- a/src/Access/DiskAccessStorage.cpp
+++ b/src/Access/DiskAccessStorage.cpp
@@ -217,6 +217,7 @@ namespace
         /// Write the file.
         WriteBufferFromFile out{tmp_file_path.string()};
         out.write(file_contents.data(), file_contents.size());
+        out.close();
 
         /// Rename.
         std::filesystem::rename(tmp_file_path, file_path);
@@ -274,6 +275,7 @@ namespace
             writeStringBinary(name, out);
             writeUUIDText(id, out);
         }
+        out.close();
     }
 
 
diff --git a/src/Access/EnabledQuota.cpp b/src/Access/EnabledQuota.cpp
index e9d586a692f..4a77426004d 100644
--- a/src/Access/EnabledQuota.cpp
+++ b/src/Access/EnabledQuota.cpp
@@ -26,10 +26,6 @@ struct EnabledQuota::Impl
         std::chrono::seconds duration,
         std::chrono::system_clock::time_point end_of_interval)
     {
-        std::function<String(UInt64)> amount_to_string = [](UInt64 amount) { return std::to_string(amount); };
-        if (resource_type == Quota::EXECUTION_TIME)
-            amount_to_string = [&](UInt64 amount) { return ext::to_string(std::chrono::nanoseconds(amount)); };
-
         const auto & type_info = Quota::ResourceTypeInfo::get(resource_type);
         throw Exception(
             "Quota for user " + backQuote(user_name) + " for " + ext::to_string(duration) + " has been exceeded: "
@@ -39,35 +35,47 @@ struct EnabledQuota::Impl
     }
 
 
+    /// Returns the end of the current interval. If the passed `current_time` is greater than that end,
+    /// the function automatically recalculates the interval's end by adding the interval's duration
+    /// one or more times until the interval's end is greater than `current_time`.
+    /// If that recalculation occurs the function also resets amounts of resources used and sets the variable
+    /// `counters_were_reset`.
     static std::chrono::system_clock::time_point getEndOfInterval(
-        const Interval & interval, std::chrono::system_clock::time_point current_time, bool * counters_were_reset = nullptr)
+        const Interval & interval, std::chrono::system_clock::time_point current_time, bool & counters_were_reset)
     {
         auto & end_of_interval = interval.end_of_interval;
         auto end_loaded = end_of_interval.load();
         auto end = std::chrono::system_clock::time_point{end_loaded};
         if (current_time < end)
         {
-            if (counters_were_reset)
-                *counters_were_reset = false;
+            counters_were_reset = false;
             return end;
         }
 
-        const auto duration = interval.duration;
+        /// We reset counters only if the interval's end has been calculated before.
+        /// If it hasn't we just calculate the interval's end for the first time and don't reset counters yet.
+        bool need_reset_counters = (end_loaded.count() != 0);
 
         do
         {
-            end = end + (current_time - end + duration) / duration * duration;
+            /// Calculate the end of the next interval:
+            ///  |                     X                                 |
+            /// end               current_time                next_end = end + duration * n
+            /// where n is an integer number, n >= 1.
+            const auto duration = interval.duration;
+            UInt64 n = static_cast<UInt64>((current_time - end + duration) / duration);
+            end = end + duration * n;
             if (end_of_interval.compare_exchange_strong(end_loaded, end.time_since_epoch()))
-            {
-                boost::range::fill(interval.used, 0);
                 break;
-            }
             end = std::chrono::system_clock::time_point{end_loaded};
         }
         while (current_time >= end);
 
-        if (counters_were_reset)
-            *counters_were_reset = true;
+        if (need_reset_counters)
+        {
+            boost::range::fill(interval.used, 0);
+            counters_were_reset = true;
+        }
         return end;
     }
 
@@ -89,7 +97,7 @@ struct EnabledQuota::Impl
             if (used > max)
             {
                 bool counters_were_reset = false;
-                auto end_of_interval = getEndOfInterval(interval, current_time, &counters_were_reset);
+                auto end_of_interval = getEndOfInterval(interval, current_time, counters_were_reset);
                 if (counters_were_reset)
                 {
                     used = (interval.used[resource_type] += amount);
@@ -116,9 +124,9 @@ struct EnabledQuota::Impl
                 continue;
             if (used > max)
             {
-                bool used_counters_reset = false;
-                std::chrono::system_clock::time_point end_of_interval = getEndOfInterval(interval, current_time, &used_counters_reset);
-                if (!used_counters_reset)
+                bool counters_were_reset = false;
+                std::chrono::system_clock::time_point end_of_interval = getEndOfInterval(interval, current_time, counters_were_reset);
+                if (!counters_were_reset)
                     throwQuotaExceed(user_name, intervals.quota_name, resource_type, used, max, interval.duration, end_of_interval);
             }
         }
@@ -177,7 +185,8 @@ std::optional<QuotaUsage> EnabledQuota::Intervals::getUsage(std::chrono::system_
         auto & out = usage.intervals.back();
         out.duration = in.duration;
         out.randomize_interval = in.randomize_interval;
-        out.end_of_interval = Impl::getEndOfInterval(in, current_time);
+        bool counters_were_reset = false;
+        out.end_of_interval = Impl::getEndOfInterval(in, current_time, counters_were_reset);
         for (auto resource_type : ext::range(MAX_RESOURCE_TYPE))
         {
             if (in.max[resource_type])
diff --git a/src/AggregateFunctions/AggregateFunctionAny.cpp b/src/AggregateFunctions/AggregateFunctionAny.cpp
index 0aeb2548af9..8b18abae884 100644
--- a/src/AggregateFunctions/AggregateFunctionAny.cpp
+++ b/src/AggregateFunctions/AggregateFunctionAny.cpp
@@ -34,6 +34,14 @@ void registerAggregateFunctionsAny(AggregateFunctionFactory & factory)
     factory.registerFunction("any", { createAggregateFunctionAny, properties });
     factory.registerFunction("anyLast", { createAggregateFunctionAnyLast, properties });
     factory.registerFunction("anyHeavy", { createAggregateFunctionAnyHeavy, properties });
+
+    // Synonyms for use as window functions.
+    factory.registerFunction("first_value",
+        { createAggregateFunctionAny, properties },
+        AggregateFunctionFactory::CaseInsensitive);
+    factory.registerFunction("last_value",
+        { createAggregateFunctionAnyLast, properties },
+        AggregateFunctionFactory::CaseInsensitive);
 }
 
 }
diff --git a/src/AggregateFunctions/AggregateFunctionDeltaSum.h b/src/AggregateFunctions/AggregateFunctionDeltaSum.h
index 11824c9d51f..d5760de84ae 100644
--- a/src/AggregateFunctions/AggregateFunctionDeltaSum.h
+++ b/src/AggregateFunctions/AggregateFunctionDeltaSum.h
@@ -43,7 +43,7 @@ public:
 
     DataTypePtr getReturnType() const override { return std::make_shared<DataTypeNumber<T>>(); }
 
-    void NO_SANITIZE_UNDEFINED ALWAYS_INLINE add(AggregateDataPtr place, const IColumn ** columns, size_t row_num, Arena *) const override
+    void NO_SANITIZE_UNDEFINED ALWAYS_INLINE add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override
     {
         auto value = assert_cast<const ColumnVector<T> &>(*columns[0]).getData()[row_num];
 
@@ -62,7 +62,7 @@ public:
         }
     }
 
-    void NO_SANITIZE_UNDEFINED ALWAYS_INLINE merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena *) const override
+    void NO_SANITIZE_UNDEFINED ALWAYS_INLINE merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
     {
         auto place_data = &this->data(place);
         auto rhs_data = &this->data(rhs);
@@ -102,7 +102,7 @@ public:
         // Otherwise lhs either has data or is uninitialized, so we don't need to modify its values.
     }
 
-    void serialize(ConstAggregateDataPtr place, WriteBuffer & buf) const override
+    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf) const override
     {
         writeIntBinary(this->data(place).sum, buf);
         writeIntBinary(this->data(place).first, buf);
@@ -111,7 +111,7 @@ public:
         writePODBinary<bool>(this->data(place).seen_last, buf);
     }
 
-    void deserialize(AggregateDataPtr place, ReadBuffer & buf, Arena *) const override
+    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, Arena *) const override
     {
         readIntBinary(this->data(place).sum, buf);
         readIntBinary(this->data(place).first, buf);
@@ -120,7 +120,7 @@ public:
         readPODBinary<bool>(this->data(place).seen_last, buf);
     }
 
-    void insertResultInto(AggregateDataPtr place, IColumn & to, Arena *) const override
+    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
     {
         assert_cast<ColumnVector<T> &>(to).getData().push_back(this->data(place).sum);
     }
diff --git a/src/AggregateFunctions/AggregateFunctionGroupArrayMoving.h b/src/AggregateFunctions/AggregateFunctionGroupArrayMoving.h
index eecf97e1e8c..2a713f3aed2 100644
--- a/src/AggregateFunctions/AggregateFunctionGroupArrayMoving.h
+++ b/src/AggregateFunctions/AggregateFunctionGroupArrayMoving.h
@@ -40,7 +40,7 @@ struct MovingData
     Array value;    /// Prefix sums.
     T sum = 0;
 
-    void add(T val, Arena * arena)
+    void NO_SANITIZE_UNDEFINED add(T val, Arena * arena)
     {
         sum += val;
         value.push_back(sum, arena);
@@ -120,7 +120,7 @@ public:
         this->data(place).add(static_cast<ResultT>(value), arena);
     }
 
-    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena * arena) const override
+    void NO_SANITIZE_UNDEFINED merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena * arena) const override
     {
         auto & cur_elems = this->data(place);
         auto & rhs_elems = this->data(rhs);
diff --git a/src/AggregateFunctions/IAggregateFunction.h b/src/AggregateFunctions/IAggregateFunction.h
index f1bbfa40aac..d15ff4e8a78 100644
--- a/src/AggregateFunctions/IAggregateFunction.h
+++ b/src/AggregateFunctions/IAggregateFunction.h
@@ -26,6 +26,7 @@ class ReadBuffer;
 class WriteBuffer;
 class IColumn;
 class IDataType;
+class IWindowFunction;
 
 using DataTypePtr = std::shared_ptr<const IDataType>;
 using DataTypes = std::vector<DataTypePtr>;
@@ -215,6 +216,20 @@ public:
     const DataTypes & getArgumentTypes() const { return argument_types; }
     const Array & getParameters() const { return parameters; }
 
+    // Any aggregate function can be calculated over a window, but there are some
+    // window functions such as rank() that require a different interface, e.g.
+    // because they don't respect the window frame, or need to be notified when
+    // a new peer group starts. They pretend to be normal aggregate functions,
+    // but will fail if you actually try to use them in Aggregator. The
+    // WindowTransform recognizes these functions and handles them differently.
+    // We could have a separate factory for window functions, and make all
+    // aggregate functions implement IWindowFunction interface and so on. This
+    // would be more logically correct, but more complex. We only have a handful
+    // of true window functions, so this hack-ish interface suffices.
+    virtual IWindowFunction * asWindowFunction() { return nullptr; }
+    virtual const IWindowFunction * asWindowFunction() const
+    { return const_cast<IAggregateFunction *>(this)->asWindowFunction(); }
+
 protected:
     DataTypes argument_types;
     Array parameters;
diff --git a/src/AggregateFunctions/registerAggregateFunctions.cpp b/src/AggregateFunctions/registerAggregateFunctions.cpp
index 1900d5d46c6..ae26fdc5d40 100644
--- a/src/AggregateFunctions/registerAggregateFunctions.cpp
+++ b/src/AggregateFunctions/registerAggregateFunctions.cpp
@@ -58,6 +58,8 @@ void registerAggregateFunctionCombinatorOrFill(AggregateFunctionCombinatorFactor
 void registerAggregateFunctionCombinatorResample(AggregateFunctionCombinatorFactory &);
 void registerAggregateFunctionCombinatorDistinct(AggregateFunctionCombinatorFactory &);
 
+void registerWindowFunctions(AggregateFunctionFactory & factory);
+
 
 void registerAggregateFunctions()
 {
@@ -103,6 +105,8 @@ void registerAggregateFunctions()
         registerAggregateFunctionMannWhitney(factory);
         registerAggregateFunctionWelchTTest(factory);
         registerAggregateFunctionStudentTTest(factory);
+
+        registerWindowFunctions(factory);
     }
 
     {
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 86db7742c97..d370016da00 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -60,6 +60,7 @@ add_subdirectory (Processors)
 add_subdirectory (Formats)
 add_subdirectory (Compression)
 add_subdirectory (Server)
+add_subdirectory (Coordination)
 
 
 set(dbms_headers)
@@ -192,6 +193,10 @@ add_object_library(clickhouse_processors_merges_algorithms Processors/Merges/Alg
 add_object_library(clickhouse_processors_queryplan Processors/QueryPlan)
 add_object_library(clickhouse_processors_queryplan_optimizations Processors/QueryPlan/Optimizations)
 
+if (USE_NURAFT)
+    add_object_library(clickhouse_coordination Coordination)
+endif()
+
 set (DBMS_COMMON_LIBRARIES)
 # libgcc_s does not provide an implementation of an atomics library. Instead,
 # GCC’s libatomic library can be used to supply these when using libgcc_s.
@@ -314,7 +319,7 @@ if (USE_KRB5)
 endif()
 
 if (USE_NURAFT)
-    dbms_target_link_libraries(PRIVATE ${NURAFT_LIBRARY})
+    dbms_target_link_libraries(PUBLIC ${NURAFT_LIBRARY})
 endif()
 
 if(RE2_INCLUDE_DIR)
diff --git a/src/Client/Connection.cpp b/src/Client/Connection.cpp
index e38a6b240a6..164b9565633 100644
--- a/src/Client/Connection.cpp
+++ b/src/Client/Connection.cpp
@@ -756,7 +756,11 @@ std::optional<UInt64> Connection::checkPacket(size_t timeout_microseconds)
 Packet Connection::receivePacket(std::function<void(Poco::Net::Socket &)> async_callback)
 {
     in->setAsyncCallback(std::move(async_callback));
-    SCOPE_EXIT(in->setAsyncCallback({}));
+    SCOPE_EXIT({
+        /// disconnect() will reset "in".
+        if (in)
+            in->setAsyncCallback({});
+    });
 
     try
     {
diff --git a/src/Columns/ColumnDecimal.cpp b/src/Columns/ColumnDecimal.cpp
index f6261079287..ddc971032b6 100644
--- a/src/Columns/ColumnDecimal.cpp
+++ b/src/Columns/ColumnDecimal.cpp
@@ -30,6 +30,12 @@ namespace ErrorCodes
     extern const int LOGICAL_ERROR;
 }
 
+template class DecimalPaddedPODArray<Decimal32>;
+template class DecimalPaddedPODArray<Decimal64>;
+template class DecimalPaddedPODArray<Decimal128>;
+template class DecimalPaddedPODArray<Decimal256>;
+template class DecimalPaddedPODArray<DateTime64>;
+
 template <typename T>
 int ColumnDecimal<T>::compareAt(size_t n, size_t m, const IColumn & rhs_, int) const
 {
@@ -370,4 +376,5 @@ template class ColumnDecimal<Decimal64>;
 template class ColumnDecimal<Decimal128>;
 template class ColumnDecimal<Decimal256>;
 template class ColumnDecimal<DateTime64>;
+
 }
diff --git a/src/Columns/ColumnDecimal.h b/src/Columns/ColumnDecimal.h
index 1578633c13d..ef841292a7d 100644
--- a/src/Columns/ColumnDecimal.h
+++ b/src/Columns/ColumnDecimal.h
@@ -50,6 +50,14 @@ private:
     UInt32 scale;
 };
 
+/// Prevent implicit template instantiation of DecimalPaddedPODArray for common decimal types
+
+extern template class DecimalPaddedPODArray<Decimal32>;
+extern template class DecimalPaddedPODArray<Decimal64>;
+extern template class DecimalPaddedPODArray<Decimal128>;
+extern template class DecimalPaddedPODArray<Decimal256>;
+extern template class DecimalPaddedPODArray<DateTime64>;
+
 /// A ColumnVector for Decimals
 template <typename T>
 class ColumnDecimal final : public COWHelper<ColumnVectorHelper, ColumnDecimal<T>>
@@ -215,4 +223,14 @@ ColumnPtr ColumnDecimal<T>::indexImpl(const PaddedPODArray<Type> & indexes, size
     return res;
 }
 
+
+/// Prevent implicit template instantiation of ColumnDecimal for common decimal types
+
+extern template class ColumnDecimal<Decimal32>;
+extern template class ColumnDecimal<Decimal64>;
+extern template class ColumnDecimal<Decimal128>;
+extern template class ColumnDecimal<Decimal256>;
+extern template class ColumnDecimal<DateTime64>;
+
+
 }
diff --git a/src/Columns/ColumnVector.cpp b/src/Columns/ColumnVector.cpp
index a075c10a8a9..ec26500d057 100644
--- a/src/Columns/ColumnVector.cpp
+++ b/src/Columns/ColumnVector.cpp
@@ -535,4 +535,5 @@ template class ColumnVector<Int128>;
 template class ColumnVector<Int256>;
 template class ColumnVector<Float32>;
 template class ColumnVector<Float64>;
+
 }
diff --git a/src/Columns/ColumnVector.h b/src/Columns/ColumnVector.h
index 1b13859bdee..586fced88a6 100644
--- a/src/Columns/ColumnVector.h
+++ b/src/Columns/ColumnVector.h
@@ -345,4 +345,21 @@ ColumnPtr ColumnVector<T>::indexImpl(const PaddedPODArray<Type> & indexes, size_
     return res;
 }
 
+/// Prevent implicit template instantiation of ColumnVector for common types
+
+extern template class ColumnVector<UInt8>;
+extern template class ColumnVector<UInt16>;
+extern template class ColumnVector<UInt32>;
+extern template class ColumnVector<UInt64>;
+extern template class ColumnVector<UInt128>;
+extern template class ColumnVector<UInt256>;
+extern template class ColumnVector<Int8>;
+extern template class ColumnVector<Int16>;
+extern template class ColumnVector<Int32>;
+extern template class ColumnVector<Int64>;
+extern template class ColumnVector<Int128>;
+extern template class ColumnVector<Int256>;
+extern template class ColumnVector<Float32>;
+extern template class ColumnVector<Float64>;
+
 }
diff --git a/src/Common/Allocator.cpp b/src/Common/Allocator.cpp
index 08c275abfc2..5a66ddb63a2 100644
--- a/src/Common/Allocator.cpp
+++ b/src/Common/Allocator.cpp
@@ -19,3 +19,8 @@
       */
     __attribute__((__weak__)) extern const size_t MMAP_THRESHOLD = 16384;
 #endif
+
+template class Allocator<false, false>;
+template class Allocator<true, false>;
+template class Allocator<false, true>;
+template class Allocator<true, true>;
diff --git a/src/Common/Allocator.h b/src/Common/Allocator.h
index a499f4a442b..e3c6ddf9ff4 100644
--- a/src/Common/Allocator.h
+++ b/src/Common/Allocator.h
@@ -352,6 +352,12 @@ template<typename Base, size_t initial_bytes, size_t Alignment>
 constexpr size_t allocatorInitialBytes<AllocatorWithStackMemory<
     Base, initial_bytes, Alignment>> = initial_bytes;
 
+/// Prevent implicit template instantiation of Allocator
+
+extern template class Allocator<false, false>;
+extern template class Allocator<true, false>;
+extern template class Allocator<false, true>;
+extern template class Allocator<true, true>;
 
 #if !__clang__
 #pragma GCC diagnostic pop
diff --git a/src/Common/ColumnsHashing.h b/src/Common/ColumnsHashing.h
index b1d25c98955..1ac753fbae5 100644
--- a/src/Common/ColumnsHashing.h
+++ b/src/Common/ColumnsHashing.h
@@ -455,7 +455,14 @@ template <>
 struct LowCardinalityKeys<false> {};
 
 /// For the case when all keys are of fixed length, and they fit in N (for example, 128) bits.
-template <typename Value, typename Key, typename Mapped, bool has_nullable_keys_ = false, bool has_low_cardinality_ = false, bool use_cache = true, bool need_offset = false>
+template <
+    typename Value,
+    typename Key,
+    typename Mapped,
+    bool has_nullable_keys_ = false,
+    bool has_low_cardinality_ = false,
+    bool use_cache = true,
+    bool need_offset = false>
 struct HashMethodKeysFixed
     : private columns_hashing_impl::BaseStateKeysFixed<Key, has_nullable_keys_>
     , public columns_hashing_impl::HashMethodBase<HashMethodKeysFixed<Value, Key, Mapped, has_nullable_keys_, has_low_cardinality_, use_cache, need_offset>, Value, Mapped, use_cache, need_offset>
@@ -471,6 +478,12 @@ struct HashMethodKeysFixed
     Sizes key_sizes;
     size_t keys_size;
 
+    /// SSSE3 shuffle method can be used. Shuffle masks will be calculated and stored here.
+#if defined(__SSSE3__) && !defined(MEMORY_SANITIZER)
+    std::unique_ptr<uint8_t[]> masks;
+    std::unique_ptr<const char*[]> columns_data;
+#endif
+
     HashMethodKeysFixed(const ColumnRawPtrs & key_columns, const Sizes & key_sizes_, const HashMethodContextPtr &)
         : Base(key_columns), key_sizes(std::move(key_sizes_)), keys_size(key_columns.size())
     {
@@ -491,6 +504,58 @@ struct HashMethodKeysFixed
                     low_cardinality_keys.nested_columns[i] = key_columns[i];
             }
         }
+
+#if defined(__SSSE3__) && !defined(MEMORY_SANITIZER)
+        if constexpr (!has_low_cardinality && !has_nullable_keys && sizeof(Key) <= 16)
+        {
+            /** The task is to "pack" multiple fixed-size fields into single larger Key.
+              * Example: pack UInt8, UInt32, UInt16, UInt64 into UInt128 key:
+              * [- ---- -- -------- -] - the resulting uint128 key
+              *  ^  ^   ^   ^       ^
+              *  u8 u32 u16 u64    zero
+              *
+              * We can do it with the help of SSSE3 shuffle instruction.
+              *
+              * There will be a mask for every GROUP BY element (keys_size masks in total).
+              * Every mask has 16 bytes but only sizeof(Key) bytes are used (other we don't care).
+              *
+              * Every byte in the mask has the following meaning:
+              * - if it is 0..15, take the element at this index from source register and place here in the result;
+              * - if it is 0xFF - set the elemend in the result to zero.
+              *
+              * Example:
+              * We want to copy UInt32 to offset 1 in the destination and set other bytes in the destination as zero.
+              * The corresponding mask will be: FF, 0, 1, 2, 3, FF, FF, FF, FF, FF, FF, FF, FF, FF, FF, FF
+              *
+              * The max size of destination is 16 bytes, because we cannot process more with SSSE3.
+              *
+              * The method is disabled under MSan, because it's allowed
+              * to load into SSE register and process up to 15 bytes of uninitialized memory in columns padding.
+              * We don't use this uninitialized memory but MSan cannot look "into" the shuffle instruction.
+              *
+              * 16-bytes masks can be placed overlapping, only first sizeof(Key) bytes are relevant in each mask.
+              * We initialize them to 0xFF and then set the needed elements.
+              */
+            size_t total_masks_size = sizeof(Key) * keys_size + (16 - sizeof(Key));
+            masks.reset(new uint8_t[total_masks_size]);
+            memset(masks.get(), 0xFF, total_masks_size);
+
+            size_t offset = 0;
+            for (size_t i = 0; i < keys_size; ++i)
+            {
+                for (size_t j = 0; j < key_sizes[i]; ++j)
+                {
+                    masks[i * sizeof(Key) + offset] = j;
+                    ++offset;
+                }
+            }
+
+            columns_data.reset(new const char*[keys_size]);
+
+            for (size_t i = 0; i < keys_size; ++i)
+                columns_data[i] = Base::getActualColumns()[i]->getRawData().data;
+        }
+#endif
     }
 
     ALWAYS_INLINE Key getKeyHolder(size_t row, Arena &) const
@@ -506,6 +571,10 @@ struct HashMethodKeysFixed
                 return packFixed<Key, true>(row, keys_size, low_cardinality_keys.nested_columns, key_sizes,
                                             &low_cardinality_keys.positions, &low_cardinality_keys.position_sizes);
 
+#if defined(__SSSE3__) && !defined(MEMORY_SANITIZER)
+            if constexpr (!has_low_cardinality && !has_nullable_keys && sizeof(Key) <= 16)
+                return packFixedShuffle<Key>(columns_data.get(), keys_size, key_sizes.data(), row, masks.get());
+#endif
             return packFixed<Key>(row, keys_size, Base::getActualColumns(), key_sizes);
         }
     }
diff --git a/src/Common/Dwarf.cpp b/src/Common/Dwarf.cpp
index 7a697a2c9ef..d0b3244dac2 100644
--- a/src/Common/Dwarf.cpp
+++ b/src/Common/Dwarf.cpp
@@ -19,8 +19,6 @@
 /** This file was edited for ClickHouse.
   */
 
-#include <optional>
-
 #include <string.h>
 
 #include <Common/Elf.h>
@@ -43,6 +41,7 @@
 #define DW_FORM_ref4 0x13
 #define DW_FORM_data8 0x07
 #define DW_FORM_ref8 0x14
+#define DW_FORM_ref_sig8 0x20
 #define DW_FORM_sdata 0x0d
 #define DW_FORM_udata 0x0f
 #define DW_FORM_ref_udata 0x15
@@ -54,9 +53,24 @@
 #define DW_FORM_strp 0x0e
 #define DW_FORM_indirect 0x16
 #define DW_TAG_compile_unit 0x11
+#define DW_TAG_subprogram 0x2e
+#define DW_TAG_try_block 0x32
+#define DW_TAG_catch_block 0x25
+#define DW_TAG_entry_point 0x03
+#define DW_TAG_common_block 0x1a
+#define DW_TAG_lexical_block 0x0b
 #define DW_AT_stmt_list 0x10
 #define DW_AT_comp_dir 0x1b
 #define DW_AT_name 0x03
+#define DW_AT_high_pc 0x12
+#define DW_AT_low_pc 0x11
+#define DW_AT_entry_pc 0x52
+#define DW_AT_ranges 0x55
+#define DW_AT_abstract_origin 0x31
+#define DW_AT_call_line 0x59
+#define DW_AT_call_file 0x58
+#define DW_AT_linkage_name 0x6e
+#define DW_AT_specification 0x47
 #define DW_LNE_define_file 0x03
 #define DW_LNS_copy 0x01
 #define DW_LNS_advance_pc 0x02
@@ -84,7 +98,7 @@ namespace ErrorCodes
 }
 
 
-Dwarf::Dwarf(const Elf & elf) : elf_(&elf)
+Dwarf::Dwarf(const std::shared_ptr<Elf> & elf) : elf_(elf)
 {
     init();
 }
@@ -99,6 +113,10 @@ Dwarf::Section::Section(std::string_view d) : is64Bit_(false), data_(d)
 
 namespace
 {
+// Maximum number of DIEAbbreviation to cache in a compilation unit. Used to
+// speed up inline function lookup.
+const uint32_t kMaxAbbreviationEntries = 1000;
+
 // All following read* functions read from a std::string_view, advancing the
 // std::string_view, and aborting if there's not enough room.
 
@@ -158,7 +176,7 @@ uint64_t readOffset(std::string_view & sp, bool is64Bit)
 // Read "len" bytes
 std::string_view readBytes(std::string_view & sp, uint64_t len)
 {
-    SAFE_CHECK(len >= sp.size(), "invalid string length");
+    SAFE_CHECK(len <= sp.size(), "invalid string length: " + std::to_string(len) + " vs. " + std::to_string(sp.size()));
     std::string_view ret(sp.data(), len);
     sp.remove_prefix(len);
     return ret;
@@ -364,15 +382,18 @@ void Dwarf::init()
         || !getSection(".debug_line", &line_)
         || !getSection(".debug_str", &strings_))
     {
-        elf_ = nullptr;
+        elf_.reset();
         return;
     }
 
     // Optional: fast address range lookup. If missing .debug_info can
     // be used - but it's much slower (linear scan).
     getSection(".debug_aranges", &aranges_);
+
+    getSection(".debug_ranges", &ranges_);
 }
 
+// static
 bool Dwarf::readAbbreviation(std::string_view & section, DIEAbbreviation & abbr)
 {
     // abbreviation code
@@ -384,14 +405,14 @@ bool Dwarf::readAbbreviation(std::string_view & section, DIEAbbreviation & abbr)
     abbr.tag = readULEB(section);
 
     // does this entry have children?
-    abbr.hasChildren = (read<uint8_t>(section) != DW_CHILDREN_no);
+    abbr.has_children = (read<uint8_t>(section) != DW_CHILDREN_no);
 
     // attributes
     const char * attribute_begin = section.data();
     for (;;)
     {
         SAFE_CHECK(!section.empty(), "invalid attribute section");
-        auto attr = readAttribute(section);
+        auto attr = readAttributeSpec(section);
         if (attr.name == 0 && attr.form == 0)
             break;
     }
@@ -400,11 +421,161 @@ bool Dwarf::readAbbreviation(std::string_view & section, DIEAbbreviation & abbr)
     return true;
 }
 
-Dwarf::DIEAbbreviation::Attribute Dwarf::readAttribute(std::string_view & sp)
+// static
+void Dwarf::readCompilationUnitAbbrs(std::string_view abbrev, CompilationUnit & cu)
+{
+    abbrev.remove_prefix(cu.abbrev_offset);
+
+    DIEAbbreviation abbr;
+    while (readAbbreviation(abbrev, abbr))
+    {
+        // Abbreviation code 0 is reserved for null debugging information entries.
+        if (abbr.code != 0 && abbr.code <= kMaxAbbreviationEntries)
+        {
+            cu.abbr_cache[abbr.code - 1] = abbr;
+        }
+    }
+}
+
+size_t Dwarf::forEachChild(const CompilationUnit & cu, const Die & die, std::function<bool(const Die & die)> f) const
+{
+    size_t next_die_offset = forEachAttribute(cu, die, [&](const Attribute &) { return true; });
+    if (!die.abbr.has_children)
+    {
+        return next_die_offset;
+    }
+
+    auto child_die = getDieAtOffset(cu, next_die_offset);
+    while (child_die.code != 0)
+    {
+        if (!f(child_die))
+        {
+            return child_die.offset;
+        }
+
+        // NOTE: Don't run `f` over grandchildren, just skip over them.
+        size_t sibling_offset = forEachChild(cu, child_die, [](const Die &) { return true; });
+        child_die = getDieAtOffset(cu, sibling_offset);
+    }
+
+    // childDie is now a dummy die whose offset is to the code 0 marking the
+    // end of the children. Need to add one to get the offset of the next die.
+    return child_die.offset + 1;
+}
+
+/*
+ * Iterate over all attributes of the given DIE, calling the given callable
+ * for each. Iteration is stopped early if any of the calls return false.
+ */
+size_t Dwarf::forEachAttribute(const CompilationUnit & cu, const Die & die, std::function<bool(const Attribute & die)> f) const
+{
+    auto attrs = die.abbr.attributes;
+    auto values = std::string_view{info_.data() + die.offset + die.attr_offset, cu.offset + cu.size - die.offset - die.attr_offset};
+    while (auto spec = readAttributeSpec(attrs))
+    {
+        auto attr = readAttribute(die, spec, values);
+        if (!f(attr))
+        {
+            return static_cast<size_t>(-1);
+        }
+    }
+    return values.data() - info_.data();
+}
+
+Dwarf::Attribute Dwarf::readAttribute(const Die & die, AttributeSpec spec, std::string_view & info) const
+{
+    switch (spec.form)
+    {
+        case DW_FORM_addr:
+            return {spec, die, read<uintptr_t>(info)};
+        case DW_FORM_block1:
+            return {spec, die, readBytes(info, read<uint8_t>(info))};
+        case DW_FORM_block2:
+            return {spec, die, readBytes(info, read<uint16_t>(info))};
+        case DW_FORM_block4:
+            return {spec, die, readBytes(info, read<uint32_t>(info))};
+        case DW_FORM_block:
+            [[fallthrough]];
+        case DW_FORM_exprloc:
+            return {spec, die, readBytes(info, readULEB(info))};
+        case DW_FORM_data1:
+            [[fallthrough]];
+        case DW_FORM_ref1:
+            return {spec, die, read<uint8_t>(info)};
+        case DW_FORM_data2:
+            [[fallthrough]];
+        case DW_FORM_ref2:
+            return {spec, die, read<uint16_t>(info)};
+        case DW_FORM_data4:
+            [[fallthrough]];
+        case DW_FORM_ref4:
+            return {spec, die, read<uint32_t>(info)};
+        case DW_FORM_data8:
+            [[fallthrough]];
+        case DW_FORM_ref8:
+            [[fallthrough]];
+        case DW_FORM_ref_sig8:
+            return {spec, die, read<uint64_t>(info)};
+        case DW_FORM_sdata:
+            return {spec, die, uint64_t(readSLEB(info))};
+        case DW_FORM_udata:
+            [[fallthrough]];
+        case DW_FORM_ref_udata:
+            return {spec, die, readULEB(info)};
+        case DW_FORM_flag:
+            return {spec, die, read<uint8_t>(info)};
+        case DW_FORM_flag_present:
+            return {spec, die, 1u};
+        case DW_FORM_sec_offset:
+            [[fallthrough]];
+        case DW_FORM_ref_addr:
+            return {spec, die, readOffset(info, die.is64Bit)};
+        case DW_FORM_string:
+            return {spec, die, readNullTerminated(info)};
+        case DW_FORM_strp:
+            return {spec, die, getStringFromStringSection(readOffset(info, die.is64Bit))};
+        case DW_FORM_indirect: // form is explicitly specified
+            // Update spec with the actual FORM.
+            spec.form = readULEB(info);
+            return readAttribute(die, spec, info);
+        default:
+            SAFE_CHECK(false, "invalid attribute form");
+    }
+
+    return {spec, die, 0u};
+}
+
+// static
+Dwarf::AttributeSpec Dwarf::readAttributeSpec(std::string_view & sp)
 {
     return {readULEB(sp), readULEB(sp)};
 }
 
+// static
+Dwarf::CompilationUnit Dwarf::getCompilationUnit(std::string_view info, uint64_t offset)
+{
+    SAFE_CHECK(offset < info.size(), "unexpected offset");
+    CompilationUnit cu;
+    std::string_view chunk(info);
+    cu.offset = offset;
+    chunk.remove_prefix(offset);
+
+    auto initial_length = read<uint32_t>(chunk);
+    cu.is64Bit = (initial_length == uint32_t(-1));
+    cu.size = cu.is64Bit ? read<uint64_t>(chunk) : initial_length;
+    SAFE_CHECK(cu.size <= chunk.size(), "invalid chunk size");
+    cu.size += cu.is64Bit ? 12 : 4;
+
+    cu.version = read<uint16_t>(chunk);
+    SAFE_CHECK(cu.version >= 2 && cu.version <= 4, "invalid info version");
+    cu.abbrev_offset = readOffset(chunk, cu.is64Bit);
+    cu.addr_size = read<uint8_t>(chunk);
+    SAFE_CHECK(cu.addr_size == sizeof(uintptr_t), "invalid address size");
+
+    cu.first_die = chunk.data() - info.data();
+    return cu;
+}
+
 Dwarf::DIEAbbreviation Dwarf::getAbbreviation(uint64_t code, uint64_t offset) const
 {
     // Linear search in the .debug_abbrev section, starting at offset
@@ -516,104 +687,411 @@ bool Dwarf::findDebugInfoOffset(uintptr_t address, std::string_view aranges, uin
     return false;
 }
 
+Dwarf::Die Dwarf::getDieAtOffset(const CompilationUnit & cu, uint64_t offset) const
+{
+    SAFE_CHECK(offset < info_.size(), "unexpected offset");
+    Die die;
+    std::string_view sp{info_.data() + offset, cu.offset + cu.size - offset};
+    die.offset = offset;
+    die.is64Bit = cu.is64Bit;
+    auto code = readULEB(sp);
+    die.code = code;
+    if (code == 0)
+    {
+        return die;
+    }
+    die.attr_offset = sp.data() - info_.data() - offset;
+    die.abbr = !cu.abbr_cache.empty() && die.code < kMaxAbbreviationEntries ? cu.abbr_cache[die.code - 1]
+                                                                            : getAbbreviation(die.code, cu.abbrev_offset);
+
+    return die;
+}
+
+Dwarf::Die Dwarf::findDefinitionDie(const CompilationUnit & cu, const Die & die) const
+{
+    // Find the real definition instead of declaration.
+    // DW_AT_specification: Incomplete, non-defining, or separate declaration
+    // corresponding to a declaration
+    auto offset = getAttribute<uint64_t>(cu, die, DW_AT_specification);
+    if (!offset)
+    {
+        return die;
+    }
+    return getDieAtOffset(cu, cu.offset + offset.value());
+}
+
 /**
  * Find the @locationInfo for @address in the compilation unit represented
  * by the @sp .debug_info entry.
  * Returns whether the address was found.
  * Advances @sp to the next entry in .debug_info.
  */
-bool Dwarf::findLocation(uintptr_t address, std::string_view & infoEntry, LocationInfo & locationInfo) const
+bool Dwarf::findLocation(
+    uintptr_t address,
+    const LocationInfoMode mode,
+    CompilationUnit & cu,
+    LocationInfo & info,
+    std::vector<SymbolizedFrame> & inline_frames) const
 {
-    // For each compilation unit compiled with a DWARF producer, a
-    // contribution is made to the .debug_info section of the object
-    // file. Each such contribution consists of a compilation unit
-    // header (see Section 7.5.1.1) followed by a single
-    // DW_TAG_compile_unit or DW_TAG_partial_unit debugging information
-    // entry, together with its children.
-
-    // 7.5.1.1 Compilation Unit Header
-    //  1. unit_length (4B or 12B): read by Section::next
-    //  2. version (2B)
-    //  3. debug_abbrev_offset (4B or 8B): offset into the .debug_abbrev section
-    //  4. address_size (1B)
-
-    Section debug_info_section(infoEntry);
-    std::string_view chunk;
-    SAFE_CHECK(debug_info_section.next(chunk), "invalid debug info");
-
-    auto version = read<uint16_t>(chunk);
-    SAFE_CHECK(version >= 2 && version <= 4, "invalid info version");
-    uint64_t abbrev_offset = readOffset(chunk, debug_info_section.is64Bit());
-    auto address_size = read<uint8_t>(chunk);
-    SAFE_CHECK(address_size == sizeof(uintptr_t), "invalid address size");
-
-    // We survived so far. The first (and only) DIE should be DW_TAG_compile_unit
-    // NOTE: - binutils <= 2.25 does not issue DW_TAG_partial_unit.
-    //       - dwarf compression tools like `dwz` may generate it.
-    // TODO(tudorb): Handle DW_TAG_partial_unit?
-    auto code = readULEB(chunk);
-    SAFE_CHECK(code != 0, "invalid code");
-    auto abbr = getAbbreviation(code, abbrev_offset);
-    SAFE_CHECK(abbr.tag == DW_TAG_compile_unit, "expecting compile unit entry");
-    // Skip children entries, remove_prefix to the next compilation unit entry.
-    infoEntry.remove_prefix(chunk.end() - infoEntry.begin());
+    Die die = getDieAtOffset(cu, cu.first_die);
+    // Partial compilation unit (DW_TAG_partial_unit) is not supported.
+    SAFE_CHECK(die.abbr.tag == DW_TAG_compile_unit, "expecting compile unit entry");
 
     // Read attributes, extracting the few we care about
-    bool found_line_offset = false;
-    uint64_t line_offset = 0;
+    std::optional<uint64_t> line_offset = 0;
     std::string_view compilation_directory;
-    std::string_view main_file_name;
+    std::optional<std::string_view> main_file_name;
+    std::optional<uint64_t> base_addr_cu;
 
-    DIEAbbreviation::Attribute attr;
-    std::string_view attributes = abbr.attributes;
-    for (;;)
+    forEachAttribute(cu, die, [&](const Attribute & attr)
     {
-        attr = readAttribute(attributes);
-        if (attr.name == 0 && attr.form == 0)
-        {
-            break;
-        }
-        auto val = readAttributeValue(chunk, attr.form, debug_info_section.is64Bit());
-        switch (attr.name)
+        switch (attr.spec.name)
         {
             case DW_AT_stmt_list:
                 // Offset in .debug_line for the line number VM program for this
                 // compilation unit
-                line_offset = std::get<uint64_t>(val);
-                found_line_offset = true;
+                line_offset = std::get<uint64_t>(attr.attr_value);
                 break;
             case DW_AT_comp_dir:
                 // Compilation directory
-                compilation_directory = std::get<std::string_view>(val);
+                compilation_directory = std::get<std::string_view>(attr.attr_value);
                 break;
             case DW_AT_name:
                 // File name of main file being compiled
-                main_file_name = std::get<std::string_view>(val);
+                main_file_name = std::get<std::string_view>(attr.attr_value);
+                break;
+            case DW_AT_low_pc:
+            case DW_AT_entry_pc:
+                // 2.17.1: historically DW_AT_low_pc was used. DW_AT_entry_pc was
+                // introduced in DWARF3. Support either to determine the base address of
+                // the CU.
+                base_addr_cu = std::get<uint64_t>(attr.attr_value);
                 break;
         }
-    }
+        // Iterate through all attributes until find all above.
+        return true;
+    });
 
-    if (!main_file_name.empty())
+    if (main_file_name)
     {
-        locationInfo.hasMainFile = true;
-        locationInfo.mainFile = Path(compilation_directory, "", main_file_name);
+        info.has_main_file = true;
+        info.main_file = Path(compilation_directory, "", *main_file_name);
     }
 
-    if (!found_line_offset)
+    if (!line_offset)
     {
         return false;
     }
 
     std::string_view line_section(line_);
-    line_section.remove_prefix(line_offset);
+    line_section.remove_prefix(*line_offset);
     LineNumberVM line_vm(line_section, compilation_directory);
 
     // Execute line number VM program to find file and line
-    locationInfo.hasFileAndLine = line_vm.findAddress(address, locationInfo.file, locationInfo.line);
-    return locationInfo.hasFileAndLine;
+    info.has_file_and_line = line_vm.findAddress(address, info.file, info.line);
+
+    bool check_inline = (mode == LocationInfoMode::FULL_WITH_INLINE);
+
+    if (info.has_file_and_line && check_inline)
+    {
+        // Re-get the compilation unit with abbreviation cached.
+        cu.abbr_cache.clear();
+        cu.abbr_cache.resize(kMaxAbbreviationEntries);
+        readCompilationUnitAbbrs(abbrev_, cu);
+
+        // Find the subprogram that matches the given address.
+        Die subprogram;
+        findSubProgramDieForAddress(cu, die, address, base_addr_cu, subprogram);
+
+        // Subprogram is the DIE of caller function.
+        if (check_inline && subprogram.abbr.has_children)
+        {
+            // Use an extra location and get its call file and call line, so that
+            // they can be used for the second last location when we don't have
+            // enough inline frames for all inline functions call stack.
+            const size_t max_size = Dwarf::kMaxInlineLocationInfoPerFrame + 1;
+            std::vector<CallLocation> call_locations;
+            call_locations.reserve(Dwarf::kMaxInlineLocationInfoPerFrame + 1);
+
+            findInlinedSubroutineDieForAddress(cu, subprogram, line_vm, address, base_addr_cu, call_locations, max_size);
+            size_t num_found = call_locations.size();
+
+            if (num_found > 0)
+            {
+                const auto inner_most_file = info.file;
+                const auto inner_most_line = info.line;
+
+                // Earlier we filled in locationInfo:
+                // - mainFile: the path to the CU -- the file where the non-inlined
+                //   call is made from.
+                // - file + line: the location of the inner-most inlined call.
+                // Here we already find inlined info so mainFile would be redundant.
+                info.has_main_file = false;
+                info.main_file = Path{};
+                // @findInlinedSubroutineDieForAddress fills inlineLocations[0] with the
+                // file+line of the non-inlined outer function making the call.
+                // locationInfo.name is already set by the caller by looking up the
+                // non-inlined function @address belongs to.
+                info.has_file_and_line = true;
+                info.file = call_locations[0].file;
+                info.line = call_locations[0].line;
+
+                // The next inlined subroutine's call file and call line is the current
+                // caller's location.
+                for (size_t i = 0; i < num_found - 1; i++)
+                {
+                    call_locations[i].file = call_locations[i + 1].file;
+                    call_locations[i].line = call_locations[i + 1].line;
+                }
+                // CallLocation for the inner-most inlined function:
+                // - will be computed if enough space was available in the passed
+                //   buffer.
+                // - will have a .name, but no !.file && !.line
+                // - its corresponding file+line is the one returned by LineVM based
+                //   on @address.
+                // Use the inner-most inlined file+line info we got from the LineVM.
+                call_locations[num_found - 1].file = inner_most_file;
+                call_locations[num_found - 1].line = inner_most_line;
+
+                // Fill in inline frames in reverse order (as expected by the caller).
+                std::reverse(call_locations.begin(), call_locations.end());
+                for (const auto & call_location : call_locations)
+                {
+                    SymbolizedFrame inline_frame;
+                    inline_frame.found = true;
+                    inline_frame.addr = address;
+                    inline_frame.name = call_location.name.data();
+                    inline_frame.location.has_file_and_line = true;
+                    inline_frame.location.file = call_location.file;
+                    inline_frame.location.line = call_location.line;
+                    inline_frames.push_back(inline_frame);
+                }
+            }
+        }
+    }
+
+    return info.has_file_and_line;
 }
 
-bool Dwarf::findAddress(uintptr_t address, LocationInfo & locationInfo, LocationInfoMode mode) const
+void Dwarf::findSubProgramDieForAddress(
+    const CompilationUnit & cu, const Die & die, uint64_t address, std::optional<uint64_t> base_addr_cu, Die & subprogram) const
+{
+    forEachChild(cu, die, [&](const Die & child_die)
+    {
+        if (child_die.abbr.tag == DW_TAG_subprogram)
+        {
+            std::optional<uint64_t> low_pc;
+            std::optional<uint64_t> high_pc;
+            std::optional<bool> is_high_pc_addr;
+            std::optional<uint64_t> range_offset;
+            forEachAttribute(cu, child_die, [&](const Attribute & attr)
+            {
+                switch (attr.spec.name)
+                {
+                    case DW_AT_ranges:
+                        range_offset = std::get<uint64_t>(attr.attr_value);
+                        break;
+                    case DW_AT_low_pc:
+                        low_pc = std::get<uint64_t>(attr.attr_value);
+                        break;
+                    case DW_AT_high_pc:
+                        // Value of DW_AT_high_pc attribute can be an address
+                        // (DW_FORM_addr) or an offset (DW_FORM_data).
+                        is_high_pc_addr = (attr.spec.form == DW_FORM_addr);
+                        high_pc = std::get<uint64_t>(attr.attr_value);
+                        break;
+                }
+                // Iterate through all attributes until find all above.
+                return true;
+            });
+            bool pc_match = low_pc && high_pc && is_high_pc_addr && address >= *low_pc
+                && (address < (*is_high_pc_addr ? *high_pc : *low_pc + *high_pc));
+            bool range_match = range_offset && isAddrInRangeList(address, base_addr_cu, range_offset.value(), cu.addr_size);
+            if (pc_match || range_match)
+            {
+                subprogram = child_die;
+                return false;
+            }
+        }
+
+        findSubProgramDieForAddress(cu, child_die, address, base_addr_cu, subprogram);
+
+        // Iterates through children until find the inline subprogram.
+        return true;
+    });
+}
+
+/**
+ * Find DW_TAG_inlined_subroutine child DIEs that contain @address and
+ * then extract:
+ * - Where was it called from (DW_AT_call_file & DW_AT_call_line):
+ *   the statement or expression that caused the inline expansion.
+ * - The inlined function's name. As a function may be inlined multiple
+ *   times, common attributes like DW_AT_linkage_name or DW_AT_name
+ *   are only stored in its "concrete out-of-line instance" (a
+ *   DW_TAG_subprogram) which we find using DW_AT_abstract_origin.
+ */
+void Dwarf::findInlinedSubroutineDieForAddress(
+    const CompilationUnit & cu,
+    const Die & die,
+    const LineNumberVM & line_vm,
+    uint64_t address,
+    std::optional<uint64_t> base_addr_cu,
+    std::vector<CallLocation> & locations,
+    const size_t max_size) const
+{
+    if (locations.size() >= max_size)
+    {
+        return;
+    }
+
+    forEachChild(cu, die, [&](const Die & child_die)
+    {
+        // Between a DW_TAG_subprogram and and DW_TAG_inlined_subroutine we might
+        // have arbitrary intermediary "nodes", including DW_TAG_common_block,
+        // DW_TAG_lexical_block, DW_TAG_try_block, DW_TAG_catch_block and
+        // DW_TAG_with_stmt, etc.
+        // We can't filter with locationhere since its range may be not specified.
+        // See section 2.6.2: A location list containing only an end of list entry
+        // describes an object that exists in the source code but not in the
+        // executable program.
+        if (child_die.abbr.tag == DW_TAG_try_block || child_die.abbr.tag == DW_TAG_catch_block || child_die.abbr.tag == DW_TAG_entry_point
+            || child_die.abbr.tag == DW_TAG_common_block || child_die.abbr.tag == DW_TAG_lexical_block)
+        {
+            findInlinedSubroutineDieForAddress(cu, child_die, line_vm, address, base_addr_cu, locations, max_size);
+            return true;
+        }
+
+        std::optional<uint64_t> low_pc;
+        std::optional<uint64_t> high_pc;
+        std::optional<bool> is_high_pc_addr;
+        std::optional<uint64_t> abstract_origin;
+        std::optional<uint64_t> abstract_origin_ref_type;
+        std::optional<uint64_t> call_file;
+        std::optional<uint64_t> call_line;
+        std::optional<uint64_t> range_offset;
+        forEachAttribute(cu, child_die, [&](const Attribute & attr)
+        {
+            switch (attr.spec.name)
+            {
+                case DW_AT_ranges:
+                    range_offset = std::get<uint64_t>(attr.attr_value);
+                    break;
+                case DW_AT_low_pc:
+                    low_pc = std::get<uint64_t>(attr.attr_value);
+                    break;
+                case DW_AT_high_pc:
+                    // Value of DW_AT_high_pc attribute can be an address
+                    // (DW_FORM_addr) or an offset (DW_FORM_data).
+                    is_high_pc_addr = (attr.spec.form == DW_FORM_addr);
+                    high_pc = std::get<uint64_t>(attr.attr_value);
+                    break;
+                case DW_AT_abstract_origin:
+                    abstract_origin_ref_type = attr.spec.form;
+                    abstract_origin = std::get<uint64_t>(attr.attr_value);
+                    break;
+                case DW_AT_call_line:
+                    call_line = std::get<uint64_t>(attr.attr_value);
+                    break;
+                case DW_AT_call_file:
+                    call_file = std::get<uint64_t>(attr.attr_value);
+                    break;
+            }
+            // Iterate through all until find all above attributes.
+            return true;
+        });
+
+        // 2.17 Code Addresses and Ranges
+        // Any debugging information entry describing an entity that has a
+        // machine code address or range of machine code addresses,
+        // which includes compilation units, module initialization, subroutines,
+        // ordinary blocks, try/catch blocks, labels and the like, may have
+        //  - A DW_AT_low_pc attribute for a single address,
+        //  - A DW_AT_low_pc and DW_AT_high_pc pair of attributes for a
+        //    single contiguous range of addresses, or
+        //  - A DW_AT_ranges attribute for a non-contiguous range of addresses.
+        // TODO: Support DW_TAG_entry_point and DW_TAG_common_block that don't
+        // have DW_AT_low_pc/DW_AT_high_pc pairs and DW_AT_ranges.
+        // TODO: Support relocated address which requires lookup in relocation map.
+        bool pc_match
+            = low_pc && high_pc && is_high_pc_addr && address >= *low_pc && (address < (*is_high_pc_addr ? *high_pc : *low_pc + *high_pc));
+        bool range_match = range_offset && isAddrInRangeList(address, base_addr_cu, range_offset.value(), cu.addr_size);
+        if (!pc_match && !range_match)
+        {
+            // Address doesn't match. Keep searching other children.
+            return true;
+        }
+
+        if (!abstract_origin || !abstract_origin_ref_type || !call_line || !call_file)
+        {
+            // We expect a single sibling DIE to match on addr, but it's missing
+            // required fields. Stop searching for other DIEs.
+            return false;
+        }
+
+        CallLocation location;
+        location.file = line_vm.getFullFileName(*call_file);
+        location.line = *call_line;
+
+        auto get_function_name = [&](const CompilationUnit & srcu, uint64_t die_offset)
+        {
+            auto decl_die = getDieAtOffset(srcu, die_offset);
+            // Jump to the actual function definition instead of declaration for name
+            // and line info.
+            auto def_die = findDefinitionDie(srcu, decl_die);
+
+            std::string_view name;
+            // The file and line will be set in the next inline subroutine based on
+            // its DW_AT_call_file and DW_AT_call_line.
+            forEachAttribute(srcu, def_die, [&](const Attribute & attr)
+            {
+                switch (attr.spec.name)
+                {
+                    case DW_AT_linkage_name:
+                        name = std::get<std::string_view>(attr.attr_value);
+                        break;
+                    case DW_AT_name:
+                        // NOTE: when DW_AT_linkage_name and DW_AT_name match, dwarf
+                        // emitters omit DW_AT_linkage_name (to save space). If present
+                        // DW_AT_linkage_name should always be preferred (mangled C++ name
+                        // vs just the function name).
+                        if (name.empty())
+                        {
+                            name = std::get<std::string_view>(attr.attr_value);
+                        }
+                        break;
+                }
+                return true;
+            });
+            return name;
+        };
+
+        // DW_AT_abstract_origin is a reference. There a 3 types of references:
+        // - the reference can identify any debugging information entry within the
+        //   compilation unit (DW_FORM_ref1, DW_FORM_ref2, DW_FORM_ref4,
+        //   DW_FORM_ref8, DW_FORM_ref_udata). This type of reference is an offset
+        //   from the first byte of the compilation header for the compilation unit
+        //   containing the reference.
+        // - the reference can identify any debugging information entry within a
+        //   .debug_info section; in particular, it may refer to an entry in a
+        //   different compilation unit (DW_FORM_ref_addr)
+        // - the reference can identify any debugging information type entry that
+        //   has been placed in its own type unit.
+        //   Not applicable for DW_AT_abstract_origin.
+        location.name = (*abstract_origin_ref_type != DW_FORM_ref_addr)
+            ? get_function_name(cu, cu.offset + *abstract_origin)
+            : get_function_name(findCompilationUnit(info_, *abstract_origin), *abstract_origin);
+
+        locations.push_back(location);
+
+        findInlinedSubroutineDieForAddress(cu, child_die, line_vm, address, base_addr_cu, locations, max_size);
+
+        return false;
+    });
+}
+
+bool Dwarf::findAddress(
+    uintptr_t address, LocationInfo & locationInfo, LocationInfoMode mode, std::vector<SymbolizedFrame> & inline_frames) const
 {
     locationInfo = LocationInfo();
 
@@ -635,10 +1113,9 @@ bool Dwarf::findAddress(uintptr_t address, LocationInfo & locationInfo, Location
         if (findDebugInfoOffset(address, aranges_, offset))
         {
             // Read compilation unit header from .debug_info
-            std::string_view info_entry(info_);
-            info_entry.remove_prefix(offset);
-            findLocation(address, info_entry, locationInfo);
-            return locationInfo.hasFileAndLine;
+            auto unit = getCompilationUnit(info_, offset);
+            findLocation(address, mode, unit, locationInfo, inline_frames);
+            return locationInfo.has_file_and_line;
         }
         else if (mode == LocationInfoMode::FAST)
         {
@@ -650,20 +1127,92 @@ bool Dwarf::findAddress(uintptr_t address, LocationInfo & locationInfo, Location
         }
         else
         {
-            SAFE_CHECK(mode == LocationInfoMode::FULL, "unexpected mode");
+            SAFE_CHECK(mode == LocationInfoMode::FULL || mode == LocationInfoMode::FULL_WITH_INLINE, "unexpected mode");
             // Fall back to the linear scan.
         }
     }
 
     // Slow path (linear scan): Iterate over all .debug_info entries
     // and look for the address in each compilation unit.
-    std::string_view info_entry(info_);
-    while (!info_entry.empty() && !locationInfo.hasFileAndLine)
-        findLocation(address, info_entry, locationInfo);
+    uint64_t offset = 0;
+    while (offset < info_.size() && !locationInfo.has_file_and_line)
+    {
+        auto unit = getCompilationUnit(info_, offset);
+        offset += unit.size;
+        findLocation(address, mode, unit, locationInfo, inline_frames);
+    }
 
-    return locationInfo.hasFileAndLine;
+    return locationInfo.has_file_and_line;
 }
 
+bool Dwarf::isAddrInRangeList(uint64_t address, std::optional<uint64_t> base_addr, size_t offset, uint8_t addr_size) const
+{
+    SAFE_CHECK(addr_size == 4 || addr_size == 8, "wrong address size");
+    if (ranges_.empty())
+    {
+        return false;
+    }
+
+    const bool is_64bit_addr = addr_size == 8;
+    std::string_view sp = ranges_;
+    sp.remove_prefix(offset);
+    const uint64_t max_addr = is_64bit_addr ? std::numeric_limits<uint64_t>::max() : std::numeric_limits<uint32_t>::max();
+    while (!sp.empty())
+    {
+        uint64_t begin = readOffset(sp, is_64bit_addr);
+        uint64_t end = readOffset(sp, is_64bit_addr);
+        // The range list entry is a base address selection entry.
+        if (begin == max_addr)
+        {
+            base_addr = end;
+            continue;
+        }
+        // The range list entry is an end of list entry.
+        if (begin == 0 && end == 0)
+        {
+            break;
+        }
+        // Check if the given address falls in the range list entry.
+        // 2.17.3 Non-Contiguous Address Ranges
+        // The applicable base address of a range list entry is determined by the
+        // closest preceding base address selection entry (see below) in the same
+        // range list. If there is no such selection entry, then the applicable base
+        // address defaults to the base address of the compilation unit.
+        if (base_addr && address >= begin + *base_addr && address < end + *base_addr)
+        {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+// static
+Dwarf::CompilationUnit Dwarf::findCompilationUnit(std::string_view info, uint64_t targetOffset)
+{
+    SAFE_CHECK(targetOffset < info.size(), "unexpected target address");
+    uint64_t offset = 0;
+    while (offset < info.size())
+    {
+        std::string_view chunk(info);
+        chunk.remove_prefix(offset);
+
+        auto initial_length = read<uint32_t>(chunk);
+        auto is_64bit = (initial_length == uint32_t(-1));
+        auto size = is_64bit ? read<uint64_t>(chunk) : initial_length;
+        SAFE_CHECK(size <= chunk.size(), "invalid chunk size");
+        size += is_64bit ? 12 : 4;
+
+        if (offset + size > targetOffset)
+        {
+            break;
+        }
+        offset += size;
+    }
+    return getCompilationUnit(info, offset);
+}
+
+
 Dwarf::LineNumberVM::LineNumberVM(std::string_view data, std::string_view compilationDirectory)
     : compilationDirectory_(compilationDirectory)
 {
diff --git a/src/Common/Dwarf.h b/src/Common/Dwarf.h
index 40badc1c5a4..9ea940c3380 100644
--- a/src/Common/Dwarf.h
+++ b/src/Common/Dwarf.h
@@ -21,9 +21,13 @@
 /** This file was edited for ClickHouse.
   */
 
+#include <functional>
+#include <memory>
+#include <optional>
 #include <string>
 #include <string_view>
 #include <variant>
+#include <vector>
 
 
 namespace DB
@@ -61,7 +65,13 @@ class Dwarf final
     // be live for as long as the passed-in Elf is live.
 public:
     /** Create a DWARF parser around an ELF file. */
-    explicit Dwarf(const Elf & elf);
+    explicit Dwarf(const std::shared_ptr<Elf> & elf);
+
+    /**
+     * More than one location info may exist if current frame is an inline
+     * function call.
+     */
+    static constexpr uint32_t kMaxInlineLocationInfoPerFrame = 10;
 
     /**
       * Represent a file path a s collection of three parts (base directory,
@@ -70,7 +80,7 @@ public:
     class Path
     {
     public:
-        Path() {}
+        Path() = default;
 
         Path(std::string_view baseDir, std::string_view subDir, std::string_view file);
 
@@ -107,6 +117,14 @@ public:
         std::string_view file_;
     };
 
+    // Indicates inline function `name` is called  at `line@file`.
+    struct CallLocation
+    {
+        Path file = {};
+        uint64_t line;
+        std::string_view name;
+    };
+
     enum class LocationInfoMode
     {
         // Don't resolve location info.
@@ -115,30 +133,47 @@ public:
         FAST,
         // Scan all CU in .debug_info (slow!) on .debug_aranges lookup failure.
         FULL,
+        // Scan .debug_info (super slower, use with caution) for inline functions in
+        // addition to FULL.
+        FULL_WITH_INLINE,
     };
 
     struct LocationInfo
     {
-        bool hasMainFile = false;
-        Path mainFile;
+        bool has_main_file = false;
+        Path main_file;
 
-        bool hasFileAndLine = false;
+        bool has_file_and_line = false;
         Path file;
         uint64_t line = 0;
     };
 
+    /**
+     * Frame information: symbol name and location.
+     */
+    struct SymbolizedFrame
+    {
+        bool found = false;
+        uintptr_t addr = 0;
+        // Mangled symbol name. Use `folly::demangle()` to demangle it.
+        const char * name = nullptr;
+        LocationInfo location;
+        std::shared_ptr<const Elf> file;
+
+        void clear() { *this = SymbolizedFrame(); }
+    };
+
     /** Find the file and line number information corresponding to address.
       * The address must be physical - offset in object file without offset in virtual memory where the object is loaded.
       */
-    bool findAddress(uintptr_t address, LocationInfo & info, LocationInfoMode mode) const;
+    bool findAddress(uintptr_t address, LocationInfo & info, LocationInfoMode mode, std::vector<SymbolizedFrame> & inline_frames) const;
 
 private:
     static bool findDebugInfoOffset(uintptr_t address, std::string_view aranges, uint64_t & offset);
 
     void init();
-    bool findLocation(uintptr_t address, std::string_view & infoEntry, LocationInfo & info) const;
 
-    const Elf * elf_;
+    std::shared_ptr<const Elf> elf_;
 
     // DWARF section made up of chunks, each prefixed with a length header.
     // The length indicates whether the chunk is DWARF-32 or DWARF-64, which
@@ -169,17 +204,81 @@ private:
     {
         uint64_t code;
         uint64_t tag;
-        bool hasChildren;
-
-        struct Attribute
-        {
-            uint64_t name;
-            uint64_t form;
-        };
+        bool has_children = false;
 
         std::string_view attributes;
     };
 
+    // Debugging information entry to define a low-level representation of a
+    // source program. Each debugging information entry consists of an identifying
+    // tag and a series of attributes. An entry, or group of entries together,
+    // provide a description of a corresponding entity in the source program.
+    struct Die
+    {
+        bool is64Bit;
+        // Offset from start to first attribute
+        uint8_t attr_offset;
+        // Offset within debug info.
+        uint32_t offset;
+        uint64_t code;
+        DIEAbbreviation abbr;
+    };
+
+    struct AttributeSpec
+    {
+        uint64_t name = 0;
+        uint64_t form = 0;
+
+        explicit operator bool() const { return name != 0 || form != 0; }
+    };
+
+    struct Attribute
+    {
+        AttributeSpec spec;
+        const Die & die;
+        std::variant<uint64_t, std::string_view> attr_value;
+    };
+
+    struct CompilationUnit
+    {
+        bool is64Bit;
+        uint8_t version;
+        uint8_t addr_size;
+        // Offset in .debug_info of this compilation unit.
+        uint32_t offset;
+        uint32_t size;
+        // Offset in .debug_info for the first DIE in this compilation unit.
+        uint32_t first_die;
+        uint64_t abbrev_offset;
+        // Only the CompilationUnit that contains the caller functions needs this cache.
+        // Indexed by (abbr.code - 1) if (abbr.code - 1) < abbrCache.size();
+        std::vector<DIEAbbreviation> abbr_cache;
+    };
+
+    static CompilationUnit getCompilationUnit(std::string_view info, uint64_t offset);
+
+    /** cu must exist during the life cycle of created detail::Die. */
+    Die getDieAtOffset(const CompilationUnit & cu, uint64_t offset) const;
+
+    /**
+     * Find the actual definition DIE instead of declaration for the given die.
+     */
+    Die findDefinitionDie(const CompilationUnit & cu, const Die & die) const;
+
+    bool findLocation(
+        uintptr_t address,
+        LocationInfoMode mode,
+        CompilationUnit & cu,
+        LocationInfo & info,
+        std::vector<SymbolizedFrame> & inline_frames) const;
+
+    /**
+     * Finds a subprogram debugging info entry that contains a given address among
+     * children of given die. Depth first search.
+     */
+    void findSubProgramDieForAddress(
+        const CompilationUnit & cu, const Die & die, uint64_t address, std::optional<uint64_t> base_addr_cu, Die & subprogram) const;
+
     // Interpreter for the line number bytecode VM
     class LineNumberVM
     {
@@ -188,6 +287,13 @@ private:
 
         bool findAddress(uintptr_t target, Path & file, uint64_t & line);
 
+        /** Gets full file name at given index including directory. */
+        Path getFullFileName(uint64_t index) const
+        {
+            auto fn = getFileName(index);
+            return Path({}, getIncludeDirectory(fn.directoryIndex), fn.relativeName);
+        }
+
     private:
         void init();
         void reset();
@@ -259,18 +365,50 @@ private:
         uint64_t discriminator_;
     };
 
+    /**
+     * Finds inlined subroutine DIEs and their caller lines that contains a given
+     * address among children of given die. Depth first search.
+     */
+    void findInlinedSubroutineDieForAddress(
+        const CompilationUnit & cu,
+        const Die & die,
+        const LineNumberVM & line_vm,
+        uint64_t address,
+        std::optional<uint64_t> base_addr_cu,
+        std::vector<CallLocation> & locations,
+        size_t max_size) const;
+
     // Read an abbreviation from a std::string_view, return true if at end; remove_prefix section
     static bool readAbbreviation(std::string_view & section, DIEAbbreviation & abbr);
 
+    static void readCompilationUnitAbbrs(std::string_view abbrev, CompilationUnit & cu);
+
+    /**
+     * Iterates over all children of a debugging info entry, calling the given
+     * callable for each. Iteration is stopped early if any of the calls return
+     * false. Returns the offset of next DIE after iterations.
+     */
+    size_t forEachChild(const CompilationUnit & cu, const Die & die, std::function<bool(const Die & die)> f) const;
+
     // Get abbreviation corresponding to a code, in the chunk starting at
     // offset in the .debug_abbrev section
     DIEAbbreviation getAbbreviation(uint64_t code, uint64_t offset) const;
 
+    /**
+     * Iterates over all attributes of a debugging info entry, calling the given
+     * callable for each. If all attributes are visited, then return the offset of
+     * next DIE, or else iteration is stopped early and return size_t(-1) if any
+     * of the calls return false.
+     */
+    size_t forEachAttribute(const CompilationUnit & cu, const Die & die, std::function<bool(const Attribute & die)> f) const;
+
+    Attribute readAttribute(const Die & die, AttributeSpec spec, std::string_view & info) const;
+
     // Read one attribute <name, form> pair, remove_prefix sp; returns <0, 0> at end.
-    static DIEAbbreviation::Attribute readAttribute(std::string_view & sp);
+    static AttributeSpec readAttributeSpec(std::string_view & sp);
 
     // Read one attribute value, remove_prefix sp
-    typedef std::variant<uint64_t, std::string_view> AttributeValue;
+    using AttributeValue = std::variant<uint64_t, std::string_view>;
     AttributeValue readAttributeValue(std::string_view & sp, uint64_t form, bool is64Bit) const;
 
     // Get an ELF section by name, return true if found
@@ -279,11 +417,34 @@ private:
     // Get a string from the .debug_str section
     std::string_view getStringFromStringSection(uint64_t offset) const;
 
+    template <class T>
+    std::optional<T> getAttribute(const CompilationUnit & cu, const Die & die, uint64_t attr_name) const
+    {
+        std::optional<T> result;
+        forEachAttribute(cu, die, [&](const Attribute & attr)
+        {
+            if (attr.spec.name == attr_name)
+            {
+                result = std::get<T>(attr.attr_value);
+                return false;
+            }
+            return true;
+        });
+        return result;
+    }
+
+    // Check if the given address is in the range list at the given offset in .debug_ranges.
+    bool isAddrInRangeList(uint64_t address, std::optional<uint64_t> base_addr, size_t offset, uint8_t addr_size) const;
+
+    // Finds the Compilation Unit starting at offset.
+    static CompilationUnit findCompilationUnit(std::string_view info, uint64_t targetOffset);
+
     std::string_view info_; // .debug_info
     std::string_view abbrev_; // .debug_abbrev
     std::string_view aranges_; // .debug_aranges
     std::string_view line_; // .debug_line
     std::string_view strings_; // .debug_str
+    std::string_view ranges_; // .debug_ranges
 };
 
 }
diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp
index cf758691cec..d0d83448b68 100644
--- a/src/Common/ErrorCodes.cpp
+++ b/src/Common/ErrorCodes.cpp
@@ -534,6 +534,7 @@
     M(565, TOO_MANY_PARTITIONS) \
     M(566, CANNOT_RMDIR) \
     M(567, DUPLICATED_PART_UUIDS) \
+    M(568, RAFT_ERROR) \
     \
     M(999, KEEPER_EXCEPTION) \
     M(1000, POCO_EXCEPTION) \
diff --git a/src/Common/HashTable/HashTable.h b/src/Common/HashTable/HashTable.h
index bf159e27731..892bd0b2ba9 100644
--- a/src/Common/HashTable/HashTable.h
+++ b/src/Common/HashTable/HashTable.h
@@ -539,7 +539,8 @@ protected:
           *    after transferring all the elements from the old halves you need to     [         o   x    ]
           *    process tail from the collision resolution chain immediately after it   [        o    x    ]
           */
-        for (; !buf[i].isZero(*this); ++i)
+        size_t new_size = grower.bufSize();
+        for (; i < new_size && !buf[i].isZero(*this); ++i)
         {
             size_t updated_place_value = reinsert(buf[i], buf[i].getHash(*this));
 
diff --git a/src/Common/MemoryTracker.cpp b/src/Common/MemoryTracker.cpp
index d037142fbfb..a584885cf0f 100644
--- a/src/Common/MemoryTracker.cpp
+++ b/src/Common/MemoryTracker.cpp
@@ -24,8 +24,8 @@ namespace
 ///
 /// - when it is explicitly blocked with LockExceptionInThread
 ///
-/// - to avoid std::terminate(), when stack unwinding is current in progress in
-///   this thread.
+/// - to avoid std::terminate(), when stack unwinding is currently in progress
+///   in this thread.
 ///
 ///   NOTE: that since C++11 destructor marked with noexcept by default, and
 ///   this means that any throw from destructor (that is not marked with
diff --git a/src/Common/PODArray.cpp b/src/Common/PODArray.cpp
index e0b17c8125c..c1edc5bafad 100644
--- a/src/Common/PODArray.cpp
+++ b/src/Common/PODArray.cpp
@@ -6,4 +6,14 @@ namespace DB
 /// Used for left padding of PODArray when empty
 const char empty_pod_array[empty_pod_array_size]{};
 
+template class PODArray<UInt8, 4096, Allocator<false>, 15, 16>;
+template class PODArray<UInt16, 4096, Allocator<false>, 15, 16>;
+template class PODArray<UInt32, 4096, Allocator<false>, 15, 16>;
+template class PODArray<UInt64, 4096, Allocator<false>, 15, 16>;
+
+template class PODArray<Int8, 4096, Allocator<false>, 15, 16>;
+template class PODArray<Int16, 4096, Allocator<false>, 15, 16>;
+template class PODArray<Int32, 4096, Allocator<false>, 15, 16>;
+template class PODArray<Int64, 4096, Allocator<false>, 15, 16>;
+
 }
diff --git a/src/Common/PODArray.h b/src/Common/PODArray.h
index f0cc9df11cd..8e05dfea8b3 100644
--- a/src/Common/PODArray.h
+++ b/src/Common/PODArray.h
@@ -725,4 +725,16 @@ void swap(PODArray<T, initial_bytes, TAllocator, pad_right_> & lhs, PODArray<T,
 }
 #pragma GCC diagnostic pop
 
+/// Prevent implicit template instantiation of PODArray for common numeric types
+
+extern template class PODArray<UInt8, 4096, Allocator<false>, 15, 16>;
+extern template class PODArray<UInt16, 4096, Allocator<false>, 15, 16>;
+extern template class PODArray<UInt32, 4096, Allocator<false>, 15, 16>;
+extern template class PODArray<UInt64, 4096, Allocator<false>, 15, 16>;
+
+extern template class PODArray<Int8, 4096, Allocator<false>, 15, 16>;
+extern template class PODArray<Int16, 4096, Allocator<false>, 15, 16>;
+extern template class PODArray<Int32, 4096, Allocator<false>, 15, 16>;
+extern template class PODArray<Int64, 4096, Allocator<false>, 15, 16>;
+
 }
diff --git a/src/Common/PODArray_fwd.h b/src/Common/PODArray_fwd.h
index f817d2f6dde..22f9230c01c 100644
--- a/src/Common/PODArray_fwd.h
+++ b/src/Common/PODArray_fwd.h
@@ -3,8 +3,8 @@
   * This file contains some using-declarations that define various kinds of
   * PODArray.
   */
-#pragma once
 
+#include <common/types.h>
 #include <Common/Allocator_fwd.h>
 
 namespace DB
diff --git a/src/Common/StackTrace.cpp b/src/Common/StackTrace.cpp
index 44f6b9e5443..c4cf7f11e68 100644
--- a/src/Common/StackTrace.cpp
+++ b/src/Common/StackTrace.cpp
@@ -217,10 +217,12 @@ void StackTrace::symbolize(const StackTrace::FramePointers & frame_pointers, siz
             current_frame.object = object->name;
             if (std::filesystem::exists(current_frame.object.value()))
             {
-                auto dwarf_it = dwarfs.try_emplace(object->name, *object->elf).first;
+                auto dwarf_it = dwarfs.try_emplace(object->name, object->elf).first;
 
                 DB::Dwarf::LocationInfo location;
-                if (dwarf_it->second.findAddress(uintptr_t(current_frame.physical_addr), location, DB::Dwarf::LocationInfoMode::FAST))
+                std::vector<DB::Dwarf::SymbolizedFrame> inline_frames;
+                if (dwarf_it->second.findAddress(
+                        uintptr_t(current_frame.physical_addr), location, DB::Dwarf::LocationInfoMode::FAST, inline_frames))
                 {
                     current_frame.file = location.file.toString();
                     current_frame.line = location.line;
@@ -314,7 +316,11 @@ const StackTrace::FramePointers & StackTrace::getFramePointers() const
 }
 
 static void toStringEveryLineImpl(
-    const StackTrace::FramePointers & frame_pointers, size_t offset, size_t size, std::function<void(const std::string &)> callback)
+    bool fatal,
+    const StackTrace::FramePointers & frame_pointers,
+    size_t offset,
+    size_t size,
+    std::function<void(const std::string &)> callback)
 {
     if (size == 0)
         return callback("<Empty trace>");
@@ -324,11 +330,12 @@ static void toStringEveryLineImpl(
     const DB::SymbolIndex & symbol_index = *symbol_index_ptr;
     std::unordered_map<std::string, DB::Dwarf> dwarfs;
 
-    std::stringstream out;      // STYLE_CHECK_ALLOW_STD_STRING_STREAM
+    std::stringstream out;  // STYLE_CHECK_ALLOW_STD_STRING_STREAM
     out.exceptions(std::ios::failbit);
 
     for (size_t i = offset; i < size; ++i)
     {
+        std::vector<DB::Dwarf::SymbolizedFrame> inline_frames;
         const void * virtual_addr = frame_pointers[i];
         const auto * object = symbol_index.findObject(virtual_addr);
         uintptr_t virtual_offset = object ? uintptr_t(object->address_begin) : 0;
@@ -340,10 +347,11 @@ static void toStringEveryLineImpl(
         {
             if (std::filesystem::exists(object->name))
             {
-                auto dwarf_it = dwarfs.try_emplace(object->name, *object->elf).first;
+                auto dwarf_it = dwarfs.try_emplace(object->name, object->elf).first;
 
                 DB::Dwarf::LocationInfo location;
-                if (dwarf_it->second.findAddress(uintptr_t(physical_addr), location, DB::Dwarf::LocationInfoMode::FAST))
+                auto mode = fatal ? DB::Dwarf::LocationInfoMode::FULL_WITH_INLINE : DB::Dwarf::LocationInfoMode::FAST;
+                if (dwarf_it->second.findAddress(uintptr_t(physical_addr), location, mode, inline_frames))
                     out << location.file.toString() << ":" << location.line << ": ";
             }
         }
@@ -360,11 +368,20 @@ static void toStringEveryLineImpl(
         out << " @ " << physical_addr;
         out << " in " << (object ? object->name : "?");
 
+        for (size_t j = 0; j < inline_frames.size(); ++j)
+        {
+            const auto & frame = inline_frames[j];
+            int status = 0;
+            callback(fmt::format("{}.{}. inlined from {}:{}: {}",
+                     i, j+1, frame.location.file.toString(), frame.location.line, demangle(frame.name, status)));
+        }
+
         callback(out.str());
         out.str({});
     }
 #else
-    std::stringstream out;      // STYLE_CHECK_ALLOW_STD_STRING_STREAM
+    UNUSED(fatal);
+    std::stringstream out;  // STYLE_CHECK_ALLOW_STD_STRING_STREAM
     out.exceptions(std::ios::failbit);
 
     for (size_t i = offset; i < size; ++i)
@@ -382,13 +399,13 @@ static std::string toStringImpl(const StackTrace::FramePointers & frame_pointers
 {
     std::stringstream out;      // STYLE_CHECK_ALLOW_STD_STRING_STREAM
     out.exceptions(std::ios::failbit);
-    toStringEveryLineImpl(frame_pointers, offset, size, [&](const std::string & str) { out << str << '\n'; });
+    toStringEveryLineImpl(false, frame_pointers, offset, size, [&](const std::string & str) { out << str << '\n'; });
     return out.str();
 }
 
 void StackTrace::toStringEveryLine(std::function<void(const std::string &)> callback) const
 {
-    toStringEveryLineImpl(frame_pointers, offset, size, std::move(callback));
+    toStringEveryLineImpl(true, frame_pointers, offset, size, std::move(callback));
 }
 
 
diff --git a/src/Common/StackTrace.h b/src/Common/StackTrace.h
index b2e14a01f03..58660f9e4da 100644
--- a/src/Common/StackTrace.h
+++ b/src/Common/StackTrace.h
@@ -51,10 +51,10 @@ public:
 
     /// Tries to capture stack trace. Fallbacks on parsing caller address from
     /// signal context if no stack trace could be captured
-    StackTrace(const ucontext_t & signal_context);
+    explicit StackTrace(const ucontext_t & signal_context);
 
     /// Creates empty object for deferred initialization
-    StackTrace(NoCapture);
+    explicit StackTrace(NoCapture);
 
     size_t getSize() const;
     size_t getOffset() const;
@@ -65,6 +65,7 @@ public:
     static void symbolize(const FramePointers & frame_pointers, size_t offset, size_t size, StackTrace::Frames & frames);
 
     void toStringEveryLine(std::function<void(const std::string &)> callback) const;
+
 protected:
     void tryCapture();
 
diff --git a/src/Common/SymbolIndex.h b/src/Common/SymbolIndex.h
index b310f90988e..65e446a7fc4 100644
--- a/src/Common/SymbolIndex.h
+++ b/src/Common/SymbolIndex.h
@@ -36,7 +36,7 @@ public:
         const void * address_begin;
         const void * address_end;
         std::string name;
-        std::unique_ptr<Elf> elf;
+        std::shared_ptr<Elf> elf;
     };
 
     /// Address in virtual memory should be passed. These addresses include offset where the object is loaded in memory.
diff --git a/src/Common/ZooKeeper/IKeeper.h b/src/Common/ZooKeeper/IKeeper.h
index 9d4a2ebb16a..c53ea60ec7c 100644
--- a/src/Common/ZooKeeper/IKeeper.h
+++ b/src/Common/ZooKeeper/IKeeper.h
@@ -331,7 +331,7 @@ public:
 class IKeeper
 {
 public:
-    virtual ~IKeeper() {}
+    virtual ~IKeeper() = default;
 
     /// If expired, you can only destroy the object. All other methods will throw exception.
     virtual bool isExpired() const = 0;
diff --git a/src/Common/ZooKeeper/TestKeeperStorageDispatcher.cpp b/src/Common/ZooKeeper/TestKeeperStorageDispatcher.cpp
deleted file mode 100644
index 434a6a2e747..00000000000
--- a/src/Common/ZooKeeper/TestKeeperStorageDispatcher.cpp
+++ /dev/null
@@ -1,139 +0,0 @@
-#include <Common/ZooKeeper/TestKeeperStorageDispatcher.h>
-#include <Common/setThreadName.h>
-
-namespace DB
-{
-
-namespace ErrorCodes
-{
-
-    extern const int LOGICAL_ERROR;
-    extern const int TIMEOUT_EXCEEDED;
-}
-
-}
-namespace zkutil
-{
-
-void TestKeeperStorageDispatcher::processingThread()
-{
-    setThreadName("TestKeeperSProc");
-
-    while (!shutdown)
-    {
-        RequestInfo info;
-
-        UInt64 max_wait = UInt64(operation_timeout.totalMilliseconds());
-
-        if (requests_queue.tryPop(info, max_wait))
-        {
-            if (shutdown)
-                break;
-
-            try
-            {
-                auto responses = storage.processRequest(info.request, info.session_id);
-                for (const auto & response_for_session : responses)
-                    setResponse(response_for_session.session_id, response_for_session.response);
-            }
-            catch (...)
-            {
-                tryLogCurrentException(__PRETTY_FUNCTION__);
-            }
-        }
-    }
-}
-
-void TestKeeperStorageDispatcher::setResponse(int64_t session_id, const Coordination::ZooKeeperResponsePtr & response)
-{
-    std::lock_guard lock(session_to_response_callback_mutex);
-    auto session_writer = session_to_response_callback.find(session_id);
-    if (session_writer == session_to_response_callback.end())
-        return;
-
-    session_writer->second(response);
-    /// Session closed, no more writes
-    if (response->xid != Coordination::WATCH_XID && response->getOpNum() == Coordination::OpNum::Close)
-        session_to_response_callback.erase(session_writer);
-}
-
-void TestKeeperStorageDispatcher::finalize()
-{
-    {
-        std::lock_guard lock(push_request_mutex);
-
-        if (shutdown)
-            return;
-
-        shutdown = true;
-
-        if (processing_thread.joinable())
-            processing_thread.join();
-    }
-
-    RequestInfo info;
-    TestKeeperStorage::RequestsForSessions expired_requests;
-    while (requests_queue.tryPop(info))
-        expired_requests.push_back(TestKeeperStorage::RequestForSession{info.session_id, info.request});
-
-    auto expired_responses = storage.finalize(expired_requests);
-
-    for (const auto & response_for_session : expired_responses)
-        setResponse(response_for_session.session_id, response_for_session.response);
-}
-
-void TestKeeperStorageDispatcher::putRequest(const Coordination::ZooKeeperRequestPtr & request, int64_t session_id)
-{
-
-    {
-        std::lock_guard lock(session_to_response_callback_mutex);
-        if (session_to_response_callback.count(session_id) == 0)
-            throw Exception(DB::ErrorCodes::LOGICAL_ERROR, "Unknown session id {}", session_id);
-    }
-
-    RequestInfo request_info;
-    request_info.time = clock::now();
-    request_info.request = request;
-    request_info.session_id = session_id;
-
-    std::lock_guard lock(push_request_mutex);
-    /// Put close requests without timeouts
-    if (request->getOpNum() == Coordination::OpNum::Close)
-        requests_queue.push(std::move(request_info));
-    else if (!requests_queue.tryPush(std::move(request_info), operation_timeout.totalMilliseconds()))
-        throw Exception("Cannot push request to queue within operation timeout", ErrorCodes::TIMEOUT_EXCEEDED);
-}
-
-TestKeeperStorageDispatcher::TestKeeperStorageDispatcher()
-{
-    processing_thread = ThreadFromGlobalPool([this] { processingThread(); });
-}
-
-TestKeeperStorageDispatcher::~TestKeeperStorageDispatcher()
-{
-    try
-    {
-        finalize();
-    }
-    catch (...)
-    {
-        tryLogCurrentException(__PRETTY_FUNCTION__);
-    }
-}
-
-void TestKeeperStorageDispatcher::registerSession(int64_t session_id, ZooKeeperResponseCallback callback)
-{
-    std::lock_guard lock(session_to_response_callback_mutex);
-    if (!session_to_response_callback.try_emplace(session_id, callback).second)
-        throw Exception(DB::ErrorCodes::LOGICAL_ERROR, "Session with id {} already registered in dispatcher", session_id);
-}
-
-void TestKeeperStorageDispatcher::finishSession(int64_t session_id)
-{
-    std::lock_guard lock(session_to_response_callback_mutex);
-    auto session_it = session_to_response_callback.find(session_id);
-    if (session_it != session_to_response_callback.end())
-        session_to_response_callback.erase(session_it);
-}
-
-}
diff --git a/src/Common/ZooKeeper/TestKeeperStorageDispatcher.h b/src/Common/ZooKeeper/TestKeeperStorageDispatcher.h
deleted file mode 100644
index a86895b5be1..00000000000
--- a/src/Common/ZooKeeper/TestKeeperStorageDispatcher.h
+++ /dev/null
@@ -1,60 +0,0 @@
-#pragma once
-
-#include <Common/ThreadPool.h>
-#include <Common/ConcurrentBoundedQueue.h>
-#include <Common/ZooKeeper/TestKeeperStorage.h>
-#include <functional>
-
-namespace zkutil
-{
-
-using ZooKeeperResponseCallback = std::function<void(const Coordination::ZooKeeperResponsePtr & response)>;
-
-class TestKeeperStorageDispatcher
-{
-private:
-    Poco::Timespan operation_timeout{0, Coordination::DEFAULT_OPERATION_TIMEOUT_MS * 1000};
-
-    using clock = std::chrono::steady_clock;
-
-    struct RequestInfo
-    {
-        Coordination::ZooKeeperRequestPtr request;
-        clock::time_point time;
-        int64_t session_id;
-    };
-
-    std::mutex push_request_mutex;
-
-    using RequestsQueue = ConcurrentBoundedQueue<RequestInfo>;
-    RequestsQueue requests_queue{1};
-    std::atomic<bool> shutdown{false};
-    using SessionToResponseCallback = std::unordered_map<int64_t, ZooKeeperResponseCallback>;
-
-    std::mutex session_to_response_callback_mutex;
-    SessionToResponseCallback session_to_response_callback;
-
-    ThreadFromGlobalPool processing_thread;
-
-    TestKeeperStorage storage;
-
-private:
-    void processingThread();
-    void finalize();
-    void setResponse(int64_t session_id, const Coordination::ZooKeeperResponsePtr & response);
-
-public:
-    TestKeeperStorageDispatcher();
-    ~TestKeeperStorageDispatcher();
-
-    void putRequest(const Coordination::ZooKeeperRequestPtr & request, int64_t session_id);
-    int64_t getSessionID()
-    {
-        return storage.getSessionID();
-    }
-    void registerSession(int64_t session_id, ZooKeeperResponseCallback callback);
-    /// Call if we don't need any responses for this session no more (session was expired)
-    void finishSession(int64_t session_id);
-};
-
-}
diff --git a/src/Common/ZooKeeper/ZooKeeper.cpp b/src/Common/ZooKeeper/ZooKeeper.cpp
index 4537d5ad8cd..a1c6eb9b481 100644
--- a/src/Common/ZooKeeper/ZooKeeper.cpp
+++ b/src/Common/ZooKeeper/ZooKeeper.cpp
@@ -602,7 +602,7 @@ void ZooKeeper::removeChildren(const std::string & path)
 }
 
 
-void ZooKeeper::removeChildrenRecursive(const std::string & path)
+void ZooKeeper::removeChildrenRecursive(const std::string & path, const String & keep_child_node)
 {
     Strings children = getChildren(path);
     while (!children.empty())
@@ -611,14 +611,15 @@ void ZooKeeper::removeChildrenRecursive(const std::string & path)
         for (size_t i = 0; i < MULTI_BATCH_SIZE && !children.empty(); ++i)
         {
             removeChildrenRecursive(path + "/" + children.back());
-            ops.emplace_back(makeRemoveRequest(path + "/" + children.back(), -1));
+            if (likely(keep_child_node.empty() || keep_child_node != children.back()))
+                ops.emplace_back(makeRemoveRequest(path + "/" + children.back(), -1));
             children.pop_back();
         }
         multi(ops);
     }
 }
 
-void ZooKeeper::tryRemoveChildrenRecursive(const std::string & path)
+void ZooKeeper::tryRemoveChildrenRecursive(const std::string & path, const String & keep_child_node)
 {
     Strings children;
     if (tryGetChildren(path, children) != Coordination::Error::ZOK)
@@ -629,14 +630,14 @@ void ZooKeeper::tryRemoveChildrenRecursive(const std::string & path)
         Strings batch;
         for (size_t i = 0; i < MULTI_BATCH_SIZE && !children.empty(); ++i)
         {
-            batch.push_back(path + "/" + children.back());
+            String child_path = path + "/" + children.back();
+            tryRemoveChildrenRecursive(child_path);
+            if (likely(keep_child_node.empty() || keep_child_node != children.back()))
+            {
+                batch.push_back(child_path);
+                ops.emplace_back(zkutil::makeRemoveRequest(child_path, -1));
+            }
             children.pop_back();
-            tryRemoveChildrenRecursive(batch.back());
-
-            Coordination::RemoveRequest request;
-            request.path = batch.back();
-
-            ops.emplace_back(std::make_shared<Coordination::RemoveRequest>(std::move(request)));
         }
 
         /// Try to remove the children with a faster method - in bulk. If this fails,
diff --git a/src/Common/ZooKeeper/ZooKeeper.h b/src/Common/ZooKeeper/ZooKeeper.h
index 0d9dc104c48..90d15e2ac4a 100644
--- a/src/Common/ZooKeeper/ZooKeeper.h
+++ b/src/Common/ZooKeeper/ZooKeeper.h
@@ -184,6 +184,12 @@ public:
     /// result would be the same as for the single call.
     void tryRemoveRecursive(const std::string & path);
 
+    /// Similar to removeRecursive(...) and tryRemoveRecursive(...), but does not remove path itself.
+    /// If keep_child_node is not empty, this method will not remove path/keep_child_node (but will remove its subtree).
+    /// It can be useful to keep some child node as a flag which indicates that path is currently removing.
+    void removeChildrenRecursive(const std::string & path, const String & keep_child_node = {});
+    void tryRemoveChildrenRecursive(const std::string & path, const String & keep_child_node = {});
+
     /// Remove all children nodes (non recursive).
     void removeChildren(const std::string & path);
 
@@ -246,9 +252,6 @@ private:
     void init(const std::string & implementation_, const std::string & hosts_, const std::string & identity_,
               int32_t session_timeout_ms_, int32_t operation_timeout_ms_, const std::string & chroot_);
 
-    void removeChildrenRecursive(const std::string & path);
-    void tryRemoveChildrenRecursive(const std::string & path);
-
     /// The following methods don't throw exceptions but return error codes.
     Coordination::Error createImpl(const std::string & path, const std::string & data, int32_t mode, std::string & path_created);
     Coordination::Error removeImpl(const std::string & path, int32_t version);
@@ -320,7 +323,7 @@ public:
         catch (...)
         {
             ProfileEvents::increment(ProfileEvents::CannotRemoveEphemeralNode);
-            DB::tryLogCurrentException(__PRETTY_FUNCTION__);
+            DB::tryLogCurrentException(__PRETTY_FUNCTION__, "Cannot remove " + path + ": ");
         }
     }
 
diff --git a/src/Common/ZooKeeper/ZooKeeperCommon.cpp b/src/Common/ZooKeeper/ZooKeeperCommon.cpp
index 9c699ee298a..56f9de31ec8 100644
--- a/src/Common/ZooKeeper/ZooKeeperCommon.cpp
+++ b/src/Common/ZooKeeper/ZooKeeperCommon.cpp
@@ -37,6 +37,26 @@ void ZooKeeperRequest::write(WriteBuffer & out) const
     out.next();
 }
 
+void ZooKeeperSyncRequest::writeImpl(WriteBuffer & out) const
+{
+    Coordination::write(path, out);
+}
+
+void ZooKeeperSyncRequest::readImpl(ReadBuffer & in)
+{
+    Coordination::read(path, in);
+}
+
+void ZooKeeperSyncResponse::readImpl(ReadBuffer & in)
+{
+    Coordination::read(path, in);
+}
+
+void ZooKeeperSyncResponse::writeImpl(WriteBuffer & out) const
+{
+    Coordination::write(path, out);
+}
+
 void ZooKeeperWatchResponse::readImpl(ReadBuffer & in)
 {
     Coordination::read(type, in);
@@ -51,6 +71,13 @@ void ZooKeeperWatchResponse::writeImpl(WriteBuffer & out) const
     Coordination::write(path, out);
 }
 
+void ZooKeeperWatchResponse::write(WriteBuffer & out) const
+{
+    if (error == Error::ZOK)
+        ZooKeeperResponse::write(out);
+    /// skip bad responses for watches
+}
+
 void ZooKeeperAuthRequest::writeImpl(WriteBuffer & out) const
 {
     Coordination::write(type, out);
@@ -326,6 +353,12 @@ void ZooKeeperMultiRequest::readImpl(ReadBuffer & in)
     }
 }
 
+bool ZooKeeperMultiRequest::isReadRequest() const
+{
+    /// Possibly we can do better
+    return false;
+}
+
 void ZooKeeperMultiResponse::readImpl(ReadBuffer & in)
 {
     for (auto & response : responses)
@@ -410,6 +443,7 @@ void ZooKeeperMultiResponse::writeImpl(WriteBuffer & out) const
 }
 
 ZooKeeperResponsePtr ZooKeeperHeartbeatRequest::makeResponse() const { return std::make_shared<ZooKeeperHeartbeatResponse>(); }
+ZooKeeperResponsePtr ZooKeeperSyncRequest::makeResponse() const { return std::make_shared<ZooKeeperSyncResponse>(); }
 ZooKeeperResponsePtr ZooKeeperAuthRequest::makeResponse() const { return std::make_shared<ZooKeeperAuthResponse>(); }
 ZooKeeperResponsePtr ZooKeeperCreateRequest::makeResponse() const { return std::make_shared<ZooKeeperCreateResponse>(); }
 ZooKeeperResponsePtr ZooKeeperRemoveRequest::makeResponse() const { return std::make_shared<ZooKeeperRemoveResponse>(); }
@@ -465,6 +499,7 @@ void registerZooKeeperRequest(ZooKeeperRequestFactory & factory)
 ZooKeeperRequestFactory::ZooKeeperRequestFactory()
 {
     registerZooKeeperRequest<OpNum::Heartbeat, ZooKeeperHeartbeatRequest>(*this);
+    registerZooKeeperRequest<OpNum::Sync, ZooKeeperSyncRequest>(*this);
     registerZooKeeperRequest<OpNum::Auth, ZooKeeperAuthRequest>(*this);
     registerZooKeeperRequest<OpNum::Close, ZooKeeperCloseRequest>(*this);
     registerZooKeeperRequest<OpNum::Create, ZooKeeperCreateRequest>(*this);
diff --git a/src/Common/ZooKeeper/ZooKeeperCommon.h b/src/Common/ZooKeeper/ZooKeeperCommon.h
index 9adb0c06e4c..92b1e7c9858 100644
--- a/src/Common/ZooKeeper/ZooKeeperCommon.h
+++ b/src/Common/ZooKeeper/ZooKeeperCommon.h
@@ -30,7 +30,7 @@ struct ZooKeeperResponse : virtual Response
     virtual ~ZooKeeperResponse() override = default;
     virtual void readImpl(ReadBuffer &) = 0;
     virtual void writeImpl(WriteBuffer &) const = 0;
-    void write(WriteBuffer & out) const;
+    virtual void write(WriteBuffer & out) const;
     virtual OpNum getOpNum() const = 0;
 };
 
@@ -60,6 +60,7 @@ struct ZooKeeperRequest : virtual Request
     static std::shared_ptr<ZooKeeperRequest> read(ReadBuffer & in);
 
     virtual ZooKeeperResponsePtr makeResponse() const = 0;
+    virtual bool isReadRequest() const = 0;
 };
 
 using ZooKeeperRequestPtr = std::shared_ptr<ZooKeeperRequest>;
@@ -71,6 +72,26 @@ struct ZooKeeperHeartbeatRequest final : ZooKeeperRequest
     void writeImpl(WriteBuffer &) const override {}
     void readImpl(ReadBuffer &) override {}
     ZooKeeperResponsePtr makeResponse() const override;
+    bool isReadRequest() const override { return false; }
+};
+
+struct ZooKeeperSyncRequest final : ZooKeeperRequest
+{
+    String path;
+    String getPath() const override { return path; }
+    OpNum getOpNum() const override { return OpNum::Sync; }
+    void writeImpl(WriteBuffer & out) const override;
+    void readImpl(ReadBuffer & in) override;
+    ZooKeeperResponsePtr makeResponse() const override;
+    bool isReadRequest() const override { return false; }
+};
+
+struct ZooKeeperSyncResponse final : ZooKeeperResponse
+{
+    String path;
+    void readImpl(ReadBuffer & in) override;
+    void writeImpl(WriteBuffer & out) const override;
+    OpNum getOpNum() const override { return OpNum::Sync; }
 };
 
 struct ZooKeeperHeartbeatResponse final : ZooKeeperResponse
@@ -86,6 +107,8 @@ struct ZooKeeperWatchResponse final : WatchResponse, ZooKeeperResponse
 
     void writeImpl(WriteBuffer & out) const override;
 
+    void write(WriteBuffer & out) const override;
+
     OpNum getOpNum() const override
     {
         throw Exception("OpNum for watch response doesn't exist", Error::ZRUNTIMEINCONSISTENCY);
@@ -104,6 +127,7 @@ struct ZooKeeperAuthRequest final : ZooKeeperRequest
     void readImpl(ReadBuffer & in) override;
 
     ZooKeeperResponsePtr makeResponse() const override;
+    bool isReadRequest() const override { return false; }
 };
 
 struct ZooKeeperAuthResponse final : ZooKeeperResponse
@@ -122,6 +146,7 @@ struct ZooKeeperCloseRequest final : ZooKeeperRequest
     void readImpl(ReadBuffer &) override {}
 
     ZooKeeperResponsePtr makeResponse() const override;
+    bool isReadRequest() const override { return false; }
 };
 
 struct ZooKeeperCloseResponse final : ZooKeeperResponse
@@ -146,6 +171,7 @@ struct ZooKeeperCreateRequest final : public CreateRequest, ZooKeeperRequest
     void readImpl(ReadBuffer & in) override;
 
     ZooKeeperResponsePtr makeResponse() const override;
+    bool isReadRequest() const override { return false; }
 };
 
 struct ZooKeeperCreateResponse final : CreateResponse, ZooKeeperResponse
@@ -167,6 +193,7 @@ struct ZooKeeperRemoveRequest final : RemoveRequest, ZooKeeperRequest
     void readImpl(ReadBuffer & in) override;
 
     ZooKeeperResponsePtr makeResponse() const override;
+    bool isReadRequest() const override { return false; }
 };
 
 struct ZooKeeperRemoveResponse final : RemoveResponse, ZooKeeperResponse
@@ -183,6 +210,7 @@ struct ZooKeeperExistsRequest final : ExistsRequest, ZooKeeperRequest
     void readImpl(ReadBuffer & in) override;
 
     ZooKeeperResponsePtr makeResponse() const override;
+    bool isReadRequest() const override { return !has_watch; }
 };
 
 struct ZooKeeperExistsResponse final : ExistsResponse, ZooKeeperResponse
@@ -199,6 +227,7 @@ struct ZooKeeperGetRequest final : GetRequest, ZooKeeperRequest
     void readImpl(ReadBuffer & in) override;
 
     ZooKeeperResponsePtr makeResponse() const override;
+    bool isReadRequest() const override { return !has_watch; }
 };
 
 struct ZooKeeperGetResponse final : GetResponse, ZooKeeperResponse
@@ -217,6 +246,7 @@ struct ZooKeeperSetRequest final : SetRequest, ZooKeeperRequest
     void writeImpl(WriteBuffer & out) const override;
     void readImpl(ReadBuffer & in) override;
     ZooKeeperResponsePtr makeResponse() const override;
+    bool isReadRequest() const override { return false; }
 };
 
 struct ZooKeeperSetResponse final : SetResponse, ZooKeeperResponse
@@ -232,6 +262,7 @@ struct ZooKeeperListRequest : ListRequest, ZooKeeperRequest
     void writeImpl(WriteBuffer & out) const override;
     void readImpl(ReadBuffer & in) override;
     ZooKeeperResponsePtr makeResponse() const override;
+    bool isReadRequest() const override { return !has_watch; }
 };
 
 struct ZooKeeperSimpleListRequest final : ZooKeeperListRequest
@@ -261,6 +292,7 @@ struct ZooKeeperCheckRequest final : CheckRequest, ZooKeeperRequest
     void readImpl(ReadBuffer & in) override;
 
     ZooKeeperResponsePtr makeResponse() const override;
+    bool isReadRequest() const override { return !has_watch; }
 };
 
 struct ZooKeeperCheckResponse final : CheckResponse, ZooKeeperResponse
@@ -290,6 +322,7 @@ struct ZooKeeperMultiRequest final : MultiRequest, ZooKeeperRequest
     void readImpl(ReadBuffer & in) override;
 
     ZooKeeperResponsePtr makeResponse() const override;
+    bool isReadRequest() const override;
 };
 
 struct ZooKeeperMultiResponse final : MultiResponse, ZooKeeperResponse
diff --git a/src/Common/ZooKeeper/ZooKeeperConstants.cpp b/src/Common/ZooKeeper/ZooKeeperConstants.cpp
index b4cb9feb518..295094b336b 100644
--- a/src/Common/ZooKeeper/ZooKeeperConstants.cpp
+++ b/src/Common/ZooKeeper/ZooKeeperConstants.cpp
@@ -15,6 +15,7 @@ static const std::unordered_set<int32_t> VALID_OPERATIONS =
     static_cast<int32_t>(OpNum::Get),
     static_cast<int32_t>(OpNum::Set),
     static_cast<int32_t>(OpNum::SimpleList),
+    static_cast<int32_t>(OpNum::Sync),
     static_cast<int32_t>(OpNum::Heartbeat),
     static_cast<int32_t>(OpNum::List),
     static_cast<int32_t>(OpNum::Check),
@@ -48,6 +49,8 @@ std::string toString(OpNum op_num)
             return "Check";
         case OpNum::Multi:
             return "Multi";
+        case OpNum::Sync:
+            return "Sync";
         case OpNum::Heartbeat:
             return "Heartbeat";
         case OpNum::Auth:
diff --git a/src/Common/ZooKeeper/ZooKeeperConstants.h b/src/Common/ZooKeeper/ZooKeeperConstants.h
index 8a20330a2d7..81ca6c6a460 100644
--- a/src/Common/ZooKeeper/ZooKeeperConstants.h
+++ b/src/Common/ZooKeeper/ZooKeeperConstants.h
@@ -24,6 +24,7 @@ enum class OpNum : int32_t
     Get = 4,
     Set = 5,
     SimpleList = 8,
+    Sync = 9,
     Heartbeat = 11,
     List = 12,
     Check = 13,
diff --git a/src/Common/ZooKeeper/ZooKeeperIO.cpp b/src/Common/ZooKeeper/ZooKeeperIO.cpp
index a0e4161f111..3f0905ea186 100644
--- a/src/Common/ZooKeeper/ZooKeeperIO.cpp
+++ b/src/Common/ZooKeeper/ZooKeeperIO.cpp
@@ -3,6 +3,13 @@
 namespace Coordination
 {
 
+
+void write(size_t x, WriteBuffer & out)
+{
+    x = __builtin_bswap64(x);
+    writeBinary(x, out);
+}
+
 void write(int64_t x, WriteBuffer & out)
 {
     x = __builtin_bswap64(x);
@@ -57,6 +64,12 @@ void write(const Error & x, WriteBuffer & out)
     write(static_cast<int32_t>(x), out);
 }
 
+void read(size_t & x, ReadBuffer & in)
+{
+    readBinary(x, in);
+    x = __builtin_bswap64(x);
+}
+
 void read(int64_t & x, ReadBuffer & in)
 {
     readBinary(x, in);
diff --git a/src/Common/ZooKeeper/ZooKeeperIO.h b/src/Common/ZooKeeper/ZooKeeperIO.h
index edeb995f27b..fd47e324664 100644
--- a/src/Common/ZooKeeper/ZooKeeperIO.h
+++ b/src/Common/ZooKeeper/ZooKeeperIO.h
@@ -13,6 +13,7 @@ namespace Coordination
 
 using namespace DB;
 
+void write(size_t x, WriteBuffer & out);
 void write(int64_t x, WriteBuffer & out);
 void write(int32_t x, WriteBuffer & out);
 void write(OpNum x, WriteBuffer & out);
@@ -37,6 +38,7 @@ void write(const std::vector<T> & arr, WriteBuffer & out)
         write(elem, out);
 }
 
+void read(size_t & x, ReadBuffer & in);
 void read(int64_t & x, ReadBuffer & in);
 void read(int32_t & x, ReadBuffer & in);
 void read(OpNum & x, ReadBuffer & in);
diff --git a/src/Common/tests/compact_array.cpp b/src/Common/tests/compact_array.cpp
index 91fb59d543f..a63859ac712 100644
--- a/src/Common/tests/compact_array.cpp
+++ b/src/Common/tests/compact_array.cpp
@@ -50,6 +50,7 @@ struct Test
             {
                 DB::WriteBufferFromFile wb(filename);
                 wb.write(reinterpret_cast<const char *>(&store), sizeof(store));
+                wb.close();
             }
 
             {
diff --git a/src/Common/tests/gtest_hash_table.cpp b/src/Common/tests/gtest_hash_table.cpp
index 41255dcbba1..1c673166ca9 100644
--- a/src/Common/tests/gtest_hash_table.cpp
+++ b/src/Common/tests/gtest_hash_table.cpp
@@ -317,3 +317,51 @@ TEST(HashTable, SerializationDeserialization)
         ASSERT_EQ(convertToSet(cont), convertToSet(deserialized));
     }
 }
+
+template <typename T>
+struct IdentityHash
+{
+    size_t operator()(T x) const { return x; }
+};
+
+struct OneElementResizeGrower
+{
+    /// If collision resolution chains are contiguous, we can implement erase operation by moving the elements.
+    static constexpr auto performs_linear_probing_with_single_step = true;
+
+    static constexpr size_t initial_count = 1;
+
+    size_t bufSize() const { return buf_size; }
+
+    size_t place(size_t x) const { return x % buf_size; }
+
+    size_t next(size_t pos) const { return (pos + 1) % buf_size; }
+
+    bool overflow(size_t elems) const { return elems >= buf_size; }
+
+    void increaseSize() { ++buf_size; }
+
+    void set(size_t) { }
+
+    void setBufSize(size_t buf_size_) { buf_size = buf_size_; }
+
+    size_t buf_size = initial_count;
+};
+
+TEST(HashTable, Resize)
+{
+    {
+        /// Test edge case if after resize all cells are resized in end of buf and will take half of
+        /// hash table place.
+        using HashSet = HashSet<int, IdentityHash<int>, OneElementResizeGrower>;
+        HashSet cont;
+
+        cont.insert(3);
+        cont.insert(1);
+
+        std::set<int> expected = {1, 3};
+        std::set<int> actual = convertToSet(cont);
+
+        ASSERT_EQ(actual, expected);
+    }
+}
diff --git a/src/Common/tests/symbol_index.cpp b/src/Common/tests/symbol_index.cpp
index 3811bbbdd71..496fa7dc3fe 100644
--- a/src/Common/tests/symbol_index.cpp
+++ b/src/Common/tests/symbol_index.cpp
@@ -47,10 +47,11 @@ int main(int argc, char ** argv)
         std::cerr << "dladdr: Not found\n";
 
     const auto * object = symbol_index.findObject(getAddress());
-    Dwarf dwarf(*object->elf);
+    Dwarf dwarf(object->elf);
 
     Dwarf::LocationInfo location;
-    if (dwarf.findAddress(uintptr_t(address) - uintptr_t(info.dli_fbase), location, Dwarf::LocationInfoMode::FAST))
+    std::vector<Dwarf::SymbolizedFrame> frames;
+    if (dwarf.findAddress(uintptr_t(address) - uintptr_t(info.dli_fbase), location, Dwarf::LocationInfoMode::FAST, frames))
         std::cerr << location.file.toString() << ":" << location.line << "\n";
     else
         std::cerr << "Dwarf: Not found\n";
diff --git a/src/Common/ya.make b/src/Common/ya.make
index a8cac313a76..64dd628c457 100644
--- a/src/Common/ya.make
+++ b/src/Common/ya.make
@@ -83,8 +83,6 @@ SRCS(
     WeakHash.cpp
     ZooKeeper/IKeeper.cpp
     ZooKeeper/TestKeeper.cpp
-    ZooKeeper/TestKeeperStorage.cpp
-    ZooKeeper/TestKeeperStorageDispatcher.cpp
     ZooKeeper/ZooKeeper.cpp
     ZooKeeper/ZooKeeperCommon.cpp
     ZooKeeper/ZooKeeperConstants.cpp
diff --git a/src/Compression/CompressedWriteBuffer.cpp b/src/Compression/CompressedWriteBuffer.cpp
index 02f418dcdf7..8d146e8de23 100644
--- a/src/Compression/CompressedWriteBuffer.cpp
+++ b/src/Compression/CompressedWriteBuffer.cpp
@@ -8,6 +8,7 @@
 #include <Compression/CompressionFactory.h>
 
 #include <Common/MemorySanitizer.h>
+#include <Common/MemoryTracker.h>
 
 
 namespace DB
@@ -49,14 +50,9 @@ CompressedWriteBuffer::CompressedWriteBuffer(
 
 CompressedWriteBuffer::~CompressedWriteBuffer()
 {
-    try
-    {
-        next();
-    }
-    catch (...)
-    {
-        tryLogCurrentException(__PRETTY_FUNCTION__);
-    }
+    /// FIXME move final flush into the caller
+    MemoryTracker::LockExceptionInThread lock;
+    next();
 }
 
 }
diff --git a/tests/queries/0_stateless/01513_optimize_aggregation_in_order_memory.reference b/src/Coordination/CMakeLists.txt
similarity index 100%
rename from tests/queries/0_stateless/01513_optimize_aggregation_in_order_memory.reference
rename to src/Coordination/CMakeLists.txt
diff --git a/src/Coordination/CoordinationSettings.cpp b/src/Coordination/CoordinationSettings.cpp
new file mode 100644
index 00000000000..cd46817e82f
--- /dev/null
+++ b/src/Coordination/CoordinationSettings.cpp
@@ -0,0 +1,35 @@
+#include <Coordination/CoordinationSettings.h>
+#include <Core/Settings.h>
+#include <common/logger_useful.h>
+
+namespace DB
+{
+namespace ErrorCodes
+{
+    extern const int UNKNOWN_SETTING;
+}
+
+IMPLEMENT_SETTINGS_TRAITS(CoordinationSettingsTraits, LIST_OF_COORDINATION_SETTINGS)
+
+void CoordinationSettings::loadFromConfig(const String & config_elem, const Poco::Util::AbstractConfiguration & config)
+{
+    if (!config.has(config_elem))
+        return;
+
+    Poco::Util::AbstractConfiguration::Keys config_keys;
+    config.keys(config_elem, config_keys);
+
+    try
+    {
+        for (const String & key : config_keys)
+            set(key, config.getString(config_elem + "." + key));
+    }
+    catch (Exception & e)
+    {
+        if (e.code() == ErrorCodes::UNKNOWN_SETTING)
+            e.addMessage("in Coordination settings config");
+        throw;
+    }
+}
+
+}
diff --git a/src/Coordination/CoordinationSettings.h b/src/Coordination/CoordinationSettings.h
new file mode 100644
index 00000000000..441e1a5936f
--- /dev/null
+++ b/src/Coordination/CoordinationSettings.h
@@ -0,0 +1,43 @@
+#pragma once
+
+#include <Core/Defines.h>
+#include <Core/BaseSettings.h>
+#include <Core/SettingsEnums.h>
+#include <Common/ZooKeeper/ZooKeeperConstants.h>
+#include <Poco/Util/AbstractConfiguration.h>
+
+namespace DB
+{
+
+struct Settings;
+
+/** These settings represent fine tunes for internal details of Coordination storages
+  * and should not be changed by the user without a reason.
+  */
+
+#define LIST_OF_COORDINATION_SETTINGS(M) \
+    M(Milliseconds, session_timeout_ms, Coordination::DEFAULT_SESSION_TIMEOUT_MS, "Default client session timeout", 0) \
+    M(Milliseconds, operation_timeout_ms, Coordination::DEFAULT_OPERATION_TIMEOUT_MS, "Default client operation timeout", 0) \
+    M(Milliseconds, dead_session_check_period_ms, 500, "How often leader will check sessions to consider them dead and remove", 0) \
+    M(Milliseconds, heart_beat_interval_ms, 500, "Heartbeat interval between quorum nodes", 0) \
+    M(Milliseconds, election_timeout_lower_bound_ms, 1000, "Lower bound of election timer (avoid too often leader elections)", 0) \
+    M(Milliseconds, election_timeout_upper_bound_ms, 2000, "Lower bound of election timer (avoid too often leader elections)", 0) \
+    M(UInt64, reserved_log_items, 5000, "How many log items to store (don't remove during compaction)", 0) \
+    M(UInt64, snapshot_distance, 5000, "How many log items we have to collect to write new snapshot", 0) \
+    M(UInt64, max_stored_snapshots, 3, "How many snapshots we want to store", 0) \
+    M(Bool, auto_forwarding, true, "Allow to forward write requests from followers to leader", 0) \
+    M(Milliseconds, shutdown_timeout, 5000, "How many time we will until RAFT shutdown", 0) \
+    M(Milliseconds, startup_timeout, 30000, "How many time we will until RAFT to start", 0) \
+    M(LogsLevel, raft_logs_level, LogsLevel::information, "Log internal RAFT logs into main server log level. Valid values: 'trace', 'debug', 'information', 'warning', 'error', 'fatal', 'none'", 0)
+
+DECLARE_SETTINGS_TRAITS(CoordinationSettingsTraits, LIST_OF_COORDINATION_SETTINGS)
+
+
+struct CoordinationSettings : public BaseSettings<CoordinationSettingsTraits>
+{
+    void loadFromConfig(const String & config_elem, const Poco::Util::AbstractConfiguration & config);
+};
+
+using CoordinationSettingsPtr = std::shared_ptr<CoordinationSettings>;
+
+}
diff --git a/src/Coordination/InMemoryLogStore.cpp b/src/Coordination/InMemoryLogStore.cpp
new file mode 100644
index 00000000000..101458891e7
--- /dev/null
+++ b/src/Coordination/InMemoryLogStore.cpp
@@ -0,0 +1,194 @@
+#include <Coordination/InMemoryLogStore.h>
+
+namespace DB
+{
+
+namespace
+{
+using namespace nuraft;
+ptr<log_entry> makeClone(const ptr<log_entry> & entry)
+{
+    ptr<log_entry> clone = cs_new<log_entry>(entry->get_term(), buffer::clone(entry->get_buf()), entry->get_val_type());
+    return clone;
+}
+}
+
+InMemoryLogStore::InMemoryLogStore()
+    : start_idx(1)
+{
+    nuraft::ptr<nuraft::buffer> buf = nuraft::buffer::alloc(sizeof(size_t));
+    logs[0] = nuraft::cs_new<nuraft::log_entry>(0, buf);
+}
+
+size_t InMemoryLogStore::start_index() const
+{
+    return start_idx;
+}
+
+size_t InMemoryLogStore::next_slot() const
+{
+    std::lock_guard<std::mutex> l(logs_lock);
+    // Exclude the dummy entry.
+    return start_idx + logs.size() - 1;
+}
+
+nuraft::ptr<nuraft::log_entry> InMemoryLogStore::last_entry() const
+{
+    size_t next_idx = next_slot();
+    std::lock_guard<std::mutex> lock(logs_lock);
+    auto entry = logs.find(next_idx - 1);
+    if (entry == logs.end())
+        entry = logs.find(0);
+
+    return makeClone(entry->second);
+}
+
+size_t InMemoryLogStore::append(nuraft::ptr<nuraft::log_entry> & entry)
+{
+    ptr<log_entry> clone = makeClone(entry);
+
+    std::lock_guard<std::mutex> l(logs_lock);
+    size_t idx = start_idx + logs.size() - 1;
+    logs[idx] = clone;
+    return idx;
+}
+
+void InMemoryLogStore::write_at(size_t index, nuraft::ptr<nuraft::log_entry> & entry)
+{
+    nuraft::ptr<log_entry> clone = makeClone(entry);
+
+    // Discard all logs equal to or greater than `index.
+    std::lock_guard<std::mutex> l(logs_lock);
+    auto itr = logs.lower_bound(index);
+    while (itr != logs.end())
+        itr = logs.erase(itr);
+    logs[index] = clone;
+}
+
+nuraft::ptr<std::vector<nuraft::ptr<nuraft::log_entry>>> InMemoryLogStore::log_entries(size_t start, size_t end)
+{
+    nuraft::ptr<std::vector<nuraft::ptr<nuraft::log_entry>>> ret =
+        nuraft::cs_new<std::vector<nuraft::ptr<nuraft::log_entry>>>();
+
+    ret->resize(end - start);
+    size_t cc = 0;
+    for (size_t ii = start; ii < end; ++ii)
+    {
+        nuraft::ptr<nuraft::log_entry> src = nullptr;
+        {
+            std::lock_guard<std::mutex> l(logs_lock);
+            auto entry = logs.find(ii);
+            if (entry == logs.end())
+            {
+                entry = logs.find(0);
+                assert(0);
+            }
+            src = entry->second;
+        }
+        (*ret)[cc++] = makeClone(src);
+    }
+    return ret;
+}
+
+nuraft::ptr<nuraft::log_entry> InMemoryLogStore::entry_at(size_t index)
+{
+    nuraft::ptr<nuraft::log_entry> src = nullptr;
+    {
+        std::lock_guard<std::mutex> l(logs_lock);
+        auto entry = logs.find(index);
+        if (entry == logs.end())
+            entry = logs.find(0);
+        src = entry->second;
+    }
+    return makeClone(src);
+}
+
+size_t InMemoryLogStore::term_at(size_t index)
+{
+    size_t term = 0;
+    {
+        std::lock_guard<std::mutex> l(logs_lock);
+        auto entry = logs.find(index);
+        if (entry == logs.end())
+            entry = logs.find(0);
+        term = entry->second->get_term();
+    }
+    return term;
+}
+
+nuraft::ptr<nuraft::buffer> InMemoryLogStore::pack(size_t index, Int32 cnt)
+{
+    std::vector<nuraft::ptr<nuraft::buffer>> returned_logs;
+
+    size_t size_total = 0;
+    for (size_t ii = index; ii < index + cnt; ++ii)
+    {
+        ptr<log_entry> le = nullptr;
+        {
+            std::lock_guard<std::mutex> l(logs_lock);
+            le = logs[ii];
+        }
+        assert(le.get());
+        nuraft::ptr<nuraft::buffer> buf = le->serialize();
+        size_total += buf->size();
+        returned_logs.push_back(buf);
+    }
+
+    nuraft::ptr<buffer> buf_out = nuraft::buffer::alloc(sizeof(int32) + cnt * sizeof(int32) + size_total);
+    buf_out->pos(0);
+    buf_out->put(static_cast<Int32>(cnt));
+
+    for (auto & entry : returned_logs)
+    {
+        nuraft::ptr<nuraft::buffer> & bb = entry;
+        buf_out->put(static_cast<Int32>(bb->size()));
+        buf_out->put(*bb);
+    }
+    return buf_out;
+}
+
+void InMemoryLogStore::apply_pack(size_t index, nuraft::buffer & pack)
+{
+    pack.pos(0);
+    Int32 num_logs = pack.get_int();
+
+    for (Int32 ii = 0; ii < num_logs; ++ii)
+    {
+        size_t cur_idx = index + ii;
+        Int32 buf_size = pack.get_int();
+
+        nuraft::ptr<nuraft::buffer> buf_local = nuraft::buffer::alloc(buf_size);
+        pack.get(buf_local);
+
+        nuraft::ptr<nuraft::log_entry> le = nuraft::log_entry::deserialize(*buf_local);
+        {
+            std::lock_guard<std::mutex> l(logs_lock);
+            logs[cur_idx] = le;
+        }
+    }
+
+    {
+        std::lock_guard<std::mutex> l(logs_lock);
+        auto entry = logs.upper_bound(0);
+        if (entry != logs.end())
+            start_idx = entry->first;
+        else
+            start_idx = 1;
+    }
+}
+
+bool InMemoryLogStore::compact(size_t last_log_index)
+{
+    std::lock_guard<std::mutex> l(logs_lock);
+    for (size_t ii = start_idx; ii <= last_log_index; ++ii)
+    {
+        auto entry = logs.find(ii);
+        if (entry != logs.end())
+            logs.erase(entry);
+    }
+
+    start_idx = last_log_index + 1;
+    return true;
+}
+
+}
diff --git a/src/Coordination/InMemoryLogStore.h b/src/Coordination/InMemoryLogStore.h
new file mode 100644
index 00000000000..425b056a81d
--- /dev/null
+++ b/src/Coordination/InMemoryLogStore.h
@@ -0,0 +1,47 @@
+#pragma once
+
+#include <atomic>
+#include <map>
+#include <mutex>
+#include <Core/Types.h>
+#include <libnuraft/log_store.hxx> // Y_IGNORE
+
+namespace DB
+{
+
+class InMemoryLogStore : public nuraft::log_store
+{
+public:
+    InMemoryLogStore();
+
+    size_t start_index() const override;
+
+    size_t next_slot() const override;
+
+    nuraft::ptr<nuraft::log_entry> last_entry() const override;
+
+    size_t append(nuraft::ptr<nuraft::log_entry> & entry) override;
+
+    void write_at(size_t index, nuraft::ptr<nuraft::log_entry> & entry) override;
+
+    nuraft::ptr<std::vector<nuraft::ptr<nuraft::log_entry>>> log_entries(size_t start, size_t end) override;
+
+    nuraft::ptr<nuraft::log_entry> entry_at(size_t index) override;
+
+    size_t term_at(size_t index) override;
+
+    nuraft::ptr<nuraft::buffer> pack(size_t index, Int32 cnt) override;
+
+    void apply_pack(size_t index, nuraft::buffer & pack) override;
+
+    bool compact(size_t last_log_index) override;
+
+    bool flush() override { return true; }
+
+private:
+    std::map<size_t, nuraft::ptr<nuraft::log_entry>> logs;
+    mutable std::mutex logs_lock;
+    std::atomic<size_t> start_idx;
+};
+
+}
diff --git a/src/Coordination/InMemoryStateManager.cpp b/src/Coordination/InMemoryStateManager.cpp
new file mode 100644
index 00000000000..69e93578cc1
--- /dev/null
+++ b/src/Coordination/InMemoryStateManager.cpp
@@ -0,0 +1,78 @@
+#include <Coordination/InMemoryStateManager.h>
+#include <Common/Exception.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int RAFT_ERROR;
+}
+
+InMemoryStateManager::InMemoryStateManager(int server_id_, const std::string & host, int port)
+    : my_server_id(server_id_)
+    , my_port(port)
+    , log_store(nuraft::cs_new<InMemoryLogStore>())
+    , cluster_config(nuraft::cs_new<nuraft::cluster_config>())
+{
+    auto peer_config = nuraft::cs_new<nuraft::srv_config>(my_server_id, host + ":" + std::to_string(port));
+    cluster_config->get_servers().push_back(peer_config);
+}
+
+InMemoryStateManager::InMemoryStateManager(
+    int my_server_id_,
+    const std::string & config_prefix,
+    const Poco::Util::AbstractConfiguration & config)
+    : my_server_id(my_server_id_)
+    , log_store(nuraft::cs_new<InMemoryLogStore>())
+    , cluster_config(nuraft::cs_new<nuraft::cluster_config>())
+{
+    Poco::Util::AbstractConfiguration::Keys keys;
+    config.keys(config_prefix, keys);
+
+    for (const auto & server_key : keys)
+    {
+        std::string full_prefix = config_prefix + "." + server_key;
+        int server_id = config.getInt(full_prefix + ".id");
+        std::string hostname = config.getString(full_prefix + ".hostname");
+        int port = config.getInt(full_prefix + ".port");
+        bool can_become_leader = config.getBool(full_prefix + ".can_become_leader", true);
+        int32_t priority = config.getInt(full_prefix + ".priority", 1);
+        bool start_as_follower = config.getBool(full_prefix + ".start_as_follower", false);
+        if (start_as_follower)
+            start_as_follower_servers.insert(server_id);
+
+        auto endpoint = hostname + ":" + std::to_string(port);
+        auto peer_config = nuraft::cs_new<nuraft::srv_config>(server_id, 0, endpoint, "", !can_become_leader, priority);
+        if (server_id == my_server_id)
+        {
+            my_server_config = peer_config;
+            my_port = port;
+        }
+
+        cluster_config->get_servers().push_back(peer_config);
+    }
+    if (!my_server_config)
+        throw Exception(ErrorCodes::RAFT_ERROR, "Our server id {} not found in raft_configuration section");
+
+    if (start_as_follower_servers.size() == cluster_config->get_servers().size())
+        throw Exception(ErrorCodes::RAFT_ERROR, "At least one of servers should be able to start as leader (without <start_as_follower>)");
+}
+
+void InMemoryStateManager::save_config(const nuraft::cluster_config & config)
+{
+    // Just keep in memory in this example.
+    // Need to write to disk here, if want to make it durable.
+    nuraft::ptr<nuraft::buffer> buf = config.serialize();
+    cluster_config = nuraft::cluster_config::deserialize(*buf);
+}
+
+void InMemoryStateManager::save_state(const nuraft::srv_state & state)
+{
+     // Just keep in memory in this example.
+     // Need to write to disk here, if want to make it durable.
+     nuraft::ptr<nuraft::buffer> buf = state.serialize();
+     server_state = nuraft::srv_state::deserialize(*buf);
+ }
+
+}
diff --git a/src/Coordination/InMemoryStateManager.h b/src/Coordination/InMemoryStateManager.h
new file mode 100644
index 00000000000..2a5c2f00dba
--- /dev/null
+++ b/src/Coordination/InMemoryStateManager.h
@@ -0,0 +1,58 @@
+#pragma once
+
+#include <Core/Types.h>
+#include <string>
+#include <Coordination/InMemoryLogStore.h>
+#include <libnuraft/nuraft.hxx> // Y_IGNORE
+#include <Poco/Util/AbstractConfiguration.h>
+
+namespace DB
+{
+
+class InMemoryStateManager : public nuraft::state_mgr
+{
+public:
+    InMemoryStateManager(
+        int server_id_,
+        const std::string & config_prefix,
+        const Poco::Util::AbstractConfiguration & config);
+
+    InMemoryStateManager(
+        int server_id_,
+        const std::string & host,
+        int port);
+
+    nuraft::ptr<nuraft::cluster_config> load_config() override { return cluster_config; }
+
+    void save_config(const nuraft::cluster_config & config) override;
+
+    void save_state(const nuraft::srv_state & state) override;
+
+    nuraft::ptr<nuraft::srv_state> read_state() override { return server_state; }
+
+    nuraft::ptr<nuraft::log_store> load_log_store() override { return log_store; }
+
+    Int32 server_id() override { return my_server_id; }
+
+    nuraft::ptr<nuraft::srv_config> get_srv_config() const { return my_server_config; }
+
+    void system_exit(const int /* exit_code */) override {}
+
+    int getPort() const { return my_port; }
+
+    bool shouldStartAsFollower() const
+    {
+        return start_as_follower_servers.count(my_server_id);
+    }
+
+private:
+    int my_server_id;
+    int my_port;
+    std::unordered_set<int> start_as_follower_servers;
+    nuraft::ptr<InMemoryLogStore> log_store;
+    nuraft::ptr<nuraft::srv_config> my_server_config;
+    nuraft::ptr<nuraft::cluster_config> cluster_config;
+    nuraft::ptr<nuraft::srv_state> server_state;
+};
+
+}
diff --git a/src/Coordination/LoggerWrapper.h b/src/Coordination/LoggerWrapper.h
new file mode 100644
index 00000000000..755b72c06cc
--- /dev/null
+++ b/src/Coordination/LoggerWrapper.h
@@ -0,0 +1,47 @@
+#pragma once
+
+#include <libnuraft/nuraft.hxx> // Y_IGNORE
+#include <common/logger_useful.h>
+#include <Core/SettingsEnums.h>
+
+namespace DB
+{
+
+class LoggerWrapper : public nuraft::logger
+{
+public:
+    LoggerWrapper(const std::string & name, LogsLevel level_)
+        : log(&Poco::Logger::get(name))
+        , level(static_cast<int>(level_))
+    {
+        log->setLevel(level);
+    }
+
+    void put_details(
+        int level_,
+        const char * /* source_file */,
+        const char * /* func_name */,
+        size_t /* line_number */,
+        const std::string & msg) override
+    {
+        LOG_IMPL(log, static_cast<DB::LogsLevel>(level_), static_cast<Poco::Message::Priority>(level_), msg);
+    }
+
+    void set_level(int level_) override
+    {
+        level_ = std::min(6, std::max(1, level_));
+        log->setLevel(level_);
+        level = level_;
+    }
+
+    int get_level() override
+    {
+        return level;
+    }
+
+private:
+    Poco::Logger * log;
+    std::atomic<int> level;
+};
+
+}
diff --git a/src/Coordination/NuKeeperCommon.h b/src/Coordination/NuKeeperCommon.h
new file mode 100644
index 00000000000..14fc612093c
--- /dev/null
+++ b/src/Coordination/NuKeeperCommon.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <Common/ZooKeeper/ZooKeeperCommon.h>
+
+namespace DB
+{
+
+struct NuKeeperRequest
+{
+    int64_t session_id;
+    Coordination::ZooKeeperRequestPtr request;
+};
+
+using NuKeeperRequests = std::vector<NuKeeperRequest>;
+
+struct NuKeeperResponse
+{
+    int64_t session_id;
+    Coordination::ZooKeeperRequestPtr response;
+};
+
+using NuKeeperResponses = std::vector<NuKeeperResponse>;
+
+}
diff --git a/src/Coordination/NuKeeperServer.cpp b/src/Coordination/NuKeeperServer.cpp
new file mode 100644
index 00000000000..7464a06e86f
--- /dev/null
+++ b/src/Coordination/NuKeeperServer.cpp
@@ -0,0 +1,182 @@
+#include <Coordination/NuKeeperServer.h>
+#include <Coordination/LoggerWrapper.h>
+#include <Coordination/NuKeeperStateMachine.h>
+#include <Coordination/InMemoryStateManager.h>
+#include <Coordination/WriteBufferFromNuraftBuffer.h>
+#include <Coordination/ReadBufferFromNuraftBuffer.h>
+#include <IO/ReadHelpers.h>
+#include <IO/WriteHelpers.h>
+#include <chrono>
+#include <Common/ZooKeeper/ZooKeeperIO.h>
+#include <string>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int RAFT_ERROR;
+}
+
+NuKeeperServer::NuKeeperServer(
+    int server_id_,
+    const CoordinationSettingsPtr & coordination_settings_,
+    const Poco::Util::AbstractConfiguration & config,
+    ResponsesQueue & responses_queue_)
+    : server_id(server_id_)
+    , coordination_settings(coordination_settings_)
+    , state_machine(nuraft::cs_new<NuKeeperStateMachine>(responses_queue_, coordination_settings))
+    , state_manager(nuraft::cs_new<InMemoryStateManager>(server_id, "test_keeper_server.raft_configuration", config))
+    , responses_queue(responses_queue_)
+{
+}
+
+void NuKeeperServer::startup()
+{
+    nuraft::raft_params params;
+    params.heart_beat_interval_ = coordination_settings->heart_beat_interval_ms.totalMilliseconds();
+    params.election_timeout_lower_bound_ = coordination_settings->election_timeout_lower_bound_ms.totalMilliseconds();
+    params.election_timeout_upper_bound_ = coordination_settings->election_timeout_upper_bound_ms.totalMilliseconds();
+    params.reserved_log_items_ = coordination_settings->reserved_log_items;
+    params.snapshot_distance_ = coordination_settings->snapshot_distance;
+    params.client_req_timeout_ = coordination_settings->operation_timeout_ms.totalMilliseconds();
+    params.auto_forwarding_ = coordination_settings->auto_forwarding;
+    params.auto_forwarding_req_timeout_ = coordination_settings->operation_timeout_ms.totalMilliseconds() * 2;
+
+    params.return_method_ = nuraft::raft_params::blocking;
+
+    nuraft::asio_service::options asio_opts{};
+    nuraft::raft_server::init_options init_options;
+    init_options.skip_initial_election_timeout_ = state_manager->shouldStartAsFollower();
+    init_options.raft_callback_ = [this] (nuraft::cb_func::Type type, nuraft::cb_func::Param * param)
+    {
+        return callbackFunc(type, param);
+    };
+
+    raft_instance = launcher.init(
+        state_machine, state_manager, nuraft::cs_new<LoggerWrapper>("RaftInstance", coordination_settings->raft_logs_level), state_manager->getPort(),
+        asio_opts, params, init_options);
+
+    if (!raft_instance)
+        throw Exception(ErrorCodes::RAFT_ERROR, "Cannot allocate RAFT instance");
+}
+
+void NuKeeperServer::shutdown()
+{
+    state_machine->shutdownStorage();
+    if (!launcher.shutdown(coordination_settings->shutdown_timeout.totalSeconds()))
+        LOG_WARNING(&Poco::Logger::get("NuKeeperServer"), "Failed to shutdown RAFT server in {} seconds", 5);
+}
+
+namespace
+{
+
+nuraft::ptr<nuraft::buffer> getZooKeeperLogEntry(int64_t session_id, const Coordination::ZooKeeperRequestPtr & request)
+{
+    DB::WriteBufferFromNuraftBuffer buf;
+    DB::writeIntBinary(session_id, buf);
+    request->write(buf);
+    return buf.getBuffer();
+}
+
+}
+
+void NuKeeperServer::putRequest(const NuKeeperStorage::RequestForSession & request_for_session)
+{
+    auto [session_id, request] = request_for_session;
+    if (isLeaderAlive() && request->isReadRequest())
+    {
+        state_machine->processReadRequest(request_for_session);
+    }
+    else
+    {
+        std::vector<nuraft::ptr<nuraft::buffer>> entries;
+        entries.push_back(getZooKeeperLogEntry(session_id, request));
+
+        std::lock_guard lock(append_entries_mutex);
+
+        auto result = raft_instance->append_entries(entries);
+        if (!result->get_accepted())
+        {
+            NuKeeperStorage::ResponsesForSessions responses;
+            auto response = request->makeResponse();
+            response->xid = request->xid;
+            response->zxid = 0;
+            response->error = Coordination::Error::ZOPERATIONTIMEOUT;
+            responses_queue.push(DB::NuKeeperStorage::ResponseForSession{session_id, response});
+        }
+
+        if (result->get_result_code() == nuraft::cmd_result_code::TIMEOUT)
+        {
+            NuKeeperStorage::ResponsesForSessions responses;
+            auto response = request->makeResponse();
+            response->xid = request->xid;
+            response->zxid = 0;
+            response->error = Coordination::Error::ZOPERATIONTIMEOUT;
+            responses_queue.push(DB::NuKeeperStorage::ResponseForSession{session_id, response});
+        }
+        else if (result->get_result_code() != nuraft::cmd_result_code::OK)
+            throw Exception(ErrorCodes::RAFT_ERROR, "Requests result failed with code {} and message: '{}'", result->get_result_code(), result->get_result_str());
+    }
+}
+
+int64_t NuKeeperServer::getSessionID(int64_t session_timeout_ms)
+{
+    auto entry = nuraft::buffer::alloc(sizeof(int64_t));
+    /// Just special session request
+    nuraft::buffer_serializer bs(entry);
+    bs.put_i64(session_timeout_ms);
+
+    std::lock_guard lock(append_entries_mutex);
+
+    auto result = raft_instance->append_entries({entry});
+
+    if (!result->get_accepted())
+        throw Exception(ErrorCodes::RAFT_ERROR, "Cannot send session_id request to RAFT");
+
+    if (result->get_result_code() != nuraft::cmd_result_code::OK)
+        throw Exception(ErrorCodes::RAFT_ERROR, "session_id request failed to RAFT");
+
+    auto resp = result->get();
+    if (resp == nullptr)
+        throw Exception(ErrorCodes::RAFT_ERROR, "Received nullptr as session_id");
+
+    nuraft::buffer_serializer bs_resp(resp);
+    return bs_resp.get_i64();
+}
+
+bool NuKeeperServer::isLeader() const
+{
+    return raft_instance->is_leader();
+}
+
+bool NuKeeperServer::isLeaderAlive() const
+{
+    return raft_instance->is_leader_alive();
+}
+
+nuraft::cb_func::ReturnCode NuKeeperServer::callbackFunc(nuraft::cb_func::Type type, nuraft::cb_func::Param * /* param */)
+{
+    if (type == nuraft::cb_func::Type::BecomeFresh || type == nuraft::cb_func::Type::BecomeLeader)
+    {
+        std::unique_lock lock(initialized_mutex);
+        initialized_flag = true;
+        initialized_cv.notify_all();
+    }
+    return nuraft::cb_func::ReturnCode::Ok;
+}
+
+void NuKeeperServer::waitInit()
+{
+    std::unique_lock lock(initialized_mutex);
+    int64_t timeout = coordination_settings->startup_timeout.totalMilliseconds();
+    if (!initialized_cv.wait_for(lock, std::chrono::milliseconds(timeout), [&] { return initialized_flag; }))
+        throw Exception(ErrorCodes::RAFT_ERROR, "Failed to wait RAFT initialization");
+}
+
+std::unordered_set<int64_t> NuKeeperServer::getDeadSessions()
+{
+    return state_machine->getDeadSessions();
+}
+
+}
diff --git a/src/Coordination/NuKeeperServer.h b/src/Coordination/NuKeeperServer.h
new file mode 100644
index 00000000000..a8d269eb9eb
--- /dev/null
+++ b/src/Coordination/NuKeeperServer.h
@@ -0,0 +1,63 @@
+#pragma once
+
+#include <libnuraft/nuraft.hxx> // Y_IGNORE
+#include <Coordination/InMemoryLogStore.h>
+#include <Coordination/InMemoryStateManager.h>
+#include <Coordination/NuKeeperStateMachine.h>
+#include <Coordination/NuKeeperStorage.h>
+#include <Coordination/CoordinationSettings.h>
+#include <unordered_map>
+
+namespace DB
+{
+
+class NuKeeperServer
+{
+private:
+    int server_id;
+
+    CoordinationSettingsPtr coordination_settings;
+
+    nuraft::ptr<NuKeeperStateMachine> state_machine;
+
+    nuraft::ptr<InMemoryStateManager> state_manager;
+
+    nuraft::raft_launcher launcher;
+
+    nuraft::ptr<nuraft::raft_server> raft_instance;
+
+    std::mutex append_entries_mutex;
+
+    ResponsesQueue & responses_queue;
+
+    std::mutex initialized_mutex;
+    bool initialized_flag = false;
+    std::condition_variable initialized_cv;
+
+    nuraft::cb_func::ReturnCode callbackFunc(nuraft::cb_func::Type type, nuraft::cb_func::Param * param);
+
+public:
+    NuKeeperServer(
+        int server_id_,
+        const CoordinationSettingsPtr & coordination_settings_,
+        const Poco::Util::AbstractConfiguration & config,
+        ResponsesQueue & responses_queue_);
+
+    void startup();
+
+    void putRequest(const NuKeeperStorage::RequestForSession & request);
+
+    int64_t getSessionID(int64_t session_timeout_ms);
+
+    std::unordered_set<int64_t> getDeadSessions();
+
+    bool isLeader() const;
+
+    bool isLeaderAlive() const;
+
+    void waitInit();
+
+    void shutdown();
+};
+
+}
diff --git a/src/Coordination/NuKeeperStateMachine.cpp b/src/Coordination/NuKeeperStateMachine.cpp
new file mode 100644
index 00000000000..0061645c75c
--- /dev/null
+++ b/src/Coordination/NuKeeperStateMachine.cpp
@@ -0,0 +1,262 @@
+#include <Coordination/NuKeeperStateMachine.h>
+#include <Coordination/ReadBufferFromNuraftBuffer.h>
+#include <Coordination/WriteBufferFromNuraftBuffer.h>
+#include <IO/ReadHelpers.h>
+#include <Common/ZooKeeper/ZooKeeperIO.h>
+#include <Coordination/NuKeeperStorageSerializer.h>
+
+namespace DB
+{
+
+NuKeeperStorage::RequestForSession parseRequest(nuraft::buffer & data)
+{
+    ReadBufferFromNuraftBuffer buffer(data);
+    NuKeeperStorage::RequestForSession request_for_session;
+    readIntBinary(request_for_session.session_id, buffer);
+
+    int32_t length;
+    Coordination::read(length, buffer);
+
+    int32_t xid;
+    Coordination::read(xid, buffer);
+
+    Coordination::OpNum opnum;
+    Coordination::read(opnum, buffer);
+
+    request_for_session.request = Coordination::ZooKeeperRequestFactory::instance().get(opnum);
+    request_for_session.request->xid = xid;
+    request_for_session.request->readImpl(buffer);
+    return request_for_session;
+}
+
+nuraft::ptr<nuraft::buffer> writeResponses(NuKeeperStorage::ResponsesForSessions & responses)
+{
+    WriteBufferFromNuraftBuffer buffer;
+    for (const auto & response_and_session : responses)
+    {
+        writeIntBinary(response_and_session.session_id, buffer);
+        response_and_session.response->write(buffer);
+    }
+    return buffer.getBuffer();
+}
+
+
+NuKeeperStateMachine::NuKeeperStateMachine(ResponsesQueue & responses_queue_, const CoordinationSettingsPtr & coordination_settings_)
+    : coordination_settings(coordination_settings_)
+    , storage(coordination_settings->dead_session_check_period_ms.totalMilliseconds())
+    , responses_queue(responses_queue_)
+    , last_committed_idx(0)
+    , log(&Poco::Logger::get("NuRaftStateMachine"))
+{
+    LOG_DEBUG(log, "Created nukeeper state machine");
+}
+
+nuraft::ptr<nuraft::buffer> NuKeeperStateMachine::commit(const size_t log_idx, nuraft::buffer & data)
+{
+    if (data.size() == sizeof(int64_t))
+    {
+        nuraft::buffer_serializer timeout_data(data);
+        int64_t session_timeout_ms = timeout_data.get_i64();
+        auto response = nuraft::buffer::alloc(sizeof(int64_t));
+        int64_t session_id;
+        nuraft::buffer_serializer bs(response);
+        {
+            std::lock_guard lock(storage_lock);
+            session_id = storage.getSessionID(session_timeout_ms);
+            bs.put_i64(session_id);
+        }
+        LOG_DEBUG(log, "Session ID response {} with timeout {}", session_id, session_timeout_ms);
+        last_committed_idx = log_idx;
+        return response;
+    }
+    else
+    {
+        auto request_for_session = parseRequest(data);
+        NuKeeperStorage::ResponsesForSessions responses_for_sessions;
+        {
+            std::lock_guard lock(storage_lock);
+            responses_for_sessions = storage.processRequest(request_for_session.request, request_for_session.session_id);
+            for (auto & response_for_session : responses_for_sessions)
+                responses_queue.push(response_for_session);
+        }
+
+        last_committed_idx = log_idx;
+        return nullptr;
+    }
+}
+
+bool NuKeeperStateMachine::apply_snapshot(nuraft::snapshot & s)
+{
+    LOG_DEBUG(log, "Applying snapshot {}", s.get_last_log_idx());
+    StorageSnapshotPtr snapshot;
+    {
+        std::lock_guard<std::mutex> lock(snapshots_lock);
+        auto entry = snapshots.find(s.get_last_log_idx());
+        if (entry == snapshots.end())
+            return false;
+        snapshot = entry->second;
+    }
+    std::lock_guard lock(storage_lock);
+    storage = snapshot->storage;
+    last_committed_idx = s.get_last_log_idx();
+    return true;
+}
+
+nuraft::ptr<nuraft::snapshot> NuKeeperStateMachine::last_snapshot()
+{
+   // Just return the latest snapshot.
+    std::lock_guard<std::mutex> lock(snapshots_lock);
+    auto entry = snapshots.rbegin();
+    if (entry == snapshots.rend())
+        return nullptr;
+
+    return entry->second->snapshot;
+}
+
+NuKeeperStateMachine::StorageSnapshotPtr NuKeeperStateMachine::createSnapshotInternal(nuraft::snapshot & s)
+{
+    nuraft::ptr<nuraft::buffer> snp_buf = s.serialize();
+    nuraft::ptr<nuraft::snapshot> ss = nuraft::snapshot::deserialize(*snp_buf);
+    std::lock_guard lock(storage_lock);
+    return std::make_shared<NuKeeperStateMachine::StorageSnapshot>(ss, storage);
+}
+
+NuKeeperStateMachine::StorageSnapshotPtr NuKeeperStateMachine::readSnapshot(nuraft::snapshot & s, nuraft::buffer & in)
+{
+    nuraft::ptr<nuraft::buffer> snp_buf = s.serialize();
+    nuraft::ptr<nuraft::snapshot> ss = nuraft::snapshot::deserialize(*snp_buf);
+    NuKeeperStorageSerializer serializer;
+
+    ReadBufferFromNuraftBuffer reader(in);
+    NuKeeperStorage new_storage(coordination_settings->dead_session_check_period_ms.totalMilliseconds());
+    serializer.deserialize(new_storage, reader);
+    return std::make_shared<StorageSnapshot>(ss, new_storage);
+}
+
+
+void NuKeeperStateMachine::writeSnapshot(const NuKeeperStateMachine::StorageSnapshotPtr & snapshot, nuraft::ptr<nuraft::buffer> & out)
+{
+    NuKeeperStorageSerializer serializer;
+
+    WriteBufferFromNuraftBuffer writer;
+    serializer.serialize(snapshot->storage, writer);
+    out = writer.getBuffer();
+}
+
+void NuKeeperStateMachine::create_snapshot(
+    nuraft::snapshot & s,
+    nuraft::async_result<bool>::handler_type & when_done)
+{
+    LOG_DEBUG(log, "Creating snapshot {}", s.get_last_log_idx());
+    auto snapshot = createSnapshotInternal(s);
+    {
+        std::lock_guard<std::mutex> lock(snapshots_lock);
+        snapshots[s.get_last_log_idx()] = snapshot;
+        size_t num = snapshots.size();
+        if (num > coordination_settings->max_stored_snapshots)
+        {
+            auto entry = snapshots.begin();
+
+            for (size_t i = 0; i < num - coordination_settings->max_stored_snapshots; ++i)
+            {
+                if (entry == snapshots.end())
+                    break;
+                entry = snapshots.erase(entry);
+            }
+        }
+
+    }
+
+    LOG_DEBUG(log, "Created snapshot {}", s.get_last_log_idx());
+    nuraft::ptr<std::exception> except(nullptr);
+    bool ret = true;
+    when_done(ret, except);
+}
+
+void NuKeeperStateMachine::save_logical_snp_obj(
+    nuraft::snapshot & s,
+    size_t & obj_id,
+    nuraft::buffer & data,
+    bool /*is_first_obj*/,
+    bool /*is_last_obj*/)
+{
+    LOG_DEBUG(log, "Saving snapshot {} obj_id {}", s.get_last_log_idx(), obj_id);
+
+    if (obj_id == 0)
+    {
+        auto new_snapshot = createSnapshotInternal(s);
+        std::lock_guard<std::mutex> lock(snapshots_lock);
+        snapshots.try_emplace(s.get_last_log_idx(), std::move(new_snapshot));
+    }
+    else
+    {
+        auto received_snapshot = readSnapshot(s, data);
+
+        std::lock_guard<std::mutex> lock(snapshots_lock);
+        snapshots[s.get_last_log_idx()] = std::move(received_snapshot);
+    }
+
+    obj_id++;
+}
+
+int NuKeeperStateMachine::read_logical_snp_obj(
+    nuraft::snapshot & s,
+    void* & /*user_snp_ctx*/,
+    ulong obj_id,
+    nuraft::ptr<nuraft::buffer> & data_out,
+    bool & is_last_obj)
+{
+
+    LOG_DEBUG(log, "Reading snapshot {} obj_id {}", s.get_last_log_idx(), obj_id);
+    StorageSnapshotPtr required_snapshot;
+    {
+        std::lock_guard<std::mutex> lock(snapshots_lock);
+        auto entry = snapshots.find(s.get_last_log_idx());
+        if (entry == snapshots.end())
+        {
+            // Snapshot doesn't exist.
+            data_out = nullptr;
+            is_last_obj = true;
+            return 0;
+        }
+        required_snapshot = entry->second;
+    }
+
+    if (obj_id == 0)
+    {
+        auto new_snapshot = createSnapshotInternal(s);
+        writeSnapshot(new_snapshot, data_out);
+        is_last_obj = false;
+    }
+    else
+    {
+        writeSnapshot(required_snapshot, data_out);
+        is_last_obj = true;
+    }
+    return 0;
+}
+
+void NuKeeperStateMachine::processReadRequest(const NuKeeperStorage::RequestForSession & request_for_session)
+{
+    NuKeeperStorage::ResponsesForSessions responses;
+    {
+        std::lock_guard lock(storage_lock);
+        responses = storage.processRequest(request_for_session.request, request_for_session.session_id);
+    }
+    for (const auto & response : responses)
+        responses_queue.push(response);
+}
+
+std::unordered_set<int64_t> NuKeeperStateMachine::getDeadSessions()
+{
+    std::lock_guard lock(storage_lock);
+    return storage.getDeadSessions();
+}
+
+void NuKeeperStateMachine::shutdownStorage()
+{
+    std::lock_guard lock(storage_lock);
+    storage.finalize();
+}
+
+}
diff --git a/src/Coordination/NuKeeperStateMachine.h b/src/Coordination/NuKeeperStateMachine.h
new file mode 100644
index 00000000000..87748db20a5
--- /dev/null
+++ b/src/Coordination/NuKeeperStateMachine.h
@@ -0,0 +1,99 @@
+#pragma once
+
+#include <Coordination/NuKeeperStorage.h>
+#include <libnuraft/nuraft.hxx> // Y_IGNORE
+#include <common/logger_useful.h>
+#include <Coordination/ThreadSafeQueue.h>
+#include <Coordination/CoordinationSettings.h>
+
+namespace DB
+{
+
+using ResponsesQueue = ThreadSafeQueue<NuKeeperStorage::ResponseForSession>;
+
+class NuKeeperStateMachine : public nuraft::state_machine
+{
+public:
+    NuKeeperStateMachine(ResponsesQueue & responses_queue_, const CoordinationSettingsPtr & coordination_settings_);
+
+    nuraft::ptr<nuraft::buffer> pre_commit(const size_t /*log_idx*/, nuraft::buffer & /*data*/) override { return nullptr; }
+
+    nuraft::ptr<nuraft::buffer> commit(const size_t log_idx, nuraft::buffer & data) override;
+
+    void rollback(const size_t /*log_idx*/, nuraft::buffer & /*data*/) override {}
+
+    size_t last_commit_index() override { return last_committed_idx; }
+
+    bool apply_snapshot(nuraft::snapshot & s) override;
+
+    nuraft::ptr<nuraft::snapshot> last_snapshot() override;
+
+    void create_snapshot(
+        nuraft::snapshot & s,
+        nuraft::async_result<bool>::handler_type & when_done) override;
+
+    void save_logical_snp_obj(
+        nuraft::snapshot & s,
+        size_t & obj_id,
+        nuraft::buffer & data,
+        bool is_first_obj,
+        bool is_last_obj) override;
+
+    int read_logical_snp_obj(
+        nuraft::snapshot & s,
+        void* & user_snp_ctx,
+        ulong obj_id,
+        nuraft::ptr<nuraft::buffer> & data_out,
+        bool & is_last_obj) override;
+
+    NuKeeperStorage & getStorage()
+    {
+        return storage;
+    }
+
+    void processReadRequest(const NuKeeperStorage::RequestForSession & request_for_session);
+
+    std::unordered_set<int64_t> getDeadSessions();
+
+    void shutdownStorage();
+
+private:
+    struct StorageSnapshot
+    {
+        StorageSnapshot(const nuraft::ptr<nuraft::snapshot> & s, const NuKeeperStorage & storage_)
+            : snapshot(s)
+            , storage(storage_)
+        {}
+
+        nuraft::ptr<nuraft::snapshot> snapshot;
+        NuKeeperStorage storage;
+    };
+
+    using StorageSnapshotPtr = std::shared_ptr<StorageSnapshot>;
+
+    StorageSnapshotPtr createSnapshotInternal(nuraft::snapshot & s);
+
+    StorageSnapshotPtr readSnapshot(nuraft::snapshot & s, nuraft::buffer & in);
+
+    static void writeSnapshot(const StorageSnapshotPtr & snapshot, nuraft::ptr<nuraft::buffer> & out);
+
+    CoordinationSettingsPtr coordination_settings;
+
+    NuKeeperStorage storage;
+
+    ResponsesQueue & responses_queue;
+    /// Mutex for snapshots
+    std::mutex snapshots_lock;
+
+    /// Lock for storage
+    std::mutex storage_lock;
+
+    /// Fake snapshot storage
+    std::map<uint64_t, StorageSnapshotPtr> snapshots;
+
+    /// Last committed Raft log number.
+    std::atomic<size_t> last_committed_idx;
+    Poco::Logger * log;
+};
+
+}
diff --git a/src/Common/ZooKeeper/TestKeeperStorage.cpp b/src/Coordination/NuKeeperStorage.cpp
similarity index 69%
rename from src/Common/ZooKeeper/TestKeeperStorage.cpp
rename to src/Coordination/NuKeeperStorage.cpp
index e364b0efca9..631f975cddc 100644
--- a/src/Common/ZooKeeper/TestKeeperStorage.cpp
+++ b/src/Coordination/NuKeeperStorage.cpp
@@ -1,4 +1,4 @@
-#include <Common/ZooKeeper/TestKeeperStorage.h>
+#include <Coordination/NuKeeperStorage.h>
 #include <Common/ZooKeeper/IKeeper.h>
 #include <Common/setThreadName.h>
 #include <mutex>
@@ -17,13 +17,6 @@ namespace ErrorCodes
     extern const int BAD_ARGUMENTS;
 }
 
-}
-
-namespace zkutil
-{
-
-using namespace DB;
-
 static String parentPath(const String & path)
 {
     auto rslash_pos = path.rfind('/');
@@ -38,20 +31,20 @@ static String baseName(const String & path)
     return path.substr(rslash_pos + 1);
 }
 
-static TestKeeperStorage::ResponsesForSessions processWatchesImpl(const String & path, TestKeeperStorage::Watches & watches, TestKeeperStorage::Watches & list_watches, Coordination::Event event_type)
+static NuKeeperStorage::ResponsesForSessions processWatchesImpl(const String & path, NuKeeperStorage::Watches & watches, NuKeeperStorage::Watches & list_watches, Coordination::Event event_type)
 {
-    TestKeeperStorage::ResponsesForSessions result;
+    NuKeeperStorage::ResponsesForSessions result;
     auto it = watches.find(path);
     if (it != watches.end())
     {
         std::shared_ptr<Coordination::ZooKeeperWatchResponse> watch_response = std::make_shared<Coordination::ZooKeeperWatchResponse>();
         watch_response->path = path;
-        watch_response->xid = -1;
+        watch_response->xid = Coordination::WATCH_XID;
         watch_response->zxid = -1;
         watch_response->type = event_type;
         watch_response->state = Coordination::State::CONNECTED;
         for (auto watcher_session : it->second)
-            result.push_back(TestKeeperStorage::ResponseForSession{watcher_session, watch_response});
+            result.push_back(NuKeeperStorage::ResponseForSession{watcher_session, watch_response});
 
         watches.erase(it);
     }
@@ -62,58 +55,69 @@ static TestKeeperStorage::ResponsesForSessions processWatchesImpl(const String &
     {
         std::shared_ptr<Coordination::ZooKeeperWatchResponse> watch_list_response = std::make_shared<Coordination::ZooKeeperWatchResponse>();
         watch_list_response->path = parent_path;
-        watch_list_response->xid = -1;
+        watch_list_response->xid = Coordination::WATCH_XID;
         watch_list_response->zxid = -1;
         watch_list_response->type = Coordination::Event::CHILD;
         watch_list_response->state = Coordination::State::CONNECTED;
         for (auto watcher_session : it->second)
-            result.push_back(TestKeeperStorage::ResponseForSession{watcher_session, watch_list_response});
+            result.push_back(NuKeeperStorage::ResponseForSession{watcher_session, watch_list_response});
 
         list_watches.erase(it);
     }
     return result;
 }
 
-TestKeeperStorage::TestKeeperStorage()
+NuKeeperStorage::NuKeeperStorage(int64_t tick_time_ms)
+    : session_expiry_queue(tick_time_ms)
 {
     container.emplace("/", Node());
 }
 
 using Undo = std::function<void()>;
 
-struct TestKeeperStorageRequest
+struct NuKeeperStorageRequest
 {
     Coordination::ZooKeeperRequestPtr zk_request;
 
-    explicit TestKeeperStorageRequest(const Coordination::ZooKeeperRequestPtr & zk_request_)
+    explicit NuKeeperStorageRequest(const Coordination::ZooKeeperRequestPtr & zk_request_)
         : zk_request(zk_request_)
     {}
-    virtual std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(TestKeeperStorage::Container & container, TestKeeperStorage::Ephemerals & ephemerals, int64_t zxid, int64_t session_id) const = 0;
-    virtual TestKeeperStorage::ResponsesForSessions processWatches(TestKeeperStorage::Watches & /*watches*/, TestKeeperStorage::Watches & /*list_watches*/) const { return {}; }
+    virtual std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(NuKeeperStorage::Container & container, NuKeeperStorage::Ephemerals & ephemerals, int64_t zxid, int64_t session_id) const = 0;
+    virtual NuKeeperStorage::ResponsesForSessions processWatches(NuKeeperStorage::Watches & /*watches*/, NuKeeperStorage::Watches & /*list_watches*/) const { return {}; }
 
-    virtual ~TestKeeperStorageRequest() = default;
+    virtual ~NuKeeperStorageRequest() = default;
 };
 
-struct TestKeeperStorageHeartbeatRequest final : public TestKeeperStorageRequest
+struct NuKeeperStorageHeartbeatRequest final : public NuKeeperStorageRequest
 {
-    using TestKeeperStorageRequest::TestKeeperStorageRequest;
-    std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(TestKeeperStorage::Container & /* container */, TestKeeperStorage::Ephemerals & /* ephemerals */, int64_t /* zxid */, int64_t /* session_id */) const override
+    using NuKeeperStorageRequest::NuKeeperStorageRequest;
+    std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(NuKeeperStorage::Container & /* container */, NuKeeperStorage::Ephemerals & /* ephemerals */, int64_t /* zxid */, int64_t /* session_id */) const override
     {
         return {zk_request->makeResponse(), {}};
     }
 };
 
-
-struct TestKeeperStorageCreateRequest final : public TestKeeperStorageRequest
+struct NuKeeperStorageSyncRequest final : public NuKeeperStorageRequest
 {
-    using TestKeeperStorageRequest::TestKeeperStorageRequest;
+    using NuKeeperStorageRequest::NuKeeperStorageRequest;
+    std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(NuKeeperStorage::Container & /* container */, NuKeeperStorage::Ephemerals & /* ephemerals */, int64_t /* zxid */, int64_t /* session_id */) const override
+    {
+        auto response = zk_request->makeResponse();
+        dynamic_cast<Coordination::ZooKeeperSyncResponse *>(response.get())->path = dynamic_cast<Coordination::ZooKeeperSyncRequest *>(zk_request.get())->path;
+        return {response, {}};
+    }
+};
 
-    TestKeeperStorage::ResponsesForSessions processWatches(TestKeeperStorage::Watches & watches, TestKeeperStorage::Watches & list_watches) const override
+struct NuKeeperStorageCreateRequest final : public NuKeeperStorageRequest
+{
+    using NuKeeperStorageRequest::NuKeeperStorageRequest;
+
+    NuKeeperStorage::ResponsesForSessions processWatches(NuKeeperStorage::Watches & watches, NuKeeperStorage::Watches & list_watches) const override
     {
         return processWatchesImpl(zk_request->getPath(), watches, list_watches, Coordination::Event::CREATED);
     }
 
-    std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(TestKeeperStorage::Container & container, TestKeeperStorage::Ephemerals & ephemerals, int64_t zxid, int64_t session_id) const override
+    std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(NuKeeperStorage::Container & container, NuKeeperStorage::Ephemerals & ephemerals, int64_t zxid, int64_t session_id) const override
     {
         Coordination::ZooKeeperResponsePtr response_ptr = zk_request->makeResponse();
         Undo undo;
@@ -138,8 +142,7 @@ struct TestKeeperStorageCreateRequest final : public TestKeeperStorageRequest
             }
             else
             {
-                TestKeeperStorage::Node created_node;
-                created_node.seq_num = 0;
+                NuKeeperStorage::Node created_node;
                 created_node.stat.czxid = zxid;
                 created_node.stat.mzxid = zxid;
                 created_node.stat.ctime = std::chrono::system_clock::now().time_since_epoch() / std::chrono::milliseconds(1);
@@ -193,10 +196,10 @@ struct TestKeeperStorageCreateRequest final : public TestKeeperStorageRequest
     }
 };
 
-struct TestKeeperStorageGetRequest final : public TestKeeperStorageRequest
+struct NuKeeperStorageGetRequest final : public NuKeeperStorageRequest
 {
-    using TestKeeperStorageRequest::TestKeeperStorageRequest;
-    std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(TestKeeperStorage::Container & container, TestKeeperStorage::Ephemerals & /* ephemerals */, int64_t /* zxid */, int64_t /* session_id */) const override
+    using NuKeeperStorageRequest::NuKeeperStorageRequest;
+    std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(NuKeeperStorage::Container & container, NuKeeperStorage::Ephemerals & /* ephemerals */, int64_t /* zxid */, int64_t /* session_id */) const override
     {
         Coordination::ZooKeeperResponsePtr response_ptr = zk_request->makeResponse();
         Coordination::ZooKeeperGetResponse & response = dynamic_cast<Coordination::ZooKeeperGetResponse &>(*response_ptr);
@@ -218,10 +221,10 @@ struct TestKeeperStorageGetRequest final : public TestKeeperStorageRequest
     }
 };
 
-struct TestKeeperStorageRemoveRequest final : public TestKeeperStorageRequest
+struct NuKeeperStorageRemoveRequest final : public NuKeeperStorageRequest
 {
-    using TestKeeperStorageRequest::TestKeeperStorageRequest;
-    std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(TestKeeperStorage::Container & container, TestKeeperStorage::Ephemerals & ephemerals, int64_t /*zxid*/, int64_t session_id) const override
+    using NuKeeperStorageRequest::NuKeeperStorageRequest;
+    std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(NuKeeperStorage::Container & container, NuKeeperStorage::Ephemerals & ephemerals, int64_t /*zxid*/, int64_t session_id) const override
     {
         Coordination::ZooKeeperResponsePtr response_ptr = zk_request->makeResponse();
         Coordination::ZooKeeperRemoveResponse & response = dynamic_cast<Coordination::ZooKeeperRemoveResponse &>(*response_ptr);
@@ -268,16 +271,16 @@ struct TestKeeperStorageRemoveRequest final : public TestKeeperStorageRequest
         return { response_ptr, undo };
     }
 
-    TestKeeperStorage::ResponsesForSessions processWatches(TestKeeperStorage::Watches & watches, TestKeeperStorage::Watches & list_watches) const override
+    NuKeeperStorage::ResponsesForSessions processWatches(NuKeeperStorage::Watches & watches, NuKeeperStorage::Watches & list_watches) const override
     {
         return processWatchesImpl(zk_request->getPath(), watches, list_watches, Coordination::Event::DELETED);
     }
 };
 
-struct TestKeeperStorageExistsRequest final : public TestKeeperStorageRequest
+struct NuKeeperStorageExistsRequest final : public NuKeeperStorageRequest
 {
-    using TestKeeperStorageRequest::TestKeeperStorageRequest;
-    std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(TestKeeperStorage::Container & container, TestKeeperStorage::Ephemerals & /* ephemerals */, int64_t /*zxid*/, int64_t /* session_id */) const override
+    using NuKeeperStorageRequest::NuKeeperStorageRequest;
+    std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(NuKeeperStorage::Container & container, NuKeeperStorage::Ephemerals & /* ephemerals */, int64_t /*zxid*/, int64_t /* session_id */) const override
     {
         Coordination::ZooKeeperResponsePtr response_ptr = zk_request->makeResponse();
         Coordination::ZooKeeperExistsResponse & response = dynamic_cast<Coordination::ZooKeeperExistsResponse &>(*response_ptr);
@@ -298,10 +301,10 @@ struct TestKeeperStorageExistsRequest final : public TestKeeperStorageRequest
     }
 };
 
-struct TestKeeperStorageSetRequest final : public TestKeeperStorageRequest
+struct NuKeeperStorageSetRequest final : public NuKeeperStorageRequest
 {
-    using TestKeeperStorageRequest::TestKeeperStorageRequest;
-    std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(TestKeeperStorage::Container & container, TestKeeperStorage::Ephemerals & /* ephemerals */, int64_t zxid, int64_t /* session_id */) const override
+    using NuKeeperStorageRequest::NuKeeperStorageRequest;
+    std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(NuKeeperStorage::Container & container, NuKeeperStorage::Ephemerals & /* ephemerals */, int64_t zxid, int64_t /* session_id */) const override
     {
         Coordination::ZooKeeperResponsePtr response_ptr = zk_request->makeResponse();
         Coordination::ZooKeeperSetResponse & response = dynamic_cast<Coordination::ZooKeeperSetResponse &>(*response_ptr);
@@ -341,17 +344,17 @@ struct TestKeeperStorageSetRequest final : public TestKeeperStorageRequest
         return { response_ptr, undo };
     }
 
-    TestKeeperStorage::ResponsesForSessions processWatches(TestKeeperStorage::Watches & watches, TestKeeperStorage::Watches & list_watches) const override
+    NuKeeperStorage::ResponsesForSessions processWatches(NuKeeperStorage::Watches & watches, NuKeeperStorage::Watches & list_watches) const override
     {
         return processWatchesImpl(zk_request->getPath(), watches, list_watches, Coordination::Event::CHANGED);
     }
 
 };
 
-struct TestKeeperStorageListRequest final : public TestKeeperStorageRequest
+struct NuKeeperStorageListRequest final : public NuKeeperStorageRequest
 {
-    using TestKeeperStorageRequest::TestKeeperStorageRequest;
-    std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(TestKeeperStorage::Container & container, TestKeeperStorage::Ephemerals & /* ephemerals */, int64_t /*zxid*/, int64_t /*session_id*/) const override
+    using NuKeeperStorageRequest::NuKeeperStorageRequest;
+    std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(NuKeeperStorage::Container & container, NuKeeperStorage::Ephemerals & /* ephemerals */, int64_t /*zxid*/, int64_t /*session_id*/) const override
     {
         Coordination::ZooKeeperResponsePtr response_ptr = zk_request->makeResponse();
         Coordination::ZooKeeperListResponse & response = dynamic_cast<Coordination::ZooKeeperListResponse &>(*response_ptr);
@@ -387,10 +390,10 @@ struct TestKeeperStorageListRequest final : public TestKeeperStorageRequest
     }
 };
 
-struct TestKeeperStorageCheckRequest final : public TestKeeperStorageRequest
+struct NuKeeperStorageCheckRequest final : public NuKeeperStorageRequest
 {
-    using TestKeeperStorageRequest::TestKeeperStorageRequest;
-    std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(TestKeeperStorage::Container & container, TestKeeperStorage::Ephemerals & /* ephemerals */, int64_t /*zxid*/, int64_t /*session_id*/) const override
+    using NuKeeperStorageRequest::NuKeeperStorageRequest;
+    std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(NuKeeperStorage::Container & container, NuKeeperStorage::Ephemerals & /* ephemerals */, int64_t /*zxid*/, int64_t /*session_id*/) const override
     {
         Coordination::ZooKeeperResponsePtr response_ptr = zk_request->makeResponse();
         Coordination::ZooKeeperCheckResponse & response = dynamic_cast<Coordination::ZooKeeperCheckResponse &>(*response_ptr);
@@ -413,11 +416,11 @@ struct TestKeeperStorageCheckRequest final : public TestKeeperStorageRequest
     }
 };
 
-struct TestKeeperStorageMultiRequest final : public TestKeeperStorageRequest
+struct NuKeeperStorageMultiRequest final : public NuKeeperStorageRequest
 {
-    std::vector<TestKeeperStorageRequestPtr> concrete_requests;
-    explicit TestKeeperStorageMultiRequest(const Coordination::ZooKeeperRequestPtr & zk_request_)
-        : TestKeeperStorageRequest(zk_request_)
+    std::vector<NuKeeperStorageRequestPtr> concrete_requests;
+    explicit NuKeeperStorageMultiRequest(const Coordination::ZooKeeperRequestPtr & zk_request_)
+        : NuKeeperStorageRequest(zk_request_)
     {
         Coordination::ZooKeeperMultiRequest & request = dynamic_cast<Coordination::ZooKeeperMultiRequest &>(*zk_request);
         concrete_requests.reserve(request.requests.size());
@@ -427,26 +430,26 @@ struct TestKeeperStorageMultiRequest final : public TestKeeperStorageRequest
             auto sub_zk_request = std::dynamic_pointer_cast<Coordination::ZooKeeperRequest>(sub_request);
             if (sub_zk_request->getOpNum() == Coordination::OpNum::Create)
             {
-                concrete_requests.push_back(std::make_shared<TestKeeperStorageCreateRequest>(sub_zk_request));
+                concrete_requests.push_back(std::make_shared<NuKeeperStorageCreateRequest>(sub_zk_request));
             }
             else if (sub_zk_request->getOpNum() == Coordination::OpNum::Remove)
             {
-                concrete_requests.push_back(std::make_shared<TestKeeperStorageRemoveRequest>(sub_zk_request));
+                concrete_requests.push_back(std::make_shared<NuKeeperStorageRemoveRequest>(sub_zk_request));
             }
             else if (sub_zk_request->getOpNum() == Coordination::OpNum::Set)
             {
-                concrete_requests.push_back(std::make_shared<TestKeeperStorageSetRequest>(sub_zk_request));
+                concrete_requests.push_back(std::make_shared<NuKeeperStorageSetRequest>(sub_zk_request));
             }
             else if (sub_zk_request->getOpNum() == Coordination::OpNum::Check)
             {
-                concrete_requests.push_back(std::make_shared<TestKeeperStorageCheckRequest>(sub_zk_request));
+                concrete_requests.push_back(std::make_shared<NuKeeperStorageCheckRequest>(sub_zk_request));
             }
             else
                 throw DB::Exception(ErrorCodes::BAD_ARGUMENTS, "Illegal command as part of multi ZooKeeper request {}", sub_zk_request->getOpNum());
         }
     }
 
-    std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(TestKeeperStorage::Container & container, TestKeeperStorage::Ephemerals & ephemerals, int64_t zxid, int64_t session_id) const override
+    std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(NuKeeperStorage::Container & container, NuKeeperStorage::Ephemerals & ephemerals, int64_t zxid, int64_t session_id) const override
     {
         Coordination::ZooKeeperResponsePtr response_ptr = zk_request->makeResponse();
         Coordination::ZooKeeperMultiResponse & response = dynamic_cast<Coordination::ZooKeeperMultiResponse &>(*response_ptr);
@@ -499,9 +502,9 @@ struct TestKeeperStorageMultiRequest final : public TestKeeperStorageRequest
         }
     }
 
-    TestKeeperStorage::ResponsesForSessions processWatches(TestKeeperStorage::Watches & watches, TestKeeperStorage::Watches & list_watches) const override
+    NuKeeperStorage::ResponsesForSessions processWatches(NuKeeperStorage::Watches & watches, NuKeeperStorage::Watches & list_watches) const override
     {
-        TestKeeperStorage::ResponsesForSessions result;
+        NuKeeperStorage::ResponsesForSessions result;
         for (const auto & generic_request : concrete_requests)
         {
             auto responses = generic_request->processWatches(watches, list_watches);
@@ -511,75 +514,49 @@ struct TestKeeperStorageMultiRequest final : public TestKeeperStorageRequest
     }
 };
 
-struct TestKeeperStorageCloseRequest final : public TestKeeperStorageRequest
+struct NuKeeperStorageCloseRequest final : public NuKeeperStorageRequest
 {
-    using TestKeeperStorageRequest::TestKeeperStorageRequest;
-    std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(TestKeeperStorage::Container &, TestKeeperStorage::Ephemerals &, int64_t, int64_t) const override
+    using NuKeeperStorageRequest::NuKeeperStorageRequest;
+    std::pair<Coordination::ZooKeeperResponsePtr, Undo> process(NuKeeperStorage::Container &, NuKeeperStorage::Ephemerals &, int64_t, int64_t) const override
     {
         throw DB::Exception("Called process on close request", ErrorCodes::LOGICAL_ERROR);
     }
 };
 
-TestKeeperStorage::ResponsesForSessions TestKeeperStorage::finalize(const RequestsForSessions & expired_requests)
+void NuKeeperStorage::finalize()
 {
     if (finalized)
         throw DB::Exception("Testkeeper storage already finalized", ErrorCodes::LOGICAL_ERROR);
 
     finalized = true;
 
-    ResponsesForSessions finalize_results;
-    auto finish_watch = [] (const auto & watch_pair) -> ResponsesForSessions
-    {
-        ResponsesForSessions results;
-        std::shared_ptr<Coordination::ZooKeeperWatchResponse> response = std::make_shared<Coordination::ZooKeeperWatchResponse>();
-        response->type = Coordination::SESSION;
-        response->state = Coordination::EXPIRED_SESSION;
-        response->error = Coordination::Error::ZSESSIONEXPIRED;
+    for (const auto & [session_id, ephemerals_paths] : ephemerals)
+        for (const String & ephemeral_path : ephemerals_paths)
+            container.erase(ephemeral_path);
 
-        for (auto & watcher_session : watch_pair.second)
-            results.push_back(ResponseForSession{watcher_session, response});
-        return results;
-    };
-
-    for (auto & path_watch : watches)
-    {
-        auto watch_responses = finish_watch(path_watch);
-        finalize_results.insert(finalize_results.end(), watch_responses.begin(), watch_responses.end());
-    }
+    ephemerals.clear();
 
     watches.clear();
-    for (auto & path_watch : list_watches)
-    {
-        auto list_watch_responses = finish_watch(path_watch);
-        finalize_results.insert(finalize_results.end(), list_watch_responses.begin(), list_watch_responses.end());
-    }
     list_watches.clear();
     sessions_and_watchers.clear();
-
-    for (const auto & [session_id, zk_request] : expired_requests)
-    {
-        auto response = zk_request->makeResponse();
-        response->error = Coordination::Error::ZSESSIONEXPIRED;
-        finalize_results.push_back(ResponseForSession{session_id, response});
-    }
-    return finalize_results;
+    session_expiry_queue.clear();
 }
 
 
-class TestKeeperWrapperFactory final : private boost::noncopyable
+class NuKeeperWrapperFactory final : private boost::noncopyable
 {
 
 public:
-    using Creator = std::function<TestKeeperStorageRequestPtr(const Coordination::ZooKeeperRequestPtr &)>;
+    using Creator = std::function<NuKeeperStorageRequestPtr(const Coordination::ZooKeeperRequestPtr &)>;
     using OpNumToRequest = std::unordered_map<Coordination::OpNum, Creator>;
 
-    static TestKeeperWrapperFactory & instance()
+    static NuKeeperWrapperFactory & instance()
     {
-        static TestKeeperWrapperFactory factory;
+        static NuKeeperWrapperFactory factory;
         return factory;
     }
 
-    TestKeeperStorageRequestPtr get(const Coordination::ZooKeeperRequestPtr & zk_request) const
+    NuKeeperStorageRequestPtr get(const Coordination::ZooKeeperRequestPtr & zk_request) const
     {
         auto it = op_num_to_request.find(zk_request->getOpNum());
         if (it == op_num_to_request.end())
@@ -596,36 +573,37 @@ public:
 
 private:
     OpNumToRequest op_num_to_request;
-    TestKeeperWrapperFactory();
+    NuKeeperWrapperFactory();
 };
 
 template<Coordination::OpNum num, typename RequestT>
-void registerTestKeeperRequestWrapper(TestKeeperWrapperFactory & factory)
+void registerNuKeeperRequestWrapper(NuKeeperWrapperFactory & factory)
 {
     factory.registerRequest(num, [] (const Coordination::ZooKeeperRequestPtr & zk_request) { return std::make_shared<RequestT>(zk_request); });
 }
 
 
-TestKeeperWrapperFactory::TestKeeperWrapperFactory()
+NuKeeperWrapperFactory::NuKeeperWrapperFactory()
 {
-    registerTestKeeperRequestWrapper<Coordination::OpNum::Heartbeat, TestKeeperStorageHeartbeatRequest>(*this);
-    //registerTestKeeperRequestWrapper<Coordination::OpNum::Auth, TestKeeperStorageAuthRequest>(*this);
-    registerTestKeeperRequestWrapper<Coordination::OpNum::Close, TestKeeperStorageCloseRequest>(*this);
-    registerTestKeeperRequestWrapper<Coordination::OpNum::Create, TestKeeperStorageCreateRequest>(*this);
-    registerTestKeeperRequestWrapper<Coordination::OpNum::Remove, TestKeeperStorageRemoveRequest>(*this);
-    registerTestKeeperRequestWrapper<Coordination::OpNum::Exists, TestKeeperStorageExistsRequest>(*this);
-    registerTestKeeperRequestWrapper<Coordination::OpNum::Get, TestKeeperStorageGetRequest>(*this);
-    registerTestKeeperRequestWrapper<Coordination::OpNum::Set, TestKeeperStorageSetRequest>(*this);
-    registerTestKeeperRequestWrapper<Coordination::OpNum::List, TestKeeperStorageListRequest>(*this);
-    registerTestKeeperRequestWrapper<Coordination::OpNum::SimpleList, TestKeeperStorageListRequest>(*this);
-    registerTestKeeperRequestWrapper<Coordination::OpNum::Check, TestKeeperStorageCheckRequest>(*this);
-    registerTestKeeperRequestWrapper<Coordination::OpNum::Multi, TestKeeperStorageMultiRequest>(*this);
+    registerNuKeeperRequestWrapper<Coordination::OpNum::Heartbeat, NuKeeperStorageHeartbeatRequest>(*this);
+    registerNuKeeperRequestWrapper<Coordination::OpNum::Sync, NuKeeperStorageSyncRequest>(*this);
+    //registerNuKeeperRequestWrapper<Coordination::OpNum::Auth, NuKeeperStorageAuthRequest>(*this);
+    registerNuKeeperRequestWrapper<Coordination::OpNum::Close, NuKeeperStorageCloseRequest>(*this);
+    registerNuKeeperRequestWrapper<Coordination::OpNum::Create, NuKeeperStorageCreateRequest>(*this);
+    registerNuKeeperRequestWrapper<Coordination::OpNum::Remove, NuKeeperStorageRemoveRequest>(*this);
+    registerNuKeeperRequestWrapper<Coordination::OpNum::Exists, NuKeeperStorageExistsRequest>(*this);
+    registerNuKeeperRequestWrapper<Coordination::OpNum::Get, NuKeeperStorageGetRequest>(*this);
+    registerNuKeeperRequestWrapper<Coordination::OpNum::Set, NuKeeperStorageSetRequest>(*this);
+    registerNuKeeperRequestWrapper<Coordination::OpNum::List, NuKeeperStorageListRequest>(*this);
+    registerNuKeeperRequestWrapper<Coordination::OpNum::SimpleList, NuKeeperStorageListRequest>(*this);
+    registerNuKeeperRequestWrapper<Coordination::OpNum::Check, NuKeeperStorageCheckRequest>(*this);
+    registerNuKeeperRequestWrapper<Coordination::OpNum::Multi, NuKeeperStorageMultiRequest>(*this);
 }
 
 
-TestKeeperStorage::ResponsesForSessions TestKeeperStorage::processRequest(const Coordination::ZooKeeperRequestPtr & zk_request, int64_t session_id)
+NuKeeperStorage::ResponsesForSessions NuKeeperStorage::processRequest(const Coordination::ZooKeeperRequestPtr & zk_request, int64_t session_id)
 {
-    TestKeeperStorage::ResponsesForSessions results;
+    NuKeeperStorage::ResponsesForSessions results;
     if (zk_request->getOpNum() == Coordination::OpNum::Close)
     {
         auto it = ephemerals.find(session_id);
@@ -645,12 +623,24 @@ TestKeeperStorage::ResponsesForSessions TestKeeperStorage::processRequest(const
         auto response = std::make_shared<Coordination::ZooKeeperCloseResponse>();
         response->xid = zk_request->xid;
         response->zxid = getZXID();
+        session_expiry_queue.remove(session_id);
+        session_and_timeout.erase(session_id);
+        results.push_back(ResponseForSession{session_id, response});
+    }
+    else if (zk_request->getOpNum() == Coordination::OpNum::Heartbeat)
+    {
+        session_expiry_queue.update(session_id, session_and_timeout[session_id]);
+        NuKeeperStorageRequestPtr storage_request = NuKeeperWrapperFactory::instance().get(zk_request);
+        auto [response, _] = storage_request->process(container, ephemerals, zxid, session_id);
+        response->xid = zk_request->xid;
+        response->zxid = getZXID();
+
         results.push_back(ResponseForSession{session_id, response});
     }
     else
     {
 
-        TestKeeperStorageRequestPtr storage_request = TestKeeperWrapperFactory::instance().get(zk_request);
+        NuKeeperStorageRequestPtr storage_request = NuKeeperWrapperFactory::instance().get(zk_request);
         auto [response, _] = storage_request->process(container, ephemerals, zxid, session_id);
 
         if (zk_request->has_watch)
@@ -669,15 +659,6 @@ TestKeeperStorage::ResponsesForSessions TestKeeperStorage::processRequest(const
                 watches[zk_request->getPath()].emplace_back(session_id);
                 sessions_and_watchers[session_id].emplace(zk_request->getPath());
             }
-            else
-            {
-                std::shared_ptr<Coordination::ZooKeeperWatchResponse> watch_response = std::make_shared<Coordination::ZooKeeperWatchResponse>();
-                watch_response->path = zk_request->getPath();
-                watch_response->xid = -1;
-                watch_response->error = response->error;
-                watch_response->type = Coordination::Event::NOTWATCHING;
-                results.push_back(ResponseForSession{session_id, watch_response});
-            }
         }
 
         if (response->error == Coordination::Error::ZOK)
@@ -696,7 +677,7 @@ TestKeeperStorage::ResponsesForSessions TestKeeperStorage::processRequest(const
 }
 
 
-void TestKeeperStorage::clearDeadWatches(int64_t session_id)
+void NuKeeperStorage::clearDeadWatches(int64_t session_id)
 {
     auto watches_it = sessions_and_watchers.find(session_id);
     if (watches_it != sessions_and_watchers.end())
diff --git a/src/Common/ZooKeeper/TestKeeperStorage.h b/src/Coordination/NuKeeperStorage.h
similarity index 65%
rename from src/Common/ZooKeeper/TestKeeperStorage.h
rename to src/Coordination/NuKeeperStorage.h
index 21b1ce16c32..20ab1982b4e 100644
--- a/src/Common/ZooKeeper/TestKeeperStorage.h
+++ b/src/Coordination/NuKeeperStorage.h
@@ -4,27 +4,28 @@
 #include <Common/ZooKeeper/IKeeper.h>
 #include <Common/ConcurrentBoundedQueue.h>
 #include <Common/ZooKeeper/ZooKeeperCommon.h>
+#include <Coordination/SessionExpiryQueue.h>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
 
-namespace zkutil
+namespace DB
 {
 
 using namespace DB;
-struct TestKeeperStorageRequest;
-using TestKeeperStorageRequestPtr = std::shared_ptr<TestKeeperStorageRequest>;
+struct NuKeeperStorageRequest;
+using NuKeeperStorageRequestPtr = std::shared_ptr<NuKeeperStorageRequest>;
 using ResponseCallback = std::function<void(const Coordination::ZooKeeperResponsePtr &)>;
 
-class TestKeeperStorage
+class NuKeeperStorage
 {
 public:
-    std::atomic<int64_t> session_id_counter{0};
+    int64_t session_id_counter{0};
 
     struct Node
     {
         String data;
-        Coordination::ACLs acls;
+        Coordination::ACLs acls{};
         bool is_ephemeral = false;
         bool is_sequental = false;
         Coordination::Stat stat{};
@@ -50,6 +51,7 @@ public:
     using Container = std::map<std::string, Node>;
     using Ephemerals = std::unordered_map<int64_t, std::unordered_set<String>>;
     using SessionAndWatcher = std::unordered_map<int64_t, std::unordered_set<String>>;
+    using SessionAndTimeout = std::unordered_map<int64_t, long>;
     using SessionIDs = std::vector<int64_t>;
 
     using Watches = std::map<String /* path, relative of root_path */, SessionIDs>;
@@ -57,9 +59,11 @@ public:
     Container container;
     Ephemerals ephemerals;
     SessionAndWatcher sessions_and_watchers;
+    SessionExpiryQueue session_expiry_queue;
+    SessionAndTimeout session_and_timeout;
 
-    std::atomic<int64_t> zxid{0};
-    std::atomic<bool> finalized{false};
+    int64_t zxid{0};
+    bool finalized{false};
 
     Watches watches;
     Watches list_watches;   /// Watches for 'list' request (watches on children).
@@ -68,18 +72,27 @@ public:
 
     int64_t getZXID()
     {
-        return zxid.fetch_add(1);
+        return zxid++;
     }
 
 public:
-    TestKeeperStorage();
+    NuKeeperStorage(int64_t tick_time_ms);
+
+    int64_t getSessionID(int64_t session_timeout_ms)
+    {
+        auto result = session_id_counter++;
+        session_and_timeout.emplace(result, session_timeout_ms);
+        session_expiry_queue.update(result, session_timeout_ms);
+        return result;
+    }
 
     ResponsesForSessions processRequest(const Coordination::ZooKeeperRequestPtr & request, int64_t session_id);
-    ResponsesForSessions finalize(const RequestsForSessions & expired_requests);
 
-    int64_t getSessionID()
+    void finalize();
+
+    std::unordered_set<int64_t> getDeadSessions()
     {
-        return session_id_counter.fetch_add(1);
+        return session_expiry_queue.getExpiredSessions();
     }
 };
 
diff --git a/src/Coordination/NuKeeperStorageDispatcher.cpp b/src/Coordination/NuKeeperStorageDispatcher.cpp
new file mode 100644
index 00000000000..570087757ad
--- /dev/null
+++ b/src/Coordination/NuKeeperStorageDispatcher.cpp
@@ -0,0 +1,237 @@
+#include <Coordination/NuKeeperStorageDispatcher.h>
+#include <Common/setThreadName.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+
+    extern const int LOGICAL_ERROR;
+    extern const int TIMEOUT_EXCEEDED;
+}
+
+NuKeeperStorageDispatcher::NuKeeperStorageDispatcher()
+    : coordination_settings(std::make_shared<CoordinationSettings>())
+    , log(&Poco::Logger::get("NuKeeperDispatcher"))
+{
+}
+
+void NuKeeperStorageDispatcher::requestThread()
+{
+    setThreadName("NuKeeperReqT");
+    while (!shutdown_called)
+    {
+        NuKeeperStorage::RequestForSession request;
+
+        UInt64 max_wait = UInt64(coordination_settings->operation_timeout_ms.totalMilliseconds());
+
+        if (requests_queue.tryPop(request, max_wait))
+        {
+            if (shutdown_called)
+                break;
+
+            try
+            {
+                server->putRequest(request);
+            }
+            catch (...)
+            {
+                tryLogCurrentException(__PRETTY_FUNCTION__);
+            }
+        }
+    }
+}
+
+void NuKeeperStorageDispatcher::responseThread()
+{
+    setThreadName("NuKeeperRspT");
+    while (!shutdown_called)
+    {
+        NuKeeperStorage::ResponseForSession response_for_session;
+
+        UInt64 max_wait = UInt64(coordination_settings->operation_timeout_ms.totalMilliseconds());
+
+        if (responses_queue.tryPop(response_for_session, max_wait))
+        {
+            if (shutdown_called)
+                break;
+
+            try
+            {
+                 setResponse(response_for_session.session_id, response_for_session.response);
+            }
+            catch (...)
+            {
+                tryLogCurrentException(__PRETTY_FUNCTION__);
+            }
+        }
+    }
+}
+
+void NuKeeperStorageDispatcher::setResponse(int64_t session_id, const Coordination::ZooKeeperResponsePtr & response)
+{
+    std::lock_guard lock(session_to_response_callback_mutex);
+    auto session_writer = session_to_response_callback.find(session_id);
+    if (session_writer == session_to_response_callback.end())
+        return;
+
+    session_writer->second(response);
+    /// Session closed, no more writes
+    if (response->xid != Coordination::WATCH_XID && response->getOpNum() == Coordination::OpNum::Close)
+        session_to_response_callback.erase(session_writer);
+}
+
+bool NuKeeperStorageDispatcher::putRequest(const Coordination::ZooKeeperRequestPtr & request, int64_t session_id)
+{
+    {
+        std::lock_guard lock(session_to_response_callback_mutex);
+        if (session_to_response_callback.count(session_id) == 0)
+            return false;
+    }
+
+    NuKeeperStorage::RequestForSession request_info;
+    request_info.request = request;
+    request_info.session_id = session_id;
+
+    std::lock_guard lock(push_request_mutex);
+    /// Put close requests without timeouts
+    if (request->getOpNum() == Coordination::OpNum::Close)
+        requests_queue.push(std::move(request_info));
+    else if (!requests_queue.tryPush(std::move(request_info), coordination_settings->operation_timeout_ms.totalMilliseconds()))
+        throw Exception("Cannot push request to queue within operation timeout", ErrorCodes::TIMEOUT_EXCEEDED);
+    return true;
+}
+
+void NuKeeperStorageDispatcher::initialize(const Poco::Util::AbstractConfiguration & config)
+{
+    LOG_DEBUG(log, "Initializing storage dispatcher");
+    int myid = config.getInt("test_keeper_server.server_id");
+
+    coordination_settings->loadFromConfig("test_keeper_server.coordination_settings", config);
+
+    server = std::make_unique<NuKeeperServer>(myid, coordination_settings, config, responses_queue);
+    try
+    {
+        LOG_DEBUG(log, "Waiting server to initialize");
+        server->startup();
+        LOG_DEBUG(log, "Server initialized, waiting for quorum");
+
+        server->waitInit();
+        LOG_DEBUG(log, "Quorum initialized");
+    }
+    catch (...)
+    {
+        tryLogCurrentException(__PRETTY_FUNCTION__);
+        throw;
+    }
+
+    request_thread = ThreadFromGlobalPool([this] { requestThread(); });
+    responses_thread = ThreadFromGlobalPool([this] { responseThread(); });
+    session_cleaner_thread = ThreadFromGlobalPool([this] { sessionCleanerTask(); });
+
+    LOG_DEBUG(log, "Dispatcher initialized");
+}
+
+void NuKeeperStorageDispatcher::shutdown()
+{
+    try
+    {
+        {
+            std::lock_guard lock(push_request_mutex);
+
+            if (shutdown_called)
+                return;
+
+            LOG_DEBUG(log, "Shutting down storage dispatcher");
+            shutdown_called = true;
+
+            if (session_cleaner_thread.joinable())
+                session_cleaner_thread.join();
+
+            if (request_thread.joinable())
+                request_thread.join();
+
+            if (responses_thread.joinable())
+                responses_thread.join();
+        }
+
+        if (server)
+            server->shutdown();
+
+        NuKeeperStorage::RequestForSession request_for_session;
+        while (requests_queue.tryPop(request_for_session))
+        {
+            auto response = request_for_session.request->makeResponse();
+            response->error = Coordination::Error::ZSESSIONEXPIRED;
+            setResponse(request_for_session.session_id, response);
+        }
+        session_to_response_callback.clear();
+    }
+    catch (...)
+    {
+        tryLogCurrentException(__PRETTY_FUNCTION__);
+    }
+
+    LOG_DEBUG(log, "Dispatcher shut down");
+}
+
+NuKeeperStorageDispatcher::~NuKeeperStorageDispatcher()
+{
+    shutdown();
+}
+
+void NuKeeperStorageDispatcher::registerSession(int64_t session_id, ZooKeeperResponseCallback callback)
+{
+    std::lock_guard lock(session_to_response_callback_mutex);
+    if (!session_to_response_callback.try_emplace(session_id, callback).second)
+        throw Exception(DB::ErrorCodes::LOGICAL_ERROR, "Session with id {} already registered in dispatcher", session_id);
+}
+
+void NuKeeperStorageDispatcher::sessionCleanerTask()
+{
+    while (true)
+    {
+        if (shutdown_called)
+            return;
+
+        try
+        {
+            if (isLeader())
+            {
+                auto dead_sessions = server->getDeadSessions();
+                for (int64_t dead_session : dead_sessions)
+                {
+                    LOG_INFO(log, "Found dead session {}, will try to close it", dead_session);
+                    Coordination::ZooKeeperRequestPtr request = Coordination::ZooKeeperRequestFactory::instance().get(Coordination::OpNum::Close);
+                    request->xid = Coordination::CLOSE_XID;
+                    NuKeeperStorage::RequestForSession request_info;
+                    request_info.request = request;
+                    request_info.session_id = dead_session;
+                    {
+                        std::lock_guard lock(push_request_mutex);
+                        requests_queue.push(std::move(request_info));
+                    }
+                    finishSession(dead_session);
+                    LOG_INFO(log, "Dead session close request pushed");
+                }
+            }
+        }
+        catch (...)
+        {
+            tryLogCurrentException(__PRETTY_FUNCTION__);
+        }
+
+        std::this_thread::sleep_for(std::chrono::milliseconds(coordination_settings->dead_session_check_period_ms.totalMilliseconds()));
+    }
+}
+
+void NuKeeperStorageDispatcher::finishSession(int64_t session_id)
+{
+    std::lock_guard lock(session_to_response_callback_mutex);
+    auto session_it = session_to_response_callback.find(session_id);
+    if (session_it != session_to_response_callback.end())
+        session_to_response_callback.erase(session_it);
+}
+
+}
diff --git a/src/Coordination/NuKeeperStorageDispatcher.h b/src/Coordination/NuKeeperStorageDispatcher.h
new file mode 100644
index 00000000000..62144b92a7a
--- /dev/null
+++ b/src/Coordination/NuKeeperStorageDispatcher.h
@@ -0,0 +1,89 @@
+#pragma once
+
+#if !defined(ARCADIA_BUILD)
+#    include <Common/config.h>
+#    include "config_core.h"
+#endif
+
+#if USE_NURAFT
+
+#include <Common/ThreadPool.h>
+#include <Common/ConcurrentBoundedQueue.h>
+#include <Poco/Util/AbstractConfiguration.h>
+#include <Common/Exception.h>
+#include <common/logger_useful.h>
+#include <functional>
+#include <Coordination/NuKeeperServer.h>
+#include <Coordination/CoordinationSettings.h>
+
+
+namespace DB
+{
+
+using ZooKeeperResponseCallback = std::function<void(const Coordination::ZooKeeperResponsePtr & response)>;
+
+class NuKeeperStorageDispatcher
+{
+
+private:
+    std::mutex push_request_mutex;
+
+    CoordinationSettingsPtr coordination_settings;
+    using RequestsQueue = ConcurrentBoundedQueue<NuKeeperStorage::RequestForSession>;
+    RequestsQueue requests_queue{1};
+    ResponsesQueue responses_queue;
+    std::atomic<bool> shutdown_called{false};
+    using SessionToResponseCallback = std::unordered_map<int64_t, ZooKeeperResponseCallback>;
+
+    std::mutex session_to_response_callback_mutex;
+    SessionToResponseCallback session_to_response_callback;
+
+    ThreadFromGlobalPool request_thread;
+    ThreadFromGlobalPool responses_thread;
+
+    ThreadFromGlobalPool session_cleaner_thread;
+
+    std::unique_ptr<NuKeeperServer> server;
+
+    Poco::Logger * log;
+
+private:
+    void requestThread();
+    void responseThread();
+    void sessionCleanerTask();
+    void setResponse(int64_t session_id, const Coordination::ZooKeeperResponsePtr & response);
+
+public:
+    NuKeeperStorageDispatcher();
+
+    void initialize(const Poco::Util::AbstractConfiguration & config);
+
+    void shutdown();
+
+    ~NuKeeperStorageDispatcher();
+
+    bool putRequest(const Coordination::ZooKeeperRequestPtr & request, int64_t session_id);
+
+    bool isLeader() const
+    {
+        return server->isLeader();
+    }
+
+    bool hasLeader() const
+    {
+        return server->isLeaderAlive();
+    }
+
+    int64_t getSessionID(long session_timeout_ms)
+    {
+        return server->getSessionID(session_timeout_ms);
+    }
+
+    void registerSession(int64_t session_id, ZooKeeperResponseCallback callback);
+    /// Call if we don't need any responses for this session no more (session was expired)
+    void finishSession(int64_t session_id);
+};
+
+}
+
+#endif
diff --git a/src/Coordination/NuKeeperStorageSerializer.cpp b/src/Coordination/NuKeeperStorageSerializer.cpp
new file mode 100644
index 00000000000..298df45cde0
--- /dev/null
+++ b/src/Coordination/NuKeeperStorageSerializer.cpp
@@ -0,0 +1,87 @@
+#include <Coordination/NuKeeperStorageSerializer.h>
+#include <IO/WriteHelpers.h>
+#include <IO/ReadHelpers.h>
+#include <Common/ZooKeeper/ZooKeeperIO.h>
+
+namespace DB
+{
+
+namespace
+{
+    void writeNode(const NuKeeperStorage::Node & node, WriteBuffer & out)
+    {
+        Coordination::write(node.data, out);
+        Coordination::write(node.acls, out);
+        Coordination::write(node.is_ephemeral, out);
+        Coordination::write(node.is_sequental, out);
+        Coordination::write(node.stat, out);
+        Coordination::write(node.seq_num, out);
+    }
+
+    void readNode(NuKeeperStorage::Node & node, ReadBuffer & in)
+    {
+        Coordination::read(node.data, in);
+        Coordination::read(node.acls, in);
+        Coordination::read(node.is_ephemeral, in);
+        Coordination::read(node.is_sequental, in);
+        Coordination::read(node.stat, in);
+        Coordination::read(node.seq_num, in);
+    }
+}
+
+void NuKeeperStorageSerializer::serialize(const NuKeeperStorage & storage, WriteBuffer & out)
+{
+    Coordination::write(storage.zxid, out);
+    Coordination::write(storage.session_id_counter, out);
+    Coordination::write(storage.container.size(), out);
+    for (const auto & [path, node] : storage.container)
+    {
+        Coordination::write(path, out);
+        writeNode(node, out);
+    }
+    Coordination::write(storage.ephemerals.size(), out);
+    for (const auto & [session_id, paths] : storage.ephemerals)
+    {
+        Coordination::write(session_id, out);
+        Coordination::write(paths.size(), out);
+        for (const auto & path : paths)
+            Coordination::write(path, out);
+    }
+}
+
+void NuKeeperStorageSerializer::deserialize(NuKeeperStorage & storage, ReadBuffer & in)
+{
+    int64_t session_id_counter, zxid;
+    Coordination::read(zxid, in);
+    Coordination::read(session_id_counter, in);
+    storage.zxid = zxid;
+    storage.session_id_counter = session_id_counter;
+
+    size_t container_size;
+    Coordination::read(container_size, in);
+    while (storage.container.size() < container_size)
+    {
+        std::string path;
+        Coordination::read(path, in);
+        NuKeeperStorage::Node node;
+        readNode(node, in);
+        storage.container[path] = node;
+    }
+    size_t ephemerals_size;
+    Coordination::read(ephemerals_size, in);
+    while (storage.ephemerals.size() < ephemerals_size)
+    {
+        int64_t session_id;
+        size_t ephemerals_for_session;
+        Coordination::read(session_id, in);
+        Coordination::read(ephemerals_for_session, in);
+        while (storage.ephemerals[session_id].size() < ephemerals_for_session)
+        {
+            std::string ephemeral_path;
+            Coordination::read(ephemeral_path, in);
+            storage.ephemerals[session_id].emplace(ephemeral_path);
+        }
+    }
+}
+
+}
diff --git a/src/Coordination/NuKeeperStorageSerializer.h b/src/Coordination/NuKeeperStorageSerializer.h
new file mode 100644
index 00000000000..e54c65a739d
--- /dev/null
+++ b/src/Coordination/NuKeeperStorageSerializer.h
@@ -0,0 +1,17 @@
+#pragma once
+#include <Coordination/NuKeeperStorage.h>
+#include <IO/WriteBuffer.h>
+#include <IO/ReadBuffer.h>
+
+namespace DB
+{
+
+class NuKeeperStorageSerializer
+{
+public:
+    static void serialize(const NuKeeperStorage & storage, WriteBuffer & out);
+
+    static void deserialize(NuKeeperStorage & storage, ReadBuffer & in);
+};
+
+}
diff --git a/src/Coordination/ReadBufferFromNuraftBuffer.h b/src/Coordination/ReadBufferFromNuraftBuffer.h
new file mode 100644
index 00000000000..3817e217881
--- /dev/null
+++ b/src/Coordination/ReadBufferFromNuraftBuffer.h
@@ -0,0 +1,20 @@
+#pragma once
+#include <IO/ReadBufferFromMemory.h>
+
+#include <libnuraft/nuraft.hxx> // Y_IGNORE
+
+namespace DB
+{
+
+class ReadBufferFromNuraftBuffer : public ReadBufferFromMemory
+{
+public:
+    explicit ReadBufferFromNuraftBuffer(nuraft::ptr<nuraft::buffer> buffer)
+        : ReadBufferFromMemory(buffer->data_begin(), buffer->size())
+    {}
+    explicit ReadBufferFromNuraftBuffer(nuraft::buffer & buffer)
+        : ReadBufferFromMemory(buffer.data_begin(), buffer.size())
+    {}
+};
+
+}
diff --git a/src/Coordination/SessionExpiryQueue.cpp b/src/Coordination/SessionExpiryQueue.cpp
new file mode 100644
index 00000000000..51837087af5
--- /dev/null
+++ b/src/Coordination/SessionExpiryQueue.cpp
@@ -0,0 +1,83 @@
+#include <Coordination/SessionExpiryQueue.h>
+#include <common/logger_useful.h>
+namespace DB
+{
+
+bool SessionExpiryQueue::remove(int64_t session_id)
+{
+    auto session_it = session_to_timeout.find(session_id);
+    if (session_it != session_to_timeout.end())
+    {
+        auto set_it = expiry_to_sessions.find(session_it->second);
+        if (set_it != expiry_to_sessions.end())
+            set_it->second.erase(session_id);
+
+        return true;
+    }
+
+    return false;
+}
+
+bool SessionExpiryQueue::update(int64_t session_id, int64_t timeout_ms)
+{
+    auto session_it = session_to_timeout.find(session_id);
+    int64_t now = getNowMilliseconds();
+    int64_t new_expiry_time = roundToNextInterval(now + timeout_ms);
+
+    if (session_it != session_to_timeout.end())
+    {
+        if (new_expiry_time == session_it->second)
+            return false;
+
+        auto set_it = expiry_to_sessions.find(new_expiry_time);
+        if (set_it == expiry_to_sessions.end())
+            std::tie(set_it, std::ignore) = expiry_to_sessions.emplace(new_expiry_time, std::unordered_set<int64_t>());
+
+        set_it->second.insert(session_id);
+        int64_t prev_expiry_time = session_it->second;
+
+        if (prev_expiry_time != new_expiry_time)
+        {
+            auto prev_set_it = expiry_to_sessions.find(prev_expiry_time);
+            if (prev_set_it != expiry_to_sessions.end())
+                prev_set_it->second.erase(session_id);
+        }
+        session_it->second = new_expiry_time;
+        return true;
+    }
+    else
+    {
+        session_to_timeout[session_id] = new_expiry_time;
+        auto set_it = expiry_to_sessions.find(new_expiry_time);
+        if (set_it == expiry_to_sessions.end())
+            std::tie(set_it, std::ignore) = expiry_to_sessions.emplace(new_expiry_time, std::unordered_set<int64_t>());
+        set_it->second.insert(session_id);
+        return false;
+    }
+}
+
+std::unordered_set<int64_t> SessionExpiryQueue::getExpiredSessions()
+{
+    int64_t now = getNowMilliseconds();
+    if (now < next_expiration_time)
+        return {};
+
+    auto set_it = expiry_to_sessions.find(next_expiration_time);
+    int64_t new_expiration_time = next_expiration_time + expiration_interval;
+    next_expiration_time = new_expiration_time;
+    if (set_it != expiry_to_sessions.end())
+    {
+        auto result = set_it->second;
+        expiry_to_sessions.erase(set_it);
+        return result;
+    }
+    return {};
+}
+
+void SessionExpiryQueue::clear()
+{
+    session_to_timeout.clear();
+    expiry_to_sessions.clear();
+}
+
+}
diff --git a/src/Coordination/SessionExpiryQueue.h b/src/Coordination/SessionExpiryQueue.h
new file mode 100644
index 00000000000..dff629a2432
--- /dev/null
+++ b/src/Coordination/SessionExpiryQueue.h
@@ -0,0 +1,45 @@
+#pragma once
+#include <unordered_map>
+#include <unordered_set>
+#include <chrono>
+
+namespace DB
+{
+
+class SessionExpiryQueue
+{
+private:
+    std::unordered_map<int64_t, int64_t> session_to_timeout;
+    std::unordered_map<int64_t, std::unordered_set<int64_t>> expiry_to_sessions;
+
+    int64_t expiration_interval;
+    int64_t next_expiration_time;
+
+    static int64_t getNowMilliseconds()
+    {
+        using namespace std::chrono;
+        return duration_cast<milliseconds>(system_clock::now().time_since_epoch()).count();
+    }
+
+    int64_t roundToNextInterval(int64_t time) const
+    {
+        return (time / expiration_interval + 1) * expiration_interval;
+    }
+
+public:
+    explicit SessionExpiryQueue(int64_t expiration_interval_)
+        : expiration_interval(expiration_interval_)
+        , next_expiration_time(roundToNextInterval(getNowMilliseconds()))
+    {
+    }
+
+    bool remove(int64_t session_id);
+
+    bool update(int64_t session_id, int64_t timeout_ms);
+
+    std::unordered_set<int64_t> getExpiredSessions();
+
+    void clear();
+};
+
+}
diff --git a/src/Coordination/SummingStateMachine.cpp b/src/Coordination/SummingStateMachine.cpp
new file mode 100644
index 00000000000..0cb7a7da6c3
--- /dev/null
+++ b/src/Coordination/SummingStateMachine.cpp
@@ -0,0 +1,167 @@
+#include <Coordination/SummingStateMachine.h>
+#include <iostream>
+#include <cstring>
+
+namespace DB
+{
+
+static constexpr int MAX_SNAPSHOTS = 3;
+
+static int64_t deserializeValue(nuraft::buffer & buffer)
+{
+    nuraft::buffer_serializer bs(buffer);
+    int64_t result;
+    memcpy(&result, bs.get_raw(buffer.size()), sizeof(result));
+    return result;
+}
+
+SummingStateMachine::SummingStateMachine()
+    : value(0)
+    , last_committed_idx(0)
+{
+}
+
+nuraft::ptr<nuraft::buffer> SummingStateMachine::commit(const size_t log_idx, nuraft::buffer & data)
+{
+    int64_t value_to_add = deserializeValue(data);
+
+    value += value_to_add;
+    last_committed_idx = log_idx;
+
+    // Return Raft log number as a return result.
+    nuraft::ptr<nuraft::buffer> ret = nuraft::buffer::alloc(sizeof(log_idx));
+    nuraft::buffer_serializer bs(ret);
+    bs.put_u64(log_idx);
+    return ret;
+}
+
+bool SummingStateMachine::apply_snapshot(nuraft::snapshot & s)
+{
+    std::lock_guard<std::mutex> ll(snapshots_lock);
+    auto entry = snapshots.find(s.get_last_log_idx());
+    if (entry == snapshots.end())
+        return false;
+
+    auto ctx = entry->second;
+    value = ctx->value;
+    return true;
+}
+
+nuraft::ptr<nuraft::snapshot> SummingStateMachine::last_snapshot()
+{
+    // Just return the latest snapshot.
+    std::lock_guard<std::mutex> ll(snapshots_lock);
+    auto entry = snapshots.rbegin();
+    if (entry == snapshots.rend())
+        return nullptr;
+
+    auto ctx = entry->second;
+    return ctx->snapshot;
+}
+
+
+void SummingStateMachine::createSnapshotInternal(nuraft::snapshot & s)
+{
+    // Clone snapshot from `s`.
+    nuraft::ptr<nuraft::buffer> snp_buf = s.serialize();
+    nuraft::ptr<nuraft::snapshot> ss = nuraft::snapshot::deserialize(*snp_buf);
+
+    // Put into snapshot map.
+    auto ctx = cs_new<SingleValueSnapshotContext>(ss, value);
+    snapshots[s.get_last_log_idx()] = ctx;
+
+    // Maintain last 3 snapshots only.
+    int num = snapshots.size();
+    auto entry = snapshots.begin();
+
+    for (int ii = 0; ii < num - MAX_SNAPSHOTS; ++ii)
+    {
+        if (entry == snapshots.end())
+            break;
+        entry = snapshots.erase(entry);
+    }
+}
+
+void SummingStateMachine::save_logical_snp_obj(
+    nuraft::snapshot & s,
+    size_t & obj_id,
+    nuraft::buffer & data,
+    bool /*is_first_obj*/,
+    bool /*is_last_obj*/)
+{
+    if (obj_id == 0)
+    {
+        // Object ID == 0: it contains dummy value, create snapshot context.
+        createSnapshotInternal(s);
+    }
+    else
+    {
+        // Object ID > 0: actual snapshot value.
+        nuraft::buffer_serializer bs(data);
+        int64_t local_value = static_cast<int64_t>(bs.get_u64());
+
+        std::lock_guard<std::mutex> ll(snapshots_lock);
+        auto entry = snapshots.find(s.get_last_log_idx());
+        assert(entry != snapshots.end());
+        entry->second->value = local_value;
+    }
+    // Request next object.
+    obj_id++;
+}
+
+int SummingStateMachine::read_logical_snp_obj(
+    nuraft::snapshot & s,
+    void* & /*user_snp_ctx*/,
+    size_t obj_id,
+    nuraft::ptr<nuraft::buffer> & data_out,
+    bool & is_last_obj)
+{
+    nuraft::ptr<SingleValueSnapshotContext> ctx = nullptr;
+    {
+        std::lock_guard<std::mutex> ll(snapshots_lock);
+        auto entry = snapshots.find(s.get_last_log_idx());
+        if (entry == snapshots.end())
+        {
+            // Snapshot doesn't exist.
+            data_out = nullptr;
+            is_last_obj = true;
+            return 0;
+        }
+        ctx = entry->second;
+    }
+
+    if (obj_id == 0)
+    {
+        // Object ID == 0: first object, put dummy data.
+        data_out = nuraft::buffer::alloc(sizeof(Int32));
+        nuraft::buffer_serializer bs(data_out);
+        bs.put_i32(0);
+        is_last_obj = false;
+
+    }
+    else
+    {
+        // Object ID > 0: second object, put actual value.
+        data_out = nuraft::buffer::alloc(sizeof(size_t));
+        nuraft::buffer_serializer bs(data_out);
+        bs.put_u64(ctx->value);
+        is_last_obj = true;
+    }
+    return 0;
+}
+
+void SummingStateMachine::create_snapshot(
+    nuraft::snapshot & s,
+    nuraft::async_result<bool>::handler_type & when_done)
+{
+    {
+        std::lock_guard<std::mutex> ll(snapshots_lock);
+        createSnapshotInternal(s);
+    }
+    nuraft::ptr<std::exception> except(nullptr);
+    bool ret = true;
+    when_done(ret, except);
+}
+
+
+}
diff --git a/src/Coordination/SummingStateMachine.h b/src/Coordination/SummingStateMachine.h
new file mode 100644
index 00000000000..c8594ba7e8d
--- /dev/null
+++ b/src/Coordination/SummingStateMachine.h
@@ -0,0 +1,78 @@
+#pragma once
+
+#include <libnuraft/nuraft.hxx> // Y_IGNORE
+#include <Core/Types.h>
+#include <atomic>
+#include <map>
+#include <mutex>
+
+namespace DB
+{
+
+/// Example trivial state machine.
+class SummingStateMachine : public nuraft::state_machine
+{
+public:
+    SummingStateMachine();
+
+    nuraft::ptr<nuraft::buffer> pre_commit(const size_t /*log_idx*/, nuraft::buffer & /*data*/) override { return nullptr; }
+
+    nuraft::ptr<nuraft::buffer> commit(const size_t log_idx, nuraft::buffer & data) override;
+
+    void rollback(const size_t /*log_idx*/, nuraft::buffer & /*data*/) override {}
+
+    size_t last_commit_index() override { return last_committed_idx; }
+
+    bool apply_snapshot(nuraft::snapshot & s) override;
+
+    nuraft::ptr<nuraft::snapshot> last_snapshot() override;
+
+    void create_snapshot(
+        nuraft::snapshot & s,
+        nuraft::async_result<bool>::handler_type & when_done) override;
+
+    void save_logical_snp_obj(
+        nuraft::snapshot & s,
+        size_t & obj_id,
+        nuraft::buffer & data,
+        bool is_first_obj,
+        bool is_last_obj) override;
+
+    int read_logical_snp_obj(
+        nuraft::snapshot & s,
+        void* & user_snp_ctx,
+        size_t obj_id,
+        nuraft::ptr<nuraft::buffer> & data_out,
+        bool & is_last_obj) override;
+
+    int64_t getValue() const { return value; }
+
+private:
+    struct SingleValueSnapshotContext
+    {
+        SingleValueSnapshotContext(nuraft::ptr<nuraft::snapshot> & s, int64_t v)
+            : snapshot(s)
+            , value(v)
+        {}
+
+        nuraft::ptr<nuraft::snapshot> snapshot;
+        int64_t value;
+    };
+
+    void createSnapshotInternal(nuraft::snapshot & s);
+
+    // State machine's current value.
+    std::atomic<int64_t> value;
+
+    // Last committed Raft log number.
+    std::atomic<uint64_t> last_committed_idx;
+
+    // Keeps the last 3 snapshots, by their Raft log numbers.
+    std::map<uint64_t, nuraft::ptr<SingleValueSnapshotContext>> snapshots;
+
+    // Mutex for `snapshots_`.
+    std::mutex snapshots_lock;
+
+};
+
+}
diff --git a/src/Coordination/ThreadSafeQueue.h b/src/Coordination/ThreadSafeQueue.h
new file mode 100644
index 00000000000..d36e25244bb
--- /dev/null
+++ b/src/Coordination/ThreadSafeQueue.h
@@ -0,0 +1,45 @@
+#pragma once
+
+#include <queue>
+#include <mutex>
+
+namespace DB
+{
+
+/// Queue with mutex and condvar. As simple as possible.
+template <typename T>
+class ThreadSafeQueue
+{
+private:
+    mutable std::mutex queue_mutex;
+    std::condition_variable cv;
+    std::queue<T> queue;
+public:
+
+    void push(const T & response)
+    {
+        std::lock_guard lock(queue_mutex);
+        queue.push(response);
+        cv.notify_one();
+    }
+
+    bool tryPop(T & response, int64_t timeout_ms = 0)
+    {
+        std::unique_lock lock(queue_mutex);
+        if (!cv.wait_for(lock,
+                std::chrono::milliseconds(timeout_ms), [this] { return !queue.empty(); }))
+            return false;
+
+        response = queue.front();
+        queue.pop();
+        return true;
+    }
+
+    size_t size() const
+    {
+        std::lock_guard lock(queue_mutex);
+        return queue.size();
+    }
+};
+
+}
diff --git a/src/Coordination/WriteBufferFromNuraftBuffer.cpp b/src/Coordination/WriteBufferFromNuraftBuffer.cpp
new file mode 100644
index 00000000000..1a16b7cef24
--- /dev/null
+++ b/src/Coordination/WriteBufferFromNuraftBuffer.cpp
@@ -0,0 +1,71 @@
+#include <Coordination/WriteBufferFromNuraftBuffer.h>
+#include <common/logger_useful.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int CANNOT_WRITE_AFTER_END_OF_BUFFER;
+}
+
+void WriteBufferFromNuraftBuffer::nextImpl()
+{
+    if (is_finished)
+        throw Exception("WriteBufferFromNuraftBuffer is finished", ErrorCodes::CANNOT_WRITE_AFTER_END_OF_BUFFER);
+
+    /// pos may not be equal to vector.data() + old_size, because WriteBuffer::next() can be used to flush data
+    size_t pos_offset = pos - reinterpret_cast<Position>(buffer->data_begin());
+    size_t old_size = buffer->size();
+    if (pos_offset == old_size)
+    {
+        nuraft::ptr<nuraft::buffer> new_buffer = nuraft::buffer::alloc(old_size * size_multiplier);
+        memcpy(new_buffer->data_begin(), buffer->data_begin(), buffer->size());
+        buffer = new_buffer;
+    }
+    internal_buffer = Buffer(reinterpret_cast<Position>(buffer->data_begin() + pos_offset), reinterpret_cast<Position>(buffer->data_begin() + buffer->size()));
+    working_buffer = internal_buffer;
+
+}
+
+WriteBufferFromNuraftBuffer::WriteBufferFromNuraftBuffer()
+    : WriteBuffer(nullptr, 0)
+{
+    buffer = nuraft::buffer::alloc(initial_size);
+    set(reinterpret_cast<Position>(buffer->data_begin()), buffer->size());
+}
+
+void WriteBufferFromNuraftBuffer::finalize()
+{
+    if (is_finished)
+        return;
+
+    is_finished = true;
+    size_t real_size = pos - reinterpret_cast<Position>(buffer->data_begin());
+    nuraft::ptr<nuraft::buffer> new_buffer = nuraft::buffer::alloc(real_size);
+    memcpy(new_buffer->data_begin(), buffer->data_begin(), real_size);
+    buffer = new_buffer;
+
+    /// Prevent further writes.
+    set(nullptr, 0);
+}
+
+nuraft::ptr<nuraft::buffer> WriteBufferFromNuraftBuffer::getBuffer()
+{
+    finalize();
+    return buffer;
+}
+
+WriteBufferFromNuraftBuffer::~WriteBufferFromNuraftBuffer()
+{
+    try
+    {
+        finalize();
+    }
+    catch (...)
+    {
+        tryLogCurrentException(__PRETTY_FUNCTION__);
+    }
+}
+
+}
diff --git a/src/Coordination/WriteBufferFromNuraftBuffer.h b/src/Coordination/WriteBufferFromNuraftBuffer.h
new file mode 100644
index 00000000000..d037a0e6a27
--- /dev/null
+++ b/src/Coordination/WriteBufferFromNuraftBuffer.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <IO/WriteBuffer.h>
+#include <libnuraft/nuraft.hxx> // Y_IGNORE
+
+namespace DB
+{
+
+class WriteBufferFromNuraftBuffer : public WriteBuffer
+{
+private:
+    nuraft::ptr<nuraft::buffer> buffer;
+    bool is_finished = false;
+
+    static constexpr size_t initial_size = 32;
+    static constexpr size_t size_multiplier = 2;
+
+    void nextImpl() override;
+
+public:
+    WriteBufferFromNuraftBuffer();
+
+    void finalize() override final;
+    nuraft::ptr<nuraft::buffer> getBuffer();
+    bool isFinished() const { return is_finished; }
+
+    ~WriteBufferFromNuraftBuffer() override;
+};
+
+}
diff --git a/src/Coordination/tests/gtest_for_build.cpp b/src/Coordination/tests/gtest_for_build.cpp
new file mode 100644
index 00000000000..ed9777350c5
--- /dev/null
+++ b/src/Coordination/tests/gtest_for_build.cpp
@@ -0,0 +1,336 @@
+#include <gtest/gtest.h>
+
+#if !defined(ARCADIA_BUILD)
+#    include <Common/config.h>
+#    include "config_core.h"
+#endif
+
+#if USE_NURAFT
+
+#include <Coordination/InMemoryLogStore.h>
+#include <Coordination/InMemoryStateManager.h>
+#include <Coordination/NuKeeperStorageSerializer.h>
+#include <Coordination/SummingStateMachine.h>
+#include <Coordination/NuKeeperStateMachine.h>
+#include <Coordination/LoggerWrapper.h>
+#include <Coordination/WriteBufferFromNuraftBuffer.h>
+#include <Coordination/ReadBufferFromNuraftBuffer.h>
+#include <IO/ReadBufferFromString.h>
+#include <IO/WriteBufferFromString.h>
+#include <Common/ZooKeeper/ZooKeeperCommon.h>
+#include <Common/ZooKeeper/ZooKeeperIO.h>
+#include <Common/Exception.h>
+#include <libnuraft/nuraft.hxx> // Y_IGNORE
+#include <thread>
+
+
+TEST(CoordinationTest, BuildTest)
+{
+    DB::InMemoryLogStore store;
+    DB::SummingStateMachine machine;
+    EXPECT_EQ(1, 1);
+}
+
+TEST(CoordinationTest, BufferSerde)
+{
+    Coordination::ZooKeeperRequestPtr request = Coordination::ZooKeeperRequestFactory::instance().get(Coordination::OpNum::Get);
+    request->xid = 3;
+    dynamic_cast<Coordination::ZooKeeperGetRequest *>(request.get())->path = "/path/value";
+
+    DB::WriteBufferFromNuraftBuffer wbuf;
+    request->write(wbuf);
+    auto nuraft_buffer = wbuf.getBuffer();
+    EXPECT_EQ(nuraft_buffer->size(), 28);
+
+    DB::ReadBufferFromNuraftBuffer rbuf(nuraft_buffer);
+
+    int32_t length;
+    Coordination::read(length, rbuf);
+    EXPECT_EQ(length + sizeof(length), nuraft_buffer->size());
+
+    int32_t xid;
+    Coordination::read(xid, rbuf);
+    EXPECT_EQ(xid, request->xid);
+
+    Coordination::OpNum opnum;
+    Coordination::read(opnum, rbuf);
+
+    Coordination::ZooKeeperRequestPtr request_read = Coordination::ZooKeeperRequestFactory::instance().get(opnum);
+    request_read->xid = xid;
+    request_read->readImpl(rbuf);
+
+    EXPECT_EQ(request_read->getOpNum(), Coordination::OpNum::Get);
+    EXPECT_EQ(request_read->xid, 3);
+    EXPECT_EQ(dynamic_cast<Coordination::ZooKeeperGetRequest *>(request_read.get())->path, "/path/value");
+}
+
+template <typename StateMachine>
+struct SimpliestRaftServer
+{
+    SimpliestRaftServer(int server_id_, const std::string & hostname_, int port_)
+        : server_id(server_id_)
+        , hostname(hostname_)
+        , port(port_)
+        , endpoint(hostname + ":" + std::to_string(port))
+        , state_machine(nuraft::cs_new<StateMachine>())
+        , state_manager(nuraft::cs_new<DB::InMemoryStateManager>(server_id, hostname, port))
+    {
+        nuraft::raft_params params;
+        params.heart_beat_interval_ = 100;
+        params.election_timeout_lower_bound_ = 200;
+        params.election_timeout_upper_bound_ = 400;
+        params.reserved_log_items_ = 5;
+        params.snapshot_distance_ = 1; /// forcefully send snapshots
+        params.client_req_timeout_ = 3000;
+        params.return_method_ = nuraft::raft_params::blocking;
+
+        raft_instance = launcher.init(
+            state_machine, state_manager, nuraft::cs_new<DB::LoggerWrapper>("ToyRaftLogger", DB::LogsLevel::trace), port,
+            nuraft::asio_service::options{}, params);
+
+        if (!raft_instance)
+        {
+            std::cerr << "Failed to initialize launcher (see the message "
+                         "in the log file)." << std::endl;
+            exit(-1);
+        }
+        std::cout << "init Raft instance " << server_id;
+        for (size_t ii = 0; ii < 20; ++ii)
+        {
+            if (raft_instance->is_initialized())
+            {
+                std::cout << " done" << std::endl;
+                break;
+            }
+            std::cout << ".";
+            fflush(stdout);
+            std::this_thread::sleep_for(std::chrono::milliseconds(100));
+        }
+    }
+
+    // Server ID.
+    int server_id;
+
+    // Server address.
+    std::string hostname;
+
+    // Server port.
+    int port;
+
+    std::string endpoint;
+
+    // State machine.
+    nuraft::ptr<StateMachine> state_machine;
+
+    // State manager.
+    nuraft::ptr<nuraft::state_mgr> state_manager;
+
+    // Raft launcher.
+    nuraft::raft_launcher launcher;
+
+    // Raft server instance.
+    nuraft::ptr<nuraft::raft_server> raft_instance;
+};
+
+using SummingRaftServer = SimpliestRaftServer<DB::SummingStateMachine>;
+
+nuraft::ptr<nuraft::buffer> getLogEntry(int64_t number)
+{
+    nuraft::ptr<nuraft::buffer> ret = nuraft::buffer::alloc(sizeof(number));
+    nuraft::buffer_serializer bs(ret);
+    // WARNING: We don't consider endian-safety in this example.
+    bs.put_raw(&number, sizeof(number));
+    return ret;
+}
+
+
+TEST(CoordinationTest, TestSummingRaft1)
+{
+    SummingRaftServer s1(1, "localhost", 44444);
+
+    /// Single node is leader
+    EXPECT_EQ(s1.raft_instance->get_leader(), 1);
+
+    auto entry1 = getLogEntry(143);
+    auto ret = s1.raft_instance->append_entries({entry1});
+    EXPECT_TRUE(ret->get_accepted()) << "failed to replicate: entry 1" << ret->get_result_code();
+    EXPECT_EQ(ret->get_result_code(), nuraft::cmd_result_code::OK) << "failed to replicate: entry 1" << ret->get_result_code();
+
+    while (s1.state_machine->getValue() != 143)
+    {
+        std::cout << "Waiting s1 to apply entry\n";
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+    }
+
+    EXPECT_EQ(s1.state_machine->getValue(), 143);
+
+    s1.launcher.shutdown(5);
+}
+
+TEST(CoordinationTest, TestSummingRaft3)
+{
+    SummingRaftServer s1(1, "localhost", 44444);
+    SummingRaftServer s2(2, "localhost", 44445);
+    SummingRaftServer s3(3, "localhost", 44446);
+
+    nuraft::srv_config first_config(1, "localhost:44444");
+    auto ret1 = s2.raft_instance->add_srv(first_config);
+    if (!ret1->get_accepted())
+    {
+        std::cout << "failed to add server: "
+                  << ret1->get_result_str() << std::endl;
+        EXPECT_TRUE(false);
+    }
+
+    while (s1.raft_instance->get_leader() != 2)
+    {
+        std::cout << "Waiting s1 to join to s2 quorum\n";
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+    }
+
+    nuraft::srv_config third_config(3, "localhost:44446");
+    auto ret3 = s2.raft_instance->add_srv(third_config);
+    if (!ret3->get_accepted())
+    {
+        std::cout << "failed to add server: "
+                  << ret3->get_result_str() << std::endl;
+        EXPECT_TRUE(false);
+    }
+
+    while (s3.raft_instance->get_leader() != 2)
+    {
+        std::cout << "Waiting s3 to join to s2 quorum\n";
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+    }
+
+    /// S2 is leader
+    EXPECT_EQ(s1.raft_instance->get_leader(), 2);
+    EXPECT_EQ(s2.raft_instance->get_leader(), 2);
+    EXPECT_EQ(s3.raft_instance->get_leader(), 2);
+
+    std::cerr << "Starting to add entries\n";
+    auto entry = getLogEntry(1);
+    auto ret = s2.raft_instance->append_entries({entry});
+    EXPECT_TRUE(ret->get_accepted()) << "failed to replicate: entry 1" << ret->get_result_code();
+    EXPECT_EQ(ret->get_result_code(), nuraft::cmd_result_code::OK) << "failed to replicate: entry 1" << ret->get_result_code();
+
+    while (s1.state_machine->getValue() != 1)
+    {
+        std::cout << "Waiting s1 to apply entry\n";
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+    }
+
+    while (s2.state_machine->getValue() != 1)
+    {
+        std::cout << "Waiting s2 to apply entry\n";
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+    }
+
+    while (s3.state_machine->getValue() != 1)
+    {
+        std::cout << "Waiting s3 to apply entry\n";
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+    }
+
+    EXPECT_EQ(s1.state_machine->getValue(), 1);
+    EXPECT_EQ(s2.state_machine->getValue(), 1);
+    EXPECT_EQ(s3.state_machine->getValue(), 1);
+
+    auto non_leader_entry = getLogEntry(3);
+    auto ret_non_leader1 = s1.raft_instance->append_entries({non_leader_entry});
+
+    EXPECT_FALSE(ret_non_leader1->get_accepted());
+
+    auto ret_non_leader3 = s3.raft_instance->append_entries({non_leader_entry});
+
+    EXPECT_FALSE(ret_non_leader3->get_accepted());
+
+    auto leader_entry = getLogEntry(77);
+    auto ret_leader = s2.raft_instance->append_entries({leader_entry});
+    EXPECT_TRUE(ret_leader->get_accepted()) << "failed to replicate: entry 78" << ret_leader->get_result_code();
+    EXPECT_EQ(ret_leader->get_result_code(), nuraft::cmd_result_code::OK) << "failed to replicate: entry 78" << ret_leader->get_result_code();
+
+    while (s1.state_machine->getValue() != 78)
+    {
+        std::cout << "Waiting s1 to apply entry\n";
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+    }
+
+    while (s3.state_machine->getValue() != 78)
+    {
+        std::cout << "Waiting s3 to apply entry\n";
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+    }
+
+    EXPECT_EQ(s1.state_machine->getValue(), 78);
+    EXPECT_EQ(s2.state_machine->getValue(), 78);
+    EXPECT_EQ(s3.state_machine->getValue(), 78);
+
+    s1.launcher.shutdown(5);
+    s2.launcher.shutdown(5);
+    s3.launcher.shutdown(5);
+}
+
+nuraft::ptr<nuraft::buffer> getZooKeeperLogEntry(int64_t session_id, const Coordination::ZooKeeperRequestPtr & request)
+{
+    DB::WriteBufferFromNuraftBuffer buf;
+    DB::writeIntBinary(session_id, buf);
+    request->write(buf);
+    return buf.getBuffer();
+}
+
+DB::NuKeeperStorage::ResponsesForSessions getZooKeeperResponses(nuraft::ptr<nuraft::buffer> & buffer, const Coordination::ZooKeeperRequestPtr & request)
+{
+    DB::NuKeeperStorage::ResponsesForSessions results;
+    DB::ReadBufferFromNuraftBuffer buf(buffer);
+    while (!buf.eof())
+    {
+        int64_t session_id;
+        DB::readIntBinary(session_id, buf);
+
+        int32_t length;
+        Coordination::XID xid;
+        int64_t zxid;
+        Coordination::Error err;
+
+        Coordination::read(length, buf);
+        Coordination::read(xid, buf);
+        Coordination::read(zxid, buf);
+        Coordination::read(err, buf);
+        auto response = request->makeResponse();
+        response->readImpl(buf);
+        results.push_back(DB::NuKeeperStorage::ResponseForSession{session_id, response});
+    }
+    return results;
+}
+
+TEST(CoordinationTest, TestStorageSerialization)
+{
+    DB::NuKeeperStorage storage(500);
+    storage.container["/hello"] = DB::NuKeeperStorage::Node{.data="world"};
+    storage.container["/hello/somepath"] =  DB::NuKeeperStorage::Node{.data="somedata"};
+    storage.session_id_counter = 5;
+    storage.zxid = 156;
+    storage.ephemerals[3] = {"/hello", "/"};
+    storage.ephemerals[1] = {"/hello/somepath"};
+
+    DB::WriteBufferFromOwnString buffer;
+    DB::NuKeeperStorageSerializer serializer;
+    serializer.serialize(storage, buffer);
+    std::string serialized = buffer.str();
+    EXPECT_NE(serialized.size(), 0);
+    DB::ReadBufferFromString read(serialized);
+    DB::NuKeeperStorage new_storage(500);
+    serializer.deserialize(new_storage, read);
+
+    EXPECT_EQ(new_storage.container.size(), 3);
+    EXPECT_EQ(new_storage.container["/hello"].data, "world");
+    EXPECT_EQ(new_storage.container["/hello/somepath"].data, "somedata");
+    EXPECT_EQ(new_storage.session_id_counter, 5);
+    EXPECT_EQ(new_storage.zxid, 156);
+    EXPECT_EQ(new_storage.ephemerals.size(), 2);
+    EXPECT_EQ(new_storage.ephemerals[3].size(), 2);
+    EXPECT_EQ(new_storage.ephemerals[1].size(), 1);
+}
+
+#endif
diff --git a/src/Coordination/ya.make b/src/Coordination/ya.make
new file mode 100644
index 00000000000..f3eae68806c
--- /dev/null
+++ b/src/Coordination/ya.make
@@ -0,0 +1,13 @@
+# This file is generated automatically, do not edit. See 'ya.make.in' and use 'utils/generate-ya-make' to regenerate it.
+OWNER(g:clickhouse)
+
+LIBRARY()
+
+PEERDIR(
+    clickhouse/src/Common
+)
+
+SRCS(
+)
+
+END()
diff --git a/src/Coordination/ya.make.in b/src/Coordination/ya.make.in
new file mode 100644
index 00000000000..ba5f8bcbea4
--- /dev/null
+++ b/src/Coordination/ya.make.in
@@ -0,0 +1,12 @@
+OWNER(g:clickhouse)
+
+LIBRARY()
+
+PEERDIR(
+    clickhouse/src/Common
+)
+
+SRCS(
+)
+
+END()
diff --git a/src/Core/DecimalComparison.h b/src/Core/DecimalComparison.h
index aaf471cefd8..8279d01d35a 100644
--- a/src/Core/DecimalComparison.h
+++ b/src/Core/DecimalComparison.h
@@ -21,7 +21,7 @@ namespace ErrorCodes
     extern const int DECIMAL_OVERFLOW;
 }
 
-///
+
 inline bool allowDecimalComparison(const DataTypePtr & left_type, const DataTypePtr & right_type)
 {
     if (isColumnedAsDecimal(left_type))
@@ -30,7 +30,9 @@ inline bool allowDecimalComparison(const DataTypePtr & left_type, const DataType
             return true;
     }
     else if (isNotDecimalButComparableToDecimal(left_type) && isColumnedAsDecimal(right_type))
+    {
         return true;
+    }
     return false;
 }
 
@@ -252,9 +254,9 @@ private:
         else
         {
             if constexpr (scale_left)
-                x *= scale;
+                x = common::mulIgnoreOverflow(x, scale);
             if constexpr (scale_right)
-                y *= scale;
+                y = common::mulIgnoreOverflow(y, scale);
         }
 
         return Op::apply(x, y);
diff --git a/src/Core/config_core.h.in b/src/Core/config_core.h.in
index 6c7a35abd7c..666ef32efdf 100644
--- a/src/Core/config_core.h.in
+++ b/src/Core/config_core.h.in
@@ -13,3 +13,4 @@
 #cmakedefine01 USE_LDAP
 #cmakedefine01 USE_ROCKSDB
 #cmakedefine01 USE_LIBPQXX
+#cmakedefine01 USE_NURAFT
diff --git a/src/DataStreams/IBlockOutputStream.h b/src/DataStreams/IBlockOutputStream.h
index 4cc1257e955..79c13b6fa47 100644
--- a/src/DataStreams/IBlockOutputStream.h
+++ b/src/DataStreams/IBlockOutputStream.h
@@ -57,7 +57,7 @@ public:
       */
     virtual std::string getContentType() const { return "text/plain; charset=UTF-8"; }
 
-    virtual ~IBlockOutputStream() {}
+    virtual ~IBlockOutputStream() = default;
 
     /** Don't let to alter table while instance of stream is alive.
       */
diff --git a/src/DataTypes/DataTypeDecimalBase.h b/src/DataTypes/DataTypeDecimalBase.h
index d9128151403..c861b3bcac0 100644
--- a/src/DataTypes/DataTypeDecimalBase.h
+++ b/src/DataTypes/DataTypeDecimalBase.h
@@ -120,14 +120,17 @@ public:
         return DecimalUtils::getFractionalPart(x, scale);
     }
 
-    T maxWholeValue() const { return getScaleMultiplier(maxPrecision() - scale) - T(1); }
+    T maxWholeValue() const { return getScaleMultiplier(precision - scale) - T(1); }
 
-    bool canStoreWhole(T x) const
+    template<typename U>
+    bool canStoreWhole(U x) const
     {
+        static_assert(std::is_signed_v<typename T::NativeType>);
         T max = maxWholeValue();
-        if (x > max || x < -max)
-            return false;
-        return true;
+        if constexpr (std::is_signed_v<U>)
+            return -max <= x && x <= max;
+        else
+            return x <= static_cast<std::make_unsigned_t<typename T::NativeType>>(max.value);
     }
 
     /// @returns multiplier for U to become T with correct scale
diff --git a/src/DataTypes/DataTypeNumberBase.h b/src/DataTypes/DataTypeNumberBase.h
index cbbc203bf4f..1491eabfbd5 100644
--- a/src/DataTypes/DataTypeNumberBase.h
+++ b/src/DataTypes/DataTypeNumberBase.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <Common/UInt128.h>
 #include <DataTypes/IDataType.h>
 #include <DataTypes/DataTypeWithSimpleSerialization.h>
 
@@ -70,4 +71,21 @@ public:
     bool canBeInsideLowCardinality() const override { return true; }
 };
 
+/// Prevent implicit template instantiation of DataTypeNumberBase for common numeric types
+
+extern template class DataTypeNumberBase<UInt8>;
+extern template class DataTypeNumberBase<UInt16>;
+extern template class DataTypeNumberBase<UInt32>;
+extern template class DataTypeNumberBase<UInt64>;
+extern template class DataTypeNumberBase<UInt128>; // base for UUID
+extern template class DataTypeNumberBase<UInt256>;
+extern template class DataTypeNumberBase<Int16>;
+extern template class DataTypeNumberBase<Int8>;
+extern template class DataTypeNumberBase<Int32>;
+extern template class DataTypeNumberBase<Int64>;
+extern template class DataTypeNumberBase<Int128>;
+extern template class DataTypeNumberBase<Int256>;
+extern template class DataTypeNumberBase<Float32>;
+extern template class DataTypeNumberBase<Float64>;
+
 }
diff --git a/src/Dictionaries/RangeDictionaryBlockInputStream.h b/src/Dictionaries/RangeDictionaryBlockInputStream.h
index 3da43c85c45..ccd77d49e0f 100644
--- a/src/Dictionaries/RangeDictionaryBlockInputStream.h
+++ b/src/Dictionaries/RangeDictionaryBlockInputStream.h
@@ -47,7 +47,8 @@ private:
         const std::string & default_name,
         const std::unordered_set<std::string> & column_names_set,
         const PaddedPODArray<T> & values,
-        ColumnsWithTypeAndName & columns) const;
+        ColumnsWithTypeAndName & columns,
+        bool force = false) const;
 
     Block fillBlock(
         const PaddedPODArray<Key> & ids_to_fill,
@@ -121,13 +122,14 @@ void RangeDictionaryBlockInputStream<DictionaryType, RangeType, Key>::addSpecial
     const std::string & default_name,
     const std::unordered_set<std::string> & column_names_set,
     const PaddedPODArray<T> & values,
-    ColumnsWithTypeAndName & columns) const
+    ColumnsWithTypeAndName & columns,
+    bool force) const
 {
     std::string name = default_name;
     if (attribute)
         name = attribute->name;
 
-    if (column_names_set.find(name) != column_names_set.end())
+    if (force || column_names_set.find(name) != column_names_set.end())
         columns.emplace_back(getColumnFromPODArray(values), type, name);
 }
 
@@ -159,7 +161,7 @@ Block RangeDictionaryBlockInputStream<DictionaryType, RangeType, Key>::fillBlock
 
     std::unordered_set<std::string> names(column_names.begin(), column_names.end());
 
-    addSpecialColumn(structure.id, std::make_shared<DataTypeUInt64>(), "ID", names, ids_to_fill, columns);
+    addSpecialColumn(structure.id, std::make_shared<DataTypeUInt64>(), "ID", names, ids_to_fill, columns, true);
     auto ids_column = columns.back().column;
     addSpecialColumn(structure.range_min, structure.range_max->type, "Range Start", names, block_start_dates, columns);
     addSpecialColumn(structure.range_max, structure.range_max->type, "Range End", names, block_end_dates, columns);
diff --git a/src/Disks/S3/registerDiskS3.cpp b/src/Disks/S3/registerDiskS3.cpp
index d094d228bae..3ce2f909760 100644
--- a/src/Disks/S3/registerDiskS3.cpp
+++ b/src/Disks/S3/registerDiskS3.cpp
@@ -7,6 +7,7 @@
 #include "DiskS3.h"
 #include "Disks/DiskCacheWrapper.h"
 #include "Disks/DiskFactory.h"
+#include "Storages/StorageS3Settings.h"
 #include "ProxyConfiguration.h"
 #include "ProxyListConfiguration.h"
 #include "ProxyResolverConfiguration.h"
@@ -137,6 +138,8 @@ void registerDiskS3(DiskFactory & factory)
             uri.is_virtual_hosted_style,
             config.getString(config_prefix + ".access_key_id", ""),
             config.getString(config_prefix + ".secret_access_key", ""),
+            config.getString(config_prefix + ".server_side_encryption_customer_key_base64", ""),
+            {},
             config.getBool(config_prefix + ".use_environment_credentials", config.getBool("s3.use_environment_credentials", false))
         );
 
diff --git a/src/Formats/JSONEachRowUtils.cpp b/src/Formats/JSONEachRowUtils.cpp
index 6017f3983c6..28ba625d9fb 100644
--- a/src/Formats/JSONEachRowUtils.cpp
+++ b/src/Formats/JSONEachRowUtils.cpp
@@ -3,6 +3,11 @@
 
 namespace DB
 {
+namespace ErrorCodes
+{
+    extern const int INCORRECT_DATA;
+    extern const int LOGICAL_ERROR;
+}
 
 std::pair<bool, size_t> fileSegmentationEngineJSONEachRowImpl(ReadBuffer & in, DB::Memory<> & memory, size_t min_chunk_size)
 {
@@ -15,10 +20,18 @@ std::pair<bool, size_t> fileSegmentationEngineJSONEachRowImpl(ReadBuffer & in, D
 
     while (loadAtPosition(in, memory, pos) && (balance || memory.size() + static_cast<size_t>(pos - in.position()) < min_chunk_size))
     {
+        const auto current_object_size = memory.size() + static_cast<size_t>(pos - in.position());
+        if (current_object_size > 10 * min_chunk_size)
+            throw ParsingException("Size of JSON object is extremely large. Expected not greater than " +
+            std::to_string(min_chunk_size) + " bytes, but current is " + std::to_string(current_object_size) +
+            " bytes per row. Increase the value setting 'min_chunk_bytes_for_parallel_parsing' or check your data manually, most likely JSON is malformed", ErrorCodes::INCORRECT_DATA);
+
         if (quotes)
         {
             pos = find_first_symbols<'\\', '"'>(pos, in.buffer().end());
-            if (pos == in.buffer().end())
+            if (pos > in.buffer().end())
+                throw Exception("Position in buffer is out of bounds. There must be a bug.", ErrorCodes::LOGICAL_ERROR);
+            else if (pos == in.buffer().end())
                 continue;
             if (*pos == '\\')
             {
@@ -35,9 +48,11 @@ std::pair<bool, size_t> fileSegmentationEngineJSONEachRowImpl(ReadBuffer & in, D
         else
         {
             pos = find_first_symbols<'{', '}', '\\', '"'>(pos, in.buffer().end());
-            if (pos == in.buffer().end())
+            if (pos > in.buffer().end())
+                throw Exception("Position in buffer is out of bounds. There must be a bug.", ErrorCodes::LOGICAL_ERROR);
+            else if (pos == in.buffer().end())
                 continue;
-            if (*pos == '{')
+            else if (*pos == '{')
             {
                 ++balance;
                 ++pos;
diff --git a/src/Functions/DivisionUtils.h b/src/Functions/DivisionUtils.h
index d0df7e41af1..2b4c07b1cff 100644
--- a/src/Functions/DivisionUtils.h
+++ b/src/Functions/DivisionUtils.h
@@ -6,11 +6,11 @@
 #include <Common/NaNUtils.h>
 #include <DataTypes/NumberTraits.h>
 
-
 #if !defined(ARCADIA_BUILD)
 #    include <Common/config.h>
 #endif
 
+
 namespace DB
 {
 
@@ -90,17 +90,26 @@ struct DivideIntegralImpl
         }
         else
         {
+            /// Comparisons are not strict to avoid rounding issues when operand is implicitly casted to float.
+
             if constexpr (std::is_floating_point_v<A>)
-                if (isNaN(a) || a > std::numeric_limits<CastA>::max() || a < std::numeric_limits<CastA>::lowest())
+                if (isNaN(a) || a >= std::numeric_limits<CastA>::max() || a <= std::numeric_limits<CastA>::lowest())
                     throw Exception("Cannot perform integer division on infinite or too large floating point numbers",
                         ErrorCodes::ILLEGAL_DIVISION);
 
             if constexpr (std::is_floating_point_v<B>)
-                if (isNaN(b) || b > std::numeric_limits<CastB>::max() || b < std::numeric_limits<CastB>::lowest())
+                if (isNaN(b) || b >= std::numeric_limits<CastB>::max() || b <= std::numeric_limits<CastB>::lowest())
                     throw Exception("Cannot perform integer division on infinite or too large floating point numbers",
                         ErrorCodes::ILLEGAL_DIVISION);
 
-            return static_cast<Result>(checkedDivision(CastA(a), CastB(b)));
+            auto res = checkedDivision(CastA(a), CastB(b));
+
+            if constexpr (std::is_floating_point_v<decltype(res)>)
+                if (isNaN(res) || res >= std::numeric_limits<Result>::max() || res <= std::numeric_limits<Result>::lowest())
+                    throw Exception("Cannot perform integer division, because it will produce infinite or too large number",
+                        ErrorCodes::ILLEGAL_DIVISION);
+
+            return static_cast<Result>(res);
         }
     }
 
diff --git a/src/Functions/FunctionBinaryArithmetic.h b/src/Functions/FunctionBinaryArithmetic.h
index f61c9c91d00..bb85ae32622 100644
--- a/src/Functions/FunctionBinaryArithmetic.h
+++ b/src/Functions/FunctionBinaryArithmetic.h
@@ -894,9 +894,8 @@ class FunctionBinaryArithmetic : public IFunction
             const NativeResultType const_b = helperGetOrConvert<T1, ResultDataType>(col_right_const, right);
 
             const ResultType res = check_decimal_overflow
-                // the arguments are already scaled after conversion
-                ? OpImplCheck::template process<left_is_decimal, right_is_decimal>(const_a, const_b, 1, 1)
-                : OpImpl::template process<left_is_decimal, right_is_decimal>(const_a, const_b, 1, 1);
+                ? OpImplCheck::template process<left_is_decimal, right_is_decimal>(const_a, const_b, scale_a, scale_b)
+                : OpImpl::template process<left_is_decimal, right_is_decimal>(const_a, const_b, scale_a, scale_b);
 
             if constexpr (result_is_decimal)
                 return ResultDataType(type.getPrecision(), type.getScale()).createColumnConst(
diff --git a/src/Functions/FunctionFile.cpp b/src/Functions/FunctionFile.cpp
new file mode 100644
index 00000000000..f477f6123c3
--- /dev/null
+++ b/src/Functions/FunctionFile.cpp
@@ -0,0 +1,134 @@
+#include <Columns/ColumnString.h>
+#include <Columns/IColumn.h>
+#include <Functions/FunctionFactory.h>
+#include <DataTypes/DataTypeString.h>
+#include <IO/ReadBufferFromFile.h>
+#include <Poco/File.h>
+#include <Poco/Path.h>
+#include <Interpreters/Context.h>
+#include <unistd.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int ILLEGAL_COLUMN;
+    extern const int NOT_IMPLEMENTED;
+    extern const int INCORRECT_FILE_NAME;
+    extern const int DATABASE_ACCESS_DENIED;
+    extern const int FILE_DOESNT_EXIST;
+}
+
+/// A function to read file as a string.
+class FunctionFile : public IFunction
+{
+public:
+    static constexpr auto name = "file";
+    static FunctionPtr create(const Context &context) { return std::make_shared<FunctionFile>(context); }
+    explicit FunctionFile(const Context &context_) : context(context_) {}
+
+    String getName() const override { return name; }
+
+    size_t getNumberOfArguments() const override { return 1; }
+    bool isInjective(const ColumnsWithTypeAndName &) const override { return true; }
+
+    DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
+    {
+        if (!isString(arguments[0].type))
+            throw Exception(getName() + " is only implemented for types String", ErrorCodes::NOT_IMPLEMENTED);
+        return std::make_shared<DataTypeString>();
+    }
+
+    bool useDefaultImplementationForConstants() const override { return true; }
+
+    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
+    {
+        const ColumnPtr column = arguments[0].column;
+        const ColumnString * expected = checkAndGetColumn<ColumnString>(column.get());
+        if (!expected)
+            throw Exception(
+                fmt::format("Illegal column {} of argument of function {}", arguments[0].column->getName(), getName()),
+                ErrorCodes::ILLEGAL_COLUMN);
+
+        const ColumnString::Chars & chars = expected->getChars();
+        const ColumnString::Offsets & offsets = expected->getOffsets();
+
+        std::vector<String> checked_filenames(input_rows_count);
+
+        auto result = ColumnString::create();
+        auto & res_chars = result->getChars();
+        auto & res_offsets = result->getOffsets();
+
+        res_offsets.resize(input_rows_count);
+
+        size_t source_offset = 0;
+        size_t result_offset = 0;
+        for (size_t row = 0; row < input_rows_count; ++row)
+        {
+            const char * filename = reinterpret_cast<const char *>(&chars[source_offset]);
+
+            const String user_files_path = context.getUserFilesPath();
+            String user_files_absolute_path = Poco::Path(user_files_path).makeAbsolute().makeDirectory().toString();
+            Poco::Path poco_filepath = Poco::Path(filename);
+            if (poco_filepath.isRelative())
+                poco_filepath = Poco::Path(user_files_absolute_path, poco_filepath);
+            const String file_absolute_path = poco_filepath.absolute().toString();
+            checkReadIsAllowedOrThrow(user_files_absolute_path, file_absolute_path);
+
+            checked_filenames[row] = file_absolute_path;
+            auto file = Poco::File(file_absolute_path);
+
+            if (!file.exists())
+                throw Exception(fmt::format("File {} doesn't exist.", file_absolute_path), ErrorCodes::FILE_DOESNT_EXIST);
+
+            const auto current_file_size = Poco::File(file_absolute_path).getSize();
+
+            result_offset += current_file_size + 1;
+            res_offsets[row] = result_offset;
+            source_offset = offsets[row];
+        }
+
+        res_chars.resize(result_offset);
+
+        size_t prev_offset = 0;
+
+        for (size_t row = 0; row < input_rows_count; ++row)
+        {
+            auto file_absolute_path = checked_filenames[row];
+            ReadBufferFromFile in(file_absolute_path);
+            char * res_buf = reinterpret_cast<char *>(&res_chars[prev_offset]);
+
+            const size_t file_lenght = res_offsets[row] - prev_offset - 1;
+            prev_offset = res_offsets[row];
+            in.readStrict(res_buf, file_lenght);
+            res_buf[file_lenght] = '\0';
+        }
+
+        return result;
+    }
+
+private:
+
+    void checkReadIsAllowedOrThrow(const std::string & user_files_absolute_path, const std::string & file_absolute_path) const
+    {
+        // If run in Local mode, no need for path checking.
+        if (context.getApplicationType() != Context::ApplicationType::LOCAL)
+            if (file_absolute_path.find(user_files_absolute_path) != 0)
+                throw Exception("File is not inside " + user_files_absolute_path, ErrorCodes::DATABASE_ACCESS_DENIED);
+
+        Poco::File path_poco_file = Poco::File(file_absolute_path);
+        if (path_poco_file.exists() && path_poco_file.isDirectory())
+            throw Exception("File can't be a directory", ErrorCodes::INCORRECT_FILE_NAME);
+    }
+
+    const Context & context;
+};
+
+
+void registerFunctionFile(FunctionFactory & factory)
+{
+    factory.registerFunction<FunctionFile>();
+}
+
+}
diff --git a/src/Functions/FunctionHelpers.cpp b/src/Functions/FunctionHelpers.cpp
index d64646ecaf1..17c28ee3343 100644
--- a/src/Functions/FunctionHelpers.cpp
+++ b/src/Functions/FunctionHelpers.cpp
@@ -70,8 +70,19 @@ ColumnsWithTypeAndName createBlockWithNestedColumns(const ColumnsWithTypeAndName
             }
             else if (const auto * const_column = checkAndGetColumn<ColumnConst>(*col.column))
             {
-                const auto & nested_col = checkAndGetColumn<ColumnNullable>(const_column->getDataColumn())->getNestedColumnPtr();
-                res.emplace_back(ColumnWithTypeAndName{ ColumnConst::create(nested_col, col.column->size()), nested_type, col.name});
+                const auto * nullable_column = checkAndGetColumn<ColumnNullable>(const_column->getDataColumn());
+
+                ColumnPtr nullable_res;
+                if (nullable_column)
+                {
+                    const auto & nested_col = nullable_column->getNestedColumnPtr();
+                    nullable_res = ColumnConst::create(nested_col, col.column->size());
+                }
+                else
+                {
+                    nullable_res = makeNullable(col.column);
+                }
+                res.emplace_back(ColumnWithTypeAndName{ nullable_res, nested_type, col.name });
             }
             else
                 throw Exception("Illegal column for DataTypeNullable", ErrorCodes::ILLEGAL_COLUMN);
diff --git a/src/Functions/addressToLine.cpp b/src/Functions/addressToLine.cpp
index 59e347dd348..a115b13e54a 100644
--- a/src/Functions/addressToLine.cpp
+++ b/src/Functions/addressToLine.cpp
@@ -111,12 +111,13 @@ private:
 
         if (const auto * object = symbol_index.findObject(reinterpret_cast<const void *>(addr)))
         {
-            auto dwarf_it = cache.dwarfs.try_emplace(object->name, *object->elf).first;
+            auto dwarf_it = cache.dwarfs.try_emplace(object->name, object->elf).first;
             if (!std::filesystem::exists(object->name))
                 return {};
 
             Dwarf::LocationInfo location;
-            if (dwarf_it->second.findAddress(addr - uintptr_t(object->address_begin), location, Dwarf::LocationInfoMode::FAST))
+            std::vector<Dwarf::SymbolizedFrame> frames;  // NOTE: not used in FAST mode.
+            if (dwarf_it->second.findAddress(addr - uintptr_t(object->address_begin), location, Dwarf::LocationInfoMode::FAST, frames))
             {
                 const char * arena_begin = nullptr;
                 WriteBufferFromArena out(cache.arena, arena_begin);
diff --git a/src/Functions/array/arrayDifference.cpp b/src/Functions/array/arrayDifference.cpp
index 2c71c58867f..b4b30079a4e 100644
--- a/src/Functions/array/arrayDifference.cpp
+++ b/src/Functions/array/arrayDifference.cpp
@@ -47,6 +47,29 @@ struct ArrayDifferenceImpl
     }
 
 
+    template <typename Element, typename Result>
+    static void NO_SANITIZE_UNDEFINED impl(const Element * __restrict src, Result * __restrict dst, size_t begin, size_t end)
+    {
+        /// First element is zero, then the differences of ith and i-1th elements.
+
+        Element prev{};
+        for (size_t pos = begin; pos < end; ++pos)
+        {
+            if (pos == begin)
+            {
+                dst[pos] = 0;
+                prev = src[pos];
+            }
+            else
+            {
+                Element curr = src[pos];
+                dst[pos] = curr - prev;
+                prev = curr;
+            }
+        }
+    }
+
+
     template <typename Element, typename Result>
     static bool executeType(const ColumnPtr & mapped, const ColumnArray & array, ColumnPtr & res_ptr)
     {
@@ -73,14 +96,10 @@ struct ArrayDifferenceImpl
         size_t pos = 0;
         for (auto offset : offsets)
         {
-            // skip empty arrays
-            if (pos < offset)
-            {
-                res_values[pos] = 0;
-                for (++pos; pos < offset; ++pos)
-                    res_values[pos] = static_cast<Result>(data[pos]) - static_cast<Result>(data[pos - 1]);
-            }
+            impl(data.data(), res_values.data(), pos, offset);
+            pos = offset;
         }
+
         res_ptr = ColumnArray::create(std::move(res_nested), array.getOffsetsPtr());
         return true;
     }
diff --git a/src/Functions/ignore.cpp b/src/Functions/ignore.cpp
index 6b02c3a462d..1348144cb05 100644
--- a/src/Functions/ignore.cpp
+++ b/src/Functions/ignore.cpp
@@ -29,6 +29,7 @@ public:
     }
 
     bool useDefaultImplementationForNulls() const override { return false; }
+    bool isSuitableForConstantFolding() const override { return false; }
 
     /// We should never return LowCardinality result, cause we declare that result is always constant zero.
     /// (in getResultIfAlwaysReturnsConstantAndHasArguments)
diff --git a/src/Functions/registerFunctionsMiscellaneous.cpp b/src/Functions/registerFunctionsMiscellaneous.cpp
index 653922bbced..3438145981b 100644
--- a/src/Functions/registerFunctionsMiscellaneous.cpp
+++ b/src/Functions/registerFunctionsMiscellaneous.cpp
@@ -45,6 +45,7 @@ void registerFunctionTimeZone(FunctionFactory &);
 void registerFunctionRunningAccumulate(FunctionFactory &);
 void registerFunctionRunningDifference(FunctionFactory &);
 void registerFunctionRunningDifferenceStartingWithFirstValue(FunctionFactory &);
+void registerFunctionRunningConcurrency(FunctionFactory &);
 void registerFunctionFinalizeAggregation(FunctionFactory &);
 void registerFunctionToLowCardinality(FunctionFactory &);
 void registerFunctionLowCardinalityIndices(FunctionFactory &);
@@ -67,6 +68,7 @@ void registerFunctionInitializeAggregation(FunctionFactory &);
 void registerFunctionErrorCodeToName(FunctionFactory &);
 void registerFunctionTcpPort(FunctionFactory &);
 void registerFunctionByteSize(FunctionFactory &);
+void registerFunctionFile(FunctionFactory & factory);
 
 #if USE_ICU
 void registerFunctionConvertCharset(FunctionFactory &);
@@ -112,6 +114,7 @@ void registerFunctionsMiscellaneous(FunctionFactory & factory)
     registerFunctionRunningAccumulate(factory);
     registerFunctionRunningDifference(factory);
     registerFunctionRunningDifferenceStartingWithFirstValue(factory);
+    registerFunctionRunningConcurrency(factory);
     registerFunctionFinalizeAggregation(factory);
     registerFunctionToLowCardinality(factory);
     registerFunctionLowCardinalityIndices(factory);
@@ -134,6 +137,7 @@ void registerFunctionsMiscellaneous(FunctionFactory & factory)
     registerFunctionErrorCodeToName(factory);
     registerFunctionTcpPort(factory);
     registerFunctionByteSize(factory);
+    registerFunctionFile(factory);
 
 #if USE_ICU
     registerFunctionConvertCharset(factory);
diff --git a/src/Functions/runningConcurrency.cpp b/src/Functions/runningConcurrency.cpp
new file mode 100644
index 00000000000..a225e3152e7
--- /dev/null
+++ b/src/Functions/runningConcurrency.cpp
@@ -0,0 +1,223 @@
+#include <Columns/ColumnVector.h>
+#include <Core/callOnTypeIndex.h>
+#include <DataTypes/IDataType.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <DataTypes/DataTypeDate.h>
+#include <DataTypes/DataTypeDateTime.h>
+#include <DataTypes/DataTypeDateTime64.h>
+#include <DataTypes/DataTypeNullable.h>
+#include <Formats/FormatSettings.h>
+#include <Functions/FunctionFactory.h>
+#include <Functions/IFunctionImpl.h>
+#include <IO/WriteBufferFromString.h>
+#include <common/defines.h>
+#include <set>
+
+namespace DB
+{
+    namespace ErrorCodes
+    {
+        extern const int ILLEGAL_COLUMN;
+        extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+        extern const int INCORRECT_DATA;
+    }
+
+    template <typename Name, typename ArgDataType, typename ConcurrencyDataType>
+    class ExecutableFunctionRunningConcurrency : public IExecutableFunctionImpl
+    {
+    public:
+        String getName() const override
+        {
+            return Name::name;
+        }
+
+        ColumnPtr execute(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
+        {
+            using ColVecArg = typename ArgDataType::ColumnType;
+            const ColVecArg * col_begin = checkAndGetColumn<ColVecArg>(arguments[0].column.get());
+            const ColVecArg * col_end   = checkAndGetColumn<ColVecArg>(arguments[1].column.get());
+            if (!col_begin || !col_end)
+                throw Exception("Constant columns are not supported at the moment",
+                                ErrorCodes::ILLEGAL_COLUMN);
+            const typename ColVecArg::Container & vec_begin = col_begin->getData();
+            const typename ColVecArg::Container & vec_end   = col_end->getData();
+
+            using ColVecConc = typename ConcurrencyDataType::ColumnType;
+            typename ColVecConc::MutablePtr col_concurrency = ColVecConc::create(input_rows_count);
+            typename ColVecConc::Container & vec_concurrency = col_concurrency->getData();
+
+            std::multiset<typename ArgDataType::FieldType> ongoing_until;
+            for (size_t i = 0; i < input_rows_count; ++i)
+            {
+                const auto begin = vec_begin[i];
+                const auto end   = vec_end[i];
+
+                if (unlikely(begin > end))
+                {
+                    const FormatSettings default_format;
+                    WriteBufferFromOwnString buf_begin, buf_end;
+                    arguments[0].type->serializeAsTextQuoted(*(arguments[0].column), i, buf_begin, default_format);
+                    arguments[1].type->serializeAsTextQuoted(*(arguments[1].column), i, buf_end, default_format);
+                    throw Exception(
+                        "Incorrect order of events: " + buf_begin.str() + " > " + buf_end.str(),
+                        ErrorCodes::INCORRECT_DATA);
+                }
+
+                ongoing_until.insert(end);
+
+                // Erase all the elements from "ongoing_until" which
+                // are less than or equal to "begin", i.e. durations
+                // that have already ended. We consider "begin" to be
+                // inclusive, and "end" to be exclusive.
+                ongoing_until.erase(
+                    ongoing_until.begin(), ongoing_until.upper_bound(begin));
+
+                vec_concurrency[i] = ongoing_until.size();
+            }
+
+            return col_concurrency;
+        }
+
+        bool useDefaultImplementationForConstants() const override
+        {
+            return true;
+        }
+    };
+
+    template <typename Name, typename ArgDataType, typename ConcurrencyDataType>
+    class FunctionBaseRunningConcurrency : public IFunctionBaseImpl
+    {
+    public:
+        explicit FunctionBaseRunningConcurrency(DataTypes argument_types_, DataTypePtr return_type_)
+            : argument_types(std::move(argument_types_))
+            , return_type(std::move(return_type_)) {}
+
+        String getName() const override
+        {
+            return Name::name;
+        }
+
+        const DataTypes & getArgumentTypes() const override
+        {
+            return argument_types;
+        }
+
+        const DataTypePtr & getResultType() const override
+        {
+            return return_type;
+        }
+
+        ExecutableFunctionImplPtr prepare(const ColumnsWithTypeAndName &) const override
+        {
+            return std::make_unique<ExecutableFunctionRunningConcurrency<Name, ArgDataType, ConcurrencyDataType>>();
+        }
+
+        bool isStateful() const override
+        {
+            return true;
+        }
+
+    private:
+        DataTypes argument_types;
+        DataTypePtr return_type;
+    };
+
+    template <typename Name, typename ConcurrencyDataType>
+    class RunningConcurrencyOverloadResolver : public IFunctionOverloadResolverImpl
+    {
+        template <typename T>
+        struct TypeTag
+        {
+            using Type = T;
+        };
+
+        /// Call a polymorphic lambda with a type tag of src_type.
+        template <typename F>
+        void dispatchForSourceType(const IDataType & src_type, F && f) const
+        {
+            WhichDataType which(src_type);
+
+            switch (which.idx)
+            {
+            case TypeIndex::Date:       f(TypeTag<DataTypeDate>());       break;
+            case TypeIndex::DateTime:   f(TypeTag<DataTypeDateTime>());   break;
+            case TypeIndex::DateTime64: f(TypeTag<DataTypeDateTime64>()); break;
+            default:
+                throw Exception(
+                    "Arguments for function " + getName() + " must be Date, DateTime, or DateTime64.",
+                    ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+            }
+        }
+
+    public:
+        static constexpr auto name = Name::name;
+
+        static FunctionOverloadResolverImplPtr create(const Context &)
+        {
+            return std::make_unique<RunningConcurrencyOverloadResolver<Name, ConcurrencyDataType>>();
+        }
+
+        String getName() const override
+        {
+            return Name::name;
+        }
+
+        FunctionBaseImplPtr build(const ColumnsWithTypeAndName & arguments, const DataTypePtr & return_type) const override
+        {
+            // The type of the second argument must match with that of the first one.
+            if (unlikely(!arguments[1].type->equals(*(arguments[0].type))))
+            {
+                throw Exception(
+                    "Function " + getName() + " must be called with two arguments having the same type.",
+                    ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+            }
+
+            DataTypes argument_types = { arguments[0].type, arguments[1].type };
+            FunctionBaseImplPtr base;
+            dispatchForSourceType(*(arguments[0].type), [&](auto arg_type_tag) // Throws when the type is inappropriate.
+            {
+                using Tag = decltype(arg_type_tag);
+                using ArgDataType = typename Tag::Type;
+
+                base = std::make_unique<FunctionBaseRunningConcurrency<Name, ArgDataType, ConcurrencyDataType>>(argument_types, return_type);
+            });
+
+            return base;
+        }
+
+        DataTypePtr getReturnType(const DataTypes &) const override
+        {
+            return std::make_shared<ConcurrencyDataType>();
+        }
+
+        size_t getNumberOfArguments() const override
+        {
+            return 2;
+        }
+
+        bool isInjective(const ColumnsWithTypeAndName &) const override
+        {
+            return false;
+        }
+
+        bool isStateful() const override
+        {
+            return true;
+        }
+
+        bool useDefaultImplementationForNulls() const override
+        {
+            return false;
+        }
+    };
+
+    struct NameRunningConcurrency
+    {
+        static constexpr auto name = "runningConcurrency";
+    };
+
+    void registerFunctionRunningConcurrency(FunctionFactory & factory)
+    {
+        factory.registerFunction<RunningConcurrencyOverloadResolver<NameRunningConcurrency, DataTypeUInt32>>();
+    }
+}
diff --git a/src/Functions/ya.make b/src/Functions/ya.make
index 7f9c7add0b8..ea975901077 100644
--- a/src/Functions/ya.make
+++ b/src/Functions/ya.make
@@ -39,6 +39,7 @@ SRCS(
     CRC.cpp
     FunctionFQDN.cpp
     FunctionFactory.cpp
+    FunctionFile.cpp
     FunctionHelpers.cpp
     FunctionJoinGet.cpp
     FunctionsAES.cpp
@@ -423,6 +424,7 @@ SRCS(
     rowNumberInAllBlocks.cpp
     rowNumberInBlock.cpp
     runningAccumulate.cpp
+    runningConcurrency.cpp
     runningDifference.cpp
     runningDifferenceStartingWithFirstValue.cpp
     sigmoid.cpp
diff --git a/src/IO/AsynchronousWriteBuffer.h b/src/IO/AsynchronousWriteBuffer.h
index 74b5804691b..8c44f8c7d4a 100644
--- a/src/IO/AsynchronousWriteBuffer.h
+++ b/src/IO/AsynchronousWriteBuffer.h
@@ -1,10 +1,8 @@
 #pragma once
 
-#include <math.h>
-
 #include <vector>
-
 #include <Common/ThreadPool.h>
+#include <Common/MemoryTracker.h>
 #include <IO/WriteBuffer.h>
 
 
@@ -53,18 +51,14 @@ public:
 
     ~AsynchronousWriteBuffer() override
     {
-        try
-        {
-            if (started)
-                pool.wait();
+        /// FIXME move final flush into the caller
+        MemoryTracker::LockExceptionInThread lock;
 
-            swapBuffers();
-            out.next();
-        }
-        catch (...)
-        {
-            tryLogCurrentException(__PRETTY_FUNCTION__);
-        }
+        if (started)
+            pool.wait();
+
+        swapBuffers();
+        out.next();
     }
 
     /// That is executed in a separate thread
diff --git a/src/IO/BrotliWriteBuffer.cpp b/src/IO/BrotliWriteBuffer.cpp
index e4e3713d379..d14c94ca43d 100644
--- a/src/IO/BrotliWriteBuffer.cpp
+++ b/src/IO/BrotliWriteBuffer.cpp
@@ -6,6 +6,8 @@
 #    include <IO/BrotliWriteBuffer.h>
 #    include <brotli/encode.h>
 
+#include <Common/MemoryTracker.h>
+
 namespace DB
 {
 
@@ -47,14 +49,9 @@ BrotliWriteBuffer::BrotliWriteBuffer(std::unique_ptr<WriteBuffer> out_, int comp
 
 BrotliWriteBuffer::~BrotliWriteBuffer()
 {
-    try
-    {
-        finish();
-    }
-    catch (...)
-    {
-        tryLogCurrentException(__PRETTY_FUNCTION__);
-    }
+    /// FIXME move final flush into the caller
+    MemoryTracker::LockExceptionInThread lock;
+    finish();
 }
 
 void BrotliWriteBuffer::nextImpl()
diff --git a/src/IO/BufferWithOwnMemory.h b/src/IO/BufferWithOwnMemory.h
index 782eea84ed7..f8cc8b7febb 100644
--- a/src/IO/BufferWithOwnMemory.h
+++ b/src/IO/BufferWithOwnMemory.h
@@ -35,10 +35,10 @@ struct Memory : boost::noncopyable, Allocator
     char * m_data = nullptr;
     size_t alignment = 0;
 
-    Memory() {}
+    Memory() = default;
 
     /// If alignment != 0, then allocate memory aligned to specified value.
-    Memory(size_t size_, size_t alignment_ = 0) : m_capacity(size_), m_size(m_capacity), alignment(alignment_)
+    explicit Memory(size_t size_, size_t alignment_ = 0) : m_capacity(size_), m_size(m_capacity), alignment(alignment_)
     {
         alloc();
     }
@@ -140,7 +140,7 @@ protected:
     Memory<> memory;
 public:
     /// If non-nullptr 'existing_memory' is passed, then buffer will not create its own memory and will use existing_memory without ownership.
-    BufferWithOwnMemory(size_t size = DBMS_DEFAULT_BUFFER_SIZE, char * existing_memory = nullptr, size_t alignment = 0)
+    explicit BufferWithOwnMemory(size_t size = DBMS_DEFAULT_BUFFER_SIZE, char * existing_memory = nullptr, size_t alignment = 0)
         : Base(nullptr, 0), memory(existing_memory ? 0 : size, alignment)
     {
         Base::set(existing_memory ? existing_memory : memory.data(), size);
diff --git a/src/IO/HexWriteBuffer.cpp b/src/IO/HexWriteBuffer.cpp
index d7b8a993ce5..4e3403ba74b 100644
--- a/src/IO/HexWriteBuffer.cpp
+++ b/src/IO/HexWriteBuffer.cpp
@@ -1,6 +1,6 @@
 #include <common/types.h>
 #include <Common/hex.h>
-#include <Common/Exception.h>
+#include <Common/MemoryTracker.h>
 #include <IO/HexWriteBuffer.h>
 
 
@@ -22,14 +22,9 @@ void HexWriteBuffer::nextImpl()
 
 HexWriteBuffer::~HexWriteBuffer()
 {
-    try
-    {
-        nextImpl();
-    }
-    catch (...)
-    {
-        tryLogCurrentException(__PRETTY_FUNCTION__);
-    }
+    /// FIXME move final flush into the caller
+    MemoryTracker::LockExceptionInThread lock;
+    nextImpl();
 }
 
 }
diff --git a/src/IO/IReadableWriteBuffer.h b/src/IO/IReadableWriteBuffer.h
index a02dd4e23cb..539825e3a85 100644
--- a/src/IO/IReadableWriteBuffer.h
+++ b/src/IO/IReadableWriteBuffer.h
@@ -17,7 +17,7 @@ struct IReadableWriteBuffer
         return getReadBufferImpl();
     }
 
-    virtual ~IReadableWriteBuffer() {}
+    virtual ~IReadableWriteBuffer() = default;
 
 protected:
 
diff --git a/src/IO/LZMADeflatingWriteBuffer.cpp b/src/IO/LZMADeflatingWriteBuffer.cpp
index e3051f1de65..5803bc1e9f1 100644
--- a/src/IO/LZMADeflatingWriteBuffer.cpp
+++ b/src/IO/LZMADeflatingWriteBuffer.cpp
@@ -1,4 +1,5 @@
 #include <IO/LZMADeflatingWriteBuffer.h>
+#include <Common/MemoryTracker.h>
 
 #if !defined(ARCADIA_BUILD)
 
@@ -48,16 +49,11 @@ LZMADeflatingWriteBuffer::LZMADeflatingWriteBuffer(
 
 LZMADeflatingWriteBuffer::~LZMADeflatingWriteBuffer()
 {
-    try
-    {
-        finish();
+    /// FIXME move final flush into the caller
+    MemoryTracker::LockExceptionInThread lock;
 
-        lzma_end(&lstr);
-    }
-    catch (...)
-    {
-        tryLogCurrentException(__PRETTY_FUNCTION__);
-    }
+    finish();
+    lzma_end(&lstr);
 }
 
 void LZMADeflatingWriteBuffer::nextImpl()
diff --git a/src/IO/ReadBufferFromFile.h b/src/IO/ReadBufferFromFile.h
index cebda605b21..33365bc7ceb 100644
--- a/src/IO/ReadBufferFromFile.h
+++ b/src/IO/ReadBufferFromFile.h
@@ -25,11 +25,11 @@ protected:
     CurrentMetrics::Increment metric_increment{CurrentMetrics::OpenFileForRead};
 
 public:
-    ReadBufferFromFile(const std::string & file_name_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, int flags = -1,
+    explicit ReadBufferFromFile(const std::string & file_name_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, int flags = -1,
         char * existing_memory = nullptr, size_t alignment = 0);
 
     /// Use pre-opened file descriptor.
-    ReadBufferFromFile(
+    explicit ReadBufferFromFile(
         int & fd, /// Will be set to -1 if constructor didn't throw and ownership of file descriptor is passed to the object.
         const std::string & original_file_name = {},
         size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
diff --git a/src/IO/ReadHelpers.cpp b/src/IO/ReadHelpers.cpp
index 5a159defe06..baa12297718 100644
--- a/src/IO/ReadHelpers.cpp
+++ b/src/IO/ReadHelpers.cpp
@@ -1104,9 +1104,9 @@ void saveUpToPosition(ReadBuffer & in, DB::Memory<> & memory, char * current)
     assert(current >= in.position());
     assert(current <= in.buffer().end());
 
-    const int old_bytes = memory.size();
-    const int additional_bytes = current - in.position();
-    const int new_bytes = old_bytes + additional_bytes;
+    const size_t old_bytes = memory.size();
+    const size_t additional_bytes = current - in.position();
+    const size_t new_bytes = old_bytes + additional_bytes;
     /// There are no new bytes to add to memory.
     /// No need to do extra stuff.
     if (new_bytes == 0)
diff --git a/src/IO/S3Common.cpp b/src/IO/S3Common.cpp
index fbcd4ed97f1..f9962735ddc 100644
--- a/src/IO/S3Common.cpp
+++ b/src/IO/S3Common.cpp
@@ -13,6 +13,7 @@
 #    include <aws/core/platform/Environment.h>
 #    include <aws/core/utils/logging/LogMacros.h>
 #    include <aws/core/utils/logging/LogSystemInterface.h>
+#    include <aws/core/utils/HashingUtils.h>
 #    include <aws/s3/S3Client.h>
 #    include <aws/core/http/HttpClientFactory.h>
 #    include <IO/S3/PocoHTTPClientFactory.h>
@@ -273,56 +274,12 @@ namespace S3
         return ret;
     }
 
-    /// This method is not static because it requires ClientFactory to be initialized.
-    std::shared_ptr<Aws::S3::S3Client> ClientFactory::create( // NOLINT
-        const String & endpoint,
-        bool is_virtual_hosted_style,
-        const String & access_key_id,
-        const String & secret_access_key,
-        bool use_environment_credentials,
-        const RemoteHostFilter & remote_host_filter,
-        unsigned int s3_max_redirects)
-    {
-        PocoHTTPClientConfiguration client_configuration(remote_host_filter, s3_max_redirects);
-
-        if (!endpoint.empty())
-            client_configuration.endpointOverride = endpoint;
-
-        return create(client_configuration,
-            is_virtual_hosted_style,
-            access_key_id,
-            secret_access_key,
-            use_environment_credentials);
-    }
-
-    std::shared_ptr<Aws::S3::S3Client> ClientFactory::create( // NOLINT
-        const PocoHTTPClientConfiguration & cfg_,
-        bool is_virtual_hosted_style,
-        const String & access_key_id,
-        const String & secret_access_key,
-        bool use_environment_credentials)
-    {
-        Aws::Auth::AWSCredentials credentials(access_key_id, secret_access_key);
-
-        PocoHTTPClientConfiguration client_configuration = cfg_;
-        client_configuration.updateSchemeAndRegion();
-
-        return std::make_shared<Aws::S3::S3Client>(
-            std::make_shared<S3CredentialsProviderChain>(
-                client_configuration,
-                credentials,
-                use_environment_credentials), // AWS credentials provider.
-            std::move(client_configuration), // Client configuration.
-            Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never, // Sign policy.
-            is_virtual_hosted_style || client_configuration.endpointOverride.empty() // Use virtual addressing if endpoint is not specified.
-        );
-    }
-
     std::shared_ptr<Aws::S3::S3Client> ClientFactory::create( // NOLINT
         const PocoHTTPClientConfiguration & cfg_,
         bool is_virtual_hosted_style,
         const String & access_key_id,
         const String & secret_access_key,
+        const String & server_side_encryption_customer_key_base64,
         HeaderCollection headers,
         bool use_environment_credentials)
     {
@@ -331,7 +288,28 @@ namespace S3
 
         Aws::Auth::AWSCredentials credentials(access_key_id, secret_access_key);
 
-        auto auth_signer = std::make_shared<S3AuthSigner>(client_configuration, std::move(credentials), std::move(headers), use_environment_credentials);
+        if (!server_side_encryption_customer_key_base64.empty())
+        {
+            /// See S3Client::GeneratePresignedUrlWithSSEC().
+
+            headers.push_back({Aws::S3::SSEHeaders::SERVER_SIDE_ENCRYPTION_CUSTOMER_ALGORITHM,
+                Aws::S3::Model::ServerSideEncryptionMapper::GetNameForServerSideEncryption(Aws::S3::Model::ServerSideEncryption::AES256)});
+
+            headers.push_back({Aws::S3::SSEHeaders::SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY,
+                server_side_encryption_customer_key_base64});
+
+            Aws::Utils::ByteBuffer buffer = Aws::Utils::HashingUtils::Base64Decode(server_side_encryption_customer_key_base64);
+            String str_buffer(reinterpret_cast<char *>(buffer.GetUnderlyingData()), buffer.GetLength());
+            headers.push_back({Aws::S3::SSEHeaders::SERVER_SIDE_ENCRYPTION_CUSTOMER_KEY_MD5,
+                Aws::Utils::HashingUtils::Base64Encode(Aws::Utils::HashingUtils::CalculateMD5(str_buffer))});
+        }
+
+        auto auth_signer = std::make_shared<S3AuthSigner>(
+            client_configuration,
+            std::move(credentials),
+            std::move(headers),
+            use_environment_credentials);
+
         return std::make_shared<Aws::S3::S3Client>(
             std::move(auth_signer),
             std::move(client_configuration), // Client configuration.
diff --git a/src/IO/S3Common.h b/src/IO/S3Common.h
index c367444395d..b071daefee1 100644
--- a/src/IO/S3Common.h
+++ b/src/IO/S3Common.h
@@ -31,27 +31,12 @@ public:
 
     static ClientFactory & instance();
 
-    std::shared_ptr<Aws::S3::S3Client> create(
-        const String & endpoint,
-        bool is_virtual_hosted_style,
-        const String & access_key_id,
-        const String & secret_access_key,
-        bool use_environment_credentials,
-        const RemoteHostFilter & remote_host_filter,
-        unsigned int s3_max_redirects);
-
-    std::shared_ptr<Aws::S3::S3Client> create(
-        const PocoHTTPClientConfiguration & cfg,
-        bool is_virtual_hosted_style,
-        const String & access_key_id,
-        const String & secret_access_key,
-        bool use_environment_credentials);
-
     std::shared_ptr<Aws::S3::S3Client> create(
         const PocoHTTPClientConfiguration & cfg,
         bool is_virtual_hosted_style,
         const String & access_key_id,
         const String & secret_access_key,
+        const String & server_side_encryption_customer_key_base64,
         HeaderCollection headers,
         bool use_environment_credentials);
 
diff --git a/src/IO/WriteBuffer.h b/src/IO/WriteBuffer.h
index d425f813d7b..24529fad8c0 100644
--- a/src/IO/WriteBuffer.h
+++ b/src/IO/WriteBuffer.h
@@ -95,8 +95,15 @@ public:
         ++pos;
     }
 
-    virtual void sync() {}
-    virtual void finalize() {}
+    virtual void sync()
+    {
+        next();
+    }
+
+    virtual void finalize()
+    {
+        next();
+    }
 
 private:
     /** Write the data in the buffer (from the beginning of the buffer to the current position).
diff --git a/src/IO/WriteBufferFromFile.cpp b/src/IO/WriteBufferFromFile.cpp
index aeed4862fba..b3a63842326 100644
--- a/src/IO/WriteBufferFromFile.cpp
+++ b/src/IO/WriteBufferFromFile.cpp
@@ -3,6 +3,7 @@
 #include <errno.h>
 
 #include <Common/ProfileEvents.h>
+#include <Common/MemoryTracker.h>
 
 #include <IO/WriteBufferFromFile.h>
 #include <IO/WriteHelpers.h>
@@ -77,14 +78,10 @@ WriteBufferFromFile::~WriteBufferFromFile()
     if (fd < 0)
         return;
 
-    try
-    {
-        next();
-    }
-    catch (...)
-    {
-        tryLogCurrentException(__PRETTY_FUNCTION__);
-    }
+    /// FIXME move final flush into the caller
+    MemoryTracker::LockExceptionInThread lock;
+
+    next();
 
     ::close(fd);
 }
diff --git a/src/IO/WriteBufferFromFileDescriptor.cpp b/src/IO/WriteBufferFromFileDescriptor.cpp
index a59ae20c588..bfd874ee396 100644
--- a/src/IO/WriteBufferFromFileDescriptor.cpp
+++ b/src/IO/WriteBufferFromFileDescriptor.cpp
@@ -8,6 +8,7 @@
 #include <Common/ProfileEvents.h>
 #include <Common/CurrentMetrics.h>
 #include <Common/Stopwatch.h>
+#include <Common/MemoryTracker.h>
 
 #include <IO/WriteBufferFromFileDescriptor.h>
 #include <IO/WriteHelpers.h>
@@ -90,17 +91,15 @@ WriteBufferFromFileDescriptor::WriteBufferFromFileDescriptor(
 
 WriteBufferFromFileDescriptor::~WriteBufferFromFileDescriptor()
 {
-    try
+    if (fd < 0)
     {
-        if (fd >= 0)
-            next();
-        else
-            assert(!offset() && "attempt to write after close");
-    }
-    catch (...)
-    {
-        tryLogCurrentException(__PRETTY_FUNCTION__);
+        assert(!offset() && "attempt to write after close");
+        return;
     }
+
+    /// FIXME move final flush into the caller
+    MemoryTracker::LockExceptionInThread lock;
+    next();
 }
 
 
diff --git a/src/IO/WriteBufferFromHTTPServerResponse.cpp b/src/IO/WriteBufferFromHTTPServerResponse.cpp
index 0f30f1352e3..ac2eeac1652 100644
--- a/src/IO/WriteBufferFromHTTPServerResponse.cpp
+++ b/src/IO/WriteBufferFromHTTPServerResponse.cpp
@@ -7,6 +7,7 @@
 #include <Common/Exception.h>
 #include <Common/NetException.h>
 #include <Common/Stopwatch.h>
+#include <Common/MemoryTracker.h>
 
 #if !defined(ARCADIA_BUILD)
 #    include <Common/config.h>
@@ -187,14 +188,14 @@ void WriteBufferFromHTTPServerResponse::onProgress(const Progress & progress)
 
 void WriteBufferFromHTTPServerResponse::finalize()
 {
-    if (offset())
+    next();
+    if (out)
     {
-        next();
-
-        if (out)
-            out.reset();
+        out->next();
+        out.reset();
     }
-    else
+
+    if (!offset())
     {
         /// If no remaining data, just send headers.
         std::lock_guard lock(mutex);
@@ -206,14 +207,9 @@ void WriteBufferFromHTTPServerResponse::finalize()
 
 WriteBufferFromHTTPServerResponse::~WriteBufferFromHTTPServerResponse()
 {
-    try
-    {
-        finalize();
-    }
-    catch (...)
-    {
-        tryLogCurrentException(__PRETTY_FUNCTION__);
-    }
+    /// FIXME move final flush into the caller
+    MemoryTracker::LockExceptionInThread lock;
+    finalize();
 }
 
 }
diff --git a/src/IO/WriteBufferFromOStream.cpp b/src/IO/WriteBufferFromOStream.cpp
index 2c45a21a0a3..cf731934c93 100644
--- a/src/IO/WriteBufferFromOStream.cpp
+++ b/src/IO/WriteBufferFromOStream.cpp
@@ -1,5 +1,5 @@
 #include <IO/WriteBufferFromOStream.h>
-#include <Common/Exception.h>
+#include <Common/MemoryTracker.h>
 
 
 namespace DB
@@ -42,14 +42,9 @@ WriteBufferFromOStream::WriteBufferFromOStream(
 
 WriteBufferFromOStream::~WriteBufferFromOStream()
 {
-    try
-    {
-        next();
-    }
-    catch (...)
-    {
-        tryLogCurrentException(__PRETTY_FUNCTION__);
-    }
+    /// FIXME move final flush into the caller
+    MemoryTracker::LockExceptionInThread lock;
+    next();
 }
 
 }
diff --git a/src/IO/WriteBufferFromPocoSocket.cpp b/src/IO/WriteBufferFromPocoSocket.cpp
index c05dc11e330..284fa5dbd97 100644
--- a/src/IO/WriteBufferFromPocoSocket.cpp
+++ b/src/IO/WriteBufferFromPocoSocket.cpp
@@ -5,6 +5,7 @@
 #include <Common/Exception.h>
 #include <Common/NetException.h>
 #include <Common/Stopwatch.h>
+#include <Common/MemoryTracker.h>
 
 
 namespace ProfileEvents
@@ -70,14 +71,9 @@ WriteBufferFromPocoSocket::WriteBufferFromPocoSocket(Poco::Net::Socket & socket_
 
 WriteBufferFromPocoSocket::~WriteBufferFromPocoSocket()
 {
-    try
-    {
-        next();
-    }
-    catch (...)
-    {
-        tryLogCurrentException(__PRETTY_FUNCTION__);
-    }
+    /// FIXME move final flush into the caller
+    MemoryTracker::LockExceptionInThread lock;
+    next();
 }
 
 }
diff --git a/src/IO/WriteBufferFromS3.cpp b/src/IO/WriteBufferFromS3.cpp
index 09aabb1b21d..a6ec60b295f 100644
--- a/src/IO/WriteBufferFromS3.cpp
+++ b/src/IO/WriteBufferFromS3.cpp
@@ -4,6 +4,7 @@
 
 #    include <IO/WriteBufferFromS3.h>
 #    include <IO/WriteHelpers.h>
+#    include <Common/MemoryTracker.h>
 
 #    include <aws/s3/S3Client.h>
 #    include <aws/s3/model/CreateMultipartUploadRequest.h>
@@ -78,6 +79,8 @@ void WriteBufferFromS3::nextImpl()
 
 void WriteBufferFromS3::finalize()
 {
+    /// FIXME move final flush into the caller
+    MemoryTracker::LockExceptionInThread lock;
     finalizeImpl();
 }
 
@@ -104,14 +107,7 @@ void WriteBufferFromS3::finalizeImpl()
 
 WriteBufferFromS3::~WriteBufferFromS3()
 {
-    try
-    {
-        finalizeImpl();
-    }
-    catch (...)
-    {
-        tryLogCurrentException(__PRETTY_FUNCTION__);
-    }
+    finalizeImpl();
 }
 
 void WriteBufferFromS3::createMultipartUpload()
diff --git a/src/IO/WriteBufferFromVector.h b/src/IO/WriteBufferFromVector.h
index 2a9810f3461..1dcf2c3f327 100644
--- a/src/IO/WriteBufferFromVector.h
+++ b/src/IO/WriteBufferFromVector.h
@@ -3,6 +3,7 @@
 #include <vector>
 
 #include <IO/WriteBuffer.h>
+#include <Common/MemoryTracker.h>
 
 
 namespace DB
@@ -93,14 +94,9 @@ public:
 
     ~WriteBufferFromVector() override
     {
-        try
-        {
-            finalize();
-        }
-        catch (...)
-        {
-            tryLogCurrentException(__PRETTY_FUNCTION__);
-        }
+        /// FIXME move final flush into the caller
+        MemoryTracker::LockExceptionInThread lock;
+        finalize();
     }
 };
 
diff --git a/src/IO/WriteBufferValidUTF8.cpp b/src/IO/WriteBufferValidUTF8.cpp
index f1f04e9805b..1071ac1078d 100644
--- a/src/IO/WriteBufferValidUTF8.cpp
+++ b/src/IO/WriteBufferValidUTF8.cpp
@@ -1,5 +1,6 @@
 #include <Poco/UTF8Encoding.h>
 #include <IO/WriteBufferValidUTF8.h>
+#include <Common/MemoryTracker.h>
 #include <common/types.h>
 
 #ifdef __SSE2__
@@ -136,14 +137,9 @@ void WriteBufferValidUTF8::finish()
 
 WriteBufferValidUTF8::~WriteBufferValidUTF8()
 {
-    try
-    {
-        finish();
-    }
-    catch (...)
-    {
-        tryLogCurrentException(__PRETTY_FUNCTION__);
-    }
+    /// FIXME move final flush into the caller
+    MemoryTracker::LockExceptionInThread lock;
+    finish();
 }
 
 }
diff --git a/src/IO/ZlibDeflatingWriteBuffer.cpp b/src/IO/ZlibDeflatingWriteBuffer.cpp
index 8efe96877e4..4b838ac6d0a 100644
--- a/src/IO/ZlibDeflatingWriteBuffer.cpp
+++ b/src/IO/ZlibDeflatingWriteBuffer.cpp
@@ -1,5 +1,7 @@
 #include <IO/ZlibDeflatingWriteBuffer.h>
 #include <Common/MemorySanitizer.h>
+#include <Common/MemoryTracker.h>
+#include <Common/Exception.h>
 
 
 namespace DB
@@ -46,16 +48,21 @@ ZlibDeflatingWriteBuffer::ZlibDeflatingWriteBuffer(
 
 ZlibDeflatingWriteBuffer::~ZlibDeflatingWriteBuffer()
 {
+    /// FIXME move final flush into the caller
+    MemoryTracker::LockExceptionInThread lock;
+
+    finish();
+
     try
     {
-        finish();
-
         int rc = deflateEnd(&zstr);
         if (rc != Z_OK)
             throw Exception(std::string("deflateEnd failed: ") + zError(rc), ErrorCodes::ZLIB_DEFLATE_FAILED);
     }
     catch (...)
     {
+        /// It is OK not to terminate under an error from deflateEnd()
+        /// since all data already written to the stream.
         tryLogCurrentException(__PRETTY_FUNCTION__);
     }
 }
diff --git a/src/IO/ZstdDeflatingWriteBuffer.cpp b/src/IO/ZstdDeflatingWriteBuffer.cpp
index df28820e382..9b79d5ae513 100644
--- a/src/IO/ZstdDeflatingWriteBuffer.cpp
+++ b/src/IO/ZstdDeflatingWriteBuffer.cpp
@@ -1,4 +1,6 @@
 #include <IO/ZstdDeflatingWriteBuffer.h>
+#include <Common/MemoryTracker.h>
+#include <Common/Exception.h>
 
 namespace DB
 {
@@ -28,14 +30,22 @@ ZstdDeflatingWriteBuffer::ZstdDeflatingWriteBuffer(
 
 ZstdDeflatingWriteBuffer::~ZstdDeflatingWriteBuffer()
 {
+    /// FIXME move final flush into the caller
+    MemoryTracker::LockExceptionInThread lock;
+
+    finish();
+
     try
     {
-        finish();
-
-        ZSTD_freeCCtx(cctx);
+        int err = ZSTD_freeCCtx(cctx);
+        /// This is just in case, since it is impossible to get an error by using this wrapper.
+        if (unlikely(err))
+            throw Exception(ErrorCodes::ZSTD_ENCODER_FAILED, "ZSTD_freeCCtx failed: error code: {}; zstd version: {}", err, ZSTD_VERSION_STRING);
     }
     catch (...)
     {
+        /// It is OK not to terminate under an error from ZSTD_freeCCtx()
+        /// since all data already written to the stream.
         tryLogCurrentException(__PRETTY_FUNCTION__);
     }
 }
diff --git a/src/Interpreters/AggregationCommon.h b/src/Interpreters/AggregationCommon.h
index 9b0872d3df1..aafec9a7929 100644
--- a/src/Interpreters/AggregationCommon.h
+++ b/src/Interpreters/AggregationCommon.h
@@ -15,6 +15,10 @@
 #include <Columns/ColumnFixedString.h>
 #include <Columns/ColumnLowCardinality.h>
 
+#if defined(__SSSE3__) && !defined(MEMORY_SANITIZER)
+#include <tmmintrin.h>
+#endif
+
 
 template <>
 struct DefaultHash<StringRef> : public StringRefHash {};
@@ -77,12 +81,8 @@ static inline T ALWAYS_INLINE packFixed(
     const ColumnRawPtrs * low_cardinality_positions [[maybe_unused]] = nullptr,
     const Sizes * low_cardinality_sizes [[maybe_unused]] = nullptr)
 {
-    union
-    {
-        T key;
-        char bytes[sizeof(key)] = {};
-    };
-
+    T key{};
+    char * bytes = reinterpret_cast<char *>(&key);
     size_t offset = 0;
 
     for (size_t j = 0; j < keys_size; ++j)
@@ -259,4 +259,32 @@ static inline StringRef ALWAYS_INLINE serializeKeysToPoolContiguous(
 }
 
 
+/** Pack elements with shuffle instruction.
+  * See the explanation in ColumnsHashing.h
+  */
+#if defined(__SSSE3__) && !defined(MEMORY_SANITIZER)
+template <typename T>
+static T inline packFixedShuffle(
+    const char * __restrict * __restrict srcs,
+    size_t num_srcs,
+    const size_t * __restrict elem_sizes,
+    size_t idx,
+    const uint8_t * __restrict masks)
+{
+    __m128i res{};
+
+    for (size_t i = 0; i < num_srcs; ++i)
+    {
+        res = _mm_xor_si128(res,
+            _mm_shuffle_epi8(
+                _mm_loadu_si128(reinterpret_cast<const __m128i *>(srcs[i] + elem_sizes[i] * idx)),
+                _mm_loadu_si128(reinterpret_cast<const __m128i *>(&masks[i * sizeof(T)]))));
+    }
+
+    T out;
+    __builtin_memcpy(&out, &res, sizeof(T));
+    return out;
+}
+#endif
+
 }
diff --git a/src/Interpreters/Aggregator.h b/src/Interpreters/Aggregator.h
index 2a1224b0b48..c5bcc1eb27f 100644
--- a/src/Interpreters/Aggregator.h
+++ b/src/Interpreters/Aggregator.h
@@ -365,7 +365,13 @@ struct AggregationMethodKeysFixed
     template <typename Other>
     AggregationMethodKeysFixed(const Other & other) : data(other.data) {}
 
-    using State = ColumnsHashing::HashMethodKeysFixed<typename Data::value_type, Key, Mapped, has_nullable_keys, has_low_cardinality, use_cache>;
+    using State = ColumnsHashing::HashMethodKeysFixed<
+        typename Data::value_type,
+        Key,
+        Mapped,
+        has_nullable_keys,
+        has_low_cardinality,
+        use_cache>;
 
     static const bool low_cardinality_optimization = false;
 
diff --git a/src/Interpreters/CollectJoinOnKeysVisitor.cpp b/src/Interpreters/CollectJoinOnKeysVisitor.cpp
index 3b3fdaa65cb..a0ea27e9905 100644
--- a/src/Interpreters/CollectJoinOnKeysVisitor.cpp
+++ b/src/Interpreters/CollectJoinOnKeysVisitor.cpp
@@ -16,6 +16,26 @@ namespace ErrorCodes
     extern const int LOGICAL_ERROR;
 }
 
+namespace
+{
+
+void addAndTerm(ASTPtr & ast, const ASTPtr & term)
+{
+    if (!ast)
+        ast = term;
+    else
+        ast = makeASTFunction("and", ast, term);
+}
+
+/// If this is an inner join and the expression related to less than 2 tables, then move it to WHERE
+bool canMoveToWhere(std::pair<size_t, size_t> table_numbers, ASTTableJoin::Kind kind)
+{
+    return kind == ASTTableJoin::Kind::Inner &&
+        (table_numbers.first == table_numbers.second || table_numbers.first == 0 || table_numbers.second == 0);
+}
+
+}
+
 void CollectJoinOnKeysMatcher::Data::addJoinKeys(const ASTPtr & left_ast, const ASTPtr & right_ast,
                                                  const std::pair<size_t, size_t> & table_no)
 {
@@ -29,7 +49,8 @@ void CollectJoinOnKeysMatcher::Data::addJoinKeys(const ASTPtr & left_ast, const
     else
         throw Exception("Cannot detect left and right JOIN keys. JOIN ON section is ambiguous.",
                         ErrorCodes::AMBIGUOUS_COLUMN_NAME);
-    has_some = true;
+    if (table_no.first != table_no.second && table_no.first > 0 && table_no.second > 0)
+        has_some = true;
 }
 
 void CollectJoinOnKeysMatcher::Data::addAsofJoinKeys(const ASTPtr & left_ast, const ASTPtr & right_ast,
@@ -78,22 +99,45 @@ void CollectJoinOnKeysMatcher::visit(const ASTFunction & func, const ASTPtr & as
     {
         ASTPtr left = func.arguments->children.at(0);
         ASTPtr right = func.arguments->children.at(1);
-        auto table_numbers = getTableNumbers(ast, left, right, data);
-        data.addJoinKeys(left, right, table_numbers);
-    }
-    else if (inequality != ASOF::Inequality::None)
-    {
-        if (!data.is_asof)
-            throw Exception("JOIN ON inequalities are not supported. Unexpected '" + queryToString(ast) + "'",
-                            ErrorCodes::NOT_IMPLEMENTED);
+        auto table_numbers = getTableNumbers(left, right, data);
 
+        if (canMoveToWhere(table_numbers, data.kind))
+        {
+            addAndTerm(data.new_where_conditions, ast);
+        }
+        else
+        {
+            if (data.kind == ASTTableJoin::Kind::Inner)
+            {
+                addAndTerm(data.new_on_expression, ast);
+            }
+            data.addJoinKeys(left, right, table_numbers);
+        }
+    }
+    else if (inequality != ASOF::Inequality::None && !data.is_asof)
+    {
+        ASTPtr left = func.arguments->children.at(0);
+        ASTPtr right = func.arguments->children.at(1);
+        auto table_numbers = getTableNumbers(left, right, data);
+        if (canMoveToWhere(table_numbers, data.kind))
+        {
+            addAndTerm(data.new_where_conditions, ast);
+        }
+        else
+        {
+            throw Exception("JOIN ON inequalities are not supported. Unexpected '" + queryToString(ast) + "'",
+                ErrorCodes::NOT_IMPLEMENTED);
+        }
+    }
+    else if (inequality != ASOF::Inequality::None && data.is_asof)
+    {
         if (data.asof_left_key || data.asof_right_key)
             throw Exception("ASOF JOIN expects exactly one inequality in ON section. Unexpected '" + queryToString(ast) + "'",
-                            ErrorCodes::INVALID_JOIN_ON_EXPRESSION);
+                ErrorCodes::INVALID_JOIN_ON_EXPRESSION);
 
         ASTPtr left = func.arguments->children.at(0);
         ASTPtr right = func.arguments->children.at(1);
-        auto table_numbers = getTableNumbers(ast, left, right, data);
+        auto table_numbers = getTableNumbers(left, right, data);
 
         data.addAsofJoinKeys(left, right, table_numbers, inequality);
     }
@@ -118,7 +162,8 @@ void CollectJoinOnKeysMatcher::getIdentifiers(const ASTPtr & ast, std::vector<co
         getIdentifiers(child, out);
 }
 
-std::pair<size_t, size_t> CollectJoinOnKeysMatcher::getTableNumbers(const ASTPtr & expr, const ASTPtr & left_ast, const ASTPtr & right_ast,
+
+std::pair<size_t, size_t> CollectJoinOnKeysMatcher::getTableNumbers(const ASTPtr & left_ast, const ASTPtr & right_ast,
                                                                     Data & data)
 {
     std::vector<const ASTIdentifier *> left_identifiers;
@@ -127,23 +172,13 @@ std::pair<size_t, size_t> CollectJoinOnKeysMatcher::getTableNumbers(const ASTPtr
     getIdentifiers(left_ast, left_identifiers);
     getIdentifiers(right_ast, right_identifiers);
 
-    if (left_identifiers.empty() || right_identifiers.empty())
-    {
-        throw Exception("Not equi-join ON expression: " + queryToString(expr) + ". No columns in one of equality side.",
-                        ErrorCodes::INVALID_JOIN_ON_EXPRESSION);
-    }
+    size_t left_idents_table = 0;
+    size_t right_idents_table = 0;
 
-    size_t left_idents_table = getTableForIdentifiers(left_identifiers, data);
-    size_t right_idents_table = getTableForIdentifiers(right_identifiers, data);
-
-    if (left_idents_table && left_idents_table == right_idents_table)
-    {
-        auto left_name = queryToString(*left_identifiers[0]);
-        auto right_name = queryToString(*right_identifiers[0]);
-
-        throw Exception("In expression " + queryToString(expr) + " columns " + left_name + " and " + right_name
-            + " are from the same table but from different arguments of equal function", ErrorCodes::INVALID_JOIN_ON_EXPRESSION);
-    }
+    if (!left_identifiers.empty())
+        left_idents_table = getTableForIdentifiers(left_identifiers, data);
+    if (!right_identifiers.empty())
+        right_idents_table = getTableForIdentifiers(right_identifiers, data);
 
     return std::make_pair(left_idents_table, right_idents_table);
 }
diff --git a/src/Interpreters/CollectJoinOnKeysVisitor.h b/src/Interpreters/CollectJoinOnKeysVisitor.h
index 54e008a114e..aa2fd80d07c 100644
--- a/src/Interpreters/CollectJoinOnKeysVisitor.h
+++ b/src/Interpreters/CollectJoinOnKeysVisitor.h
@@ -5,6 +5,7 @@
 #include <Interpreters/InDepthNodeVisitor.h>
 #include <Interpreters/DatabaseAndTableWithAlias.h>
 #include <Interpreters/Aliases.h>
+#include <Parsers/ASTTablesInSelectQuery.h>
 
 
 namespace DB
@@ -30,8 +31,11 @@ public:
         const TableWithColumnNamesAndTypes & right_table;
         const Aliases & aliases;
         const bool is_asof{false};
+        ASTTableJoin::Kind kind;
         ASTPtr asof_left_key{};
         ASTPtr asof_right_key{};
+        ASTPtr new_on_expression{};
+        ASTPtr new_where_conditions{};
         bool has_some{false};
 
         void addJoinKeys(const ASTPtr & left_ast, const ASTPtr & right_ast, const std::pair<size_t, size_t> & table_no);
@@ -57,7 +61,7 @@ private:
     static void visit(const ASTFunction & func, const ASTPtr & ast, Data & data);
 
     static void getIdentifiers(const ASTPtr & ast, std::vector<const ASTIdentifier *> & out);
-    static std::pair<size_t, size_t> getTableNumbers(const ASTPtr & expr, const ASTPtr & left_ast, const ASTPtr & right_ast, Data & data);
+    static std::pair<size_t, size_t> getTableNumbers(const ASTPtr & left_ast, const ASTPtr & right_ast, Data & data);
     static const ASTIdentifier * unrollAliases(const ASTIdentifier * identifier, const Aliases & aliases);
     static size_t getTableForIdentifiers(std::vector<const ASTIdentifier *> & identifiers, const Data & data);
 };
diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp
index ca4a313da62..5e8d80adee9 100644
--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@@ -12,7 +12,7 @@
 #include <Common/Stopwatch.h>
 #include <Common/formatReadable.h>
 #include <Common/thread_local_rng.h>
-#include <Common/ZooKeeper/TestKeeperStorageDispatcher.h>
+#include <Coordination/NuKeeperStorageDispatcher.h>
 #include <Compression/ICompressionCodec.h>
 #include <Core/BackgroundSchedulePool.h>
 #include <Formats/FormatFactory.h>
@@ -305,8 +305,10 @@ struct ContextShared
     mutable zkutil::ZooKeeperPtr zookeeper;                 /// Client for ZooKeeper.
     ConfigurationPtr zookeeper_config;                      /// Stores zookeeper configs
 
-    mutable std::mutex test_keeper_storage_dispatcher_mutex;
-    mutable std::shared_ptr<zkutil::TestKeeperStorageDispatcher> test_keeper_storage_dispatcher;
+#if USE_NURAFT
+    mutable std::mutex nu_keeper_storage_dispatcher_mutex;
+    mutable std::shared_ptr<NuKeeperStorageDispatcher> nu_keeper_storage_dispatcher;
+#endif
     mutable std::mutex auxiliary_zookeepers_mutex;
     mutable std::map<String, zkutil::ZooKeeperPtr> auxiliary_zookeepers;    /// Map for auxiliary ZooKeeper clients.
     ConfigurationPtr auxiliary_zookeepers_config;           /// Stores auxiliary zookeepers configs
@@ -331,7 +333,8 @@ struct ContextShared
     mutable std::optional<ExternalModelsLoader> external_models_loader;
     String default_profile_name;                            /// Default profile name used for default values.
     String system_profile_name;                             /// Profile used by system processes
-    std::unique_ptr<AccessControlManager> access_control_manager;
+    String buffer_profile_name;                             /// Profile used by Buffer engine for flushing to the underlying
+    AccessControlManager access_control_manager;
     mutable UncompressedCachePtr uncompressed_cache;        /// The cache of decompressed blocks.
     mutable MarkCachePtr mark_cache;                        /// Cache of marks in compressed files.
     ProcessList process_list;                               /// Executing queries at the moment.
@@ -388,8 +391,7 @@ struct ContextShared
     Context::ConfigReloadCallback config_reload_callback;
 
     ContextShared()
-        : access_control_manager(std::make_unique<AccessControlManager>())
-        , macros(std::make_unique<Macros>())
+        : macros(std::make_unique<Macros>())
     {
         /// TODO: make it singleton (?)
         static std::atomic<size_t> num_calls{0};
@@ -435,7 +437,6 @@ struct ContextShared
         /// Preemptive destruction is important, because these objects may have a refcount to ContextShared (cyclic reference).
         /// TODO: Get rid of this.
 
-        access_control_manager.reset();
         system_logs.reset();
         embedded_dictionaries.reset();
         external_dictionaries_loader.reset();
@@ -450,8 +451,7 @@ struct ContextShared
         trace_collector.reset();
         /// Stop zookeeper connection
         zookeeper.reset();
-        /// Stop test_keeper storage
-        test_keeper_storage_dispatcher.reset();
+
     }
 
     bool hasTraceCollector() const
@@ -642,7 +642,7 @@ void Context::setConfig(const ConfigurationPtr & config)
 {
     auto lock = getLock();
     shared->config = config;
-    shared->access_control_manager->setExternalAuthenticatorsConfig(*shared->config);
+    shared->access_control_manager.setExternalAuthenticatorsConfig(*shared->config);
 }
 
 const Poco::Util::AbstractConfiguration & Context::getConfigRef() const
@@ -654,25 +654,25 @@ const Poco::Util::AbstractConfiguration & Context::getConfigRef() const
 
 AccessControlManager & Context::getAccessControlManager()
 {
-    return *shared->access_control_manager;
+    return shared->access_control_manager;
 }
 
 const AccessControlManager & Context::getAccessControlManager() const
 {
-    return *shared->access_control_manager;
+    return shared->access_control_manager;
 }
 
 void Context::setExternalAuthenticatorsConfig(const Poco::Util::AbstractConfiguration & config)
 {
     auto lock = getLock();
-    shared->access_control_manager->setExternalAuthenticatorsConfig(config);
+    shared->access_control_manager.setExternalAuthenticatorsConfig(config);
 }
 
 void Context::setUsersConfig(const ConfigurationPtr & config)
 {
     auto lock = getLock();
     shared->users_config = config;
-    shared->access_control_manager->setUsersConfig(*shared->users_config);
+    shared->access_control_manager.setUsersConfig(*shared->users_config);
 }
 
 ConfigurationPtr Context::getUsersConfig()
@@ -1299,6 +1299,13 @@ Context & Context::getGlobalContext()
     return *global_context;
 }
 
+const Context & Context::getBufferContext() const
+{
+    if (!buffer_context)
+        throw Exception("Logical error: there is no buffer context", ErrorCodes::LOGICAL_ERROR);
+    return *buffer_context;
+}
+
 
 const EmbeddedDictionaries & Context::getEmbeddedDictionaries() const
 {
@@ -1577,15 +1584,48 @@ zkutil::ZooKeeperPtr Context::getZooKeeper() const
     return shared->zookeeper;
 }
 
-std::shared_ptr<zkutil::TestKeeperStorageDispatcher> & Context::getTestKeeperStorageDispatcher() const
-{
-    std::lock_guard lock(shared->test_keeper_storage_dispatcher_mutex);
-    if (!shared->test_keeper_storage_dispatcher)
-        shared->test_keeper_storage_dispatcher = std::make_shared<zkutil::TestKeeperStorageDispatcher>();
 
-    return shared->test_keeper_storage_dispatcher;
+void Context::initializeNuKeeperStorageDispatcher() const
+{
+#if USE_NURAFT
+    std::lock_guard lock(shared->nu_keeper_storage_dispatcher_mutex);
+
+    if (shared->nu_keeper_storage_dispatcher)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to initialize NuKeeper multiple times");
+
+    const auto & config = getConfigRef();
+    if (config.has("test_keeper_server"))
+    {
+        shared->nu_keeper_storage_dispatcher = std::make_shared<NuKeeperStorageDispatcher>();
+        shared->nu_keeper_storage_dispatcher->initialize(config);
+    }
+#endif
 }
 
+#if USE_NURAFT
+std::shared_ptr<NuKeeperStorageDispatcher> & Context::getNuKeeperStorageDispatcher() const
+{
+    std::lock_guard lock(shared->nu_keeper_storage_dispatcher_mutex);
+    if (!shared->nu_keeper_storage_dispatcher)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "NuKeeper must be initialized before requests");
+
+    return shared->nu_keeper_storage_dispatcher;
+}
+#endif
+
+void Context::shutdownNuKeeperStorageDispatcher() const
+{
+#if USE_NURAFT
+    std::lock_guard lock(shared->nu_keeper_storage_dispatcher_mutex);
+    if (shared->nu_keeper_storage_dispatcher)
+    {
+        shared->nu_keeper_storage_dispatcher->shutdown();
+        shared->nu_keeper_storage_dispatcher.reset();
+    }
+#endif
+}
+
+
 zkutil::ZooKeeperPtr Context::getAuxiliaryZooKeeper(const String & name) const
 {
     std::lock_guard lock(shared->auxiliary_zookeepers_mutex);
@@ -2221,6 +2261,10 @@ void Context::setDefaultProfiles(const Poco::Util::AbstractConfiguration & confi
 
     shared->system_profile_name = config.getString("system_profile", shared->default_profile_name);
     setProfile(shared->system_profile_name);
+
+    shared->buffer_profile_name = config.getString("buffer_profile", shared->system_profile_name);
+    buffer_context = std::make_shared<Context>(*this);
+    buffer_context->setProfile(shared->buffer_profile_name);
 }
 
 String Context::getDefaultProfileName() const
diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h
index 98ca3909fea..be53a4364e0 100644
--- a/src/Interpreters/Context.h
+++ b/src/Interpreters/Context.h
@@ -40,7 +40,6 @@ namespace Poco
 namespace zkutil
 {
     class ZooKeeper;
-    class TestKeeperStorageDispatcher;
 }
 
 
@@ -109,6 +108,7 @@ class StoragePolicySelector;
 using StoragePolicySelectorPtr = std::shared_ptr<const StoragePolicySelector>;
 struct PartUUIDs;
 using PartUUIDsPtr = std::shared_ptr<PartUUIDs>;
+class NuKeeperStorageDispatcher;
 
 class IOutputFormat;
 using OutputFormatPtr = std::shared_ptr<IOutputFormat>;
@@ -254,6 +254,7 @@ private:
     Context * query_context = nullptr;
     Context * session_context = nullptr;    /// Session context or nullptr. Could be equal to this.
     Context * global_context = nullptr;     /// Global context. Could be equal to this.
+    std::shared_ptr<Context> buffer_context;/// Buffer context. Could be equal to this.
 
 public:
     // Top-level OpenTelemetry trace context for the query. Makes sense only for
@@ -542,6 +543,8 @@ public:
     Context & getGlobalContext();
     bool hasGlobalContext() const { return global_context != nullptr; }
 
+    const Context & getBufferContext() const;
+
     void setQueryContext(Context & context_) { query_context = &context_; }
     void setSessionContext(Context & context_) { session_context = &context_; }
 
@@ -578,8 +581,11 @@ public:
     /// Same as above but return a zookeeper connection from auxiliary_zookeepers configuration entry.
     std::shared_ptr<zkutil::ZooKeeper> getAuxiliaryZooKeeper(const String & name) const;
 
-
-    std::shared_ptr<zkutil::TestKeeperStorageDispatcher> & getTestKeeperStorageDispatcher() const;
+#if USE_NURAFT
+    std::shared_ptr<NuKeeperStorageDispatcher> & getNuKeeperStorageDispatcher() const;
+#endif
+    void initializeNuKeeperStorageDispatcher() const;
+    void shutdownNuKeeperStorageDispatcher() const;
 
     /// Set auxiliary zookeepers configuration at server starting or configuration reloading.
     void reloadAuxiliaryZooKeepersConfigIfChanged(const ConfigurationPtr & config);
diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp
index 05370a6a3b7..fc460a5584c 100644
--- a/src/Interpreters/DDLWorker.cpp
+++ b/src/Interpreters/DDLWorker.cpp
@@ -652,15 +652,10 @@ void DDLWorker::enqueueTask(DDLTaskPtr task_ptr)
             {
                 recoverZooKeeper();
             }
-            else if (e.code == Coordination::Error::ZNONODE)
-            {
-                LOG_ERROR(log, "ZooKeeper error: {}", getCurrentExceptionMessage(true));
-                // TODO: retry?
-            }
             else
             {
                 LOG_ERROR(log, "Unexpected ZooKeeper error: {}.", getCurrentExceptionMessage(true));
-                return;
+                throw;
             }
         }
         catch (...)
@@ -695,25 +690,44 @@ void DDLWorker::processTask(DDLTask & task)
 
     LOG_DEBUG(log, "Processing task {} ({})", task.entry_name, task.entry.query);
 
-    String dummy;
     String active_node_path = task.entry_path + "/active/" + task.host_id_str;
     String finished_node_path = task.entry_path + "/finished/" + task.host_id_str;
 
-    auto code = zookeeper->tryCreate(active_node_path, "", zkutil::CreateMode::Ephemeral, dummy);
+    /// It will tryRemove(...) on exception
+    auto active_node = zkutil::EphemeralNodeHolder::existing(active_node_path, *zookeeper);
 
-    if (code == Coordination::Error::ZOK || code == Coordination::Error::ZNODEEXISTS)
+    /// Try fast path
+    auto create_active_res = zookeeper->tryCreate(active_node_path, {}, zkutil::CreateMode::Ephemeral);
+    if (create_active_res != Coordination::Error::ZOK)
     {
-        // Ok
+        if (create_active_res != Coordination::Error::ZNONODE && create_active_res != Coordination::Error::ZNODEEXISTS)
+        {
+            assert(Coordination::isHardwareError(create_active_res));
+            throw Coordination::Exception(create_active_res, active_node_path);
+        }
+
+        /// Status dirs were not created in enqueueQuery(...) or someone is removing entry
+        if (create_active_res == Coordination::Error::ZNONODE)
+            createStatusDirs(task.entry_path, zookeeper);
+
+        if (create_active_res == Coordination::Error::ZNODEEXISTS)
+        {
+            /// Connection has been lost and now we are retrying to write query status,
+            /// but our previous ephemeral node still exists.
+            assert(task.was_executed);
+            zkutil::EventPtr eph_node_disappeared = std::make_shared<Poco::Event>();
+            String dummy;
+            if (zookeeper->tryGet(active_node_path, dummy, nullptr, eph_node_disappeared))
+            {
+                constexpr int timeout_ms = 5000;
+                if (!eph_node_disappeared->tryWait(timeout_ms))
+                    throw Exception(ErrorCodes::LOGICAL_ERROR, "Ephemeral node {} still exists, "
+                                    "probably it's owned by someone else", active_node_path);
+            }
+        }
+
+        zookeeper->create(active_node_path, {}, zkutil::CreateMode::Ephemeral);
     }
-    else if (code == Coordination::Error::ZNONODE)
-    {
-        /// There is no parent
-        createStatusDirs(task.entry_path, zookeeper);
-        if (Coordination::Error::ZOK != zookeeper->tryCreate(active_node_path, "", zkutil::CreateMode::Ephemeral, dummy))
-            throw Coordination::Exception(code, active_node_path);
-    }
-    else
-        throw Coordination::Exception(code, active_node_path);
 
     if (!task.was_executed)
     {
@@ -969,7 +983,6 @@ void DDLWorker::cleanupQueue(Int64 current_time_seconds, const ZooKeeperPtr & zo
 
         String node_name = *it;
         String node_path = fs::path(queue_dir) / node_name;
-        String lock_path = fs::path(node_path) / "lock";
 
         Coordination::Stat stat;
         String dummy;
@@ -991,19 +1004,14 @@ void DDLWorker::cleanupQueue(Int64 current_time_seconds, const ZooKeeperPtr & zo
             if (!node_lifetime_is_expired && !node_is_outside_max_window)
                 continue;
 
-            /// Skip if there are active nodes (it is weak guard)
-            if (zookeeper->exists(fs::path(node_path) / "active", &stat) && stat.numChildren > 0)
+            /// At first we remove entry/active node to prevent staled hosts from executing entry concurrently
+            auto rm_active_res = zookeeper->tryRemove(fs::path(node_path) / "active");
+            if (rm_active_res != Coordination::Error::ZOK && rm_active_res != Coordination::Error::ZNONODE)
             {
-                LOG_INFO(log, "Task {} should be deleted, but there are active workers. Skipping it.", node_name);
-                continue;
-            }
-
-            /// Usage of the lock is not necessary now (tryRemoveRecursive correctly removes node in a presence of concurrent cleaners)
-            /// But the lock will be required to implement system.distributed_ddl_queue table
-            auto lock = createSimpleZooKeeperLock(zookeeper, node_path, "lock", host_fqdn_id);
-            if (!lock->tryLock())
-            {
-                LOG_INFO(log, "Task {} should be deleted, but it is locked. Skipping it.", node_name);
+                if (rm_active_res == Coordination::Error::ZNOTEMPTY)
+                    LOG_DEBUG(log, "Task {} should be deleted, but there are active workers. Skipping it.", node_name);
+                else
+                    LOG_WARNING(log, "Unexpected status code {} on attempt to remove {}/active", rm_active_res, node_name);
                 continue;
             }
 
@@ -1012,21 +1020,33 @@ void DDLWorker::cleanupQueue(Int64 current_time_seconds, const ZooKeeperPtr & zo
             else if (node_is_outside_max_window)
                 LOG_INFO(log, "Task {} is outdated, deleting it", node_name);
 
-            /// Deleting
-            {
-                Strings children = zookeeper->getChildren(node_path);
-                for (const String & child : children)
-                {
-                    if (child != "lock")
-                        zookeeper->tryRemoveRecursive(fs::path(node_path) / child);
-                }
+            /// We recursively delete all nodes except node_path/finished to prevent staled hosts from
+            /// creating node_path/active node (see createStatusDirs(...))
+            zookeeper->tryRemoveChildrenRecursive(node_path, "finished");
 
-                /// Remove the lock node and its parent atomically
-                Coordination::Requests ops;
-                ops.emplace_back(zkutil::makeRemoveRequest(lock_path, -1));
-                ops.emplace_back(zkutil::makeRemoveRequest(node_path, -1));
-                zookeeper->multi(ops);
+            /// And then we remove node_path and node_path/finished in a single transaction
+            Coordination::Requests ops;
+            Coordination::Responses res;
+            ops.emplace_back(zkutil::makeCheckRequest(node_path, -1));  /// See a comment below
+            ops.emplace_back(zkutil::makeRemoveRequest(fs::path(node_path) / "finished", -1));
+            ops.emplace_back(zkutil::makeRemoveRequest(node_path, -1));
+            auto rm_entry_res = zookeeper->tryMulti(ops, res);
+            if (rm_entry_res == Coordination::Error::ZNONODE)
+            {
+                /// Most likely both node_path/finished and node_path were removed concurrently.
+                bool entry_removed_concurrently = res[0]->error == Coordination::Error::ZNONODE;
+                if (entry_removed_concurrently)
+                    continue;
+
+                /// Possible rare case: initiator node has lost connection after enqueueing entry and failed to create status dirs.
+                /// No one has started to process the entry, so node_path/active and node_path/finished nodes were never created, node_path has no children.
+                /// Entry became outdated, but we cannot remove remove it in a transaction with node_path/finished.
+                assert(res[0]->error == Coordination::Error::ZOK && res[1]->error == Coordination::Error::ZNONODE);
+                rm_entry_res = zookeeper->tryRemove(node_path);
+                assert(rm_entry_res != Coordination::Error::ZNOTEMPTY);
+                continue;
             }
+            zkutil::KeeperMultiException::check(rm_entry_res, ops, res);
         }
         catch (...)
         {
@@ -1040,21 +1060,32 @@ void DDLWorker::cleanupQueue(Int64 current_time_seconds, const ZooKeeperPtr & zo
 void DDLWorker::createStatusDirs(const std::string & node_path, const ZooKeeperPtr & zookeeper)
 {
     Coordination::Requests ops;
-    {
-        Coordination::CreateRequest request;
-        request.path = fs::path(node_path) / "active";
-        ops.emplace_back(std::make_shared<Coordination::CreateRequest>(std::move(request)));
-    }
-    {
-        Coordination::CreateRequest request;
-        request.path = fs::path(node_path) / "finished";
-        ops.emplace_back(std::make_shared<Coordination::CreateRequest>(std::move(request)));
-    }
+    ops.emplace_back(zkutil::makeCreateRequest(fs::path(node_path) / "active", {}, zkutil::CreateMode::Persistent));
+    ops.emplace_back(zkutil::makeCreateRequest(fs::path(node_path) / "finished", {}, zkutil::CreateMode::Persistent));
+
     Coordination::Responses responses;
     Coordination::Error code = zookeeper->tryMulti(ops, responses);
-    if (code != Coordination::Error::ZOK
-        && code != Coordination::Error::ZNODEEXISTS)
-        throw Coordination::Exception(code);
+
+    bool both_created = code == Coordination::Error::ZOK;
+
+    /// Failed on attempt to create node_path/active because it exists, so node_path/finished must exist too
+    bool both_already_exists = responses.size() == 2 && responses[0]->error == Coordination::Error::ZNODEEXISTS
+                                                     && responses[1]->error == Coordination::Error::ZRUNTIMEINCONSISTENCY;
+    assert(!both_already_exists || (zookeeper->exists(fs::path(node_path) / "active") && zookeeper->exists(fs::path(node_path) / "finished")));
+
+    /// Failed on attempt to create node_path/finished, but node_path/active does not exist
+    bool is_currently_deleting = responses.size() == 2 && responses[0]->error == Coordination::Error::ZOK
+                                                       && responses[1]->error == Coordination::Error::ZNODEEXISTS;
+    if (both_created || both_already_exists)
+        return;
+
+    if (is_currently_deleting)
+        throw Exception(ErrorCodes::UNFINISHED, "Cannot create status dirs for {}, "
+                        "most likely because someone is deleting it concurrently", node_path);
+
+    /// Connection lost or entry was removed
+    assert(Coordination::isHardwareError(code) || code == Coordination::Error::ZNONODE);
+    zkutil::KeeperMultiException::check(code, ops, responses);
 }
 
 
@@ -1114,7 +1145,7 @@ void DDLWorker::runMainThread()
             if (!Coordination::isHardwareError(e.code))
             {
                 /// A logical error.
-                LOG_ERROR(log, "ZooKeeper error: {}. Failed to start DDLWorker.",getCurrentExceptionMessage(true));
+                LOG_ERROR(log, "ZooKeeper error: {}. Failed to start DDLWorker.", getCurrentExceptionMessage(true));
                 reset_state(false);
                 assert(false);  /// Catch such failures in tests with debug build
             }
diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp
index 3f65a6f3f58..660718549b3 100644
--- a/src/Interpreters/ExpressionAnalyzer.cpp
+++ b/src/Interpreters/ExpressionAnalyzer.cpp
@@ -89,8 +89,7 @@ bool allowEarlyConstantFolding(const ActionsDAG & actions, const Settings & sett
     {
         if (node.type == ActionsDAG::ActionType::FUNCTION && node.function_base)
         {
-            auto name = node.function_base->getName();
-            if (name == "ignore")
+            if (!node.function_base->isSuitableForConstantFolding())
                 return false;
         }
     }
@@ -540,7 +539,10 @@ void ExpressionAnalyzer::makeWindowDescriptions(ActionsDAGPtr actions)
         !context.getSettingsRef().allow_experimental_window_functions)
     {
         throw Exception(ErrorCodes::NOT_IMPLEMENTED,
-            "Window functions are not implemented (while processing '{}')",
+            "The support for window functions is experimental and will change"
+            " in backwards-incompatible ways in the future releases. Set"
+            " allow_experimental_window_functions = 1 to enable it."
+            " While processing '{}'",
             syntax->window_function_asts[0]->formatForErrorMessage());
     }
 
diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp
index f78ca478fb8..9f97160f77f 100644
--- a/src/Interpreters/InterpreterSelectQuery.cpp
+++ b/src/Interpreters/InterpreterSelectQuery.cpp
@@ -784,9 +784,22 @@ static bool hasWithTotalsInAnySubqueryInFromClause(const ASTSelectQuery & query)
     {
         if (const auto * ast_union = query_table->as<ASTSelectWithUnionQuery>())
         {
+            /// NOTE: Child of subquery can be ASTSelectWithUnionQuery or ASTSelectQuery,
+            /// and after normalization, the height of the AST tree is at most 2
             for (const auto & elem : ast_union->list_of_selects->children)
-                if (hasWithTotalsInAnySubqueryInFromClause(elem->as<ASTSelectQuery &>()))
-                    return true;
+            {
+                if (const auto * child_union = elem->as<ASTSelectWithUnionQuery>())
+                {
+                    for (const auto & child_elem : child_union->list_of_selects->children)
+                        if (hasWithTotalsInAnySubqueryInFromClause(child_elem->as<ASTSelectQuery &>()))
+                            return true;
+                }
+                else
+                {
+                    if (hasWithTotalsInAnySubqueryInFromClause(elem->as<ASTSelectQuery &>()))
+                        return true;
+                }
+            }
         }
     }
 
@@ -1847,26 +1860,36 @@ static bool windowDescriptionComparator(const WindowDescription * _left,
         {
             return true;
         }
-
-        if (left[i].column_number < right[i].column_number)
-        {
-            return true;
-        }
-
-        if (left[i].direction < right[i].direction)
-        {
-            return true;
-        }
-
-        if (left[i].nulls_direction < right[i].nulls_direction)
-        {
-            return true;
-        }
-
-        if (left[i] != right[i])
+        else if (left[i].column_name > right[i].column_name)
         {
             return false;
         }
+        else if (left[i].column_number < right[i].column_number)
+        {
+            return true;
+        }
+        else if (left[i].column_number > right[i].column_number)
+        {
+            return false;
+        }
+        else if (left[i].direction < right[i].direction)
+        {
+            return true;
+        }
+        else if (left[i].direction > right[i].direction)
+        {
+            return false;
+        }
+        else if (left[i].nulls_direction < right[i].nulls_direction)
+        {
+            return true;
+        }
+        else if (left[i].nulls_direction > right[i].nulls_direction)
+        {
+            return false;
+        }
+
+        assert(left[i] == right[i]);
     }
 
     // Note that we check the length last, because we want to put together the
diff --git a/src/Interpreters/InterpreterSelectWithUnionQuery.cpp b/src/Interpreters/InterpreterSelectWithUnionQuery.cpp
index e6610df43ff..59fcff61936 100644
--- a/src/Interpreters/InterpreterSelectWithUnionQuery.cpp
+++ b/src/Interpreters/InterpreterSelectWithUnionQuery.cpp
@@ -329,7 +329,7 @@ InterpreterSelectWithUnionQuery::buildCurrentChildInterpreter(const ASTPtr & ast
 
 InterpreterSelectWithUnionQuery::~InterpreterSelectWithUnionQuery() = default;
 
-Block InterpreterSelectWithUnionQuery::getSampleBlock(const ASTPtr & query_ptr_, const Context & context_)
+Block InterpreterSelectWithUnionQuery::getSampleBlock(const ASTPtr & query_ptr_, const Context & context_, bool is_subquery)
 {
     auto & cache = context_.getSampleBlockCache();
     /// Using query string because query_ptr changes for every internal SELECT
@@ -339,7 +339,11 @@ Block InterpreterSelectWithUnionQuery::getSampleBlock(const ASTPtr & query_ptr_,
         return cache[key];
     }
 
-    return cache[key] = InterpreterSelectWithUnionQuery(query_ptr_, context_, SelectQueryOptions().analyze()).getSampleBlock();
+    if (is_subquery)
+        return cache[key]
+            = InterpreterSelectWithUnionQuery(query_ptr_, context_, SelectQueryOptions().subquery().analyze()).getSampleBlock();
+    else
+        return cache[key] = InterpreterSelectWithUnionQuery(query_ptr_, context_, SelectQueryOptions().analyze()).getSampleBlock();
 }
 
 
diff --git a/src/Interpreters/InterpreterSelectWithUnionQuery.h b/src/Interpreters/InterpreterSelectWithUnionQuery.h
index cd089a51970..f4062b2005e 100644
--- a/src/Interpreters/InterpreterSelectWithUnionQuery.h
+++ b/src/Interpreters/InterpreterSelectWithUnionQuery.h
@@ -35,7 +35,8 @@ public:
 
     static Block getSampleBlock(
         const ASTPtr & query_ptr_,
-        const Context & context_);
+        const Context & context_,
+        bool is_subquery = false);
 
     virtual void ignoreWithTotals() override;
 
diff --git a/src/Interpreters/InterpreterShowProcesslistQuery.h b/src/Interpreters/InterpreterShowProcesslistQuery.h
index 6b87fd7edc3..fa0bbf075bd 100644
--- a/src/Interpreters/InterpreterShowProcesslistQuery.h
+++ b/src/Interpreters/InterpreterShowProcesslistQuery.h
@@ -20,6 +20,11 @@ public:
 
     BlockIO execute() override;
 
+    /// We ignore the quota and limits here because execute() will rewrite a show query as a SELECT query and then
+    /// the SELECT query will checks the quota and limits.
+    bool ignoreQuota() const override { return true; }
+    bool ignoreLimits() const override { return true; }
+
 private:
     ASTPtr query_ptr;
     Context & context;
diff --git a/src/Interpreters/InterpreterShowTablesQuery.h b/src/Interpreters/InterpreterShowTablesQuery.h
index fc5cb2b7505..4f720e68622 100644
--- a/src/Interpreters/InterpreterShowTablesQuery.h
+++ b/src/Interpreters/InterpreterShowTablesQuery.h
@@ -20,6 +20,11 @@ public:
 
     BlockIO execute() override;
 
+    /// We ignore the quota and limits here because execute() will rewrite a show query as a SELECT query and then
+    /// the SELECT query will checks the quota and limits.
+    bool ignoreQuota() const override { return true; }
+    bool ignoreLimits() const override { return true; }
+
 private:
     ASTPtr query_ptr;
     Context & context;
diff --git a/src/Interpreters/InterpreterSystemQuery.h b/src/Interpreters/InterpreterSystemQuery.h
index 6fd96c15a2e..6fa0a432191 100644
--- a/src/Interpreters/InterpreterSystemQuery.h
+++ b/src/Interpreters/InterpreterSystemQuery.h
@@ -37,9 +37,6 @@ public:
 
     BlockIO execute() override;
 
-    bool ignoreQuota() const override { return true; }
-    bool ignoreLimits() const override { return true; }
-
 private:
     ASTPtr query_ptr;
     Context & context;
diff --git a/src/Interpreters/TableJoin.cpp b/src/Interpreters/TableJoin.cpp
index 2d3bffa8234..c1777711d9e 100644
--- a/src/Interpreters/TableJoin.cpp
+++ b/src/Interpreters/TableJoin.cpp
@@ -230,8 +230,16 @@ void TableJoin::addJoinedColumn(const NameAndTypePair & joined_column)
 void TableJoin::addJoinedColumnsAndCorrectNullability(ColumnsWithTypeAndName & columns) const
 {
     for (auto & col : columns)
+    {
         if (leftBecomeNullable(col.type))
-            col.type = makeNullable(col.type);
+        {
+            /// No need to nullify constants
+            if (!(col.column && isColumnConst(*col.column)))
+            {
+                col.type = makeNullable(col.type);
+            }
+        }
+    }
 
     for (const auto & col : columns_added_by_join)
     {
diff --git a/src/Interpreters/TreeRewriter.cpp b/src/Interpreters/TreeRewriter.cpp
index c4fc40785f6..fd87d86bf97 100644
--- a/src/Interpreters/TreeRewriter.cpp
+++ b/src/Interpreters/TreeRewriter.cpp
@@ -404,13 +404,13 @@ void setJoinStrictness(ASTSelectQuery & select_query, JoinStrictness join_defaul
 
 /// Find the columns that are obtained by JOIN.
 void collectJoinedColumns(TableJoin & analyzed_join, const ASTSelectQuery & select_query,
-                          const TablesWithColumns & tables, const Aliases & aliases)
+                          const TablesWithColumns & tables, const Aliases & aliases, ASTPtr & new_where_conditions)
 {
     const ASTTablesInSelectQueryElement * node = select_query.join();
     if (!node)
         return;
 
-    const auto & table_join = node->table_join->as<ASTTableJoin &>();
+    auto & table_join = node->table_join->as<ASTTableJoin &>();
 
     if (table_join.using_expression_list)
     {
@@ -422,16 +422,33 @@ void collectJoinedColumns(TableJoin & analyzed_join, const ASTSelectQuery & sele
     {
         bool is_asof = (table_join.strictness == ASTTableJoin::Strictness::Asof);
 
-        CollectJoinOnKeysVisitor::Data data{analyzed_join, tables[0], tables[1], aliases, is_asof};
+        CollectJoinOnKeysVisitor::Data data{analyzed_join, tables[0], tables[1], aliases, is_asof, table_join.kind};
         CollectJoinOnKeysVisitor(data).visit(table_join.on_expression);
         if (!data.has_some)
             throw Exception("Cannot get JOIN keys from JOIN ON section: " + queryToString(table_join.on_expression),
                             ErrorCodes::INVALID_JOIN_ON_EXPRESSION);
         if (is_asof)
+        {
             data.asofToJoinKeys();
+        }
+        else if (data.new_on_expression)
+        {
+            table_join.on_expression = data.new_on_expression;
+            new_where_conditions = data.new_where_conditions;
+        }
     }
 }
 
+/// Move joined key related to only one table to WHERE clause
+void moveJoinedKeyToWhere(ASTSelectQuery * select_query, ASTPtr & new_where_conditions)
+{
+    if (select_query->where())
+        select_query->setExpression(ASTSelectQuery::Expression::WHERE,
+            makeASTFunction("and", new_where_conditions, select_query->where()));
+    else
+        select_query->setExpression(ASTSelectQuery::Expression::WHERE, new_where_conditions->clone());
+}
+
 
 std::vector<const ASTFunction *> getAggregates(ASTPtr & query, const ASTSelectQuery & select_query)
 {
@@ -808,7 +825,11 @@ TreeRewriterResultPtr TreeRewriter::analyzeSelect(
 
     setJoinStrictness(*select_query, settings.join_default_strictness, settings.any_join_distinct_right_table_keys,
                         result.analyzed_join->table_join);
-    collectJoinedColumns(*result.analyzed_join, *select_query, tables_with_columns, result.aliases);
+
+    ASTPtr new_where_condition = nullptr;
+    collectJoinedColumns(*result.analyzed_join, *select_query, tables_with_columns, result.aliases, new_where_condition);
+    if (new_where_condition)
+        moveJoinedKeyToWhere(select_query, new_where_condition);
 
     /// rewrite filters for select query, must go after getArrayJoinedColumns
     if (settings.optimize_respect_aliases && result.metadata_snapshot)
diff --git a/src/Interpreters/getTableExpressions.cpp b/src/Interpreters/getTableExpressions.cpp
index 766ce257530..a4e971c302c 100644
--- a/src/Interpreters/getTableExpressions.cpp
+++ b/src/Interpreters/getTableExpressions.cpp
@@ -84,7 +84,7 @@ static NamesAndTypesList getColumnsFromTableExpression(
     if (table_expression.subquery)
     {
         const auto & subquery = table_expression.subquery->children.at(0);
-        names_and_type_list = InterpreterSelectWithUnionQuery::getSampleBlock(subquery, context).getNamesAndTypesList();
+        names_and_type_list = InterpreterSelectWithUnionQuery::getSampleBlock(subquery, context, true).getNamesAndTypesList();
     }
     else if (table_expression.table_function)
     {
diff --git a/src/Parsers/ExpressionElementParsers.cpp b/src/Parsers/ExpressionElementParsers.cpp
index e7cd85798b9..3d868812304 100644
--- a/src/Parsers/ExpressionElementParsers.cpp
+++ b/src/Parsers/ExpressionElementParsers.cpp
@@ -266,7 +266,7 @@ bool ParserFunction::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
     ParserIdentifier id_parser;
     ParserKeyword distinct("DISTINCT");
     ParserKeyword all("ALL");
-    ParserExpressionList contents(false);
+    ParserExpressionList contents(false, is_table_function);
     ParserSelectWithUnionQuery select;
     ParserKeyword over("OVER");
 
@@ -278,6 +278,12 @@ bool ParserFunction::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
     ASTPtr expr_list_args;
     ASTPtr expr_list_params;
 
+    if (is_table_function)
+    {
+        if (ParserTableFunctionView().parse(pos, node, expected))
+            return true;
+    }
+
     if (!id_parser.parse(pos, identifier, expected))
         return false;
 
@@ -312,36 +318,6 @@ bool ParserFunction::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
         }
     }
 
-    if (!has_distinct && !has_all)
-    {
-        auto old_pos = pos;
-        auto maybe_an_subquery = pos->type == TokenType::OpeningRoundBracket;
-
-        if (select.parse(pos, query, expected))
-        {
-            auto & select_ast = query->as<ASTSelectWithUnionQuery &>();
-            if (select_ast.list_of_selects->children.size() == 1 && maybe_an_subquery)
-            {
-                // It's an subquery. Bail out.
-                pos = old_pos;
-            }
-            else
-            {
-                if (pos->type != TokenType::ClosingRoundBracket)
-                    return false;
-                ++pos;
-                auto function_node = std::make_shared<ASTFunction>();
-                tryGetIdentifierNameInto(identifier, function_node->name);
-                auto expr_list_with_single_query = std::make_shared<ASTExpressionList>();
-                expr_list_with_single_query->children.push_back(query);
-                function_node->arguments = expr_list_with_single_query;
-                function_node->children.push_back(function_node->arguments);
-                node = function_node;
-                return true;
-            }
-        }
-    }
-
     const char * contents_begin = pos->begin;
     if (!contents.parse(pos, expr_list_args, expected))
         return false;
@@ -477,6 +453,49 @@ bool ParserFunction::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
     return true;
 }
 
+bool ParserTableFunctionView::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
+{
+    ParserIdentifier id_parser;
+    ParserKeyword view("VIEW");
+    ParserSelectWithUnionQuery select;
+
+    ASTPtr identifier;
+    ASTPtr query;
+
+    if (!view.ignore(pos, expected))
+        return false;
+
+    if (pos->type != TokenType::OpeningRoundBracket)
+        return false;
+
+    ++pos;
+
+    bool maybe_an_subquery = pos->type == TokenType::OpeningRoundBracket;
+
+    if (!select.parse(pos, query, expected))
+        return false;
+
+    auto & select_ast = query->as<ASTSelectWithUnionQuery &>();
+    if (select_ast.list_of_selects->children.size() == 1 && maybe_an_subquery)
+    {
+        // It's an subquery. Bail out.
+        return false;
+    }
+
+    if (pos->type != TokenType::ClosingRoundBracket)
+        return false;
+    ++pos;
+    auto function_node = std::make_shared<ASTFunction>();
+    tryGetIdentifierNameInto(identifier, function_node->name);
+    auto expr_list_with_single_query = std::make_shared<ASTExpressionList>();
+    expr_list_with_single_query->children.push_back(query);
+    function_node->name = "view";
+    function_node->arguments = expr_list_with_single_query;
+    function_node->children.push_back(function_node->arguments);
+    node = function_node;
+    return true;
+}
+
 bool ParserWindowReference::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
 {
     ASTFunction * function = dynamic_cast<ASTFunction *>(node.get());
diff --git a/src/Parsers/ExpressionElementParsers.h b/src/Parsers/ExpressionElementParsers.h
index ba18fc2cddd..b6194f981fe 100644
--- a/src/Parsers/ExpressionElementParsers.h
+++ b/src/Parsers/ExpressionElementParsers.h
@@ -149,11 +149,25 @@ protected:
 class ParserFunction : public IParserBase
 {
 public:
-    ParserFunction(bool allow_function_parameters_ = true) : allow_function_parameters(allow_function_parameters_) {}
+    ParserFunction(bool allow_function_parameters_ = true, bool is_table_function_ = false)
+        : allow_function_parameters(allow_function_parameters_), is_table_function(is_table_function_)
+    {
+    }
+
 protected:
     const char * getName() const override { return "function"; }
     bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override;
     bool allow_function_parameters;
+    bool is_table_function;
+};
+
+// A special function parser for view table function.
+// It parses an SELECT query as its argument and doesn't support getColumnName().
+class ParserTableFunctionView : public IParserBase
+{
+protected:
+    const char * getName() const override { return "function"; }
+    bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override;
 };
 
 // Window reference (the thing that goes after OVER) for window function.
diff --git a/src/Parsers/ExpressionListParsers.cpp b/src/Parsers/ExpressionListParsers.cpp
index afe85f069c7..e9ad65af471 100644
--- a/src/Parsers/ExpressionListParsers.cpp
+++ b/src/Parsers/ExpressionListParsers.cpp
@@ -468,6 +468,14 @@ bool ParserLambdaExpression::parseImpl(Pos & pos, ASTPtr & node, Expected & expe
 }
 
 
+bool ParserTableFunctionExpression::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
+{
+    if (ParserTableFunctionView().parse(pos, node, expected))
+        return true;
+    return elem_parser.parse(pos, node, expected);
+}
+
+
 bool ParserPrefixUnaryOperatorExpression::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
 {
     /// try to find any of the valid operators
@@ -570,9 +578,10 @@ bool ParserTupleElementExpression::parseImpl(Pos & pos, ASTPtr & node, Expected
 }
 
 
-ParserExpressionWithOptionalAlias::ParserExpressionWithOptionalAlias(bool allow_alias_without_as_keyword)
-    : impl(std::make_unique<ParserWithOptionalAlias>(std::make_unique<ParserExpression>(),
-                                                     allow_alias_without_as_keyword))
+ParserExpressionWithOptionalAlias::ParserExpressionWithOptionalAlias(bool allow_alias_without_as_keyword, bool is_table_function)
+    : impl(std::make_unique<ParserWithOptionalAlias>(
+        is_table_function ? ParserPtr(std::make_unique<ParserTableFunctionExpression>()) : ParserPtr(std::make_unique<ParserExpression>()),
+        allow_alias_without_as_keyword))
 {
 }
 
@@ -580,7 +589,7 @@ ParserExpressionWithOptionalAlias::ParserExpressionWithOptionalAlias(bool allow_
 bool ParserExpressionList::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
 {
     return ParserList(
-        std::make_unique<ParserExpressionWithOptionalAlias>(allow_alias_without_as_keyword),
+        std::make_unique<ParserExpressionWithOptionalAlias>(allow_alias_without_as_keyword, is_table_function),
         std::make_unique<ParserToken>(TokenType::Comma))
         .parse(pos, node, expected);
 }
diff --git a/src/Parsers/ExpressionListParsers.h b/src/Parsers/ExpressionListParsers.h
index 90b27950873..2371e006c09 100644
--- a/src/Parsers/ExpressionListParsers.h
+++ b/src/Parsers/ExpressionListParsers.h
@@ -436,13 +436,26 @@ protected:
 };
 
 
+// It's used to parse expressions in table function.
+class ParserTableFunctionExpression : public IParserBase
+{
+private:
+    ParserLambdaExpression elem_parser;
+
+protected:
+    const char * getName() const override { return "table function expression"; }
+
+    bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override;
+};
+
+
 using ParserExpression = ParserLambdaExpression;
 
 
 class ParserExpressionWithOptionalAlias : public IParserBase
 {
 public:
-    ParserExpressionWithOptionalAlias(bool allow_alias_without_as_keyword);
+    explicit ParserExpressionWithOptionalAlias(bool allow_alias_without_as_keyword, bool is_table_function = false);
 protected:
     ParserPtr impl;
 
@@ -459,11 +472,12 @@ protected:
 class ParserExpressionList : public IParserBase
 {
 public:
-    ParserExpressionList(bool allow_alias_without_as_keyword_)
-        : allow_alias_without_as_keyword(allow_alias_without_as_keyword_) {}
+    explicit ParserExpressionList(bool allow_alias_without_as_keyword_, bool is_table_function_ = false)
+        : allow_alias_without_as_keyword(allow_alias_without_as_keyword_), is_table_function(is_table_function_) {}
 
 protected:
     bool allow_alias_without_as_keyword;
+    bool is_table_function; // This expression list is used by a table function
 
     const char * getName() const override { return "list of expressions"; }
     bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override;
@@ -473,7 +487,7 @@ protected:
 class ParserNotEmptyExpressionList : public IParserBase
 {
 public:
-    ParserNotEmptyExpressionList(bool allow_alias_without_as_keyword)
+    explicit ParserNotEmptyExpressionList(bool allow_alias_without_as_keyword)
         : nested_parser(allow_alias_without_as_keyword) {}
 private:
     ParserExpressionList nested_parser;
diff --git a/src/Parsers/ParserTablesInSelectQuery.cpp b/src/Parsers/ParserTablesInSelectQuery.cpp
index 1264acefe64..2e20279dbe1 100644
--- a/src/Parsers/ParserTablesInSelectQuery.cpp
+++ b/src/Parsers/ParserTablesInSelectQuery.cpp
@@ -22,7 +22,7 @@ bool ParserTableExpression::parseImpl(Pos & pos, ASTPtr & node, Expected & expec
     auto res = std::make_shared<ASTTableExpression>();
 
     if (!ParserWithOptionalAlias(std::make_unique<ParserSubquery>(), true).parse(pos, res->subquery, expected)
-        && !ParserWithOptionalAlias(std::make_unique<ParserFunction>(), true).parse(pos, res->table_function, expected)
+        && !ParserWithOptionalAlias(std::make_unique<ParserFunction>(true, true), true).parse(pos, res->table_function, expected)
         && !ParserWithOptionalAlias(std::make_unique<ParserCompoundIdentifier>(false, true), true).parse(pos, res->database_and_table_name, expected))
         return false;
 
diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp
index 8422f09e364..f7f08411dfa 100644
--- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp
+++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp
@@ -15,6 +15,7 @@ namespace ErrorCodes
 {
     extern const int BAD_ARGUMENTS;
     extern const int INCORRECT_DATA;
+    extern const int LOGICAL_ERROR;
 }
 
 
@@ -436,9 +437,11 @@ static std::pair<bool, size_t> fileSegmentationEngineCSVImpl(ReadBuffer & in, DB
         if (quotes)
         {
             pos = find_first_symbols<'"'>(pos, in.buffer().end());
-            if (pos == in.buffer().end())
+            if (pos > in.buffer().end())
+                throw Exception("Position in buffer is out of bounds. There must be a bug.", ErrorCodes::LOGICAL_ERROR);
+            else if (pos == in.buffer().end())
                 continue;
-            if (*pos == '"')
+            else if (*pos == '"')
             {
                 ++pos;
                 if (loadAtPosition(in, memory, pos) && *pos == '"')
@@ -450,9 +453,11 @@ static std::pair<bool, size_t> fileSegmentationEngineCSVImpl(ReadBuffer & in, DB
         else
         {
             pos = find_first_symbols<'"', '\r', '\n'>(pos, in.buffer().end());
-            if (pos == in.buffer().end())
+            if (pos > in.buffer().end())
+                throw Exception("Position in buffer is out of bounds. There must be a bug.", ErrorCodes::LOGICAL_ERROR);
+            else if (pos == in.buffer().end())
                 continue;
-            if (*pos == '"')
+            else if (*pos == '"')
             {
                 quotes = true;
                 ++pos;
diff --git a/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp b/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp
index 6e14a1dc3c8..108f4d9d321 100644
--- a/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp
+++ b/src/Processors/Formats/Impl/RegexpRowInputFormat.cpp
@@ -11,6 +11,7 @@ namespace ErrorCodes
 {
     extern const int INCORRECT_DATA;
     extern const int BAD_ARGUMENTS;
+    extern const int LOGICAL_ERROR;
 }
 
 RegexpRowInputFormat::RegexpRowInputFormat(
@@ -182,7 +183,9 @@ static std::pair<bool, size_t> fileSegmentationEngineRegexpImpl(ReadBuffer & in,
     while (loadAtPosition(in, memory, pos) && need_more_data)
     {
         pos = find_first_symbols<'\n', '\r'>(pos, in.buffer().end());
-        if (pos == in.buffer().end())
+        if (pos > in.buffer().end())
+                throw Exception("Position in buffer is out of bounds. There must be a bug.", ErrorCodes::LOGICAL_ERROR);
+        else if (pos == in.buffer().end())
             continue;
 
         // Support DOS-style newline ("\r\n")
diff --git a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp
index 69a5e61caf2..96b01a5bd9b 100644
--- a/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp
+++ b/src/Processors/Formats/Impl/TabSeparatedRowInputFormat.cpp
@@ -15,6 +15,7 @@ namespace DB
 namespace ErrorCodes
 {
     extern const int INCORRECT_DATA;
+    extern const int LOGICAL_ERROR;
 }
 
 
@@ -433,10 +434,11 @@ static std::pair<bool, size_t> fileSegmentationEngineTabSeparatedImpl(ReadBuffer
     {
         pos = find_first_symbols<'\\', '\r', '\n'>(pos, in.buffer().end());
 
-        if (pos == in.buffer().end())
+        if (pos > in.buffer().end())
+                throw Exception("Position in buffer is out of bounds. There must be a bug.", ErrorCodes::LOGICAL_ERROR);
+        else if (pos == in.buffer().end())
             continue;
-
-        if (*pos == '\\')
+        else if (*pos == '\\')
         {
             ++pos;
             if (loadAtPosition(in, memory, pos))
diff --git a/src/Processors/Transforms/WindowTransform.cpp b/src/Processors/Transforms/WindowTransform.cpp
index 914289bca2f..0013e0061e2 100644
--- a/src/Processors/Transforms/WindowTransform.cpp
+++ b/src/Processors/Transforms/WindowTransform.cpp
@@ -1,17 +1,134 @@
 #include <Processors/Transforms/WindowTransform.h>
 
-#include <Interpreters/ExpressionActions.h>
-
+#include <AggregateFunctions/AggregateFunctionFactory.h>
 #include <Common/Arena.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <Interpreters/ExpressionActions.h>
+#include <Interpreters/convertFieldToType.h>
+
 
 namespace DB
 {
 
 namespace ErrorCodes
 {
+    extern const int BAD_ARGUMENTS;
     extern const int NOT_IMPLEMENTED;
 }
 
+// Interface for true window functions. It's not much of an interface, they just
+// accept the guts of WindowTransform and do 'something'. Given a small number of
+// true window functions, and the fact that the WindowTransform internals are
+// pretty much well defined in domain terms (e.g. frame boundaries), this is
+// somewhat acceptable.
+class IWindowFunction
+{
+public:
+    virtual ~IWindowFunction() = default;
+
+    // Must insert the result for current_row.
+    virtual void windowInsertResultInto(IColumn & to, const WindowTransform * transform) = 0;
+};
+
+// Compares ORDER BY column values at given rows to find the boundaries of frame:
+// [compared] with [reference] +/- offset. Return value is -1/0/+1, like in
+// sorting predicates -- -1 means [compared] is less than [reference] +/- offset.
+template <typename ColumnType>
+static int compareValuesWithOffset(const IColumn * _compared_column,
+    size_t compared_row, const IColumn * _reference_column,
+    size_t reference_row,
+    uint64_t _offset,
+    bool offset_is_preceding)
+{
+    // Casting the columns to the known type here makes it faster, probably
+    // because the getData call can be devirtualized.
+    const auto * compared_column = assert_cast<const ColumnType *>(
+        _compared_column);
+    const auto * reference_column = assert_cast<const ColumnType *>(
+        _reference_column);
+    const auto offset = static_cast<typename ColumnType::ValueType>(_offset);
+
+    const auto compared_value_data = compared_column->getDataAt(compared_row);
+    assert(compared_value_data.size == sizeof(typename ColumnType::ValueType));
+    auto compared_value = unalignedLoad<typename ColumnType::ValueType>(
+        compared_value_data.data);
+
+    const auto reference_value_data = reference_column->getDataAt(reference_row);
+    assert(reference_value_data.size == sizeof(typename ColumnType::ValueType));
+    auto reference_value = unalignedLoad<typename ColumnType::ValueType>(
+        reference_value_data.data);
+
+    bool is_overflow;
+    bool overflow_to_negative;
+    if (offset_is_preceding)
+    {
+        is_overflow = __builtin_sub_overflow(reference_value, offset,
+            &reference_value);
+        overflow_to_negative = offset > 0;
+    }
+    else
+    {
+        is_overflow = __builtin_add_overflow(reference_value, offset,
+            &reference_value);
+        overflow_to_negative = offset < 0;
+    }
+
+//    fmt::print(stderr,
+//        "compared [{}] = {}, ref [{}] = {}, offset {} preceding {} overflow {} to negative {}\n",
+//        compared_row, toString(compared_value),
+//        reference_row, toString(reference_value),
+//        toString(offset), offset_is_preceding,
+//        is_overflow, overflow_to_negative);
+
+    if (is_overflow)
+    {
+        if (overflow_to_negative)
+        {
+            // Overflow to the negative, [compared] must be greater.
+            return 1;
+        }
+        else
+        {
+            // Overflow to the positive, [compared] must be less.
+            return -1;
+        }
+    }
+    else
+    {
+        // No overflow, compare normally.
+        return compared_value < reference_value ? -1
+            : compared_value == reference_value ? 0 : 1;
+    }
+}
+
+// Helper macros to dispatch on type of the ORDER BY column
+#define APPLY_FOR_ONE_TYPE(FUNCTION, TYPE) \
+else if (typeid_cast<const TYPE *>(column)) \
+{ \
+    /* clang-tidy you're dumb, I can't put FUNCTION in braces here. */ \
+    compare_values_with_offset = FUNCTION<TYPE>; /* NOLINT */ \
+}
+
+#define APPLY_FOR_TYPES(FUNCTION) \
+if (false) /* NOLINT */ \
+{ \
+    /* Do nothing, a starter condition. */ \
+} \
+APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<Int8>) \
+APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<UInt8>) \
+APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<Int16>) \
+APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<UInt16>) \
+APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<Int32>) \
+APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<UInt32>) \
+APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<Int64>) \
+APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<UInt64>) \
+else \
+{ \
+    throw Exception(ErrorCodes::NOT_IMPLEMENTED, \
+        "The RANGE OFFSET frame for '{}' ORDER BY column is not implemented", \
+        demangle(typeid(*column).name())); \
+}
+
 WindowTransform::WindowTransform(const Block & input_header_,
         const Block & output_header_,
         const WindowDescription & window_description_,
@@ -26,26 +143,29 @@ WindowTransform::WindowTransform(const Block & input_header_,
     for (const auto & f : functions)
     {
         WindowFunctionWorkspace workspace;
-        workspace.window_function = f;
-
-        const auto & aggregate_function
-            = workspace.window_function.aggregate_function;
+        workspace.aggregate_function = f.aggregate_function;
+        const auto & aggregate_function = workspace.aggregate_function;
         if (!arena && aggregate_function->allocatesMemoryInArena())
         {
             arena = std::make_unique<Arena>();
         }
 
-        workspace.argument_column_indices.reserve(
-            workspace.window_function.argument_names.size());
-        for (const auto & argument_name : workspace.window_function.argument_names)
+        workspace.argument_column_indices.reserve(f.argument_names.size());
+        for (const auto & argument_name : f.argument_names)
         {
             workspace.argument_column_indices.push_back(
                 input_header.getPositionByName(argument_name));
         }
+        workspace.argument_columns.assign(f.argument_names.size(), nullptr);
 
-        workspace.aggregate_function_state.reset(aggregate_function->sizeOfData(),
-            aggregate_function->alignOfData());
-        aggregate_function->create(workspace.aggregate_function_state.data());
+        workspace.window_function_impl = aggregate_function->asWindowFunction();
+        if (!workspace.window_function_impl)
+        {
+            workspace.aggregate_function_state.reset(
+                aggregate_function->sizeOfData(),
+                aggregate_function->alignOfData());
+            aggregate_function->create(workspace.aggregate_function_state.data());
+        }
 
         workspaces.push_back(std::move(workspace));
     }
@@ -63,6 +183,20 @@ WindowTransform::WindowTransform(const Block & input_header_,
         order_by_indices.push_back(
             input_header.getPositionByName(column.column_name));
     }
+
+    // Choose a row comparison function for RANGE OFFSET frame based on the
+    // type of the ORDER BY column.
+    if (window_description.frame.type == WindowFrame::FrameType::Range
+        && (window_description.frame.begin_type
+                == WindowFrame::BoundaryType::Offset
+            || window_description.frame.end_type
+                == WindowFrame::BoundaryType::Offset))
+    {
+        assert(order_by_indices.size() == 1);
+        const IColumn * column = input_header.getByPosition(
+            order_by_indices[0]).column.get();
+        APPLY_FOR_TYPES(compareValuesWithOffset)
+    }
 }
 
 WindowTransform::~WindowTransform()
@@ -70,8 +204,11 @@ WindowTransform::~WindowTransform()
     // Some states may be not created yet if the creation failed.
     for (auto & ws : workspaces)
     {
-        ws.window_function.aggregate_function->destroy(
-            ws.aggregate_function_state.data());
+        if (!ws.window_function_impl)
+        {
+            ws.aggregate_function->destroy(
+                ws.aggregate_function_state.data());
+        }
     }
 }
 
@@ -290,85 +427,22 @@ void WindowTransform::advanceFrameStartRowsOffset()
     assert(offset_left >= 0);
 }
 
-// Compares ORDER BY column values at given rows to find the boundaries of frame:
-// [compared] with [reference] +/- offset. Return value is -1/0/+1, like in
-// sorting predicates -- -1 means [compared] is less than [reference] +/- offset.
-template <typename ColumnType>
-static int compareValuesWithOffset(const ColumnType * compared_column,
-    size_t compared_row, const ColumnType * reference_column,
-    size_t reference_row,
-    typename ColumnType::ValueType offset,
-    bool offset_is_preceding)
-{
-    const auto compared_value_data = compared_column->getDataAt(compared_row);
-    assert(compared_value_data.size == sizeof(typename ColumnType::ValueType));
-    auto compared_value = unalignedLoad<typename ColumnType::ValueType>(
-        compared_value_data.data);
 
-    const auto reference_value_data = reference_column->getDataAt(reference_row);
-    assert(reference_value_data.size == sizeof(typename ColumnType::ValueType));
-    auto reference_value = unalignedLoad<typename ColumnType::ValueType>(
-        reference_value_data.data);
-
-    bool is_overflow;
-    bool overflow_to_negative;
-    if (offset_is_preceding)
-    {
-        is_overflow = __builtin_sub_overflow(reference_value, offset,
-            &reference_value);
-        overflow_to_negative = offset > 0;
-    }
-    else
-    {
-        is_overflow = __builtin_add_overflow(reference_value, offset,
-            &reference_value);
-        overflow_to_negative = offset < 0;
-    }
-
-//    fmt::print(stderr,
-//        "compared [{}] = {}, ref [{}] = {}, offset {} preceding {} overflow {} to negative {}\n",
-//        compared_row, toString(compared_value),
-//        reference_row, toString(reference_value),
-//        toString(offset), offset_is_preceding,
-//        is_overflow, overflow_to_negative);
-
-    if (is_overflow)
-    {
-        if (overflow_to_negative)
-        {
-            // Overflow to the negative, [compared] must be greater.
-            return 1;
-        }
-        else
-        {
-            // Overflow to the positive, [compared] must be less.
-            return -1;
-        }
-    }
-    else
-    {
-        // No overflow, compare normally.
-        return compared_value < reference_value ? -1
-            : compared_value == reference_value ? 0 : 1;
-    }
-}
-
-template <typename ColumnType>
 void WindowTransform::advanceFrameStartRangeOffset()
 {
     // See the comment for advanceFrameEndRangeOffset().
     const int direction = window_description.order_by[0].direction;
     const bool preceding = window_description.frame.begin_preceding
         == (direction > 0);
-    const auto * reference_column = assert_cast<const ColumnType *>(
-        inputAt(current_row)[order_by_indices[0]].get());
+    const auto * reference_column
+        = inputAt(current_row)[order_by_indices[0]].get();
     for (; frame_start < partition_end; advanceRowNumber(frame_start))
     {
         // The first frame value is [current_row] with offset, so we advance
         // while [frames_start] < [current_row] with offset.
-        const auto * compared_column = assert_cast<const ColumnType *>(
-            inputAt(frame_start)[order_by_indices[0]].get());
-        if (compareValuesWithOffset(compared_column, frame_start.row,
+        const auto * compared_column
+            = inputAt(frame_start)[order_by_indices[0]].get();
+        if (compare_values_with_offset(compared_column, frame_start.row,
             reference_column, current_row.row,
             window_description.frame.begin_offset,
             preceding)
@@ -382,43 +456,6 @@ void WindowTransform::advanceFrameStartRangeOffset()
     frame_started = partition_ended;
 }
 
-// Helper macros to dispatch on type of the ORDER BY column
-#define APPLY_FOR_ONE_TYPE(FUNCTION, TYPE) \
-else if (typeid_cast<const TYPE *>(column)) \
-{ \
-    /* clang-tidy you're dumb, I can't put FUNCTION in braces here. */ \
-    FUNCTION<TYPE>(); /* NOLINT */ \
-}
-
-#define APPLY_FOR_TYPES(FUNCTION) \
-if (false) /* NOLINT */ \
-{ \
-    /* Do nothing, a starter condition. */ \
-} \
-APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<Int8>) \
-APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<UInt8>) \
-APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<Int16>) \
-APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<UInt16>) \
-APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<Int32>) \
-APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<UInt32>) \
-APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<Int64>) \
-APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector<UInt64>) \
-else \
-{ \
-    throw Exception(ErrorCodes::NOT_IMPLEMENTED, \
-        "The RANGE OFFSET frame for '{}' ORDER BY column is not implemented", \
-        demangle(typeid(*column).name())); \
-}
-
-void WindowTransform::advanceFrameStartRangeOffsetDispatch()
-{
-    // Dispatch on the type of the ORDER BY column.
-    assert(order_by_indices.size() == 1);
-    const IColumn * column = inputAt(current_row)[order_by_indices[0]].get();
-
-    APPLY_FOR_TYPES(advanceFrameStartRangeOffset)
-}
-
 void WindowTransform::advanceFrameStart()
 {
     if (frame_started)
@@ -451,7 +488,7 @@ void WindowTransform::advanceFrameStart()
                     advanceFrameStartRowsOffset();
                     break;
                 case WindowFrame::FrameType::Range:
-                    advanceFrameStartRangeOffsetDispatch();
+                    advanceFrameStartRangeOffset();
                     break;
                 default:
                     throw Exception(ErrorCodes::NOT_IMPLEMENTED,
@@ -631,7 +668,6 @@ void WindowTransform::advanceFrameEndRowsOffset()
     assert(offset_left >= 0);
 }
 
-template <typename ColumnType>
 void WindowTransform::advanceFrameEndRangeOffset()
 {
     // PRECEDING/FOLLOWING change direction for DESC order.
@@ -639,16 +675,16 @@ void WindowTransform::advanceFrameEndRangeOffset()
     const int direction = window_description.order_by[0].direction;
     const bool preceding = window_description.frame.end_preceding
         == (direction > 0);
-    const auto * reference_column = assert_cast<const ColumnType *>(
-        inputAt(current_row)[order_by_indices[0]].get());
+    const auto * reference_column
+        = inputAt(current_row)[order_by_indices[0]].get();
     for (; frame_end < partition_end; advanceRowNumber(frame_end))
     {
         // The last frame value is current_row with offset, and we need a
         // past-the-end pointer, so we advance while
         // [frame_end] <= [current_row] with offset.
-        const auto * compared_column = assert_cast<const ColumnType *>(
-            inputAt(frame_end)[order_by_indices[0]].get());
-        if (compareValuesWithOffset(compared_column, frame_end.row,
+        const auto * compared_column
+            = inputAt(frame_end)[order_by_indices[0]].get();
+        if (compare_values_with_offset(compared_column, frame_end.row,
             reference_column, current_row.row,
             window_description.frame.end_offset,
             preceding)
@@ -662,15 +698,6 @@ void WindowTransform::advanceFrameEndRangeOffset()
     frame_ended = partition_ended;
 }
 
-void WindowTransform::advanceFrameEndRangeOffsetDispatch()
-{
-    // Dispatch on the type of the ORDER BY column.
-    assert(order_by_indices.size() == 1);
-    const IColumn * column = inputAt(current_row)[order_by_indices[0]].get();
-
-    APPLY_FOR_TYPES(advanceFrameEndRangeOffset)
-}
-
 void WindowTransform::advanceFrameEnd()
 {
     // No reason for this function to be called again after it succeeded.
@@ -693,7 +720,7 @@ void WindowTransform::advanceFrameEnd()
                     advanceFrameEndRowsOffset();
                     break;
                 case WindowFrame::FrameType::Range:
-                    advanceFrameEndRangeOffsetDispatch();
+                    advanceFrameEndRangeOffset();
                     break;
                 default:
                     throw Exception(ErrorCodes::NOT_IMPLEMENTED,
@@ -753,7 +780,13 @@ void WindowTransform::updateAggregationState()
 
     for (auto & ws : workspaces)
     {
-        const auto * a = ws.window_function.aggregate_function.get();
+        if (ws.window_function_impl)
+        {
+            // No need to do anything for true window functions.
+            continue;
+        }
+
+        const auto * a = ws.aggregate_function.get();
         auto * buf = ws.aggregate_function_state.data();
 
         if (reset_aggregation)
@@ -763,24 +796,46 @@ void WindowTransform::updateAggregationState()
             a->create(buf);
         }
 
-        for (auto row = rows_to_add_start; row < rows_to_add_end;
-            advanceRowNumber(row))
+        // To achieve better performance, we will have to loop over blocks and
+        // rows manually, instead of using advanceRowNumber().
+        // For this purpose, the past-the-end block can be different than the
+        // block of the past-the-end row (it's usually the next block).
+        const auto past_the_end_block = rows_to_add_end.row == 0
+            ? rows_to_add_end.block
+            : rows_to_add_end.block + 1;
+
+        for (auto block_number = rows_to_add_start.block;
+             block_number < past_the_end_block;
+             ++block_number)
         {
-            if (row.block != ws.cached_block_number)
+            auto & block = blockAt(block_number);
+
+            if (ws.cached_block_number != block_number)
             {
-                const auto & block
-                    = blocks[row.block - first_block_number];
-                ws.argument_columns.clear();
-                for (const auto i : ws.argument_column_indices)
+                for (size_t i = 0; i < ws.argument_column_indices.size(); ++i)
                 {
-                    ws.argument_columns.push_back(block.input_columns[i].get());
+                    ws.argument_columns[i] = block.input_columns[
+                        ws.argument_column_indices[i]].get();
                 }
-                ws.cached_block_number = row.block;
+                ws.cached_block_number = block_number;
             }
 
-//            fmt::print(stderr, "(2) add row {}\n", row);
+            // First and last blocks may be processed partially, and other blocks
+            // are processed in full.
+            const auto first_row = block_number == rows_to_add_start.block
+                ? rows_to_add_start.row : 0;
+            const auto past_the_end_row = block_number == rows_to_add_end.block
+                ? rows_to_add_end.row : block.rows;
+
+            // We should add an addBatch analog that can accept a starting offset.
+            // For now, add the values one by one.
             auto * columns = ws.argument_columns.data();
-            a->add(buf, columns, row.row, arena.get());
+            // Removing arena.get() from the loop makes it faster somehow...
+            auto * arena_ptr = arena.get();
+            for (auto row = first_row; row < past_the_end_row; ++row)
+            {
+                a->add(buf, columns, row, arena_ptr);
+            }
         }
     }
 
@@ -793,17 +848,24 @@ void WindowTransform::writeOutCurrentRow()
     assert(current_row < partition_end);
     assert(current_row.block >= first_block_number);
 
+    const auto & block = blockAt(current_row);
     for (size_t wi = 0; wi < workspaces.size(); ++wi)
     {
         auto & ws = workspaces[wi];
-        const auto & f = ws.window_function;
-        const auto * a = f.aggregate_function.get();
-        auto * buf = ws.aggregate_function_state.data();
+        IColumn * result_column = block.output_columns[wi].get();
 
-        IColumn * result_column = outputAt(current_row)[wi].get();
-        // FIXME does it also allocate the result on the arena?
-        // We'll have to pass it out with blocks then...
-        a->insertResultInto(buf, *result_column, arena.get());
+        if (ws.window_function_impl)
+        {
+            ws.window_function_impl->windowInsertResultInto(*result_column, this);
+        }
+        else
+        {
+            const auto * a = ws.aggregate_function.get();
+            auto * buf = ws.aggregate_function_state.data();
+            // FIXME does it also allocate the result on the arena?
+            // We'll have to pass it out with blocks then...
+            a->insertResultInto(buf, *result_column, arena.get());
+        }
     }
 }
 
@@ -821,6 +883,10 @@ void WindowTransform::appendChunk(Chunk & chunk)
         auto & block = blocks.back();
         block.input_columns = chunk.detachColumns();
 
+        // Even in case of `count() over ()` we should have a dummy input column.
+        // Not sure how reliable this is...
+        block.rows = block.input_columns[0]->size();
+
         for (auto & ws : workspaces)
         {
             // Aggregate functions can't work with constant columns, so we have to
@@ -832,13 +898,10 @@ void WindowTransform::appendChunk(Chunk & chunk)
                         ->convertToFullColumnIfConst();
             }
 
-            block.output_columns.push_back(ws.window_function.aggregate_function
-                ->getReturnType()->createColumn());
+            block.output_columns.push_back(ws.aggregate_function->getReturnType()
+                ->createColumn());
+            block.output_columns.back()->reserve(block.rows);
         }
-
-        // Even in case of `count() over ()` we should have a dummy input column.
-        // Not sure how reliable this is...
-        block.rows = block.input_columns[0]->size();
     }
 
     // Start the calculations. First, advance the partition end.
@@ -870,6 +933,8 @@ void WindowTransform::appendChunk(Chunk & chunk)
             if (!arePeers(peer_group_start, current_row))
             {
                 peer_group_start = current_row;
+                peer_group_start_row_number = current_row_number;
+                ++peer_group_number;
             }
 
             // Advance the frame start.
@@ -927,6 +992,7 @@ void WindowTransform::appendChunk(Chunk & chunk)
             // The peer group start is updated at the beginning of the loop,
             // because current_row might now be past-the-end.
             advanceRowNumber(current_row);
+            ++current_row_number;
             first_not_ready_row = current_row;
             frame_ended = false;
             frame_started = false;
@@ -960,7 +1026,10 @@ void WindowTransform::appendChunk(Chunk & chunk)
         prev_frame_start = partition_start;
         prev_frame_end = partition_start;
         assert(current_row == partition_start);
+        current_row_number = 1;
         peer_group_start = partition_start;
+        peer_group_start_row_number = 1;
+        peer_group_number = 1;
 
 //        fmt::print(stderr, "reinitialize agg data at start of {}\n",
 //            new_partition_start);
@@ -968,8 +1037,12 @@ void WindowTransform::appendChunk(Chunk & chunk)
         // has started.
         for (auto & ws : workspaces)
         {
-            const auto & f = ws.window_function;
-            const auto * a = f.aggregate_function.get();
+            if (ws.window_function_impl)
+            {
+                continue;
+            }
+
+            const auto * a = ws.aggregate_function.get();
             auto * buf = ws.aggregate_function_state.data();
 
             a->destroy(buf);
@@ -985,8 +1058,12 @@ void WindowTransform::appendChunk(Chunk & chunk)
 
         for (auto & ws : workspaces)
         {
-            const auto & f = ws.window_function;
-            const auto * a = f.aggregate_function.get();
+            if (ws.window_function_impl)
+            {
+                continue;
+            }
+
+            const auto * a = ws.aggregate_function.get();
             auto * buf = ws.aggregate_function_state.data();
 
             a->create(buf);
@@ -1152,5 +1229,132 @@ void WindowTransform::work()
     }
 }
 
+// A basic implementation for a true window function. It pretends to be an
+// aggregate function, but refuses to work as such.
+struct WindowFunction
+    : public IAggregateFunctionHelper<WindowFunction>
+    , public IWindowFunction
+{
+    std::string name;
+
+    WindowFunction(const std::string & name_, const DataTypes & argument_types_,
+               const Array & parameters_)
+        : IAggregateFunctionHelper<WindowFunction>(argument_types_, parameters_)
+        , name(name_)
+    {}
+
+    IWindowFunction * asWindowFunction() override { return this; }
+
+    [[noreturn]] void fail() const
+    {
+        throw Exception(ErrorCodes::BAD_ARGUMENTS,
+            "The function '{}' can only be used as a window function, not as an aggregate function",
+            getName());
+    }
+
+    String getName() const override { return name; }
+    void create(AggregateDataPtr __restrict) const override { fail(); }
+    void destroy(AggregateDataPtr __restrict) const noexcept override {}
+    bool hasTrivialDestructor() const override { return true; }
+    size_t sizeOfData() const override { return 0; }
+    size_t alignOfData() const override { return 1; }
+    void add(AggregateDataPtr __restrict, const IColumn **, size_t, Arena *) const override { fail(); }
+    void merge(AggregateDataPtr __restrict, ConstAggregateDataPtr, Arena *) const override { fail(); }
+    void serialize(ConstAggregateDataPtr __restrict, WriteBuffer &) const override { fail(); }
+    void deserialize(AggregateDataPtr __restrict, ReadBuffer &, Arena *) const override { fail(); }
+    void insertResultInto(AggregateDataPtr __restrict, IColumn &, Arena *) const override { fail(); }
+};
+
+struct WindowFunctionRank final : public WindowFunction
+{
+    WindowFunctionRank(const std::string & name_,
+            const DataTypes & argument_types_, const Array & parameters_)
+        : WindowFunction(name_, argument_types_, parameters_)
+    {}
+
+    DataTypePtr getReturnType() const override
+    { return std::make_shared<DataTypeUInt64>(); }
+
+    void windowInsertResultInto(IColumn & to, const WindowTransform * transform) override
+    {
+        assert_cast<ColumnUInt64 &>(to).getData().push_back(
+            transform->peer_group_start_row_number);
+    }
+};
+
+struct WindowFunctionDenseRank final : public WindowFunction
+{
+    WindowFunctionDenseRank(const std::string & name_,
+            const DataTypes & argument_types_, const Array & parameters_)
+        : WindowFunction(name_, argument_types_, parameters_)
+    {}
+
+    DataTypePtr getReturnType() const override
+    { return std::make_shared<DataTypeUInt64>(); }
+
+    void windowInsertResultInto(IColumn & to, const WindowTransform * transform) override
+    {
+        assert_cast<ColumnUInt64 &>(to).getData().push_back(
+            transform->peer_group_number);
+    }
+};
+
+struct WindowFunctionRowNumber final : public WindowFunction
+{
+    WindowFunctionRowNumber(const std::string & name_,
+            const DataTypes & argument_types_, const Array & parameters_)
+        : WindowFunction(name_, argument_types_, parameters_)
+    {}
+
+    DataTypePtr getReturnType() const override
+    { return std::make_shared<DataTypeUInt64>(); }
+
+    void windowInsertResultInto(IColumn & to, const WindowTransform * transform) override
+    {
+        assert_cast<ColumnUInt64 &>(to).getData().push_back(
+            transform->current_row_number);
+    }
+};
+
+void registerWindowFunctions(AggregateFunctionFactory & factory)
+{
+    // Why didn't I implement lag/lead yet? Because they are a mess. I imagine
+    // they are from the older generation of window functions, when the concept
+    // of frame was not yet invented, so they ignore the frame and use the
+    // partition instead. This means we have to track a separate frame for
+    // these functions, which would  make the window transform completely
+    // impenetrable to human mind. We can't just get away with materializing
+    // the whole partition like Postgres does, because using a linear amount
+    // of additional memory is not an option when we have a lot of data. We must
+    // be able to process at least the lag/lead in streaming fashion.
+    // Our best bet is probably rewriting, say `lag(value, offset)` to
+    // `any(value) over (rows between offset preceding and offset preceding)`,
+    // at the query planning stage.
+    // Functions like cume_dist() do require materializing the entire
+    // partition, but it's probably also simpler to implement them by rewriting
+    // to a (rows between unbounded preceding and unbounded following) frame,
+    // instead of adding separate logic for them.
+
+    factory.registerFunction("rank", [](const std::string & name,
+            const DataTypes & argument_types, const Array & parameters)
+        {
+            return std::make_shared<WindowFunctionRank>(name, argument_types,
+                parameters);
+        });
+
+    factory.registerFunction("dense_rank", [](const std::string & name,
+            const DataTypes & argument_types, const Array & parameters)
+        {
+            return std::make_shared<WindowFunctionDenseRank>(name, argument_types,
+                parameters);
+        });
+
+    factory.registerFunction("row_number", [](const std::string & name,
+            const DataTypes & argument_types, const Array & parameters)
+        {
+            return std::make_shared<WindowFunctionRowNumber>(name, argument_types,
+                parameters);
+        });
+}
 
 }
diff --git a/src/Processors/Transforms/WindowTransform.h b/src/Processors/Transforms/WindowTransform.h
index 541d4eb87c8..5001b984e9a 100644
--- a/src/Processors/Transforms/WindowTransform.h
+++ b/src/Processors/Transforms/WindowTransform.h
@@ -19,10 +19,18 @@ class Arena;
 // Runtime data for computing one window function.
 struct WindowFunctionWorkspace
 {
-    WindowFunctionDescription window_function;
-    AlignedBuffer aggregate_function_state;
+    AggregateFunctionPtr aggregate_function;
+
+    // This field is set for pure window functions. When set, we ignore the
+    // window_function.aggregate_function, and work through this interface
+    // instead.
+    IWindowFunction * window_function_impl = nullptr;
+
     std::vector<size_t> argument_column_indices;
 
+    // Will not be initialized for a pure window function.
+    AlignedBuffer aggregate_function_state;
+
     // Argument columns. Be careful, this is a per-block cache.
     std::vector<const IColumn *> argument_columns;
     uint64_t cached_block_number = std::numeric_limits<uint64_t>::max();
@@ -108,8 +116,6 @@ private:
     bool arePeers(const RowNumber & x, const RowNumber & y) const;
 
     void advanceFrameStartRowsOffset();
-    void advanceFrameStartRangeOffsetDispatch();
-    template <typename ColumnType>
     void advanceFrameStartRangeOffset();
     void advanceFrameStart();
 
@@ -117,8 +123,6 @@ private:
     void advanceFrameEndCurrentRow();
     void advanceFrameEndUnbounded();
     void advanceFrameEnd();
-    void advanceFrameEndRangeOffsetDispatch();
-    template <typename ColumnType>
     void advanceFrameEndRangeOffset();
 
     void updateAggregationState();
@@ -134,13 +138,19 @@ private:
     const Columns & inputAt(const RowNumber & x) const
     { return const_cast<WindowTransform *>(this)->inputAt(x); }
 
-    auto & blockAt(const RowNumber & x)
+    auto & blockAt(const uint64_t block_number)
     {
-        assert(x.block >= first_block_number);
-        assert(x.block - first_block_number < blocks.size());
-        return blocks[x.block - first_block_number];
+        assert(block_number >= first_block_number);
+        assert(block_number - first_block_number < blocks.size());
+        return blocks[block_number - first_block_number];
     }
 
+    const auto & blockAt(const uint64_t block_number) const
+    { return const_cast<WindowTransform *>(this)->blockAt(block_number); }
+
+    auto & blockAt(const RowNumber & x)
+    { return blockAt(x.block); }
+
     const auto & blockAt(const RowNumber & x) const
     { return const_cast<WindowTransform *>(this)->blockAt(x); }
 
@@ -280,6 +290,11 @@ public:
     // frames may be earlier.
     RowNumber peer_group_start;
 
+    // Row and group numbers in partition for calculating rank() and friends.
+    uint64_t current_row_number = 1;
+    uint64_t peer_group_start_row_number = 1;
+    uint64_t peer_group_number = 1;
+
     // The frame is [frame_start, frame_end) if frame_ended && frame_started,
     // and unknown otherwise. Note that when we move to the next row, both the
     // frame_start and the frame_end may jump forward by an unknown amount of
@@ -299,6 +314,18 @@ public:
     // state after we find the new frame.
     RowNumber prev_frame_start;
     RowNumber prev_frame_end;
+
+    // Comparison function for RANGE OFFSET frames. We choose the appropriate
+    // overload once, based on the type of the ORDER BY column. Choosing it for
+    // each row would be slow.
+    int (* compare_values_with_offset) (
+        const IColumn * compared_column, size_t compared_row,
+        const IColumn * reference_column, size_t reference_row,
+        // We can make it a Field later if we need the Decimals. Now we only
+        // have ints and datetime, and the underlying Field type for them is
+        // uint64_t anyway.
+        uint64_t offset,
+        bool offset_is_preceding);
 };
 
 }
diff --git a/src/Server/HTTPHandler.cpp b/src/Server/HTTPHandler.cpp
index eb4d6119c6f..e9a77c3b433 100644
--- a/src/Server/HTTPHandler.cpp
+++ b/src/Server/HTTPHandler.cpp
@@ -715,7 +715,6 @@ void HTTPHandler::trySendExceptionToClient(const std::string & s, int exception_
             writeChar('\n', *used_output.out_maybe_compressed);
 
             used_output.out_maybe_compressed->next();
-            used_output.out->next();
             used_output.out->finalize();
         }
     }
@@ -775,6 +774,9 @@ void HTTPHandler::handleRequest(Poco::Net::HTTPServerRequest & request, Poco::Ne
 
         trySendExceptionToClient(exception_message, exception_code, request, response, used_output);
     }
+
+    if (used_output.out)
+        used_output.out->finalize();
 }
 
 DynamicQueryHandler::DynamicQueryHandler(IServer & server_, const std::string & param_name_)
diff --git a/src/Server/TestKeeperTCPHandler.cpp b/src/Server/NuKeeperTCPHandler.cpp
similarity index 81%
rename from src/Server/TestKeeperTCPHandler.cpp
rename to src/Server/NuKeeperTCPHandler.cpp
index 97999c2b1c1..e855e2c68f7 100644
--- a/src/Server/TestKeeperTCPHandler.cpp
+++ b/src/Server/NuKeeperTCPHandler.cpp
@@ -1,4 +1,7 @@
-#include <Server/TestKeeperTCPHandler.h>
+#include <Server/NuKeeperTCPHandler.h>
+
+#if USE_NURAFT
+
 #include <Common/ZooKeeper/ZooKeeperIO.h>
 #include <Core/Types.h>
 #include <IO/WriteBufferFromPocoSocket.h>
@@ -22,14 +25,17 @@
     #include <poll.h>
 #endif
 
+
 namespace DB
 {
 
+
 namespace ErrorCodes
 {
     extern const int SYSTEM_ERROR;
     extern const int LOGICAL_ERROR;
     extern const int UNEXPECTED_PACKET_FROM_CLIENT;
+    extern const int TIMEOUT_EXCEEDED;
 }
 
 struct PollResult
@@ -39,36 +45,6 @@ struct PollResult
     bool error{false};
 };
 
-/// Queue with mutex. As simple as possible.
-class ThreadSafeResponseQueue
-{
-private:
-    mutable std::mutex queue_mutex;
-    std::queue<Coordination::ZooKeeperResponsePtr> queue;
-public:
-    void push(const Coordination::ZooKeeperResponsePtr & response)
-    {
-        std::lock_guard lock(queue_mutex);
-        queue.push(response);
-    }
-    bool tryPop(Coordination::ZooKeeperResponsePtr & response)
-    {
-        std::lock_guard lock(queue_mutex);
-        if (!queue.empty())
-        {
-            response = queue.front();
-            queue.pop();
-            return true;
-        }
-        return false;
-    }
-    size_t size() const
-    {
-        std::lock_guard lock(queue_mutex);
-        return queue.size();
-    }
-};
-
 struct SocketInterruptablePollWrapper
 {
     int sockfd;
@@ -218,45 +194,47 @@ struct SocketInterruptablePollWrapper
 #endif
 };
 
-TestKeeperTCPHandler::TestKeeperTCPHandler(IServer & server_, const Poco::Net::StreamSocket & socket_)
+NuKeeperTCPHandler::NuKeeperTCPHandler(IServer & server_, const Poco::Net::StreamSocket & socket_)
     : Poco::Net::TCPServerConnection(socket_)
     , server(server_)
-    , log(&Poco::Logger::get("TestKeeperTCPHandler"))
+    , log(&Poco::Logger::get("NuKeeperTCPHandler"))
     , global_context(server.context())
-    , test_keeper_storage_dispatcher(global_context.getTestKeeperStorageDispatcher())
+    , nu_keeper_storage_dispatcher(global_context.getNuKeeperStorageDispatcher())
     , operation_timeout(0, global_context.getConfigRef().getUInt("test_keeper_server.operation_timeout_ms", Coordination::DEFAULT_OPERATION_TIMEOUT_MS) * 1000)
     , session_timeout(0, global_context.getConfigRef().getUInt("test_keeper_server.session_timeout_ms", Coordination::DEFAULT_SESSION_TIMEOUT_MS) * 1000)
-    , session_id(test_keeper_storage_dispatcher->getSessionID())
     , poll_wrapper(std::make_unique<SocketInterruptablePollWrapper>(socket_))
     , responses(std::make_unique<ThreadSafeResponseQueue>())
 {
 }
 
-void TestKeeperTCPHandler::sendHandshake()
+void NuKeeperTCPHandler::sendHandshake(bool has_leader)
 {
     Coordination::write(Coordination::SERVER_HANDSHAKE_LENGTH, *out);
-    Coordination::write(Coordination::ZOOKEEPER_PROTOCOL_VERSION, *out);
-    Coordination::write(Coordination::DEFAULT_SESSION_TIMEOUT_MS, *out);
+    if (has_leader)
+        Coordination::write(Coordination::ZOOKEEPER_PROTOCOL_VERSION, *out);
+    else /// Specially ignore connections if we are not leader, client will throw exception
+        Coordination::write(42, *out);
+
+    Coordination::write(static_cast<int32_t>(session_timeout.totalMilliseconds()), *out);
     Coordination::write(session_id, *out);
     std::array<char, Coordination::PASSWORD_LENGTH> passwd{};
     Coordination::write(passwd, *out);
     out->next();
 }
 
-void TestKeeperTCPHandler::run()
+void NuKeeperTCPHandler::run()
 {
     runImpl();
 }
 
-void TestKeeperTCPHandler::receiveHandshake()
+Poco::Timespan NuKeeperTCPHandler::receiveHandshake()
 {
     int32_t handshake_length;
     int32_t protocol_version;
     int64_t last_zxid_seen;
-    int32_t timeout;
+    int32_t timeout_ms;
     int64_t previous_session_id = 0;    /// We don't support session restore. So previous session_id is always zero.
     std::array<char, Coordination::PASSWORD_LENGTH> passwd {};
-
     Coordination::read(handshake_length, *in);
     if (handshake_length != Coordination::CLIENT_HANDSHAKE_LENGTH && handshake_length != Coordination::CLIENT_HANDSHAKE_LENGTH_WITH_READONLY)
         throw Exception("Unexpected handshake length received: " + toString(handshake_length), ErrorCodes::UNEXPECTED_PACKET_FROM_CLIENT);
@@ -271,7 +249,7 @@ void TestKeeperTCPHandler::receiveHandshake()
     if (last_zxid_seen != 0)
         throw Exception("Non zero last_zxid_seen is not supported", ErrorCodes::UNEXPECTED_PACKET_FROM_CLIENT);
 
-    Coordination::read(timeout, *in);
+    Coordination::read(timeout_ms, *in);
     Coordination::read(previous_session_id, *in);
 
     if (previous_session_id != 0)
@@ -282,10 +260,12 @@ void TestKeeperTCPHandler::receiveHandshake()
     int8_t readonly;
     if (handshake_length == Coordination::CLIENT_HANDSHAKE_LENGTH_WITH_READONLY)
         Coordination::read(readonly, *in);
+
+    return Poco::Timespan(0, timeout_ms * 1000);
 }
 
 
-void TestKeeperTCPHandler::runImpl()
+void NuKeeperTCPHandler::runImpl()
 {
     setThreadName("TstKprHandler");
     ThreadStatus thread_status;
@@ -307,7 +287,9 @@ void TestKeeperTCPHandler::runImpl()
 
     try
     {
-        receiveHandshake();
+        auto client_timeout = receiveHandshake();
+        if (client_timeout != 0)
+            session_timeout = std::min(client_timeout, session_timeout);
     }
     catch (const Exception & e) /// Typical for an incorrect username, password, or address.
     {
@@ -315,7 +297,30 @@ void TestKeeperTCPHandler::runImpl()
         return;
     }
 
-    sendHandshake();
+    if (nu_keeper_storage_dispatcher->hasLeader())
+    {
+        try
+        {
+            LOG_INFO(log, "Requesting session ID for the new client");
+            session_id = nu_keeper_storage_dispatcher->getSessionID(session_timeout.totalMilliseconds());
+            LOG_INFO(log, "Received session ID {}", session_id);
+        }
+        catch (const Exception & e)
+        {
+            LOG_WARNING(log, "Cannot receive session id {}", e.displayText());
+            sendHandshake(false);
+            return;
+
+        }
+
+        sendHandshake(true);
+    }
+    else
+    {
+        LOG_WARNING(log, "Ignoring user request, because no alive leader exist");
+        sendHandshake(false);
+        return;
+    }
 
     auto response_fd = poll_wrapper->getResponseFD();
     auto response_callback = [this, response_fd] (const Coordination::ZooKeeperResponsePtr & response)
@@ -324,7 +329,7 @@ void TestKeeperTCPHandler::runImpl()
         UInt8 single_byte = 1;
         [[maybe_unused]] int result = write(response_fd, &single_byte, sizeof(single_byte));
     };
-    test_keeper_storage_dispatcher->registerSession(session_id, response_callback);
+    nu_keeper_storage_dispatcher->registerSession(session_id, response_callback);
 
     session_stopwatch.start();
     bool close_received = false;
@@ -371,12 +376,13 @@ void TestKeeperTCPHandler::runImpl()
                     LOG_DEBUG(log, "Session #{} successfully closed", session_id);
                     return;
                 }
-
-                if (response->error == Coordination::Error::ZOK)
-                    response->write(*out);
-                else if (response->xid != Coordination::WATCH_XID)
-                    response->write(*out);
-                /// skipping bad response for watch
+                response->write(*out);
+                if (response->error == Coordination::Error::ZSESSIONEXPIRED)
+                {
+                    LOG_DEBUG(log, "Session #{} expired because server shutting down or quorum is not alive", session_id);
+                    nu_keeper_storage_dispatcher->finishSession(session_id);
+                    return;
+                }
                 result.ready_responses_count--;
             }
 
@@ -386,7 +392,7 @@ void TestKeeperTCPHandler::runImpl()
             if (session_stopwatch.elapsedMicroseconds() > static_cast<UInt64>(session_timeout.totalMicroseconds()))
             {
                 LOG_DEBUG(log, "Session #{} expired", session_id);
-                finish();
+                nu_keeper_storage_dispatcher->finishSession(session_id);
                 break;
             }
         }
@@ -394,22 +400,11 @@ void TestKeeperTCPHandler::runImpl()
     catch (const Exception & ex)
     {
         LOG_INFO(log, "Got exception processing session #{}: {}", session_id, getExceptionMessage(ex, true));
-        finish();
+        nu_keeper_storage_dispatcher->finishSession(session_id);
     }
 }
 
-void TestKeeperTCPHandler::finish()
-{
-    Coordination::ZooKeeperRequestPtr request = Coordination::ZooKeeperRequestFactory::instance().get(Coordination::OpNum::Close);
-    request->xid = close_xid;
-    /// Put close request (so storage will remove all info about session)
-    test_keeper_storage_dispatcher->putRequest(request, session_id);
-    /// We don't need any callbacks because session can be already dead and
-    /// nobody wait for response
-    test_keeper_storage_dispatcher->finishSession(session_id);
-}
-
-std::pair<Coordination::OpNum, Coordination::XID> TestKeeperTCPHandler::receiveRequest()
+std::pair<Coordination::OpNum, Coordination::XID> NuKeeperTCPHandler::receiveRequest()
 {
     int32_t length;
     Coordination::read(length, *in);
@@ -423,8 +418,11 @@ std::pair<Coordination::OpNum, Coordination::XID> TestKeeperTCPHandler::receiveR
     request->xid = xid;
     request->readImpl(*in);
 
-    test_keeper_storage_dispatcher->putRequest(request, session_id);
+    if (!nu_keeper_storage_dispatcher->putRequest(request, session_id))
+        throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, "Session {} already disconnected", session_id);
     return std::make_pair(opnum, xid);
 }
 
 }
+
+#endif
diff --git a/src/Server/TestKeeperTCPHandler.h b/src/Server/NuKeeperTCPHandler.h
similarity index 65%
rename from src/Server/TestKeeperTCPHandler.h
rename to src/Server/NuKeeperTCPHandler.h
index 46b4454b319..03a857ad1d7 100644
--- a/src/Server/TestKeeperTCPHandler.h
+++ b/src/Server/NuKeeperTCPHandler.h
@@ -1,14 +1,22 @@
 #pragma once
 
+#if !defined(ARCADIA_BUILD)
+#    include <Common/config.h>
+#    include "config_core.h"
+#endif
+
+#if USE_NURAFT
+
 #include <Poco/Net/TCPServerConnection.h>
 #include "IServer.h"
 #include <Common/Stopwatch.h>
 #include <Interpreters/Context.h>
 #include <Common/ZooKeeper/ZooKeeperCommon.h>
 #include <Common/ZooKeeper/ZooKeeperConstants.h>
-#include <Common/ZooKeeper/TestKeeperStorageDispatcher.h>
+#include <Coordination/NuKeeperStorageDispatcher.h>
 #include <IO/WriteBufferFromPocoSocket.h>
 #include <IO/ReadBufferFromPocoSocket.h>
+#include <Coordination/ThreadSafeQueue.h>
 #include <unordered_map>
 
 namespace DB
@@ -16,22 +24,24 @@ namespace DB
 
 struct SocketInterruptablePollWrapper;
 using SocketInterruptablePollWrapperPtr = std::unique_ptr<SocketInterruptablePollWrapper>;
-class ThreadSafeResponseQueue;
+
+using ThreadSafeResponseQueue = ThreadSafeQueue<Coordination::ZooKeeperResponsePtr>;
+
 using ThreadSafeResponseQueuePtr = std::unique_ptr<ThreadSafeResponseQueue>;
 
-class TestKeeperTCPHandler : public Poco::Net::TCPServerConnection
+class NuKeeperTCPHandler : public Poco::Net::TCPServerConnection
 {
 public:
-    TestKeeperTCPHandler(IServer & server_, const Poco::Net::StreamSocket & socket_);
+    NuKeeperTCPHandler(IServer & server_, const Poco::Net::StreamSocket & socket_);
     void run() override;
 private:
     IServer & server;
     Poco::Logger * log;
     Context global_context;
-    std::shared_ptr<zkutil::TestKeeperStorageDispatcher> test_keeper_storage_dispatcher;
+    std::shared_ptr<NuKeeperStorageDispatcher> nu_keeper_storage_dispatcher;
     Poco::Timespan operation_timeout;
     Poco::Timespan session_timeout;
-    int64_t session_id;
+    int64_t session_id{-1};
     Stopwatch session_stopwatch;
     SocketInterruptablePollWrapperPtr poll_wrapper;
 
@@ -45,11 +55,11 @@ private:
 
     void runImpl();
 
-    void sendHandshake();
-    void receiveHandshake();
+    void sendHandshake(bool has_leader);
+    Poco::Timespan receiveHandshake();
 
     std::pair<Coordination::OpNum, Coordination::XID> receiveRequest();
-    void finish();
 };
 
 }
+#endif
diff --git a/src/Server/TestKeeperTCPHandlerFactory.h b/src/Server/NuKeeperTCPHandlerFactory.h
similarity index 68%
rename from src/Server/TestKeeperTCPHandlerFactory.h
rename to src/Server/NuKeeperTCPHandlerFactory.h
index ebf91aa31d4..0fd86ebc21f 100644
--- a/src/Server/TestKeeperTCPHandlerFactory.h
+++ b/src/Server/NuKeeperTCPHandlerFactory.h
@@ -1,5 +1,6 @@
 #pragma once
-#include <Server/TestKeeperTCPHandler.h>
+
+#include <Server/NuKeeperTCPHandler.h>
 #include <Poco/Net/TCPServerConnectionFactory.h>
 #include <Poco/Net/NetException.h>
 #include <common/logger_useful.h>
@@ -8,7 +9,7 @@
 namespace DB
 {
 
-class TestKeeperTCPHandlerFactory : public Poco::Net::TCPServerConnectionFactory
+class NuKeeperTCPHandlerFactory : public Poco::Net::TCPServerConnectionFactory
 {
 private:
     IServer & server;
@@ -20,9 +21,9 @@ private:
         void run() override {}
     };
 public:
-    TestKeeperTCPHandlerFactory(IServer & server_)
+    NuKeeperTCPHandlerFactory(IServer & server_)
         : server(server_)
-        , log(&Poco::Logger::get("TestKeeperTCPHandlerFactory"))
+        , log(&Poco::Logger::get("NuKeeperTCPHandlerFactory"))
     {
     }
 
@@ -30,8 +31,8 @@ public:
     {
         try
         {
-            LOG_TRACE(log, "Test keeper request. Address: {}", socket.peerAddress().toString());
-            return new TestKeeperTCPHandler(server, socket);
+            LOG_TRACE(log, "NuKeeper request. Address: {}", socket.peerAddress().toString());
+            return new NuKeeperTCPHandler(server, socket);
         }
         catch (const Poco::Net::NetException &)
         {
diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp
index d66639ef111..c207d188a85 100644
--- a/src/Server/TCPHandler.cpp
+++ b/src/Server/TCPHandler.cpp
@@ -1,6 +1,7 @@
 #include <iomanip>
 #include <ext/scope_guard.h>
 #include <Poco/Net/NetException.h>
+#include <Poco/Util/LayeredConfiguration.h>
 #include <Common/CurrentThread.h>
 #include <Common/Stopwatch.h>
 #include <Common/NetException.h>
@@ -56,6 +57,28 @@ namespace ErrorCodes
     extern const int SUPPORT_IS_DISABLED;
 }
 
+TCPHandler::TCPHandler(IServer & server_, const Poco::Net::StreamSocket & socket_, bool parse_proxy_protocol_, std::string server_display_name_)
+    : Poco::Net::TCPServerConnection(socket_)
+    , server(server_)
+    , parse_proxy_protocol(parse_proxy_protocol_)
+    , log(&Poco::Logger::get("TCPHandler"))
+    , connection_context(server.context())
+    , query_context(server.context())
+    , server_display_name(std::move(server_display_name_))
+{
+}
+TCPHandler::~TCPHandler()
+{
+    try
+    {
+        state.reset();
+        out->next();
+    }
+    catch (...)
+    {
+        tryLogCurrentException(__PRETTY_FUNCTION__);
+    }
+}
 
 void TCPHandler::runImpl()
 {
diff --git a/src/Server/TCPHandler.h b/src/Server/TCPHandler.h
index 41539bef1e1..ee2f7c96b5a 100644
--- a/src/Server/TCPHandler.h
+++ b/src/Server/TCPHandler.h
@@ -10,6 +10,7 @@
 #include <IO/Progress.h>
 #include <DataStreams/BlockIO.h>
 #include <Interpreters/InternalTextLogsQueue.h>
+#include <Interpreters/Context.h>
 #include <Client/TimeoutSetter.h>
 
 #include "IServer.h"
@@ -113,16 +114,8 @@ public:
       * Proxy-forwarded (original client) IP address is used for quota accounting if quota is keyed by forwarded IP.
       */
     TCPHandler(IServer & server_, const Poco::Net::StreamSocket & socket_, bool parse_proxy_protocol_,
-        std::string server_display_name_)
-        : Poco::Net::TCPServerConnection(socket_)
-        , server(server_)
-        , parse_proxy_protocol(parse_proxy_protocol_)
-        , log(&Poco::Logger::get("TCPHandler"))
-        , connection_context(server.context())
-        , query_context(server.context())
-        , server_display_name(std::move(server_display_name_))
-    {
-    }
+        std::string server_display_name_);
+    ~TCPHandler() override;
 
     void run() override;
 
diff --git a/src/Server/ya.make b/src/Server/ya.make
index 1e44577aea9..a0269e9ac84 100644
--- a/src/Server/ya.make
+++ b/src/Server/ya.make
@@ -17,6 +17,7 @@ SRCS(
     MySQLHandler.cpp
     MySQLHandlerFactory.cpp
     NotFoundHandler.cpp
+    NuKeeperTCPHandler.cpp
     PostgreSQLHandler.cpp
     PostgreSQLHandlerFactory.cpp
     PrometheusMetricsWriter.cpp
@@ -25,7 +26,6 @@ SRCS(
     ReplicasStatusHandler.cpp
     StaticRequestHandler.cpp
     TCPHandler.cpp
-    TestKeeperTCPHandler.cpp
     WebUIRequestHandler.cpp
 
 )
diff --git a/src/Storages/Distributed/DirectoryMonitor.cpp b/src/Storages/Distributed/DirectoryMonitor.cpp
index bf15ca22ca9..6fe98c53b3e 100644
--- a/src/Storages/Distributed/DirectoryMonitor.cpp
+++ b/src/Storages/Distributed/DirectoryMonitor.cpp
@@ -48,6 +48,7 @@ namespace ErrorCodes
     extern const int TOO_LARGE_SIZE_COMPRESSED;
     extern const int ATTEMPT_TO_READ_AFTER_EOF;
     extern const int EMPTY_DATA_PASSED;
+    extern const int INCORRECT_FILE_NAME;
 }
 
 
@@ -56,14 +57,26 @@ namespace
     constexpr const std::chrono::minutes decrease_error_count_period{5};
 
     template <typename PoolFactory>
-    ConnectionPoolPtrs createPoolsForAddresses(const std::string & name, PoolFactory && factory)
+    ConnectionPoolPtrs createPoolsForAddresses(const std::string & name, PoolFactory && factory, Poco::Logger * log)
     {
         ConnectionPoolPtrs pools;
 
         for (auto it = boost::make_split_iterator(name, boost::first_finder(",")); it != decltype(it){}; ++it)
         {
             Cluster::Address address = Cluster::Address::fromFullString(boost::copy_range<std::string>(*it));
-            pools.emplace_back(factory(address));
+            try
+            {
+                pools.emplace_back(factory(address));
+            }
+            catch (const Exception & e)
+            {
+                if (e.code() == ErrorCodes::INCORRECT_FILE_NAME)
+                {
+                    tryLogCurrentException(log);
+                    continue;
+                }
+                throw;
+            }
         }
 
         return pools;
@@ -351,16 +364,30 @@ void StorageDistributedDirectoryMonitor::run()
 
 ConnectionPoolPtr StorageDistributedDirectoryMonitor::createPool(const std::string & name, const StorageDistributed & storage)
 {
-    const auto pool_factory = [&storage] (const Cluster::Address & address) -> ConnectionPoolPtr
+    const auto pool_factory = [&storage, &name] (const Cluster::Address & address) -> ConnectionPoolPtr
     {
         const auto & cluster = storage.getCluster();
         const auto & shards_info = cluster->getShardsInfo();
         const auto & shards_addresses = cluster->getShardsAddresses();
 
-        /// check new format shard{shard_index}_number{number_index}
+        /// check new format shard{shard_index}_number{replica_index}
+        /// (shard_index and replica_index starts from 1)
         if (address.shard_index != 0)
         {
-            return shards_info[address.shard_index - 1].per_replica_pools[address.replica_index - 1];
+            if (!address.replica_index)
+                throw Exception(ErrorCodes::INCORRECT_FILE_NAME,
+                    "Wrong replica_index ({})", address.replica_index, name);
+
+            if (address.shard_index > shards_info.size())
+                throw Exception(ErrorCodes::INCORRECT_FILE_NAME,
+                    "No shard with shard_index={} ({})", address.shard_index, name);
+
+            const auto & shard_info = shards_info[address.shard_index - 1];
+            if (address.replica_index > shard_info.per_replica_pools.size())
+                throw Exception(ErrorCodes::INCORRECT_FILE_NAME,
+                    "No shard with replica_index={} ({})", address.replica_index, name);
+
+            return shard_info.per_replica_pools[address.replica_index - 1];
         }
 
         /// existing connections pool have a higher priority
@@ -398,7 +425,7 @@ ConnectionPoolPtr StorageDistributedDirectoryMonitor::createPool(const std::stri
             address.secure);
     };
 
-    auto pools = createPoolsForAddresses(name, pool_factory);
+    auto pools = createPoolsForAddresses(name, pool_factory, storage.log);
 
     const auto settings = storage.global_context.getSettings();
     return pools.size() == 1 ? pools.front() : std::make_shared<ConnectionPoolWithFailover>(pools,
diff --git a/src/Storages/LiveView/StorageLiveView.cpp b/src/Storages/LiveView/StorageLiveView.cpp
index cd96ab4ad40..bfec7bffc8c 100644
--- a/src/Storages/LiveView/StorageLiveView.cpp
+++ b/src/Storages/LiveView/StorageLiveView.cpp
@@ -512,8 +512,8 @@ Pipe StorageLiveView::read(
 
     else if (is_periodically_refreshed)
     {
-        Seconds current_time = std::chrono::duration_cast<Seconds> (std::chrono::system_clock::now().time_since_epoch());
-        Seconds blocks_time = std::chrono::duration_cast<Seconds> (getBlocksTime().time_since_epoch());
+        Seconds current_time = std::chrono::duration_cast<Seconds>(std::chrono::system_clock::now().time_since_epoch());
+        Seconds blocks_time = std::chrono::duration_cast<Seconds>(getBlocksTime().time_since_epoch());
 
         if ((current_time - periodic_live_view_refresh) >= blocks_time)
             refresh(false);
diff --git a/src/Storages/MergeTree/BackgroundJobsExecutor.cpp b/src/Storages/MergeTree/BackgroundJobsExecutor.cpp
index 3e3f693addd..8e5a0e8a3b8 100644
--- a/src/Storages/MergeTree/BackgroundJobsExecutor.cpp
+++ b/src/Storages/MergeTree/BackgroundJobsExecutor.cpp
@@ -98,11 +98,21 @@ try
                 {
                     try /// We don't want exceptions in background pool
                     {
-                        job();
+                        bool job_success = job();
                         /// Job done, decrement metric and reset no_work counter
                         CurrentMetrics::values[pool_config.tasks_metric]--;
-                        /// Job done, new empty space in pool, schedule background task
-                        runTaskWithoutDelay();
+
+                        if (job_success)
+                        {
+                            /// Job done, new empty space in pool, schedule background task
+                            runTaskWithoutDelay();
+                        }
+                        else
+                        {
+                            /// Job done, but failed, schedule with backoff
+                            scheduleTask(/* with_backoff = */ true);
+                        }
+
                     }
                     catch (...)
                     {
diff --git a/src/Storages/MergeTree/BackgroundJobsExecutor.h b/src/Storages/MergeTree/BackgroundJobsExecutor.h
index 85067188f09..da22c752e1b 100644
--- a/src/Storages/MergeTree/BackgroundJobsExecutor.h
+++ b/src/Storages/MergeTree/BackgroundJobsExecutor.h
@@ -36,10 +36,12 @@ enum class PoolType
     FETCH,
 };
 
+using BackgroundJobFunc = std::function<bool()>;
+
 /// Result from background job providers. Function which will be executed in pool and pool type.
 struct JobAndPool
 {
-    ThreadPool::Job job;
+    BackgroundJobFunc job;
     PoolType pool_type;
 };
 
diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index c6e77a56db6..a0d23b8ab22 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -2362,7 +2362,7 @@ size_t MergeTreeData::getPartsCount() const
 }
 
 
-size_t MergeTreeData::getMaxPartsCountForPartition() const
+size_t MergeTreeData::getMaxPartsCountForPartitionWithState(DataPartState state) const
 {
     auto lock = lockParts();
 
@@ -2370,7 +2370,7 @@ size_t MergeTreeData::getMaxPartsCountForPartition() const
     size_t cur_count = 0;
     const String * cur_partition_id = nullptr;
 
-    for (const auto & part : getDataPartsStateRange(DataPartState::Committed))
+    for (const auto & part : getDataPartsStateRange(state))
     {
         if (cur_partition_id && part->info.partition_id == *cur_partition_id)
         {
@@ -2389,6 +2389,18 @@ size_t MergeTreeData::getMaxPartsCountForPartition() const
 }
 
 
+size_t MergeTreeData::getMaxPartsCountForPartition() const
+{
+    return getMaxPartsCountForPartitionWithState(DataPartState::Committed);
+}
+
+
+size_t MergeTreeData::getMaxInactivePartsCountForPartition() const
+{
+    return getMaxPartsCountForPartitionWithState(DataPartState::Outdated);
+}
+
+
 std::optional<Int64> MergeTreeData::getMinPartDataVersion() const
 {
     auto lock = lockParts();
@@ -2414,19 +2426,47 @@ void MergeTreeData::delayInsertOrThrowIfNeeded(Poco::Event * until) const
         throw Exception("Too many parts (" + toString(parts_count_in_total) + ") in all partitions in total. This indicates wrong choice of partition key. The threshold can be modified with 'max_parts_in_total' setting in <merge_tree> element in config.xml or with per-table setting.", ErrorCodes::TOO_MANY_PARTS);
     }
 
-    const size_t parts_count_in_partition = getMaxPartsCountForPartition();
+    size_t parts_count_in_partition = getMaxPartsCountForPartition();
+    ssize_t k_inactive = -1;
+    if (settings->inactive_parts_to_throw_insert > 0 || settings->inactive_parts_to_delay_insert > 0)
+    {
+        size_t inactive_parts_count_in_partition = getMaxInactivePartsCountForPartition();
+        if (inactive_parts_count_in_partition >= settings->inactive_parts_to_throw_insert)
+        {
+            ProfileEvents::increment(ProfileEvents::RejectedInserts);
+            throw Exception(
+                ErrorCodes::TOO_MANY_PARTS,
+                "Too many inactive parts ({}). Parts cleaning are processing significantly slower than inserts",
+                inactive_parts_count_in_partition);
+        }
+        k_inactive = ssize_t(inactive_parts_count_in_partition) - ssize_t(settings->inactive_parts_to_delay_insert);
+    }
 
     if (parts_count_in_partition >= settings->parts_to_throw_insert)
     {
         ProfileEvents::increment(ProfileEvents::RejectedInserts);
-        throw Exception("Too many parts (" + toString(parts_count_in_partition) + "). Merges are processing significantly slower than inserts.", ErrorCodes::TOO_MANY_PARTS);
+        throw Exception(
+            ErrorCodes::TOO_MANY_PARTS,
+            "Too many parts ({}). Parts cleaning are processing significantly slower than inserts",
+            parts_count_in_partition);
     }
 
-    if (parts_count_in_partition < settings->parts_to_delay_insert)
+    if (k_inactive < 0 && parts_count_in_partition < settings->parts_to_delay_insert)
         return;
 
-    const size_t max_k = settings->parts_to_throw_insert - settings->parts_to_delay_insert; /// always > 0
-    const size_t k = 1 + parts_count_in_partition - settings->parts_to_delay_insert; /// from 1 to max_k
+    const ssize_t k_active = ssize_t(parts_count_in_partition) - ssize_t(settings->parts_to_delay_insert);
+    size_t max_k;
+    size_t k;
+    if (k_active > k_inactive)
+    {
+        max_k = settings->parts_to_throw_insert - settings->parts_to_delay_insert;
+        k = k_active + 1;
+    }
+    else
+    {
+        max_k = settings->inactive_parts_to_throw_insert - settings->inactive_parts_to_delay_insert;
+        k = k_inactive + 1;
+    }
     const double delay_milliseconds = ::pow(settings->max_delay_to_insert * 1000, static_cast<double>(k) / max_k);
 
     ProfileEvents::increment(ProfileEvents::DelayedInserts);
@@ -3796,7 +3836,7 @@ std::optional<JobAndPool> MergeTreeData::getDataMovingJob()
 
     return JobAndPool{[this, moving_tagger] () mutable
     {
-        moveParts(moving_tagger);
+        return moveParts(moving_tagger);
     }, PoolType::MOVE};
 }
 
diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h
index efa6919dc9b..2aefa66ac58 100644
--- a/src/Storages/MergeTree/MergeTreeData.h
+++ b/src/Storages/MergeTree/MergeTreeData.h
@@ -415,7 +415,9 @@ public:
     size_t getTotalActiveSizeInRows() const;
 
     size_t getPartsCount() const;
+    size_t getMaxPartsCountForPartitionWithState(DataPartState state) const;
     size_t getMaxPartsCountForPartition() const;
+    size_t getMaxInactivePartsCountForPartition() const;
 
     /// Get min value of part->info.getDataVersion() for all active parts.
     /// Makes sense only for ordinary MergeTree engines because for them block numbering doesn't depend on partition.
diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
index d41faa1ed46..d23413f4a84 100644
--- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
@@ -175,6 +175,7 @@ QueryPlanPtr MergeTreeDataSelectExecutor::readFromParts(
     Names virt_column_names;
     Names real_column_names;
 
+    size_t total_parts = parts.size();
     bool part_column_queried = false;
     bool part_uuid_column_queried = false;
 
@@ -550,7 +551,21 @@ QueryPlanPtr MergeTreeDataSelectExecutor::readFromParts(
     if (select.prewhere())
         prewhere_column = select.prewhere()->getColumnName();
 
-    std::vector<std::pair<MergeTreeIndexPtr, MergeTreeIndexConditionPtr>> useful_indices;
+    struct DataSkippingIndexAndCondition
+    {
+        MergeTreeIndexPtr index;
+        MergeTreeIndexConditionPtr condition;
+        std::atomic<size_t> total_granules;
+        std::atomic<size_t> granules_dropped;
+
+        DataSkippingIndexAndCondition(MergeTreeIndexPtr index_, MergeTreeIndexConditionPtr condition_)
+            : index(index_)
+            , condition(condition_)
+            , total_granules(0)
+            , granules_dropped(0)
+        {}
+    };
+    std::list<DataSkippingIndexAndCondition> useful_indices;
 
     for (const auto & index : metadata_snapshot->getSecondaryIndices())
     {
@@ -579,7 +594,7 @@ QueryPlanPtr MergeTreeDataSelectExecutor::readFromParts(
 
         std::unordered_set<std::string> useful_indices_names;
         for (const auto & useful_index : useful_indices)
-            useful_indices_names.insert(useful_index.first->index.name);
+            useful_indices_names.insert(useful_index.index->index.name);
 
         for (const auto & index_name : forced_indices)
         {
@@ -595,6 +610,8 @@ QueryPlanPtr MergeTreeDataSelectExecutor::readFromParts(
     RangesInDataParts parts_with_ranges(parts.size());
     size_t sum_marks = 0;
     std::atomic<size_t> sum_marks_pk = 0;
+    std::atomic<size_t> total_marks_pk = 0;
+
     size_t sum_ranges = 0;
 
     /// Let's find what range to read from each part.
@@ -615,6 +632,8 @@ QueryPlanPtr MergeTreeDataSelectExecutor::readFromParts(
 
             RangesInDataPart ranges(part, part_index);
 
+            total_marks_pk.fetch_add(part->index_granularity.getMarksCount(), std::memory_order_relaxed);
+
             if (metadata_snapshot->hasPrimaryKey())
                 ranges.ranges = markRangesFromPKRange(part, metadata_snapshot, key_condition, settings, log);
             else
@@ -630,9 +649,20 @@ QueryPlanPtr MergeTreeDataSelectExecutor::readFromParts(
 
             sum_marks_pk.fetch_add(ranges.getMarksCount(), std::memory_order_relaxed);
 
-            for (const auto & index_and_condition : useful_indices)
+            for (auto & index_and_condition : useful_indices)
+            {
+                size_t total_granules = 0;
+                size_t granules_dropped = 0;
                 ranges.ranges = filterMarksUsingIndex(
-                        index_and_condition.first, index_and_condition.second, part, ranges.ranges, settings, reader_settings, log);
+                    index_and_condition.index, index_and_condition.condition,
+                    part, ranges.ranges,
+                    settings, reader_settings,
+                    total_granules, granules_dropped,
+                    log);
+
+                index_and_condition.total_granules.fetch_add(total_granules, std::memory_order_relaxed);
+                index_and_condition.granules_dropped.fetch_add(granules_dropped, std::memory_order_relaxed);
+            }
 
             if (!ranges.ranges.empty())
             {
@@ -697,7 +727,19 @@ QueryPlanPtr MergeTreeDataSelectExecutor::readFromParts(
         parts_with_ranges.resize(next_part);
     }
 
-    LOG_DEBUG(log, "Selected {} parts by partition key, {} parts by primary key, {} marks by primary key, {} marks to read from {} ranges", parts.size(), parts_with_ranges.size(), sum_marks_pk.load(std::memory_order_relaxed), sum_marks, sum_ranges);
+    for (const auto & index_and_condition : useful_indices)
+    {
+        const auto & index_name = index_and_condition.index->index.name;
+        LOG_DEBUG(log, "Index {} has dropped {}/{} granules.",
+            backQuote(index_name),
+            index_and_condition.granules_dropped, index_and_condition.total_granules);
+    }
+
+    LOG_DEBUG(log, "Selected {}/{} parts by partition key, {} parts by primary key, {}/{} marks by primary key, {} marks to read from {} ranges",
+        parts.size(), total_parts, parts_with_ranges.size(),
+        sum_marks_pk.load(std::memory_order_relaxed),
+        total_marks_pk.load(std::memory_order_relaxed),
+        sum_marks, sum_ranges);
 
     if (parts_with_ranges.empty())
         return std::make_unique<QueryPlan>();
@@ -1595,8 +1637,6 @@ MarkRanges MergeTreeDataSelectExecutor::markRangesFromPKRange(
     /// If index is not used.
     if (key_condition.alwaysUnknownOrTrue())
     {
-        LOG_TRACE(log, "Not using primary index on part {}", part->name);
-
         if (has_final_mark)
             res.push_back(MarkRange(0, marks_count - 1));
         else
@@ -1769,6 +1809,8 @@ MarkRanges MergeTreeDataSelectExecutor::filterMarksUsingIndex(
     const MarkRanges & ranges,
     const Settings & settings,
     const MergeTreeReaderSettings & reader_settings,
+    size_t & total_granules,
+    size_t & granules_dropped,
     Poco::Logger * log)
 {
     if (!part->volume->getDisk()->exists(part->getFullRelativePath() + index_helper->getFileName() + ".idx"))
@@ -1785,9 +1827,6 @@ MarkRanges MergeTreeDataSelectExecutor::filterMarksUsingIndex(
         part->index_granularity_info.fixed_index_granularity,
         part->index_granularity_info.index_granularity_bytes);
 
-    size_t granules_dropped = 0;
-    size_t total_granules = 0;
-
     size_t marks_count = part->getMarksCount();
     size_t final_mark = part->index_granularity.hasFinalMark();
     size_t index_marks_count = (marks_count - final_mark + index_granularity - 1) / index_granularity;
@@ -1839,8 +1878,6 @@ MarkRanges MergeTreeDataSelectExecutor::filterMarksUsingIndex(
         last_index_mark = index_range.end - 1;
     }
 
-    LOG_DEBUG(log, "Index {} has dropped {} / {} granules.", backQuote(index_helper->index.name), granules_dropped, total_granules);
-
     return res;
 }
 
diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.h b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.h
index 04a3be3d3f0..7692424dfb5 100644
--- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.h
+++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.h
@@ -113,6 +113,8 @@ private:
         const MarkRanges & ranges,
         const Settings & settings,
         const MergeTreeReaderSettings & reader_settings,
+        size_t & total_granules,
+        size_t & granules_dropped,
         Poco::Logger * log);
 
     /// Select the parts in which there can be data that satisfy `minmax_idx_condition` and that match the condition on `_part`,
diff --git a/src/Storages/MergeTree/MergeTreeSettings.h b/src/Storages/MergeTree/MergeTreeSettings.h
index 53388617a07..16657b4083d 100644
--- a/src/Storages/MergeTree/MergeTreeSettings.h
+++ b/src/Storages/MergeTree/MergeTreeSettings.h
@@ -57,7 +57,9 @@ struct Settings;
     \
     /** Inserts settings. */ \
     M(UInt64, parts_to_delay_insert, 150, "If table contains at least that many active parts in single partition, artificially slow down insert into table.", 0) \
+    M(UInt64, inactive_parts_to_delay_insert, 0, "If table contains at least that many inactive parts in single partition, artificially slow down insert into table.", 0) \
     M(UInt64, parts_to_throw_insert, 300, "If more than this number active parts in single partition, throw 'Too many parts ...' exception.", 0) \
+    M(UInt64, inactive_parts_to_throw_insert, 0, "If more than this number inactive parts in single partition, throw 'Too many inactive parts ...' exception.", 0) \
     M(UInt64, max_delay_to_insert, 1, "Max delay of inserting data into MergeTree table in seconds, if there are a lot of unmerged parts in single partition.", 0) \
     M(UInt64, max_parts_in_total, 100000, "If more than this number active parts in all partitions in total, throw 'Too many parts ...' exception.", 0) \
     \
diff --git a/src/Storages/RabbitMQ/StorageRabbitMQ.cpp b/src/Storages/RabbitMQ/StorageRabbitMQ.cpp
index 3ee9dda2bf3..d14f11c4a29 100644
--- a/src/Storages/RabbitMQ/StorageRabbitMQ.cpp
+++ b/src/Storages/RabbitMQ/StorageRabbitMQ.cpp
@@ -94,6 +94,7 @@ StorageRabbitMQ::StorageRabbitMQ(
         , login_password(std::make_pair(
                     global_context.getConfigRef().getString("rabbitmq.username"),
                     global_context.getConfigRef().getString("rabbitmq.password")))
+        , vhost(global_context.getConfigRef().getString("rabbitmq.vhost", "/"))
         , semaphore(0, num_consumers)
         , unique_strbase(getRandomName())
         , queue_size(std::max(QUEUE_SIZE, static_cast<uint32_t>(getMaxBlockSize())))
@@ -483,7 +484,9 @@ bool StorageRabbitMQ::restoreConnection(bool reconnecting)
     }
 
     connection = std::make_unique<AMQP::TcpConnection>(event_handler.get(),
-            AMQP::Address(parsed_address.first, parsed_address.second, AMQP::Login(login_password.first, login_password.second), "/"));
+            AMQP::Address(
+                parsed_address.first, parsed_address.second,
+                AMQP::Login(login_password.first, login_password.second), vhost));
 
     cnt_retries = 0;
     while (!connection->ready() && !stream_cancelled && ++cnt_retries != RETRIES_MAX)
@@ -702,7 +705,7 @@ ConsumerBufferPtr StorageRabbitMQ::createReadBuffer()
 ProducerBufferPtr StorageRabbitMQ::createWriteBuffer()
 {
     return std::make_shared<WriteBufferToRabbitMQProducer>(
-        parsed_address, global_context, login_password, routing_keys, exchange_name, exchange_type,
+        parsed_address, global_context, login_password, vhost, routing_keys, exchange_name, exchange_type,
         producer_id.fetch_add(1), persistent, wait_confirm, log,
         row_delimiter ? std::optional<char>{row_delimiter} : std::nullopt, 1, 1024);
 }
diff --git a/src/Storages/RabbitMQ/StorageRabbitMQ.h b/src/Storages/RabbitMQ/StorageRabbitMQ.h
index 893c5167a97..aa316e7a842 100644
--- a/src/Storages/RabbitMQ/StorageRabbitMQ.h
+++ b/src/Storages/RabbitMQ/StorageRabbitMQ.h
@@ -94,6 +94,7 @@ private:
     String address;
     std::pair<String, UInt16> parsed_address;
     std::pair<String, String> login_password;
+    String vhost;
 
     std::unique_ptr<uv_loop_t> loop;
     std::shared_ptr<RabbitMQHandler> event_handler;
diff --git a/src/Storages/RabbitMQ/WriteBufferToRabbitMQProducer.cpp b/src/Storages/RabbitMQ/WriteBufferToRabbitMQProducer.cpp
index 08b95d46115..ac1b253b4bb 100644
--- a/src/Storages/RabbitMQ/WriteBufferToRabbitMQProducer.cpp
+++ b/src/Storages/RabbitMQ/WriteBufferToRabbitMQProducer.cpp
@@ -29,6 +29,7 @@ WriteBufferToRabbitMQProducer::WriteBufferToRabbitMQProducer(
         std::pair<String, UInt16> & parsed_address_,
         const Context & global_context,
         const std::pair<String, String> & login_password_,
+        const String & vhost_,
         const Names & routing_keys_,
         const String & exchange_name_,
         const AMQP::ExchangeType exchange_type_,
@@ -42,6 +43,7 @@ WriteBufferToRabbitMQProducer::WriteBufferToRabbitMQProducer(
         : WriteBuffer(nullptr, 0)
         , parsed_address(parsed_address_)
         , login_password(login_password_)
+        , vhost(vhost_)
         , routing_keys(routing_keys_)
         , exchange_name(exchange_name_)
         , exchange_type(exchange_type_)
@@ -149,7 +151,9 @@ bool WriteBufferToRabbitMQProducer::setupConnection(bool reconnecting)
     }
 
     connection = std::make_unique<AMQP::TcpConnection>(event_handler.get(),
-            AMQP::Address(parsed_address.first, parsed_address.second, AMQP::Login(login_password.first, login_password.second), "/"));
+            AMQP::Address(
+                parsed_address.first, parsed_address.second,
+                AMQP::Login(login_password.first, login_password.second), vhost));
 
     cnt_retries = 0;
     while (!connection->ready() && ++cnt_retries != RETRIES_MAX)
diff --git a/src/Storages/RabbitMQ/WriteBufferToRabbitMQProducer.h b/src/Storages/RabbitMQ/WriteBufferToRabbitMQProducer.h
index 2897e20b21d..e88f92239ca 100644
--- a/src/Storages/RabbitMQ/WriteBufferToRabbitMQProducer.h
+++ b/src/Storages/RabbitMQ/WriteBufferToRabbitMQProducer.h
@@ -21,6 +21,7 @@ public:
             std::pair<String, UInt16> & parsed_address_,
             const Context & global_context,
             const std::pair<String, String> & login_password_,
+            const String & vhost_,
             const Names & routing_keys_,
             const String & exchange_name_,
             const AMQP::ExchangeType exchange_type_,
@@ -53,6 +54,7 @@ private:
 
     std::pair<String, UInt16> parsed_address;
     const std::pair<String, String> login_password;
+    const String vhost;
     const Names routing_keys;
     const String exchange_name;
     AMQP::ExchangeType exchange_type;
diff --git a/src/Storages/StorageBuffer.cpp b/src/Storages/StorageBuffer.cpp
index bf02a04c704..e28d5f4d6d1 100644
--- a/src/Storages/StorageBuffer.cpp
+++ b/src/Storages/StorageBuffer.cpp
@@ -72,14 +72,14 @@ StorageBuffer::StorageBuffer(
     const StorageID & destination_id_,
     bool allow_materialized_)
     : IStorage(table_id_)
-    , global_context(context_.getGlobalContext())
+    , buffer_context(context_.getBufferContext())
     , num_shards(num_shards_), buffers(num_shards_)
     , min_thresholds(min_thresholds_)
     , max_thresholds(max_thresholds_)
     , destination_id(destination_id_)
     , allow_materialized(allow_materialized_)
     , log(&Poco::Logger::get("StorageBuffer (" + table_id_.getFullTableName() + ")"))
-    , bg_pool(global_context.getBufferFlushSchedulePool())
+    , bg_pool(buffer_context.getBufferFlushSchedulePool())
 {
     StorageInMemoryMetadata storage_metadata;
     storage_metadata.setColumns(columns_);
@@ -475,7 +475,7 @@ public:
         StoragePtr destination;
         if (storage.destination_id)
         {
-            destination = DatabaseCatalog::instance().tryGetTable(storage.destination_id, storage.global_context);
+            destination = DatabaseCatalog::instance().tryGetTable(storage.destination_id, storage.buffer_context);
             if (destination.get() == &storage)
                 throw Exception("Destination table is myself. Write will cause infinite loop.", ErrorCodes::INFINITE_LOOP);
         }
@@ -591,9 +591,9 @@ bool StorageBuffer::mayBenefitFromIndexForIn(
 
 void StorageBuffer::startup()
 {
-    if (global_context.getSettingsRef().readonly)
+    if (buffer_context.getSettingsRef().readonly)
     {
-        LOG_WARNING(log, "Storage {} is run with readonly settings, it will not be able to insert data. Set appropriate system_profile to fix this.", getName());
+        LOG_WARNING(log, "Storage {} is run with readonly settings, it will not be able to insert data. Set appropriate buffer_profile to fix this.", getName());
     }
 
     flush_handle = bg_pool.createTask(log->name() + "/Bg", [this]{ backgroundFlush(); });
@@ -610,7 +610,7 @@ void StorageBuffer::shutdown()
 
     try
     {
-        optimize(nullptr /*query*/, getInMemoryMetadataPtr(), {} /*partition*/, false /*final*/, false /*deduplicate*/, {}, global_context);
+        optimize(nullptr /*query*/, getInMemoryMetadataPtr(), {} /*partition*/, false /*final*/, false /*deduplicate*/, {}, buffer_context);
     }
     catch (...)
     {
@@ -651,6 +651,15 @@ bool StorageBuffer::optimize(
     return true;
 }
 
+bool StorageBuffer::supportsPrewhere() const
+{
+    if (!destination_id)
+        return false;
+    auto dest = DatabaseCatalog::instance().tryGetTable(destination_id, buffer_context);
+    if (dest && dest.get() != this)
+        return dest->supportsPrewhere();
+    return false;
+}
 
 bool StorageBuffer::checkThresholds(const Buffer & buffer, time_t current_time, size_t additional_rows, size_t additional_bytes) const
 {
@@ -757,7 +766,7 @@ void StorageBuffer::flushBuffer(Buffer & buffer, bool check_thresholds, bool loc
     Stopwatch watch;
     try
     {
-        writeBlockToDestination(block_to_write, DatabaseCatalog::instance().tryGetTable(destination_id, global_context));
+        writeBlockToDestination(block_to_write, DatabaseCatalog::instance().tryGetTable(destination_id, buffer_context));
         if (reset_block_structure)
             buffer.data.clear();
     }
@@ -839,7 +848,7 @@ void StorageBuffer::writeBlockToDestination(const Block & block, StoragePtr tabl
     for (const auto & column : block_to_write)
         list_of_columns->children.push_back(std::make_shared<ASTIdentifier>(column.name));
 
-    auto insert_context = Context(global_context);
+    auto insert_context = Context(buffer_context);
     insert_context.makeQueryContext();
 
     InterpreterInsertQuery interpreter{insert, insert_context, allow_materialized};
@@ -916,7 +925,7 @@ void StorageBuffer::checkAlterIsPossible(const AlterCommands & commands, const S
 std::optional<UInt64> StorageBuffer::totalRows(const Settings & settings) const
 {
     std::optional<UInt64> underlying_rows;
-    auto underlying = DatabaseCatalog::instance().tryGetTable(destination_id, global_context);
+    auto underlying = DatabaseCatalog::instance().tryGetTable(destination_id, buffer_context);
 
     if (underlying)
         underlying_rows = underlying->totalRows(settings);
diff --git a/src/Storages/StorageBuffer.h b/src/Storages/StorageBuffer.h
index 9656c78637b..46907ca196b 100644
--- a/src/Storages/StorageBuffer.h
+++ b/src/Storages/StorageBuffer.h
@@ -93,15 +93,7 @@ public:
         const Context & context) override;
 
     bool supportsSampling() const override { return true; }
-    bool supportsPrewhere() const override
-    {
-        if (!destination_id)
-            return false;
-        auto dest = DatabaseCatalog::instance().tryGetTable(destination_id, global_context);
-        if (dest && dest.get() != this)
-            return dest->supportsPrewhere();
-        return false;
-    }
+    bool supportsPrewhere() const override;
     bool supportsFinal() const override { return true; }
     bool supportsIndexForIn() const override { return true; }
 
@@ -120,7 +112,7 @@ public:
 
 
 private:
-    const Context & global_context;
+    const Context & buffer_context;
 
     struct Buffer
     {
diff --git a/src/Storages/StorageMemory.h b/src/Storages/StorageMemory.h
index dc695427156..79ced856231 100644
--- a/src/Storages/StorageMemory.h
+++ b/src/Storages/StorageMemory.h
@@ -45,6 +45,8 @@ public:
     /// Smaller blocks (e.g. 64K rows) are better for CPU cache.
     bool prefersLargeBlocks() const override { return false; }
 
+    bool hasEvenlyDistributedRead() const override { return true; }
+
     BlockOutputStreamPtr write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, const Context & context) override;
 
     void drop() override;
diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp
index 11a159d4a6c..202e909af0f 100644
--- a/src/Storages/StorageMergeTree.cpp
+++ b/src/Storages/StorageMergeTree.cpp
@@ -962,9 +962,11 @@ std::optional<JobAndPool> StorageMergeTree::getDataProcessingJob()
         return JobAndPool{[this, metadata_snapshot, merge_entry, mutate_entry, share_lock] () mutable
         {
             if (merge_entry)
-                mergeSelectedParts(metadata_snapshot, false, {}, *merge_entry, share_lock);
+                return mergeSelectedParts(metadata_snapshot, false, {}, *merge_entry, share_lock);
             else if (mutate_entry)
-                mutateSelectedPart(metadata_snapshot, *mutate_entry, share_lock);
+                return mutateSelectedPart(metadata_snapshot, *mutate_entry, share_lock);
+
+            __builtin_unreachable();
         }, PoolType::MERGE_MUTATE};
     }
     else if (auto lock = time_after_previous_cleanup.compareAndRestartDeferred(1))
@@ -978,6 +980,7 @@ std::optional<JobAndPool> StorageMergeTree::getDataProcessingJob()
             clearOldWriteAheadLogs();
             clearOldMutations();
             clearEmptyParts();
+            return true;
         }, PoolType::MERGE_MUTATE};
     }
     return {};
diff --git a/src/Storages/StorageMongoDB.cpp b/src/Storages/StorageMongoDB.cpp
index be1159b1a63..09fd413af75 100644
--- a/src/Storages/StorageMongoDB.cpp
+++ b/src/Storages/StorageMongoDB.cpp
@@ -42,7 +42,6 @@ StorageMongoDB::StorageMongoDB(
     , collection_name(collection_name_)
     , username(username_)
     , password(password_)
-    , connection{std::make_shared<Poco::MongoDB::Connection>(host, port)}
 {
     StorageInMemoryMetadata storage_metadata;
     storage_metadata.setColumns(columns_);
@@ -51,6 +50,26 @@ StorageMongoDB::StorageMongoDB(
 }
 
 
+void StorageMongoDB::connectIfNotConnected()
+{
+    std::lock_guard lock{connection_mutex};
+    if (!connection)
+        connection = std::make_shared<Poco::MongoDB::Connection>(host, port);
+
+    if (!authentified)
+    {
+#       if POCO_VERSION >= 0x01070800
+            Poco::MongoDB::Database poco_db(database_name);
+            if (!poco_db.authenticate(*connection, username, password, Poco::MongoDB::Database::AUTH_SCRAM_SHA1))
+                throw Exception("Cannot authenticate in MongoDB, incorrect user or password", ErrorCodes::MONGODB_CANNOT_AUTHENTICATE);
+#       else
+            authenticate(*connection, database_name, username, password);
+#       endif
+        authentified = true;
+    }
+}
+
+
 Pipe StorageMongoDB::read(
     const Names & column_names,
     const StorageMetadataPtr & metadata_snapshot,
@@ -60,15 +79,9 @@ Pipe StorageMongoDB::read(
     size_t max_block_size,
     unsigned)
 {
-    metadata_snapshot->check(column_names, getVirtuals(), getStorageID());
+    connectIfNotConnected();
 
-#if POCO_VERSION >= 0x01070800
-    Poco::MongoDB::Database poco_db(database_name);
-    if (!poco_db.authenticate(*connection, username, password, Poco::MongoDB::Database::AUTH_SCRAM_SHA1))
-        throw Exception("Cannot authenticate in MongoDB, incorrect user or password", ErrorCodes::MONGODB_CANNOT_AUTHENTICATE);
-#else
-    authenticate(*connection, database_name, username, password);
-#endif
+    metadata_snapshot->check(column_names, getVirtuals(), getStorageID());
 
     Block sample_block;
     for (const String & column_name : column_names)
diff --git a/src/Storages/StorageMongoDB.h b/src/Storages/StorageMongoDB.h
index d7b71495574..589ab276539 100644
--- a/src/Storages/StorageMongoDB.h
+++ b/src/Storages/StorageMongoDB.h
@@ -40,16 +40,19 @@ public:
         size_t max_block_size,
         unsigned num_streams) override;
 
-
 private:
-    std::string host;
-    short unsigned int port;
-    std::string database_name;
-    std::string collection_name;
-    std::string username;
-    std::string password;
+    void connectIfNotConnected();
+
+    const std::string host;
+    const short unsigned int port;
+    const std::string database_name;
+    const std::string collection_name;
+    const std::string username;
+    const std::string password;
 
     std::shared_ptr<Poco::MongoDB::Connection> connection;
+    bool authentified = false;
+    std::mutex connection_mutex; /// Protects the variables `connection` and `authentified`.
 };
 
 }
diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index 53104efeb43..518577c473c 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -751,7 +751,7 @@ void StorageReplicatedMergeTree::drop()
         auto zookeeper = global_context.getZooKeeper();
 
         /// If probably there is metadata in ZooKeeper, we don't allow to drop the table.
-        if (is_readonly || !zookeeper)
+        if (!zookeeper)
             throw Exception("Can't drop readonly replicated table (need to drop data in ZooKeeper as well)", ErrorCodes::TABLE_IS_READ_ONLY);
 
         shutdown();
@@ -2682,7 +2682,7 @@ std::optional<JobAndPool> StorageReplicatedMergeTree::getDataProcessingJob()
 
     return JobAndPool{[this, selected_entry] () mutable
     {
-        processQueueEntry(selected_entry);
+        return processQueueEntry(selected_entry);
     }, pool_type};
 }
 
diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp
index cafd552978e..d0e3b70d900 100644
--- a/src/Storages/StorageS3.cpp
+++ b/src/Storages/StorageS3.cpp
@@ -234,6 +234,7 @@ StorageS3::StorageS3(
         uri_.is_virtual_hosted_style,
         credentials.GetAWSAccessKeyId(),
         credentials.GetAWSSecretKey(),
+        settings.server_side_encryption_customer_key_base64,
         std::move(settings.headers),
         settings.use_environment_credentials.value_or(global_context.getConfigRef().getBool("s3.use_environment_credentials", false))
     );
diff --git a/src/Storages/StorageS3Settings.cpp b/src/Storages/StorageS3Settings.cpp
index 54384ac8253..6d97e6fae95 100644
--- a/src/Storages/StorageS3Settings.cpp
+++ b/src/Storages/StorageS3Settings.cpp
@@ -30,6 +30,7 @@ void StorageS3Settings::loadFromConfig(const String & config_elem, const Poco::U
             auto endpoint = config.getString(config_elem + "." + key + ".endpoint");
             auto access_key_id = config.getString(config_elem + "." + key + ".access_key_id", "");
             auto secret_access_key = config.getString(config_elem + "." + key + ".secret_access_key", "");
+            auto server_side_encryption_customer_key_base64 = config.getString(config_elem + "." + key + ".server_side_encryption_customer_key_base64", "");
             std::optional<bool> use_environment_credentials;
             if (config.has(config_elem + "." + key + ".use_environment_credentials"))
             {
@@ -51,7 +52,7 @@ void StorageS3Settings::loadFromConfig(const String & config_elem, const Poco::U
                 }
             }
 
-            settings.emplace(endpoint, S3AuthSettings{std::move(access_key_id), std::move(secret_access_key), std::move(headers), use_environment_credentials});
+            settings.emplace(endpoint, S3AuthSettings{std::move(access_key_id), std::move(secret_access_key), std::move(server_side_encryption_customer_key_base64), std::move(headers), use_environment_credentials});
         }
     }
 }
diff --git a/src/Storages/StorageS3Settings.h b/src/Storages/StorageS3Settings.h
index 88f964774c6..59b98ebdfdd 100644
--- a/src/Storages/StorageS3Settings.h
+++ b/src/Storages/StorageS3Settings.h
@@ -27,6 +27,7 @@ struct S3AuthSettings
 {
     const String access_key_id;
     const String secret_access_key;
+    const String server_side_encryption_customer_key_base64;
 
     const HeaderCollection headers;
 
diff --git a/src/Storages/System/StorageSystemQuotaUsage.cpp b/src/Storages/System/StorageSystemQuotaUsage.cpp
index 002ab081bcf..6d6e22e7be6 100644
--- a/src/Storages/System/StorageSystemQuotaUsage.cpp
+++ b/src/Storages/System/StorageSystemQuotaUsage.cpp
@@ -137,6 +137,9 @@ void StorageSystemQuotaUsage::fillDataImpl(
         column_quota_name.insertData(quota_name.data(), quota_name.length());
         column_quota_key.insertData(quota_key.data(), quota_key.length());
 
+        if (add_column_is_current)
+            column_is_current->push_back(quota_id == current_quota_id);
+
         if (!interval)
         {
             column_start_time.insertDefault();
@@ -171,9 +174,6 @@ void StorageSystemQuotaUsage::fillDataImpl(
             addValue(*column_max[resource_type], *column_max_null_map[resource_type], interval->max[resource_type], type_info);
             addValue(*column_usage[resource_type], *column_usage_null_map[resource_type], interval->used[resource_type], type_info);
         }
-
-        if (add_column_is_current)
-            column_is_current->push_back(quota_id == current_quota_id);
     };
 
     auto add_rows = [&](const String & quota_name, const UUID & quota_id, const String & quota_key, const std::vector<QuotaUsage::Interval> & intervals)
diff --git a/src/Storages/tests/gtest_background_executor.cpp b/src/Storages/tests/gtest_background_executor.cpp
index bf9a305ccc9..0ddf2d9ea2a 100644
--- a/src/Storages/tests/gtest_background_executor.cpp
+++ b/src/Storages/tests/gtest_background_executor.cpp
@@ -32,7 +32,7 @@ protected:
 
     std::optional<JobAndPool> getBackgroundJob() override
     {
-        return JobAndPool{[] { std::this_thread::sleep_for(1s); counter++; }, PoolType::MERGE_MUTATE};
+        return JobAndPool{[] { std::this_thread::sleep_for(1s); counter++; return true; }, PoolType::MERGE_MUTATE};
     }
 };
 
diff --git a/src/TableFunctions/TableFunctionNumbers.cpp b/src/TableFunctions/TableFunctionNumbers.cpp
index 4658165735a..594075b1c82 100644
--- a/src/TableFunctions/TableFunctionNumbers.cpp
+++ b/src/TableFunctions/TableFunctionNumbers.cpp
@@ -6,6 +6,7 @@
 #include <Common/typeid_cast.h>
 #include <Storages/System/StorageSystemNumbers.h>
 #include <Interpreters/evaluateConstantExpression.h>
+#include <Interpreters/convertFieldToType.h>
 #include <Interpreters/Context.h>
 #include <DataTypes/DataTypesNumber.h>
 #include "registerTableFunctions.h"
@@ -17,6 +18,7 @@ namespace DB
 namespace ErrorCodes
 {
     extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
+    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
 }
 
 
@@ -56,7 +58,16 @@ void registerTableFunctionNumbers(TableFunctionFactory & factory)
 template <bool multithreaded>
 UInt64 TableFunctionNumbers<multithreaded>::evaluateArgument(const Context & context, ASTPtr & argument) const
 {
-    return evaluateConstantExpressionOrIdentifierAsLiteral(argument, context)->as<ASTLiteral &>().value.safeGet<UInt64>();
+    const auto & [field, type] = evaluateConstantExpression(argument, context);
+
+    if (!isNativeNumber(type))
+        throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} expression, must be numeric type", type->getName());
+
+    Field converted = convertFieldToType(field, DataTypeUInt64());
+    if (converted.isNull())
+        throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "The value {} is not representable as UInt64", applyVisitor(FieldVisitorToString(), field));
+
+    return converted.safeGet<UInt64>();
 }
 
 }
diff --git a/src/ya.make b/src/ya.make
index c3e6b41b9b9..5361c8a5695 100644
--- a/src/ya.make
+++ b/src/ya.make
@@ -9,6 +9,7 @@ PEERDIR(
     clickhouse/src/Columns
     clickhouse/src/Common
     clickhouse/src/Compression
+    clickhouse/src/Coordination
     clickhouse/src/Core
     clickhouse/src/Databases
     clickhouse/src/DataStreams
diff --git a/tests/config/config.d/test_keeper_port.xml b/tests/config/config.d/test_keeper_port.xml
index 79e993b41f7..97c6d7c2e33 100644
--- a/tests/config/config.d/test_keeper_port.xml
+++ b/tests/config/config.d/test_keeper_port.xml
@@ -1,7 +1,21 @@
 <yandex>
     <test_keeper_server>
         <tcp_port>9181</tcp_port>
-        <operation_timeout_ms>10000</operation_timeout_ms>
-        <session_timeout_ms>30000</session_timeout_ms>
+        <server_id>1</server_id>
+
+        <coordination_settings>
+            <operation_timeout_ms>10000</operation_timeout_ms>
+            <session_timeout_ms>30000</session_timeout_ms>
+            <snapshot_distance>0</snapshot_distance>
+            <reserved_log_items>0</reserved_log_items>
+        </coordination_settings>
+
+        <raft_configuration>
+            <server>
+                <id>1</id>
+                <hostname>localhost</hostname>
+                <port>44444</port>
+            </server>
+        </raft_configuration>
     </test_keeper_server>
 </yandex>
diff --git a/tests/queries/0_stateless/01641_memory_tracking_insert_optimize.reference b/tests/integration/test_buffer_profile/__init__.py
similarity index 100%
rename from tests/queries/0_stateless/01641_memory_tracking_insert_optimize.reference
rename to tests/integration/test_buffer_profile/__init__.py
diff --git a/tests/integration/test_buffer_profile/configs/buffer_profile.xml b/tests/integration/test_buffer_profile/configs/buffer_profile.xml
new file mode 100644
index 00000000000..6ce6de70e63
--- /dev/null
+++ b/tests/integration/test_buffer_profile/configs/buffer_profile.xml
@@ -0,0 +1,3 @@
+<yandex>
+    <buffer_profile>buffer_profile</buffer_profile>
+</yandex>
diff --git a/tests/integration/test_buffer_profile/configs/users.d/buffer_profile.xml b/tests/integration/test_buffer_profile/configs/users.d/buffer_profile.xml
new file mode 100644
index 00000000000..2edd2b63dc6
--- /dev/null
+++ b/tests/integration/test_buffer_profile/configs/users.d/buffer_profile.xml
@@ -0,0 +1,8 @@
+<yandex>
+    <profiles>
+        <buffer_profile>
+            <max_partitions_per_insert_block>1</max_partitions_per_insert_block>
+        </buffer_profile>
+    </profiles>
+</yandex>
+
diff --git a/tests/integration/test_buffer_profile/test.py b/tests/integration/test_buffer_profile/test.py
new file mode 100644
index 00000000000..ae9220898ab
--- /dev/null
+++ b/tests/integration/test_buffer_profile/test.py
@@ -0,0 +1,54 @@
+# pylint: disable=unused-argument
+# pylint: disable=redefined-outer-name
+# pylint: disable=line-too-long
+
+import pytest
+
+from helpers.cluster import ClickHouseCluster
+from helpers.client import QueryRuntimeException
+
+cluster = ClickHouseCluster(__file__)
+
+node_default = cluster.add_instance('node_default')
+node_buffer_profile = cluster.add_instance('node_buffer_profile',
+    main_configs=['configs/buffer_profile.xml'],
+    user_configs=['configs/users.d/buffer_profile.xml'])
+
+@pytest.fixture(scope='module', autouse=True)
+def start_cluster():
+    try:
+        cluster.start()
+        yield cluster
+    finally:
+        cluster.shutdown()
+
+def bootstrap(node):
+    node.query("""
+    CREATE TABLE data (key Int) Engine=MergeTree()
+    ORDER BY key
+    PARTITION BY key % 2;
+
+    CREATE TABLE buffer AS data Engine=Buffer(currentDatabase(), data,
+            /* settings for manual flush only */
+            1,    /* num_layers */
+            10e6, /* min_time, placeholder */
+            10e6, /* max_time, placeholder */
+            0,    /* min_rows   */
+            10e6, /* max_rows   */
+            0,    /* min_bytes  */
+            80e6  /* max_bytes  */
+    );
+
+    INSERT INTO buffer SELECT * FROM numbers(100);
+    """)
+
+def test_default_profile():
+    bootstrap(node_default)
+    # flush the buffer
+    node_default.query('OPTIMIZE TABLE buffer')
+
+def test_buffer_profile():
+    bootstrap(node_buffer_profile)
+    with pytest.raises(QueryRuntimeException, match='Too many partitions for single INSERT block'):
+        # flush the buffer
+        node_buffer_profile.query('OPTIMIZE TABLE buffer')
diff --git a/tests/integration/test_distributed_ddl/cluster.py b/tests/integration/test_distributed_ddl/cluster.py
index 811eb94bad4..24f11fec547 100644
--- a/tests/integration/test_distributed_ddl/cluster.py
+++ b/tests/integration/test_distributed_ddl/cluster.py
@@ -10,8 +10,8 @@ from helpers.test_tools import TSV
 
 
 class ClickHouseClusterWithDDLHelpers(ClickHouseCluster):
-    def __init__(self, base_path, config_dir):
-        ClickHouseCluster.__init__(self, base_path)
+    def __init__(self, base_path, config_dir, testcase_name):
+        ClickHouseCluster.__init__(self, base_path, name=testcase_name)
 
         self.test_config_dir = config_dir
 
@@ -104,8 +104,8 @@ class ClickHouseClusterWithDDLHelpers(ClickHouseCluster):
     def ddl_check_there_are_no_dublicates(instance):
         query = "SELECT max(c), argMax(q, c) FROM (SELECT lower(query) AS q, count() AS c FROM system.query_log WHERE type=2 AND q LIKE '/* ddl_entry=query-%' GROUP BY query)"
         rows = instance.query(query)
-        assert len(rows) > 0 and rows[0][0] == "1", "dublicates on {} {}, query {}".format(instance.name,
-                                                                                           instance.ip_address, query)
+        assert len(rows) > 0 and rows[0][0] == "1", "dublicates on {} {}: {}".format(instance.name,
+                                                                                           instance.ip_address, rows)
 
     @staticmethod
     def insert_reliable(instance, query_insert):
diff --git a/tests/integration/test_distributed_ddl/test.py b/tests/integration/test_distributed_ddl/test.py
index f0e78dfec41..58e1d0d06f7 100755
--- a/tests/integration/test_distributed_ddl/test.py
+++ b/tests/integration/test_distributed_ddl/test.py
@@ -14,7 +14,7 @@ from .cluster import ClickHouseClusterWithDDLHelpers
 
 @pytest.fixture(scope="module", params=["configs", "configs_secure"])
 def test_cluster(request):
-    cluster = ClickHouseClusterWithDDLHelpers(__file__, request.param)
+    cluster = ClickHouseClusterWithDDLHelpers(__file__, request.param, request.param)
 
     try:
         cluster.prepare()
diff --git a/tests/integration/test_distributed_ddl/test_replicated_alter.py b/tests/integration/test_distributed_ddl/test_replicated_alter.py
index bd95f5660b7..148ad5fca5e 100644
--- a/tests/integration/test_distributed_ddl/test_replicated_alter.py
+++ b/tests/integration/test_distributed_ddl/test_replicated_alter.py
@@ -12,7 +12,7 @@ from .cluster import ClickHouseClusterWithDDLHelpers
 
 @pytest.fixture(scope="module", params=["configs", "configs_secure"])
 def test_cluster(request):
-    cluster = ClickHouseClusterWithDDLHelpers(__file__, request.param)
+    cluster = ClickHouseClusterWithDDLHelpers(__file__, request.param, "alters_" + request.param)
 
     try:
         # TODO: Fix ON CLUSTER alters when nodes have different configs. Need to canonicalize node identity.
diff --git a/tests/integration/test_insert_distributed_async_extra_dirs/__init__.py b/tests/integration/test_insert_distributed_async_extra_dirs/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/integration/test_insert_distributed_async_extra_dirs/configs/remote_servers.xml b/tests/integration/test_insert_distributed_async_extra_dirs/configs/remote_servers.xml
new file mode 100644
index 00000000000..1df72377ce6
--- /dev/null
+++ b/tests/integration/test_insert_distributed_async_extra_dirs/configs/remote_servers.xml
@@ -0,0 +1,13 @@
+<yandex>
+    <remote_servers>
+        <test_cluster>
+            <shard>
+                <replica>
+                    <host>node</host>
+                    <port>9000</port>
+                </replica>
+            </shard>
+        </test_cluster>
+    </remote_servers>
+</yandex>
+
diff --git a/tests/integration/test_insert_distributed_async_extra_dirs/test.py b/tests/integration/test_insert_distributed_async_extra_dirs/test.py
new file mode 100644
index 00000000000..8365fce298d
--- /dev/null
+++ b/tests/integration/test_insert_distributed_async_extra_dirs/test.py
@@ -0,0 +1,43 @@
+# pylint: disable=unused-argument
+# pylint: disable=redefined-outer-name
+# pylint: disable=line-too-long
+
+import pytest
+
+from helpers.cluster import ClickHouseCluster
+
+cluster = ClickHouseCluster(__file__)
+
+node = cluster.add_instance('node', main_configs=['configs/remote_servers.xml'], stay_alive=True)
+
+@pytest.fixture(scope='module', autouse=True)
+def start_cluster():
+    try:
+        cluster.start()
+        yield cluster
+    finally:
+        cluster.shutdown()
+
+def test_insert_distributed_async_send_success():
+    node.query('CREATE TABLE data (key Int, value String) Engine=Null()')
+    node.query("""
+    CREATE TABLE dist AS data
+    Engine=Distributed(
+        test_cluster,
+        currentDatabase(),
+        data,
+        key
+    )
+    """)
+
+    node.exec_in_container(['bash', '-c', 'mkdir /var/lib/clickhouse/data/default/dist/shard10000_replica10000'])
+    node.exec_in_container(['bash', '-c', 'touch /var/lib/clickhouse/data/default/dist/shard10000_replica10000/1.bin'])
+
+    node.exec_in_container(['bash', '-c', 'mkdir /var/lib/clickhouse/data/default/dist/shard1_replica10000'])
+    node.exec_in_container(['bash', '-c', 'touch /var/lib/clickhouse/data/default/dist/shard1_replica10000/1.bin'])
+
+    node.exec_in_container(['bash', '-c', 'mkdir /var/lib/clickhouse/data/default/dist/shard10000_replica1'])
+    node.exec_in_container(['bash', '-c', 'touch /var/lib/clickhouse/data/default/dist/shard10000_replica1/1.bin'])
+
+    # will check that clickhouse-server is alive
+    node.restart_clickhouse()
diff --git a/tests/integration/test_quota/configs/users.d/assign_myquota.xml b/tests/integration/test_quota/configs/users.d/assign_myquota_to_default_user.xml
similarity index 100%
rename from tests/integration/test_quota/configs/users.d/assign_myquota.xml
rename to tests/integration/test_quota/configs/users.d/assign_myquota_to_default_user.xml
diff --git a/tests/integration/test_quota/configs/users.d/quota.xml b/tests/integration/test_quota/configs/users.d/myquota.xml
similarity index 100%
rename from tests/integration/test_quota/configs/users.d/quota.xml
rename to tests/integration/test_quota/configs/users.d/myquota.xml
diff --git a/tests/integration/test_quota/configs/users.d/user_with_no_quota.xml b/tests/integration/test_quota/configs/users.d/user_with_no_quota.xml
new file mode 100644
index 00000000000..70f51cfff43
--- /dev/null
+++ b/tests/integration/test_quota/configs/users.d/user_with_no_quota.xml
@@ -0,0 +1,10 @@
+<yandex>
+    <users>
+        <user_with_no_quota>
+            <no_password/>
+            <networks>
+                <ip>::/0</ip>
+            </networks>
+        </user_with_no_quota>
+    </users>
+</yandex>
diff --git a/tests/integration/test_quota/test.py b/tests/integration/test_quota/test.py
index 84454159a58..353d776c0f3 100644
--- a/tests/integration/test_quota/test.py
+++ b/tests/integration/test_quota/test.py
@@ -7,9 +7,10 @@ from helpers.cluster import ClickHouseCluster
 from helpers.test_tools import assert_eq_with_retry, TSV
 
 cluster = ClickHouseCluster(__file__)
-instance = cluster.add_instance('instance', user_configs=["configs/users.d/assign_myquota.xml",
+instance = cluster.add_instance('instance', user_configs=["configs/users.d/assign_myquota_to_default_user.xml",
                                                           "configs/users.d/drop_default_quota.xml",
-                                                          "configs/users.d/quota.xml"])
+                                                          "configs/users.d/myquota.xml",
+                                                          "configs/users.d/user_with_no_quota.xml"])
 
 
 def check_system_quotas(canonical):
@@ -49,9 +50,11 @@ def system_quotas_usage(canonical):
 def copy_quota_xml(local_file_name, reload_immediately=True):
     script_dir = os.path.dirname(os.path.realpath(__file__))
     instance.copy_file_to_container(os.path.join(script_dir, local_file_name),
-                                    '/etc/clickhouse-server/users.d/quota.xml')
+                                    '/etc/clickhouse-server/users.d/myquota.xml')
     if reload_immediately:
-        instance.query("SYSTEM RELOAD CONFIG")
+         # We use the special user 'user_with_no_quota' here because
+         # we don't want SYSTEM RELOAD CONFIG to mess our quota consuming checks.
+        instance.query("SYSTEM RELOAD CONFIG", user='user_with_no_quota')
 
 
 @pytest.fixture(scope="module", autouse=True)
@@ -71,12 +74,12 @@ def started_cluster():
 @pytest.fixture(autouse=True)
 def reset_quotas_and_usage_info():
     try:
-        yield
-    finally:
-        copy_quota_xml('simpliest.xml')  # To reset usage info.
         instance.query("DROP QUOTA IF EXISTS qA, qB")
         copy_quota_xml('simpliest.xml')  # To reset usage info.
         copy_quota_xml('normal_limits.xml')
+        yield
+    finally:
+        pass
 
 
 def test_quota_from_users_xml():
@@ -379,4 +382,11 @@ def test_query_inserts():
 
     instance.query("INSERT INTO test_table values(1)")
     system_quota_usage(
-        [["myQuota", "default", 31556952, 1, 1000, 0, 500, 1, 500, 0, "\\N", 0, "\\N", 0, "\\N", 0, 1000, 0, "\\N", "\\N"]])
\ No newline at end of file
+        [["myQuota", "default", 31556952, 1, 1000, 0, 500, 1, 500, 0, "\\N", 0, "\\N", 0, "\\N", 0, 1000, 0, "\\N", "\\N"]])
+
+def test_consumption_show_tables_quota():
+    instance.query("SHOW TABLES")
+
+    assert re.match(
+        "myQuota\\tdefault\\t.*\\t31556952\\t1\\t1000\\t1\\t500\\t0\\t500\\t0\\t\\\\N\\t1\\t\\\\N\\t19\\t\\\\N\\t1\\t1000\\t35\\t\\\\N\\t.*\\t\\\\N\n",
+        instance.query("SHOW QUOTA"))
diff --git a/tests/integration/test_send_crash_reports/test.py b/tests/integration/test_send_crash_reports/test.py
index a3c35ca1537..65d49637b13 100644
--- a/tests/integration/test_send_crash_reports/test.py
+++ b/tests/integration/test_send_crash_reports/test.py
@@ -24,14 +24,17 @@ def started_node():
 
 
 def test_send_segfault(started_node, ):
+    if started_node.is_built_with_thread_sanitizer():
+        pytest.skip("doesn't fit in timeouts for stacktrace generation")
+
     started_node.copy_file_to_container(os.path.join(SCRIPT_DIR, "fake_sentry_server.py"), "/fake_sentry_server.py")
     started_node.exec_in_container(["bash", "-c", "python3 /fake_sentry_server.py > /fake_sentry_server.log 2>&1"], detach=True, user="root")
-    time.sleep(0.5)
+    time.sleep(1)
     started_node.exec_in_container(["bash", "-c", "pkill -11 clickhouse"], user="root")
 
     result = None
     for attempt in range(1, 6):
-        time.sleep(0.25 * attempt)
+        time.sleep(attempt)
         result = started_node.exec_in_container(['cat', fake_sentry_server.RESULT_PATH], user='root')
         if result == 'OK':
             break
diff --git a/tests/integration/test_system_merges/test.py b/tests/integration/test_system_merges/test.py
index 1f2da606cd1..672b637f783 100644
--- a/tests/integration/test_system_merges/test.py
+++ b/tests/integration/test_system_merges/test.py
@@ -134,7 +134,9 @@ def test_mutation_simple(started_cluster, replicated):
         result_part = "all_{}_{}_0_{}".format(starting_block, starting_block, starting_block + 1)
 
         def alter():
-            node1.query("ALTER TABLE {name} UPDATE a = 42 WHERE sleep(2) OR 1".format(name=name))
+            node1.query("ALTER TABLE {name} UPDATE a = 42 WHERE sleep(2) OR 1".format(name=name), settings={
+                'mutations_sync': 1,
+            })
 
         t = threading.Thread(target=alter)
         t.start()
@@ -159,8 +161,6 @@ def test_mutation_simple(started_cluster, replicated):
         ]
         t.join()
 
-        time.sleep(1.5)
-
         assert node_check.query("SELECT * FROM system.merges WHERE table = '{name}'".format(name=table_name)) == ""
 
     finally:
diff --git a/tests/integration/test_testkeeper_back_to_back/configs/enable_test_keeper.xml b/tests/integration/test_testkeeper_back_to_back/configs/enable_test_keeper.xml
index 79e993b41f7..1a441909998 100644
--- a/tests/integration/test_testkeeper_back_to_back/configs/enable_test_keeper.xml
+++ b/tests/integration/test_testkeeper_back_to_back/configs/enable_test_keeper.xml
@@ -1,7 +1,20 @@
 <yandex>
     <test_keeper_server>
         <tcp_port>9181</tcp_port>
-        <operation_timeout_ms>10000</operation_timeout_ms>
-        <session_timeout_ms>30000</session_timeout_ms>
+        <server_id>1</server_id>
+
+        <coordination_settings>
+            <operation_timeout_ms>5000</operation_timeout_ms>
+            <session_timeout_ms>10000</session_timeout_ms>
+            <raft_logs_level>trace</raft_logs_level>
+        </coordination_settings>
+
+        <raft_configuration>
+            <server>
+                <id>1</id>
+                <hostname>localhost</hostname>
+                <port>44444</port>
+            </server>
+        </raft_configuration>
     </test_keeper_server>
 </yandex>
diff --git a/tests/integration/test_testkeeper_back_to_back/test.py b/tests/integration/test_testkeeper_back_to_back/test.py
index d3a9b742cdd..8ec54f1a883 100644
--- a/tests/integration/test_testkeeper_back_to_back/test.py
+++ b/tests/integration/test_testkeeper_back_to_back/test.py
@@ -25,12 +25,12 @@ def get_fake_zk():
     global _fake_zk_instance
     if not _fake_zk_instance:
         print("node", cluster.get_instance_ip("node"))
-        _fake_zk_instance = KazooClient(hosts=cluster.get_instance_ip("node") + ":9181")
+        _fake_zk_instance = KazooClient(hosts=cluster.get_instance_ip("node") + ":9181", timeout=30.0)
         def reset_last_zxid_listener(state):
             print("Fake zk callback called for state", state)
             global _fake_zk_instance
-            # reset last_zxid -- fake server doesn't support it
-            _fake_zk_instance.last_zxid = 0
+            if state != KazooState.CONNECTED:
+                _fake_zk_instance._reset()
 
         _fake_zk_instance.add_listener(reset_last_zxid_listener)
         _fake_zk_instance.start()
diff --git a/tests/integration/test_testkeeper_multinode_blocade_leader/__init__.py b/tests/integration/test_testkeeper_multinode_blocade_leader/__init__.py
new file mode 100644
index 00000000000..e5a0d9b4834
--- /dev/null
+++ b/tests/integration/test_testkeeper_multinode_blocade_leader/__init__.py
@@ -0,0 +1 @@
+#!/usr/bin/env python3
diff --git a/tests/integration/test_testkeeper_multinode_blocade_leader/configs/enable_test_keeper1.xml b/tests/integration/test_testkeeper_multinode_blocade_leader/configs/enable_test_keeper1.xml
new file mode 100644
index 00000000000..4ad76889d1e
--- /dev/null
+++ b/tests/integration/test_testkeeper_multinode_blocade_leader/configs/enable_test_keeper1.xml
@@ -0,0 +1,38 @@
+<yandex>
+    <test_keeper_server>
+        <tcp_port>9181</tcp_port>
+        <server_id>1</server_id>
+
+        <coordination_settings>
+            <operation_timeout_ms>5000</operation_timeout_ms>
+            <session_timeout_ms>10000</session_timeout_ms>
+            <raft_logs_level>trace</raft_logs_level>
+        </coordination_settings>
+
+        <raft_configuration>
+            <server>
+                <id>1</id>
+                <hostname>node1</hostname>
+                <port>44444</port>
+                <can_become_leader>true</can_become_leader>
+                <priority>3</priority>
+            </server>
+            <server>
+                <id>2</id>
+                <hostname>node2</hostname>
+                <port>44444</port>
+                <can_become_leader>true</can_become_leader>
+                <start_as_follower>true</start_as_follower>
+                <priority>2</priority>
+            </server>
+            <server>
+                <id>3</id>
+                <hostname>node3</hostname>
+                <port>44444</port>
+                <can_become_leader>true</can_become_leader>
+                <start_as_follower>true</start_as_follower>
+                <priority>1</priority>
+            </server>
+        </raft_configuration>
+    </test_keeper_server>
+</yandex>
diff --git a/tests/integration/test_testkeeper_multinode_blocade_leader/configs/enable_test_keeper2.xml b/tests/integration/test_testkeeper_multinode_blocade_leader/configs/enable_test_keeper2.xml
new file mode 100644
index 00000000000..a1954a1e639
--- /dev/null
+++ b/tests/integration/test_testkeeper_multinode_blocade_leader/configs/enable_test_keeper2.xml
@@ -0,0 +1,38 @@
+<yandex>
+    <test_keeper_server>
+        <tcp_port>9181</tcp_port>
+        <server_id>2</server_id>
+
+        <coordination_settings>
+            <operation_timeout_ms>5000</operation_timeout_ms>
+            <session_timeout_ms>10000</session_timeout_ms>
+            <raft_logs_level>trace</raft_logs_level>
+        </coordination_settings>
+
+        <raft_configuration>
+            <server>
+                <id>1</id>
+                <hostname>node1</hostname>
+                <port>44444</port>
+                <can_become_leader>true</can_become_leader>
+                <priority>3</priority>
+            </server>
+            <server>
+                <id>2</id>
+                <hostname>node2</hostname>
+                <port>44444</port>
+                <can_become_leader>true</can_become_leader>
+                <start_as_follower>true</start_as_follower>
+                <priority>2</priority>
+            </server>
+            <server>
+                <id>3</id>
+                <hostname>node3</hostname>
+                <port>44444</port>
+                <can_become_leader>true</can_become_leader>
+                <start_as_follower>true</start_as_follower>
+                <priority>1</priority>
+            </server>
+        </raft_configuration>
+    </test_keeper_server>
+</yandex>
diff --git a/tests/integration/test_testkeeper_multinode_blocade_leader/configs/enable_test_keeper3.xml b/tests/integration/test_testkeeper_multinode_blocade_leader/configs/enable_test_keeper3.xml
new file mode 100644
index 00000000000..88d2358138f
--- /dev/null
+++ b/tests/integration/test_testkeeper_multinode_blocade_leader/configs/enable_test_keeper3.xml
@@ -0,0 +1,38 @@
+<yandex>
+    <test_keeper_server>
+        <tcp_port>9181</tcp_port>
+        <server_id>3</server_id>
+
+        <coordination_settings>
+            <operation_timeout_ms>5000</operation_timeout_ms>
+            <session_timeout_ms>10000</session_timeout_ms>
+            <raft_logs_level>trace</raft_logs_level>
+        </coordination_settings>
+
+        <raft_configuration>
+            <server>
+                <id>1</id>
+                <hostname>node1</hostname>
+                <port>44444</port>
+                <can_become_leader>true</can_become_leader>
+                <priority>3</priority>
+            </server>
+            <server>
+                <id>2</id>
+                <hostname>node2</hostname>
+                <port>44444</port>
+                <can_become_leader>true</can_become_leader>
+                <start_as_follower>true</start_as_follower>
+                <priority>2</priority>
+            </server>
+            <server>
+                <id>3</id>
+                <hostname>node3</hostname>
+                <port>44444</port>
+                <can_become_leader>true</can_become_leader>
+                <start_as_follower>true</start_as_follower>
+                <priority>1</priority>
+            </server>
+        </raft_configuration>
+    </test_keeper_server>
+</yandex>
diff --git a/tests/integration/test_testkeeper_multinode_blocade_leader/configs/log_conf.xml b/tests/integration/test_testkeeper_multinode_blocade_leader/configs/log_conf.xml
new file mode 100644
index 00000000000..318a6bca95d
--- /dev/null
+++ b/tests/integration/test_testkeeper_multinode_blocade_leader/configs/log_conf.xml
@@ -0,0 +1,12 @@
+<yandex>
+    <shutdown_wait_unfinished>3</shutdown_wait_unfinished>
+    <logger>
+        <level>trace</level>
+        <log>/var/log/clickhouse-server/log.log</log>
+        <errorlog>/var/log/clickhouse-server/log.err.log</errorlog>
+        <size>1000M</size>
+        <count>10</count>
+        <stderr>/var/log/clickhouse-server/stderr.log</stderr>
+        <stdout>/var/log/clickhouse-server/stdout.log</stdout>
+    </logger>
+</yandex>
diff --git a/tests/integration/test_testkeeper_multinode_blocade_leader/configs/use_test_keeper.xml b/tests/integration/test_testkeeper_multinode_blocade_leader/configs/use_test_keeper.xml
new file mode 100644
index 00000000000..b6139005d2f
--- /dev/null
+++ b/tests/integration/test_testkeeper_multinode_blocade_leader/configs/use_test_keeper.xml
@@ -0,0 +1,16 @@
+<yandex>
+    <zookeeper>
+        <node index="1">
+            <host>node1</host>
+            <port>9181</port>
+        </node>
+        <node index="2">
+            <host>node2</host>
+            <port>9181</port>
+        </node>
+        <node index="3">
+            <host>node3</host>
+            <port>9181</port>
+        </node>
+    </zookeeper>
+</yandex>
diff --git a/tests/integration/test_testkeeper_multinode_blocade_leader/test.py b/tests/integration/test_testkeeper_multinode_blocade_leader/test.py
new file mode 100644
index 00000000000..3b2867ef3c7
--- /dev/null
+++ b/tests/integration/test_testkeeper_multinode_blocade_leader/test.py
@@ -0,0 +1,321 @@
+import pytest
+from helpers.cluster import ClickHouseCluster
+import random
+import string
+import os
+import time
+from multiprocessing.dummy import Pool
+from helpers.network import PartitionManager
+
+cluster = ClickHouseCluster(__file__)
+node1 = cluster.add_instance('node1', main_configs=['configs/enable_test_keeper1.xml', 'configs/log_conf.xml', 'configs/use_test_keeper.xml'], stay_alive=True)
+node2 = cluster.add_instance('node2', main_configs=['configs/enable_test_keeper2.xml', 'configs/log_conf.xml', 'configs/use_test_keeper.xml'], stay_alive=True)
+node3 = cluster.add_instance('node3', main_configs=['configs/enable_test_keeper3.xml', 'configs/log_conf.xml', 'configs/use_test_keeper.xml'], stay_alive=True)
+
+from kazoo.client import KazooClient, KazooState
+
+@pytest.fixture(scope="module")
+def started_cluster():
+    try:
+        cluster.start()
+
+        yield cluster
+
+    finally:
+        cluster.shutdown()
+
+def smaller_exception(ex):
+    return '\n'.join(str(ex).split('\n')[0:2])
+
+def wait_node(node):
+    for _ in range(100):
+        zk = None
+        try:
+            node.query("SELECT * FROM system.zookeeper WHERE path = '/'")
+            zk = get_fake_zk(node.name, timeout=30.0)
+            zk.create("/test", sequence=True)
+            print("node", node.name, "ready")
+            break
+        except Exception as ex:
+            time.sleep(0.2)
+            print("Waiting until", node.name, "will be ready, exception", ex)
+        finally:
+            if zk:
+                zk.stop()
+                zk.close()
+    else:
+        raise Exception("Can't wait node", node.name, "to become ready")
+
+def wait_nodes():
+    for node in [node1, node2, node3]:
+        wait_node(node)
+
+
+def get_fake_zk(nodename, timeout=30.0):
+    _fake_zk_instance = KazooClient(hosts=cluster.get_instance_ip(nodename) + ":9181", timeout=timeout)
+    def reset_listener(state):
+        nonlocal _fake_zk_instance
+        print("Fake zk callback called for state", state)
+        if state != KazooState.CONNECTED:
+            _fake_zk_instance._reset()
+
+    _fake_zk_instance.add_listener(reset_listener)
+    _fake_zk_instance.start()
+    return _fake_zk_instance
+
+
+# in extremely rare case it can take more than 5 minutes in debug build with sanitizer
+@pytest.mark.timeout(600)
+def test_blocade_leader(started_cluster):
+    wait_nodes()
+    for i, node in enumerate([node1, node2, node3]):
+        node.query("CREATE DATABASE IF NOT EXISTS ordinary ENGINE=Ordinary")
+        node.query("CREATE TABLE ordinary.t1 (value UInt64) ENGINE = ReplicatedMergeTree('/clickhouse/t1', '{}') ORDER BY tuple()".format(i + 1))
+
+    node2.query("INSERT INTO ordinary.t1 SELECT number FROM numbers(10)")
+
+    node1.query("SYSTEM SYNC REPLICA ordinary.t1", timeout=10)
+    node3.query("SYSTEM SYNC REPLICA ordinary.t1", timeout=10)
+
+    assert node1.query("SELECT COUNT() FROM ordinary.t1") == "10\n"
+    assert node2.query("SELECT COUNT() FROM ordinary.t1") == "10\n"
+    assert node3.query("SELECT COUNT() FROM ordinary.t1") == "10\n"
+
+    with PartitionManager() as pm:
+        pm.partition_instances(node2, node1)
+        pm.partition_instances(node3, node1)
+
+        for i in range(100):
+            try:
+                node2.query("SYSTEM RESTART REPLICA ordinary.t1")
+                node2.query("INSERT INTO ordinary.t1 SELECT rand() FROM numbers(100)")
+                break
+            except Exception as ex:
+                try:
+                    node2.query("ATTACH TABLE ordinary.t1")
+                except Exception as attach_ex:
+                    print("Got exception node2", smaller_exception(attach_ex))
+                print("Got exception node2", smaller_exception(ex))
+                time.sleep(0.5)
+        else:
+            for num, node in enumerate([node1, node2, node3]):
+                dump_zk(node, '/clickhouse/t1', '/clickhouse/t1/replicas/{}'.format(num + 1))
+            assert False, "Cannot insert anything node2"
+
+        for i in range(100):
+            try:
+                node3.query("SYSTEM RESTART REPLICA ordinary.t1")
+                node3.query("INSERT INTO ordinary.t1 SELECT rand() FROM numbers(100)")
+                break
+            except Exception as ex:
+                try:
+                    node3.query("ATTACH TABLE ordinary.t1")
+                except Exception as attach_ex:
+                    print("Got exception node3", smaller_exception(attach_ex))
+                print("Got exception node3", smaller_exception(ex))
+                time.sleep(0.5)
+        else:
+            for num, node in enumerate([node1, node2, node3]):
+                dump_zk(node, '/clickhouse/t1', '/clickhouse/t1/replicas/{}'.format(num + 1))
+            assert False, "Cannot insert anything node3"
+
+    for n, node in enumerate([node1, node2, node3]):
+        for i in range(100):
+            try:
+                node.query("SYSTEM RESTART REPLICA ordinary.t1")
+                break
+            except Exception as ex:
+                try:
+                    node.query("ATTACH TABLE ordinary.t1")
+                except Exception as attach_ex:
+                    print("Got exception node{}".format(n + 1), smaller_exception(attach_ex))
+
+                print("Got exception node{}".format(n + 1), smaller_exception(ex))
+                time.sleep(0.5)
+        else:
+            assert False, "Cannot reconnect for node{}".format(n + 1)
+
+    for i in range(100):
+        try:
+            node1.query("INSERT INTO ordinary.t1 SELECT rand() FROM numbers(100)")
+            break
+        except Exception as ex:
+            print("Got exception node1", smaller_exception(ex))
+            time.sleep(0.5)
+    else:
+        for num, node in enumerate([node1, node2, node3]):
+            dump_zk(node, '/clickhouse/t1', '/clickhouse/t1/replicas/{}'.format(num + 1))
+        assert False, "Cannot insert anything node1"
+
+    for n, node in enumerate([node1, node2, node3]):
+        for i in range(100):
+            try:
+                node.query("SYSTEM RESTART REPLICA ordinary.t1")
+                node.query("SYSTEM SYNC REPLICA ordinary.t1", timeout=10)
+                break
+            except Exception as ex:
+                try:
+                    node.query("ATTACH TABLE ordinary.t1")
+                except Exception as attach_ex:
+                    print("Got exception node{}".format(n + 1), smaller_exception(attach_ex))
+
+                print("Got exception node{}".format(n + 1), smaller_exception(ex))
+                time.sleep(0.5)
+        else:
+            for num, node in enumerate([node1, node2, node3]):
+                dump_zk(node, '/clickhouse/t1', '/clickhouse/t1/replicas/{}'.format(num + 1))
+            assert False, "Cannot sync replica node{}".format(n+1)
+
+    if node1.query("SELECT COUNT() FROM ordinary.t1") != "310\n":
+        for num, node in enumerate([node1, node2, node3]):
+            dump_zk(node, '/clickhouse/t1', '/clickhouse/t1/replicas/{}'.format(num + 1))
+
+    assert node1.query("SELECT COUNT() FROM ordinary.t1") == "310\n"
+    assert node2.query("SELECT COUNT() FROM ordinary.t1") == "310\n"
+    assert node3.query("SELECT COUNT() FROM ordinary.t1") == "310\n"
+
+
+def dump_zk(node, zk_path, replica_path):
+    print(node.query("SELECT * FROM system.replication_queue FORMAT Vertical"))
+    print("Replicas")
+    print(node.query("SELECT * FROM system.replicas FORMAT Vertical"))
+    print("Replica 2 info")
+    print(node.query("SELECT * FROM system.zookeeper WHERE path = '{}' FORMAT Vertical".format(zk_path)))
+    print("Queue")
+    print(node.query("SELECT * FROM system.zookeeper WHERE path = '{}/queue' FORMAT Vertical".format(replica_path)))
+    print("Log")
+    print(node.query("SELECT * FROM system.zookeeper WHERE path = '{}/log' FORMAT Vertical".format(zk_path)))
+    print("Parts")
+    print(node.query("SELECT name FROM system.zookeeper WHERE path = '{}/parts' FORMAT Vertical".format(replica_path)))
+
+# in extremely rare case it can take more than 5 minutes in debug build with sanitizer
+@pytest.mark.timeout(600)
+def test_blocade_leader_twice(started_cluster):
+    wait_nodes()
+    for i, node in enumerate([node1, node2, node3]):
+        node.query("CREATE DATABASE IF NOT EXISTS ordinary ENGINE=Ordinary")
+        node.query("CREATE TABLE ordinary.t2 (value UInt64) ENGINE = ReplicatedMergeTree('/clickhouse/t2', '{}') ORDER BY tuple()".format(i + 1))
+
+    node2.query("INSERT INTO ordinary.t2 SELECT number FROM numbers(10)")
+
+    node1.query("SYSTEM SYNC REPLICA ordinary.t2", timeout=10)
+    node3.query("SYSTEM SYNC REPLICA ordinary.t2", timeout=10)
+
+    assert node1.query("SELECT COUNT() FROM ordinary.t2") == "10\n"
+    assert node2.query("SELECT COUNT() FROM ordinary.t2") == "10\n"
+    assert node3.query("SELECT COUNT() FROM ordinary.t2") == "10\n"
+
+    with PartitionManager() as pm:
+        pm.partition_instances(node2, node1)
+        pm.partition_instances(node3, node1)
+
+        for i in range(100):
+            try:
+                node2.query("SYSTEM RESTART REPLICA ordinary.t2")
+                node2.query("INSERT INTO ordinary.t2 SELECT rand() FROM numbers(100)")
+                break
+            except Exception as ex:
+                try:
+                    node2.query("ATTACH TABLE ordinary.t2")
+                except Exception as attach_ex:
+                    print("Got exception node2", smaller_exception(attach_ex))
+                print("Got exception node2", smaller_exception(ex))
+                time.sleep(0.5)
+        else:
+            for num, node in enumerate([node1, node2, node3]):
+                dump_zk(node, '/clickhouse/t2', '/clickhouse/t2/replicas/{}'.format(num + 1))
+            assert False, "Cannot reconnect for node2"
+
+        for i in range(100):
+            try:
+                node3.query("SYSTEM RESTART REPLICA ordinary.t2")
+                node3.query("INSERT INTO ordinary.t2 SELECT rand() FROM numbers(100)")
+                break
+            except Exception as ex:
+                try:
+                    node3.query("ATTACH TABLE ordinary.t2")
+                except Exception as attach_ex:
+                    print("Got exception node3", smaller_exception(attach_ex))
+                print("Got exception node3", smaller_exception(ex))
+                time.sleep(0.5)
+        else:
+            for num, node in enumerate([node1, node2, node3]):
+                dump_zk(node, '/clickhouse/t2', '/clickhouse/t2/replicas/{}'.format(num + 1))
+            assert False, "Cannot reconnect for node3"
+
+
+        # Total network partition
+        pm.partition_instances(node3, node2)
+
+        for i in range(10):
+            try:
+                node3.query("INSERT INTO ordinary.t2 SELECT rand() FROM numbers(100)")
+                assert False, "Node3 became leader?"
+            except Exception as ex:
+                time.sleep(0.5)
+
+        for i in range(10):
+            try:
+                node2.query("INSERT INTO ordinary.t2 SELECT rand() FROM numbers(100)")
+                assert False, "Node2 became leader?"
+            except Exception as ex:
+                time.sleep(0.5)
+
+
+    for n, node in enumerate([node1, node2, node3]):
+        for i in range(100):
+            try:
+                node.query("SYSTEM RESTART REPLICA ordinary.t2")
+                break
+            except Exception as ex:
+                try:
+                    node.query("ATTACH TABLE ordinary.t2")
+                except Exception as attach_ex:
+                    print("Got exception node{}".format(n + 1), smaller_exception(attach_ex))
+
+                print("Got exception node{}".format(n + 1), smaller_exception(ex))
+                time.sleep(0.5)
+        else:
+            for num, node in enumerate([node1, node2, node3]):
+                dump_zk(node, '/clickhouse/t2', '/clickhouse/t2/replicas/{}'.format(num + 1))
+            assert False, "Cannot reconnect for node{}".format(n + 1)
+
+    for n, node in enumerate([node1, node2, node3]):
+        for i in range(100):
+            try:
+                node.query("INSERT INTO ordinary.t2 SELECT rand() FROM numbers(100)")
+                break
+            except Exception as ex:
+                print("Got exception node{}".format(n + 1), smaller_exception(ex))
+                time.sleep(0.5)
+        else:
+            for num, node in enumerate([node1, node2, node3]):
+                dump_zk(node, '/clickhouse/t2', '/clickhouse/t2/replicas/{}'.format(num + 1))
+            assert False, "Cannot reconnect for node{}".format(n + 1)
+
+    for n, node in enumerate([node1, node2, node3]):
+        for i in range(100):
+            try:
+                node.query("SYSTEM RESTART REPLICA ordinary.t2")
+                node.query("SYSTEM SYNC REPLICA ordinary.t2", timeout=10)
+                break
+            except Exception as ex:
+                try:
+                    node.query("ATTACH TABLE ordinary.t2")
+                except Exception as attach_ex:
+                    print("Got exception node{}".format(n + 1), smaller_exception(attach_ex))
+
+                print("Got exception node{}".format(n + 1), smaller_exception(ex))
+                time.sleep(0.5)
+        else:
+            for num, node in enumerate([node1, node2, node3]):
+                dump_zk(node, '/clickhouse/t2', '/clickhouse/t2/replicas/{}'.format(num + 1))
+            assert False, "Cannot reconnect for node{}".format(n + 1)
+
+    assert node1.query("SELECT COUNT() FROM ordinary.t2") == "510\n"
+    if node2.query("SELECT COUNT() FROM ordinary.t2") != "510\n":
+        for num, node in enumerate([node1, node2, node3]):
+            dump_zk(node, '/clickhouse/t2', '/clickhouse/t2/replicas/{}'.format(num + 1))
+
+    assert node2.query("SELECT COUNT() FROM ordinary.t2") == "510\n"
+    assert node3.query("SELECT COUNT() FROM ordinary.t2") == "510\n"
diff --git a/tests/integration/test_testkeeper_multinode_simple/__init__.py b/tests/integration/test_testkeeper_multinode_simple/__init__.py
new file mode 100644
index 00000000000..e5a0d9b4834
--- /dev/null
+++ b/tests/integration/test_testkeeper_multinode_simple/__init__.py
@@ -0,0 +1 @@
+#!/usr/bin/env python3
diff --git a/tests/integration/test_testkeeper_multinode_simple/configs/enable_test_keeper1.xml b/tests/integration/test_testkeeper_multinode_simple/configs/enable_test_keeper1.xml
new file mode 100644
index 00000000000..4ad76889d1e
--- /dev/null
+++ b/tests/integration/test_testkeeper_multinode_simple/configs/enable_test_keeper1.xml
@@ -0,0 +1,38 @@
+<yandex>
+    <test_keeper_server>
+        <tcp_port>9181</tcp_port>
+        <server_id>1</server_id>
+
+        <coordination_settings>
+            <operation_timeout_ms>5000</operation_timeout_ms>
+            <session_timeout_ms>10000</session_timeout_ms>
+            <raft_logs_level>trace</raft_logs_level>
+        </coordination_settings>
+
+        <raft_configuration>
+            <server>
+                <id>1</id>
+                <hostname>node1</hostname>
+                <port>44444</port>
+                <can_become_leader>true</can_become_leader>
+                <priority>3</priority>
+            </server>
+            <server>
+                <id>2</id>
+                <hostname>node2</hostname>
+                <port>44444</port>
+                <can_become_leader>true</can_become_leader>
+                <start_as_follower>true</start_as_follower>
+                <priority>2</priority>
+            </server>
+            <server>
+                <id>3</id>
+                <hostname>node3</hostname>
+                <port>44444</port>
+                <can_become_leader>true</can_become_leader>
+                <start_as_follower>true</start_as_follower>
+                <priority>1</priority>
+            </server>
+        </raft_configuration>
+    </test_keeper_server>
+</yandex>
diff --git a/tests/integration/test_testkeeper_multinode_simple/configs/enable_test_keeper2.xml b/tests/integration/test_testkeeper_multinode_simple/configs/enable_test_keeper2.xml
new file mode 100644
index 00000000000..a1954a1e639
--- /dev/null
+++ b/tests/integration/test_testkeeper_multinode_simple/configs/enable_test_keeper2.xml
@@ -0,0 +1,38 @@
+<yandex>
+    <test_keeper_server>
+        <tcp_port>9181</tcp_port>
+        <server_id>2</server_id>
+
+        <coordination_settings>
+            <operation_timeout_ms>5000</operation_timeout_ms>
+            <session_timeout_ms>10000</session_timeout_ms>
+            <raft_logs_level>trace</raft_logs_level>
+        </coordination_settings>
+
+        <raft_configuration>
+            <server>
+                <id>1</id>
+                <hostname>node1</hostname>
+                <port>44444</port>
+                <can_become_leader>true</can_become_leader>
+                <priority>3</priority>
+            </server>
+            <server>
+                <id>2</id>
+                <hostname>node2</hostname>
+                <port>44444</port>
+                <can_become_leader>true</can_become_leader>
+                <start_as_follower>true</start_as_follower>
+                <priority>2</priority>
+            </server>
+            <server>
+                <id>3</id>
+                <hostname>node3</hostname>
+                <port>44444</port>
+                <can_become_leader>true</can_become_leader>
+                <start_as_follower>true</start_as_follower>
+                <priority>1</priority>
+            </server>
+        </raft_configuration>
+    </test_keeper_server>
+</yandex>
diff --git a/tests/integration/test_testkeeper_multinode_simple/configs/enable_test_keeper3.xml b/tests/integration/test_testkeeper_multinode_simple/configs/enable_test_keeper3.xml
new file mode 100644
index 00000000000..88d2358138f
--- /dev/null
+++ b/tests/integration/test_testkeeper_multinode_simple/configs/enable_test_keeper3.xml
@@ -0,0 +1,38 @@
+<yandex>
+    <test_keeper_server>
+        <tcp_port>9181</tcp_port>
+        <server_id>3</server_id>
+
+        <coordination_settings>
+            <operation_timeout_ms>5000</operation_timeout_ms>
+            <session_timeout_ms>10000</session_timeout_ms>
+            <raft_logs_level>trace</raft_logs_level>
+        </coordination_settings>
+
+        <raft_configuration>
+            <server>
+                <id>1</id>
+                <hostname>node1</hostname>
+                <port>44444</port>
+                <can_become_leader>true</can_become_leader>
+                <priority>3</priority>
+            </server>
+            <server>
+                <id>2</id>
+                <hostname>node2</hostname>
+                <port>44444</port>
+                <can_become_leader>true</can_become_leader>
+                <start_as_follower>true</start_as_follower>
+                <priority>2</priority>
+            </server>
+            <server>
+                <id>3</id>
+                <hostname>node3</hostname>
+                <port>44444</port>
+                <can_become_leader>true</can_become_leader>
+                <start_as_follower>true</start_as_follower>
+                <priority>1</priority>
+            </server>
+        </raft_configuration>
+    </test_keeper_server>
+</yandex>
diff --git a/tests/integration/test_testkeeper_multinode_simple/configs/log_conf.xml b/tests/integration/test_testkeeper_multinode_simple/configs/log_conf.xml
new file mode 100644
index 00000000000..318a6bca95d
--- /dev/null
+++ b/tests/integration/test_testkeeper_multinode_simple/configs/log_conf.xml
@@ -0,0 +1,12 @@
+<yandex>
+    <shutdown_wait_unfinished>3</shutdown_wait_unfinished>
+    <logger>
+        <level>trace</level>
+        <log>/var/log/clickhouse-server/log.log</log>
+        <errorlog>/var/log/clickhouse-server/log.err.log</errorlog>
+        <size>1000M</size>
+        <count>10</count>
+        <stderr>/var/log/clickhouse-server/stderr.log</stderr>
+        <stdout>/var/log/clickhouse-server/stdout.log</stdout>
+    </logger>
+</yandex>
diff --git a/tests/integration/test_testkeeper_multinode_simple/configs/use_test_keeper.xml b/tests/integration/test_testkeeper_multinode_simple/configs/use_test_keeper.xml
new file mode 100644
index 00000000000..b6139005d2f
--- /dev/null
+++ b/tests/integration/test_testkeeper_multinode_simple/configs/use_test_keeper.xml
@@ -0,0 +1,16 @@
+<yandex>
+    <zookeeper>
+        <node index="1">
+            <host>node1</host>
+            <port>9181</port>
+        </node>
+        <node index="2">
+            <host>node2</host>
+            <port>9181</port>
+        </node>
+        <node index="3">
+            <host>node3</host>
+            <port>9181</port>
+        </node>
+    </zookeeper>
+</yandex>
diff --git a/tests/integration/test_testkeeper_multinode_simple/test.py b/tests/integration/test_testkeeper_multinode_simple/test.py
new file mode 100644
index 00000000000..a7ece4bbd56
--- /dev/null
+++ b/tests/integration/test_testkeeper_multinode_simple/test.py
@@ -0,0 +1,239 @@
+import pytest
+from helpers.cluster import ClickHouseCluster
+import random
+import string
+import os
+import time
+from multiprocessing.dummy import Pool
+from helpers.network import PartitionManager
+
+cluster = ClickHouseCluster(__file__)
+node1 = cluster.add_instance('node1', main_configs=['configs/enable_test_keeper1.xml', 'configs/log_conf.xml', 'configs/use_test_keeper.xml'], stay_alive=True)
+node2 = cluster.add_instance('node2', main_configs=['configs/enable_test_keeper2.xml', 'configs/log_conf.xml', 'configs/use_test_keeper.xml'], stay_alive=True)
+node3 = cluster.add_instance('node3', main_configs=['configs/enable_test_keeper3.xml', 'configs/log_conf.xml', 'configs/use_test_keeper.xml'], stay_alive=True)
+
+from kazoo.client import KazooClient, KazooState
+
+@pytest.fixture(scope="module")
+def started_cluster():
+    try:
+        cluster.start()
+
+        yield cluster
+
+    finally:
+        cluster.shutdown()
+
+def smaller_exception(ex):
+    return '\n'.join(str(ex).split('\n')[0:2])
+
+def wait_node(node):
+    for _ in range(100):
+        zk = None
+        try:
+            node.query("SELECT * FROM system.zookeeper WHERE path = '/'")
+            zk = get_fake_zk(node.name, timeout=30.0)
+            zk.create("/test", sequence=True)
+            print("node", node.name, "ready")
+            break
+        except Exception as ex:
+            time.sleep(0.2)
+            print("Waiting until", node.name, "will be ready, exception", ex)
+        finally:
+            if zk:
+                zk.stop()
+                zk.close()
+    else:
+        raise Exception("Can't wait node", node.name, "to become ready")
+
+def wait_nodes():
+    for node in [node1, node2, node3]:
+        wait_node(node)
+
+
+def get_fake_zk(nodename, timeout=30.0):
+    _fake_zk_instance = KazooClient(hosts=cluster.get_instance_ip(nodename) + ":9181", timeout=timeout)
+    def reset_listener(state):
+        nonlocal _fake_zk_instance
+        print("Fake zk callback called for state", state)
+        if state != KazooState.CONNECTED:
+            _fake_zk_instance._reset()
+
+    _fake_zk_instance.add_listener(reset_listener)
+    _fake_zk_instance.start()
+    return _fake_zk_instance
+
+def test_read_write_multinode(started_cluster):
+    try:
+        wait_nodes()
+        node1_zk = get_fake_zk("node1")
+        node2_zk = get_fake_zk("node2")
+        node3_zk = get_fake_zk("node3")
+
+        node1_zk.create("/test_read_write_multinode_node1", b"somedata1")
+        node2_zk.create("/test_read_write_multinode_node2", b"somedata2")
+        node3_zk.create("/test_read_write_multinode_node3", b"somedata3")
+
+        # stale reads are allowed
+        while node1_zk.exists("/test_read_write_multinode_node2") is None:
+            time.sleep(0.1)
+
+        while node1_zk.exists("/test_read_write_multinode_node3") is None:
+            time.sleep(0.1)
+
+        while node2_zk.exists("/test_read_write_multinode_node3") is None:
+            time.sleep(0.1)
+
+        assert node3_zk.get("/test_read_write_multinode_node1")[0] == b"somedata1"
+        assert node2_zk.get("/test_read_write_multinode_node1")[0] == b"somedata1"
+        assert node1_zk.get("/test_read_write_multinode_node1")[0] == b"somedata1"
+
+        assert node3_zk.get("/test_read_write_multinode_node2")[0] == b"somedata2"
+        assert node2_zk.get("/test_read_write_multinode_node2")[0] == b"somedata2"
+        assert node1_zk.get("/test_read_write_multinode_node2")[0] == b"somedata2"
+
+        assert node3_zk.get("/test_read_write_multinode_node3")[0] == b"somedata3"
+        assert node2_zk.get("/test_read_write_multinode_node3")[0] == b"somedata3"
+        assert node1_zk.get("/test_read_write_multinode_node3")[0] == b"somedata3"
+
+    finally:
+        try:
+            for zk_conn in [node1_zk, node2_zk, node3_zk]:
+                zk_conn.stop()
+                zk_conn.close()
+        except:
+            pass
+
+
+def test_watch_on_follower(started_cluster):
+    try:
+        wait_nodes()
+        node1_zk = get_fake_zk("node1")
+        node2_zk = get_fake_zk("node2")
+        node3_zk = get_fake_zk("node3")
+
+        node1_zk.create("/test_data_watches")
+        node2_zk.set("/test_data_watches", b"hello")
+        node3_zk.set("/test_data_watches", b"world")
+
+        node1_data = None
+        def node1_callback(event):
+            print("node1 data watch called")
+            nonlocal node1_data
+            node1_data = event
+
+        node1_zk.get("/test_data_watches", watch=node1_callback)
+
+        node2_data = None
+        def node2_callback(event):
+            print("node2 data watch called")
+            nonlocal node2_data
+            node2_data = event
+
+        node2_zk.get("/test_data_watches", watch=node2_callback)
+
+        node3_data = None
+        def node3_callback(event):
+            print("node3 data watch called")
+            nonlocal node3_data
+            node3_data = event
+
+        node3_zk.get("/test_data_watches", watch=node3_callback)
+
+        node1_zk.set("/test_data_watches", b"somevalue")
+        time.sleep(3)
+
+        print(node1_data)
+        print(node2_data)
+        print(node3_data)
+
+        assert node1_data == node2_data
+        assert node3_data == node2_data
+
+    finally:
+        try:
+            for zk_conn in [node1_zk, node2_zk, node3_zk]:
+                zk_conn.stop()
+                zk_conn.close()
+        except:
+            pass
+
+
+def test_session_expiration(started_cluster):
+    try:
+        wait_nodes()
+        node1_zk = get_fake_zk("node1")
+        node2_zk = get_fake_zk("node2")
+        node3_zk = get_fake_zk("node3", timeout=3.0)
+        print("Node3 session id", node3_zk._session_id)
+
+        node3_zk.create("/test_ephemeral_node", b"world", ephemeral=True)
+
+        with PartitionManager() as pm:
+            pm.partition_instances(node3, node2)
+            pm.partition_instances(node3, node1)
+            node3_zk.stop()
+            node3_zk.close()
+            for _ in range(100):
+                if node1_zk.exists("/test_ephemeral_node") is None and node2_zk.exists("/test_ephemeral_node") is None:
+                    break
+                print("Node1 exists", node1_zk.exists("/test_ephemeral_node"))
+                print("Node2 exists", node2_zk.exists("/test_ephemeral_node"))
+                time.sleep(0.1)
+                node1_zk.sync("/")
+                node2_zk.sync("/")
+
+        assert node1_zk.exists("/test_ephemeral_node") is None
+        assert node2_zk.exists("/test_ephemeral_node") is None
+
+    finally:
+        try:
+            for zk_conn in [node1_zk, node2_zk, node3_zk]:
+                try:
+                    zk_conn.stop()
+                    zk_conn.close()
+                except:
+                    pass
+        except:
+            pass
+
+
+def test_follower_restart(started_cluster):
+    try:
+        wait_nodes()
+        node1_zk = get_fake_zk("node1")
+
+        node1_zk.create("/test_restart_node", b"hello")
+
+        node3.restart_clickhouse(kill=True)
+
+        node3_zk = get_fake_zk("node3")
+
+        # got data from log
+        assert node3_zk.get("/test_restart_node")[0] == b"hello"
+
+    finally:
+        try:
+            for zk_conn in [node1_zk, node3_zk]:
+                try:
+                    zk_conn.stop()
+                    zk_conn.close()
+                except:
+                    pass
+        except:
+            pass
+
+
+def test_simple_replicated_table(started_cluster):
+    wait_nodes()
+    for i, node in enumerate([node1, node2, node3]):
+        node.query("CREATE TABLE t (value UInt64) ENGINE = ReplicatedMergeTree('/clickhouse/t', '{}') ORDER BY tuple()".format(i + 1))
+
+    node2.query("INSERT INTO t SELECT number FROM numbers(10)")
+
+    node1.query("SYSTEM SYNC REPLICA t", timeout=10)
+    node3.query("SYSTEM SYNC REPLICA t", timeout=10)
+
+    assert node1.query("SELECT COUNT() FROM t") == "10\n"
+    assert node2.query("SELECT COUNT() FROM t") == "10\n"
+    assert node3.query("SELECT COUNT() FROM t") == "10\n"
diff --git a/tests/performance/aggregating_merge_tree_simple_aggregate_function_string.xml b/tests/performance/aggregating_merge_tree_simple_aggregate_function_string.xml
index c12f26ad595..0c93b4745cf 100644
--- a/tests/performance/aggregating_merge_tree_simple_aggregate_function_string.xml
+++ b/tests/performance/aggregating_merge_tree_simple_aggregate_function_string.xml
@@ -6,7 +6,7 @@
         SETTINGS index_granularity = 8192
         AS
         SELECT CAST(reinterpretAsString(number), 'SimpleAggregateFunction(any, String)') AS key
-        FROM numbers_mt(toUInt64(5e6))
+        FROM numbers_mt(5e6)
         SETTINGS max_insert_threads = 16
     </create_query>
 
diff --git a/tests/performance/group_by_fixed_keys.xml b/tests/performance/group_by_fixed_keys.xml
new file mode 100644
index 00000000000..0be29ff11ac
--- /dev/null
+++ b/tests/performance/group_by_fixed_keys.xml
@@ -0,0 +1,7 @@
+<test>
+    <query>WITH toUInt8(number) AS k, toUInt64(k) AS k1, k AS k2 SELECT k1, k2, count() FROM numbers(100000000) GROUP BY k1, k2</query>
+    <query>WITH toUInt8(number) AS k, toUInt16(k) AS k1, toUInt32(k) AS k2, k AS k3 SELECT k1, k2, k3, count() FROM numbers(100000000) GROUP BY k1, k2, k3</query>
+    <query>WITH toUInt8(number) AS k, k AS k1, k + 1 AS k2 SELECT k1, k2, count() FROM numbers(100000000) GROUP BY k1, k2</query>
+    <query>WITH toUInt8(number) AS k, k AS k1, k + 1 AS k2, k + 2 AS k3, k + 3 AS k4 SELECT k1, k2, k3, k4, count() FROM numbers(100000000) GROUP BY k1, k2, k3, k4</query>
+    <query>WITH toUInt8(number) AS k, toUInt64(k) AS k1, k1 + 1 AS k2 SELECT k1, k2, count() FROM numbers(100000000) GROUP BY k1, k2</query>
+</test>
diff --git a/tests/performance/window_functions.xml b/tests/performance/window_functions.xml
index 74df2b64a3b..622e349d060 100644
--- a/tests/performance/window_functions.xml
+++ b/tests/performance/window_functions.xml
@@ -86,4 +86,28 @@
         format Null
     </query>
 
+    <!-- Some synthetic tests.-->
+    <query>
+        select
+            min(number) over w,
+            count(*) over w,
+            max(number) over w
+        from
+            (select number, intDiv(number, 1111) p, mod(number, 111) o
+                from numbers(10000000)) t
+        window w as (partition by p order by o)
+        format Null
+    </query>
+
+    <query>
+        select
+            first_value(number) over w,
+            dense_rank() over w
+        from
+            (select number, intDiv(number, 1111) p, mod(number, 111) o
+                from numbers(10000000)) t
+        window w as (partition by p order by o)
+        format Null
+    </query>
+
 </test>
diff --git a/tests/queries/0_stateless/00597_push_down_predicate.reference b/tests/queries/0_stateless/00597_push_down_predicate.reference
index cea533d6ccb..794d9e7af5f 100644
--- a/tests/queries/0_stateless/00597_push_down_predicate.reference
+++ b/tests/queries/0_stateless/00597_push_down_predicate.reference
@@ -115,6 +115,7 @@ FROM
     SELECT
         1 AS id,
         identity(cast(1, \'UInt8\')) AS subquery
+    WHERE subquery = 1
 )
 WHERE subquery = 1
 1	1
diff --git a/tests/queries/0_stateless/00738_lock_for_inner_table.sh b/tests/queries/0_stateless/00738_lock_for_inner_table.sh
index 9540d566ac3..d19288f65d8 100755
--- a/tests/queries/0_stateless/00738_lock_for_inner_table.sh
+++ b/tests/queries/0_stateless/00738_lock_for_inner_table.sh
@@ -5,21 +5,37 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 # shellcheck source=../shell_config.sh
 . "$CURDIR"/../shell_config.sh
 
-echo "DROP TABLE IF EXISTS tab_00738;
-DROP TABLE IF EXISTS mv;
-CREATE TABLE tab_00738(a Int) ENGINE = Log;
-CREATE MATERIALIZED VIEW mv UUID '00000738-1000-4000-8000-000000000001' ENGINE = Log AS SELECT a FROM tab_00738;" | ${CLICKHOUSE_CLIENT} -n
+# there are some issues with Atomic database, let's generate it uniq
+# otherwise flaky check will not pass.
+uuid=$(${CLICKHOUSE_CLIENT} --query "SELECT reinterpretAsUUID(currentDatabase())")
 
-${CLICKHOUSE_CLIENT} --query_id test_00738 --query "INSERT INTO tab_00738 SELECT number FROM numbers(10000000)" &
+echo "DROP TABLE IF EXISTS tab_00738 SYNC;
+DROP TABLE IF EXISTS mv SYNC;
+-- create table with fsync and 20 partitions for slower INSERT
+-- (since increasing number of records will make it significantly slower in debug build, but not in release)
+CREATE TABLE tab_00738(a Int) ENGINE = MergeTree() ORDER BY a PARTITION BY a%20 SETTINGS fsync_after_insert=1;
+CREATE MATERIALIZED VIEW mv UUID '$uuid' ENGINE = Log AS SELECT a FROM tab_00738;" | ${CLICKHOUSE_CLIENT} -n
+
+${CLICKHOUSE_CLIENT} --query_id insert_$CLICKHOUSE_DATABASE --query "INSERT INTO tab_00738 SELECT number FROM numbers(10000000)" &
 
 function drop()
 {
-    ${CLICKHOUSE_CLIENT} --query "DROP TABLE \`.inner_id.00000738-1000-4000-8000-000000000001\`" -n
+    ${CLICKHOUSE_CLIENT} --query "DROP TABLE \`.inner_id.$uuid\`" -n
 }
 
 function wait_for_query_to_start()
 {
-    while [[ $(${CLICKHOUSE_CLIENT} --query "SELECT count() FROM system.processes WHERE query_id = 'test_00738'") == 0 ]]; do sleep 0.001; done
+    while [[ $(${CLICKHOUSE_CLIENT} --query "SELECT count() FROM system.processes WHERE query_id = 'insert_$CLICKHOUSE_DATABASE'") == 0 ]]; do sleep 0.001; done
+
+    # The query is already started, but there is no guarantee that it locks the underlying table already.
+    # Wait until PushingToViewsBlockOutputStream will acquire the lock of the underlying table for the INSERT query.
+    # (assume that 0.5 second is enough for this, but this is not 100% correct)
+    sleep 0.5
+
+    # query already finished, fail
+    if [[ $(${CLICKHOUSE_CLIENT} --query "SELECT count() FROM system.processes WHERE query_id = 'insert_$CLICKHOUSE_DATABASE'") == 0 ]]; then
+        exit 2
+    fi
 }
 
 export -f wait_for_query_to_start
diff --git a/tests/queries/0_stateless/00826_cross_to_inner_join.reference b/tests/queries/0_stateless/00826_cross_to_inner_join.reference
index e7c8d6b1ea9..84867de2849 100644
--- a/tests/queries/0_stateless/00826_cross_to_inner_join.reference
+++ b/tests/queries/0_stateless/00826_cross_to_inner_join.reference
@@ -95,7 +95,7 @@ SELECT
     t2_00826.a,
     t2_00826.b
 FROM t1_00826
-ALL INNER JOIN t2_00826 ON (a = t2_00826.a) AND (a = t2_00826.a) AND (a = t2_00826.a) AND (b = t2_00826.b)
+ALL INNER JOIN t2_00826 ON (((a = t2_00826.a) AND (a = t2_00826.a)) AND (a = t2_00826.a)) AND (b = t2_00826.b)
 WHERE (a = t2_00826.a) AND ((a = t2_00826.a) AND ((a = t2_00826.a) AND (b = t2_00826.b)))
 cross split conjunction
 SELECT
diff --git a/tests/queries/0_stateless/00849_multiple_comma_join_2.reference b/tests/queries/0_stateless/00849_multiple_comma_join_2.reference
index fc39ef13935..4db65b0b795 100644
--- a/tests/queries/0_stateless/00849_multiple_comma_join_2.reference
+++ b/tests/queries/0_stateless/00849_multiple_comma_join_2.reference
@@ -127,7 +127,7 @@ FROM
     ) AS `--.s`
     CROSS JOIN t3
 ) AS `--.s`
-ALL INNER JOIN t4 ON (a = `--t1.a`) AND (a = `--t2.a`) AND (a = `--t3.a`)
+ALL INNER JOIN t4 ON ((a = `--t1.a`) AND (a = `--t2.a`)) AND (a = `--t3.a`)
 WHERE (a = `--t1.a`) AND (a = `--t2.a`) AND (a = `--t3.a`)
 SELECT `--t1.a` AS `t1.a`
 FROM 
diff --git a/tests/queries/0_stateless/00878_join_unexpected_results.reference b/tests/queries/0_stateless/00878_join_unexpected_results.reference
index a389cb47a96..65fcbc257ca 100644
--- a/tests/queries/0_stateless/00878_join_unexpected_results.reference
+++ b/tests/queries/0_stateless/00878_join_unexpected_results.reference
@@ -23,6 +23,8 @@ join_use_nulls = 1
 -
 \N	\N
 -
+1	1	\N	\N
+2	2	\N	\N
 -
 1	1	1	1
 2	2	\N	\N
@@ -49,6 +51,8 @@ join_use_nulls = 0
 -
 -
 -
+1	1	0	0
+2	2	0	0
 -
 1	1	1	1
 2	2	0	0
diff --git a/tests/queries/0_stateless/00878_join_unexpected_results.sql b/tests/queries/0_stateless/00878_join_unexpected_results.sql
index 0aef5208b26..6f6cd6e6479 100644
--- a/tests/queries/0_stateless/00878_join_unexpected_results.sql
+++ b/tests/queries/0_stateless/00878_join_unexpected_results.sql
@@ -30,11 +30,11 @@ select * from t left outer join s on (t.a=s.a and t.b=s.b) where s.a is null;
 select '-';
 select s.* from t left outer join s on (t.a=s.a and t.b=s.b) where s.a is null;
 select '-';
-select t.*, s.* from t left join s on (s.a=t.a and t.b=s.b and t.a=toInt64(2)) order by t.a; -- {serverError 403 }
+select t.*, s.* from t left join s on (s.a=t.a and t.b=s.b and t.a=toInt64(2)) order by t.a;
 select '-';
 select t.*, s.* from t left join s on (s.a=t.a) order by t.a;
 select '-';
-select t.*, s.* from t left join s on (t.b=toInt64(2) and s.a=t.a) where s.b=2; -- {serverError 403 }
+select t.*, s.* from t left join s on (t.b=toInt64(2) and s.a=t.a) where s.b=2;
 
 select 'join_use_nulls = 0';
 set join_use_nulls = 0;
@@ -58,11 +58,11 @@ select '-';
 select '-';
 -- select s.* from t left outer join s on (t.a=s.a and t.b=s.b) where s.a is null; -- TODO
 select '-';
-select t.*, s.* from t left join s on (s.a=t.a and t.b=s.b and t.a=toInt64(2)) order by t.a; -- {serverError 403 }
+select t.*, s.* from t left join s on (s.a=t.a and t.b=s.b and t.a=toInt64(2)) order by t.a;
 select '-';
 select t.*, s.* from t left join s on (s.a=t.a) order by t.a;
 select '-';
-select t.*, s.* from t left join s on (t.b=toInt64(2) and s.a=t.a) where s.b=2; -- {serverError 403 }
+select t.*, s.* from t left join s on (t.b=toInt64(2) and s.a=t.a) where s.b=2;
 
 drop table t;
 drop table s;
diff --git a/tests/queries/0_stateless/01016_uniqCombined64.sql b/tests/queries/0_stateless/01016_uniqCombined64.sql
index 4720b53d15e..acf8135760a 100644
--- a/tests/queries/0_stateless/01016_uniqCombined64.sql
+++ b/tests/queries/0_stateless/01016_uniqCombined64.sql
@@ -5,5 +5,5 @@
 -- test is just to ensure that the result is different (and to document the
 -- outcome).
 
-SELECT uniqCombined(number)   FROM numbers(toUInt64(1e7));
-SELECT uniqCombined64(number) FROM numbers(toUInt64(1e7));
+SELECT uniqCombined(number)   FROM numbers(1e7);
+SELECT uniqCombined64(number) FROM numbers(1e7);
diff --git a/tests/queries/0_stateless/01017_uniqCombined_memory_usage.sql b/tests/queries/0_stateless/01017_uniqCombined_memory_usage.sql
index bfcfec2b8ba..2ad1edae733 100644
--- a/tests/queries/0_stateless/01017_uniqCombined_memory_usage.sql
+++ b/tests/queries/0_stateless/01017_uniqCombined_memory_usage.sql
@@ -5,45 +5,45 @@
 -- HashTable for UInt32 (used until (1<<13) elements), hence 8192 elements
 SELECT 'UInt32';
 SET max_memory_usage = 4000000;
-SELECT sum(u) FROM (SELECT intDiv(number, 8192) AS k, uniqCombined(number % 8192) u FROM numbers(toUInt64(8192 * 100)) GROUP BY k); -- { serverError 241 }
+SELECT sum(u) FROM (SELECT intDiv(number, 8192) AS k, uniqCombined(number % 8192) u FROM numbers(8192 * 100) GROUP BY k); -- { serverError 241 }
 SET max_memory_usage = 9830400;
-SELECT sum(u) FROM (SELECT intDiv(number, 8192) AS k, uniqCombined(number % 8192) u FROM numbers(toUInt64(8192 * 100)) GROUP BY k);
+SELECT sum(u) FROM (SELECT intDiv(number, 8192) AS k, uniqCombined(number % 8192) u FROM numbers(8192 * 100) GROUP BY k);
 
 -- HashTable for UInt64 (used until (1<<12) elements), hence 4096 elements
 SELECT 'UInt64';
 SET max_memory_usage = 4000000;
-SELECT sum(u) FROM (SELECT intDiv(number, 4096) AS k, uniqCombined(reinterpretAsString(number % 4096)) u FROM numbers(toUInt64(4096 * 100)) GROUP BY k); -- { serverError 241 }
+SELECT sum(u) FROM (SELECT intDiv(number, 4096) AS k, uniqCombined(reinterpretAsString(number % 4096)) u FROM numbers(4096 * 100) GROUP BY k); -- { serverError 241 }
 SET max_memory_usage = 9830400;
-SELECT sum(u) FROM (SELECT intDiv(number, 4096) AS k, uniqCombined(reinterpretAsString(number % 4096)) u FROM numbers(toUInt64(4096 * 100)) GROUP BY k);
+SELECT sum(u) FROM (SELECT intDiv(number, 4096) AS k, uniqCombined(reinterpretAsString(number % 4096)) u FROM numbers(4096 * 100) GROUP BY k);
 
 SELECT 'K=16';
 
 -- HashTable for UInt32 (used until (1<<12) elements), hence 4096 elements
 SELECT 'UInt32';
 SET max_memory_usage = 2000000;
-SELECT sum(u) FROM (SELECT intDiv(number, 4096) AS k, uniqCombined(16)(number % 4096) u FROM numbers(toUInt64(4096 * 100)) GROUP BY k); -- { serverError 241 }
+SELECT sum(u) FROM (SELECT intDiv(number, 4096) AS k, uniqCombined(16)(number % 4096) u FROM numbers(4096 * 100) GROUP BY k); -- { serverError 241 }
 SET max_memory_usage = 4915200;
-SELECT sum(u) FROM (SELECT intDiv(number, 4096) AS k, uniqCombined(16)(number % 4096) u FROM numbers(toUInt64(4096 * 100)) GROUP BY k);
+SELECT sum(u) FROM (SELECT intDiv(number, 4096) AS k, uniqCombined(16)(number % 4096) u FROM numbers(4096 * 100) GROUP BY k);
 
 -- HashTable for UInt64 (used until (1<<11) elements), hence 2048 elements
 SELECT 'UInt64';
 SET max_memory_usage = 2000000;
-SELECT sum(u) FROM (SELECT intDiv(number, 2048) AS k, uniqCombined(16)(reinterpretAsString(number % 2048)) u FROM numbers(toUInt64(2048 * 100)) GROUP BY k); -- { serverError 241 }
+SELECT sum(u) FROM (SELECT intDiv(number, 2048) AS k, uniqCombined(16)(reinterpretAsString(number % 2048)) u FROM numbers(2048 * 100) GROUP BY k); -- { serverError 241 }
 SET max_memory_usage = 4915200;
-SELECT sum(u) FROM (SELECT intDiv(number, 2048) AS k, uniqCombined(16)(reinterpretAsString(number % 2048)) u FROM numbers(toUInt64(2048 * 100)) GROUP BY k);
+SELECT sum(u) FROM (SELECT intDiv(number, 2048) AS k, uniqCombined(16)(reinterpretAsString(number % 2048)) u FROM numbers(2048 * 100) GROUP BY k);
 
 SELECT 'K=18';
 
 -- HashTable for UInt32 (used until (1<<14) elements), hence 16384 elements
 SELECT 'UInt32';
 SET max_memory_usage = 8000000;
-SELECT sum(u) FROM (SELECT intDiv(number, 16384) AS k, uniqCombined(18)(number % 16384) u FROM numbers(toUInt64(16384 * 100)) GROUP BY k); -- { serverError 241 }
+SELECT sum(u) FROM (SELECT intDiv(number, 16384) AS k, uniqCombined(18)(number % 16384) u FROM numbers(16384 * 100) GROUP BY k); -- { serverError 241 }
 SET max_memory_usage = 19660800;
-SELECT sum(u) FROM (SELECT intDiv(number, 16384) AS k, uniqCombined(18)(number % 16384) u FROM numbers(toUInt64(16384 * 100)) GROUP BY k);
+SELECT sum(u) FROM (SELECT intDiv(number, 16384) AS k, uniqCombined(18)(number % 16384) u FROM numbers(16384 * 100) GROUP BY k);
 
 -- HashTable for UInt64 (used until (1<<13) elements), hence 8192 elements
 SELECT 'UInt64';
 SET max_memory_usage = 8000000;
-SELECT sum(u) FROM (SELECT intDiv(number, 8192) AS k, uniqCombined(18)(reinterpretAsString(number % 8192)) u FROM numbers(toUInt64(8192 * 100)) GROUP BY k); -- { serverError 241 }
+SELECT sum(u) FROM (SELECT intDiv(number, 8192) AS k, uniqCombined(18)(reinterpretAsString(number % 8192)) u FROM numbers(8192 * 100) GROUP BY k); -- { serverError 241 }
 SET max_memory_usage = 19660800;
-SELECT sum(u) FROM (SELECT intDiv(number, 8192) AS k, uniqCombined(18)(reinterpretAsString(number % 8192)) u FROM numbers(toUInt64(8192 * 100)) GROUP BY k);
+SELECT sum(u) FROM (SELECT intDiv(number, 8192) AS k, uniqCombined(18)(reinterpretAsString(number % 8192)) u FROM numbers(8192 * 100) GROUP BY k);
diff --git a/tests/queries/0_stateless/01029_early_constant_folding.reference b/tests/queries/0_stateless/01029_early_constant_folding.reference
index 7e2f6c7ce76..8a1d4cec388 100644
--- a/tests/queries/0_stateless/01029_early_constant_folding.reference
+++ b/tests/queries/0_stateless/01029_early_constant_folding.reference
@@ -2,7 +2,7 @@ SELECT 1
 WHERE 0
 SELECT 1
 SELECT 1
-WHERE 0
+WHERE (1 IN (0, 2)) AND (2 = (identity(cast(2, \'UInt8\')) AS subquery))
 SELECT 1
 WHERE 1 IN (
 (
diff --git a/tests/queries/0_stateless/01029_early_constant_folding.sql b/tests/queries/0_stateless/01029_early_constant_folding.sql
index 428c3625295..6336b62e080 100644
--- a/tests/queries/0_stateless/01029_early_constant_folding.sql
+++ b/tests/queries/0_stateless/01029_early_constant_folding.sql
@@ -4,7 +4,7 @@ EXPLAIN SYNTAX SELECT 1 WHERE 1 = 0;
 
 EXPLAIN SYNTAX SELECT 1 WHERE 1 IN (0, 1, 2);
 
-EXPLAIN SYNTAX SELECT 1 WHERE 1 IN (0, 2) AND 2 = (SELECT 2);
+EXPLAIN SYNTAX SELECT 1 WHERE 1 IN (0, 2) AND 2 = ((SELECT 2) AS subquery);
 
 -- no constant folding
 
diff --git a/tests/queries/0_stateless/01125_dict_ddl_cannot_add_column.reference b/tests/queries/0_stateless/01125_dict_ddl_cannot_add_column.reference
index 1a9e5685a6a..71be9c3fb5b 100644
--- a/tests/queries/0_stateless/01125_dict_ddl_cannot_add_column.reference
+++ b/tests/queries/0_stateless/01125_dict_ddl_cannot_add_column.reference
@@ -1,3 +1,4 @@
 1	2019-01-05	2020-01-10	1
+1
 date_table
 somedict
diff --git a/tests/queries/0_stateless/01125_dict_ddl_cannot_add_column.sql b/tests/queries/0_stateless/01125_dict_ddl_cannot_add_column.sql
index 6ad76ee5a7e..471fd7959a9 100644
--- a/tests/queries/0_stateless/01125_dict_ddl_cannot_add_column.sql
+++ b/tests/queries/0_stateless/01125_dict_ddl_cannot_add_column.sql
@@ -29,6 +29,9 @@ LIFETIME(MIN 300 MAX 360);
 
 SELECT * from somedict;
 
+-- No dictionary columns
+SELECT 1 FROM somedict;
+
 SHOW TABLES;
 
-DROP DATABASE IF EXISTS database_for_dict;
+DROP DATABASE database_for_dict;
diff --git a/tests/queries/0_stateless/01177_group_array_moving.reference b/tests/queries/0_stateless/01177_group_array_moving.reference
new file mode 100644
index 00000000000..d74c84bb94f
--- /dev/null
+++ b/tests/queries/0_stateless/01177_group_array_moving.reference
@@ -0,0 +1,2 @@
+[-9223372036854775808,0,-9223372036854775808,0,-9223372036854775808,0]	[18446744073709551615,18446744073709551614,18446744073709551613,18446744073709551612,18446744073709551611,18446744073709551610]	[0,9223372036854775807,9223372036854775805,9223372036854775805,18446744073709551612,18446744073709551610]
+[-35888607147294850,-71777214294589700,-107665821441884540,-143554428589179400,-179443035736474240,-215331642883769100]	[17592202821648,35184405643296,52776608464944,70368811286592,87961014108240,105553216929888]	[0,1,3,3,4,6]
diff --git a/tests/queries/0_stateless/01177_group_array_moving.sql b/tests/queries/0_stateless/01177_group_array_moving.sql
new file mode 100644
index 00000000000..b1969e204fc
--- /dev/null
+++ b/tests/queries/0_stateless/01177_group_array_moving.sql
@@ -0,0 +1,2 @@
+SELECT groupArrayMovingSum(257)(-9223372036854775808), groupArrayMovingSum(1048575)(18446744073709551615), groupArrayMovingSum(9223372036854775807)(number * 9223372036854775807) FROM remote('127.0.0.{1..2}', numbers(3));
+SELECT groupArrayMovingAvg(257)(-9223372036854775808), groupArrayMovingAvg(1048575)(18446744073709551615), groupArrayMovingAvg(9223372036854775807)(number * 9223372036854775807) FROM remote('127.0.0.{1..2}', numbers(3));
diff --git a/tests/queries/0_stateless/01178_int_field_to_decimal.reference b/tests/queries/0_stateless/01178_int_field_to_decimal.reference
new file mode 100644
index 00000000000..6c256ba2032
--- /dev/null
+++ b/tests/queries/0_stateless/01178_int_field_to_decimal.reference
@@ -0,0 +1,2 @@
+9.00000000
+10.00000000
diff --git a/tests/queries/0_stateless/01178_int_field_to_decimal.sql b/tests/queries/0_stateless/01178_int_field_to_decimal.sql
new file mode 100644
index 00000000000..bbd72e57d70
--- /dev/null
+++ b/tests/queries/0_stateless/01178_int_field_to_decimal.sql
@@ -0,0 +1,10 @@
+select d from values('d Decimal(8, 8)', 0, 1) where d not in (-1, 0); -- { serverError 69 }
+select d from values('d Decimal(8, 8)', 0, 2) where d not in (1, 0); -- { serverError 69 }
+select d from values('d Decimal(9, 8)', 0, 3) where d not in (-9223372036854775808, 0); -- { serverError 69 }
+select d from values('d Decimal(9, 8)', 0, 4) where d not in (18446744073709551615, 0); -- { serverError 69 }
+select d from values('d Decimal(18, 8)', 0, 5) where d not in (-9223372036854775808, 0); -- { serverError 69 }
+select d from values('d Decimal(18, 8)', 0, 6) where d not in (18446744073709551615, 0); -- { serverError 69 }
+select d from values('d Decimal(26, 8)', 0, 7) where d not in (-9223372036854775808, 0); -- { serverError 69 }
+select d from values('d Decimal(27, 8)', 0, 8) where d not in (18446744073709551615, 0); -- { serverError 69 }
+select d from values('d Decimal(27, 8)', 0, 9) where d not in (-9223372036854775808, 0);
+select d from values('d Decimal(28, 8)', 0, 10) where d not in (18446744073709551615, 0);
diff --git a/tests/queries/0_stateless/01281_group_by_limit_memory_tracking.sh b/tests/queries/0_stateless/01281_group_by_limit_memory_tracking.sh
index 285e2ab8dad..9909d9b566d 100755
--- a/tests/queries/0_stateless/01281_group_by_limit_memory_tracking.sh
+++ b/tests/queries/0_stateless/01281_group_by_limit_memory_tracking.sh
@@ -33,7 +33,7 @@ function execute_group_by()
         "--max_memory_usage_for_user="$((150<<20))
         "--max_threads=2"
     )
-    execute_null "${opts[@]}" <<<'SELECT uniq(number) FROM numbers_mt(toUInt64(1e6)) GROUP BY number % 5e5'
+    execute_null "${opts[@]}" <<<'SELECT uniq(number) FROM numbers_mt(1e6) GROUP BY number % 5e5'
 }
 
 # This is needed to keep at least one running query for user for the time of test.
diff --git a/tests/queries/0_stateless/01508_partition_pruning.reference b/tests/queries/0_stateless/01508_partition_pruning.reference
deleted file mode 100644
index 0cc40d23b41..00000000000
--- a/tests/queries/0_stateless/01508_partition_pruning.reference
+++ /dev/null
@@ -1,244 +0,0 @@
---------- tMM ----------------------------
-select uniqExact(_part), count() from tMM where toDate(d)=toDate('2020-09-15');
-0	0
-Selected 0 parts by partition key, 0 parts by primary key, 0 marks by primary key, 0 marks to read from 0 ranges
-
-select uniqExact(_part), count() from tMM where toDate(d)=toDate('2020-09-01');
-2	2880
-Selected 2 parts by partition key, 2 parts by primary key, 2 marks by primary key, 2 marks to read from 2 ranges
-
-select uniqExact(_part), count() from tMM where toDate(d)=toDate('2020-10-15');
-1	1440
-Selected 1 parts by partition key, 1 parts by primary key, 1 marks by primary key, 1 marks to read from 1 ranges
-
-select uniqExact(_part), count() from tMM where toDate(d)='2020-09-15';
-0	0
-Selected 0 parts by partition key, 0 parts by primary key, 0 marks by primary key, 0 marks to read from 0 ranges
-
-select uniqExact(_part), count() from tMM where toYYYYMM(d)=202009;
-2	10000
-Selected 2 parts by partition key, 2 parts by primary key, 2 marks by primary key, 2 marks to read from 2 ranges
-
-select uniqExact(_part), count() from tMM where toYYYYMMDD(d)=20200816;
-2	2880
-Selected 2 parts by partition key, 2 parts by primary key, 2 marks by primary key, 2 marks to read from 2 ranges
-
-select uniqExact(_part), count() from tMM where toYYYYMMDD(d)=20201015;
-1	1440
-Selected 1 parts by partition key, 1 parts by primary key, 1 marks by primary key, 1 marks to read from 1 ranges
-
-select uniqExact(_part), count() from tMM where toDate(d)='2020-10-15';
-1	1440
-Selected 1 parts by partition key, 1 parts by primary key, 1 marks by primary key, 1 marks to read from 1 ranges
-
-select uniqExact(_part), count() from tMM where d >= '2020-09-01 00:00:00' and d<'2020-10-15 00:00:00';
-3	15000
-Selected 3 parts by partition key, 3 parts by primary key, 3 marks by primary key, 3 marks to read from 3 ranges
-
-select uniqExact(_part), count() from tMM where d >= '2020-01-16 00:00:00' and d < toDateTime('2021-08-17 00:00:00');
-6	30000
-Selected 6 parts by partition key, 6 parts by primary key, 6 marks by primary key, 6 marks to read from 6 ranges
-
-select uniqExact(_part), count() from tMM where d >= '2020-09-16 00:00:00' and d < toDateTime('2020-10-01 00:00:00');
-0	0
-Selected 0 parts by partition key, 0 parts by primary key, 0 marks by primary key, 0 marks to read from 0 ranges
-
-select uniqExact(_part), count() from tMM where d >= '2020-09-12 00:00:00' and d < '2020-10-16 00:00:00';
-2	6440
-Selected 2 parts by partition key, 2 parts by primary key, 2 marks by primary key, 2 marks to read from 2 ranges
-
-select uniqExact(_part), count() from tMM where toStartOfDay(d) >= '2020-09-12 00:00:00';
-2	10000
-Selected 2 parts by partition key, 2 parts by primary key, 2 marks by primary key, 2 marks to read from 2 ranges
-
-select uniqExact(_part), count() from tMM where toStartOfDay(d) = '2020-09-01 00:00:00';
-2	2880
-Selected 2 parts by partition key, 2 parts by primary key, 2 marks by primary key, 2 marks to read from 2 ranges
-
-select uniqExact(_part), count() from tMM where toStartOfDay(d) = '2020-10-01 00:00:00';
-1	1440
-Selected 1 parts by partition key, 1 parts by primary key, 1 marks by primary key, 1 marks to read from 1 ranges
-
-select uniqExact(_part), count() from tMM where toStartOfDay(d) >= '2020-09-15 00:00:00' and d < '2020-10-16 00:00:00';
-2	6440
-Selected 2 parts by partition key, 2 parts by primary key, 2 marks by primary key, 2 marks to read from 2 ranges
-
-select uniqExact(_part), count() from tMM where toYYYYMM(d) between 202009 and 202010;
-4	20000
-Selected 4 parts by partition key, 4 parts by primary key, 4 marks by primary key, 4 marks to read from 4 ranges
-
-select uniqExact(_part), count() from tMM where toYYYYMM(d) between 202009 and 202009;
-2	10000
-Selected 2 parts by partition key, 2 parts by primary key, 2 marks by primary key, 2 marks to read from 2 ranges
-
-select uniqExact(_part), count() from tMM where toYYYYMM(d) between 202009 and 202010 and toStartOfDay(d) = '2020-10-01 00:00:00';
-1	1440
-Selected 1 parts by partition key, 1 parts by primary key, 1 marks by primary key, 1 marks to read from 1 ranges
-
-select uniqExact(_part), count() from tMM where toYYYYMM(d) >= 202009 and toStartOfDay(d) < '2020-10-02 00:00:00';
-3	11440
-Selected 3 parts by partition key, 3 parts by primary key, 3 marks by primary key, 3 marks to read from 3 ranges
-
-select uniqExact(_part), count() from tMM where toYYYYMM(d) > 202009 and toStartOfDay(d) < '2020-10-02 00:00:00';
-1	1440
-Selected 1 parts by partition key, 1 parts by primary key, 1 marks by primary key, 1 marks to read from 1 ranges
-
-select uniqExact(_part), count() from tMM where toYYYYMM(d)+1 > 202009 and toStartOfDay(d) < '2020-10-02 00:00:00';
-3	11440
-Selected 3 parts by partition key, 3 parts by primary key, 3 marks by primary key, 3 marks to read from 3 ranges
-
-select uniqExact(_part), count() from tMM where toYYYYMM(d)+1 > 202010 and toStartOfDay(d) < '2020-10-02 00:00:00';
-1	1440
-Selected 1 parts by partition key, 1 parts by primary key, 1 marks by primary key, 1 marks to read from 1 ranges
-
-select uniqExact(_part), count() from tMM where toYYYYMM(d)+1 > 202010;
-2	10000
-Selected 2 parts by partition key, 2 parts by primary key, 2 marks by primary key, 2 marks to read from 2 ranges
-
-select uniqExact(_part), count() from tMM where toYYYYMM(d-1)+1 = 202010;
-3	9999
-Selected 3 parts by partition key, 3 parts by primary key, 3 marks by primary key, 3 marks to read from 3 ranges
-
-select uniqExact(_part), count() from tMM where toStartOfMonth(d) >= '2020-09-15';
-2	10000
-Selected 2 parts by partition key, 2 parts by primary key, 2 marks by primary key, 2 marks to read from 2 ranges
-
-select uniqExact(_part), count() from tMM where toStartOfMonth(d) >= '2020-09-01';
-4	20000
-Selected 4 parts by partition key, 4 parts by primary key, 4 marks by primary key, 4 marks to read from 4 ranges
-
-select uniqExact(_part), count() from tMM where toStartOfMonth(d) >= '2020-09-01' and toStartOfMonth(d) < '2020-10-01';
-2	10000
-Selected 2 parts by partition key, 2 parts by primary key, 2 marks by primary key, 2 marks to read from 2 ranges
-
-select uniqExact(_part), count() from tMM where toYYYYMM(d-1)+1 = 202010;
-2	9999
-Selected 2 parts by partition key, 2 parts by primary key, 2 marks by primary key, 2 marks to read from 2 ranges
-
-select uniqExact(_part), count() from tMM where toYYYYMM(d)+1 > 202010;
-1	10000
-Selected 1 parts by partition key, 1 parts by primary key, 1 marks by primary key, 1 marks to read from 1 ranges
-
-select uniqExact(_part), count() from tMM where toYYYYMM(d) between 202009 and 202010;
-2	20000
-Selected 2 parts by partition key, 2 parts by primary key, 2 marks by primary key, 2 marks to read from 2 ranges
-
---------- tDD ----------------------------
-select uniqExact(_part), count() from tDD where toDate(d)=toDate('2020-09-24');
-1	10000
-Selected 1 parts by partition key, 1 parts by primary key, 1 marks by primary key, 1 marks to read from 1 ranges
-
-select uniqExact(_part), count() FROM tDD WHERE toDate(d) = toDate('2020-09-24');
-1	10000
-Selected 1 parts by partition key, 1 parts by primary key, 1 marks by primary key, 1 marks to read from 1 ranges
-
-select uniqExact(_part), count() FROM tDD WHERE toDate(d) = '2020-09-24';
-1	10000
-Selected 1 parts by partition key, 1 parts by primary key, 1 marks by primary key, 1 marks to read from 1 ranges
-
-select uniqExact(_part), count() FROM tDD WHERE toDate(d) >= '2020-09-23' and toDate(d) <= '2020-09-26';
-3	40000
-Selected 3 parts by partition key, 3 parts by primary key, 4 marks by primary key, 4 marks to read from 3 ranges
-
-select uniqExact(_part), count() FROM tDD WHERE toYYYYMMDD(d) >= 20200923 and toDate(d) <= '2020-09-26';
-3	40000
-Selected 3 parts by partition key, 3 parts by primary key, 4 marks by primary key, 4 marks to read from 3 ranges
-
---------- sDD ----------------------------
-select uniqExact(_part), count() from sDD;
-6	30000
-Selected 6 parts by partition key, 6 parts by primary key, 6 marks by primary key, 6 marks to read from 6 ranges
-
-select uniqExact(_part), count() from sDD where toYYYYMM(toDateTime(intDiv(d,1000),'UTC')-1)+1 = 202010;
-3	9999
-Selected 3 parts by partition key, 3 parts by primary key, 3 marks by primary key, 3 marks to read from 3 ranges
-
-select uniqExact(_part), count() from sDD where toYYYYMM(toDateTime(intDiv(d,1000),'UTC')-1) = 202010;
-2	9999
-Selected 2 parts by partition key, 2 parts by primary key, 2 marks by primary key, 2 marks to read from 2 ranges
-
-select uniqExact(_part), count() from sDD where toYYYYMM(toDateTime(intDiv(d,1000),'UTC')-1) = 202110;
-0	0
-Selected 0 parts by partition key, 0 parts by primary key, 0 marks by primary key, 0 marks to read from 0 ranges
-
-select uniqExact(_part), count() from sDD where toYYYYMM(toDateTime(intDiv(d,1000),'UTC'))+1 > 202009 and toStartOfDay(toDateTime(intDiv(d,1000),'UTC')) < toDateTime('2020-10-02 00:00:00','UTC');
-3	11440
-Selected 3 parts by partition key, 3 parts by primary key, 3 marks by primary key, 3 marks to read from 3 ranges
-
-select uniqExact(_part), count() from sDD where toYYYYMM(toDateTime(intDiv(d,1000),'UTC'))+1 > 202009 and toDateTime(intDiv(d,1000),'UTC') < toDateTime('2020-10-01 00:00:00','UTC');
-2	10000
-Selected 2 parts by partition key, 2 parts by primary key, 2 marks by primary key, 2 marks to read from 2 ranges
-
-select uniqExact(_part), count() from sDD where d >= 1598918400000;
-4	20000
-Selected 4 parts by partition key, 4 parts by primary key, 4 marks by primary key, 4 marks to read from 4 ranges
-
-select uniqExact(_part), count() from sDD where d >= 1598918400000 and toYYYYMM(toDateTime(intDiv(d,1000),'UTC')-1) < 202010;
-3	10001
-Selected 3 parts by partition key, 3 parts by primary key, 3 marks by primary key, 3 marks to read from 3 ranges
-
---------- xMM ----------------------------
-select uniqExact(_part), count() from xMM where toStartOfDay(d) >= '2020-10-01 00:00:00';
-2	10000
-Selected 2 parts by partition key, 2 parts by primary key, 2 marks by primary key, 2 marks to read from 2 ranges
-
-select uniqExact(_part), count() from xMM where d >= '2020-09-01 00:00:00' and d <= '2020-10-01 00:00:00';
-3	10001
-Selected 3 parts by partition key, 3 parts by primary key, 3 marks by primary key, 3 marks to read from 3 ranges
-
-select uniqExact(_part), count() from xMM where d >= '2020-09-01 00:00:00' and d < '2020-10-01 00:00:00';
-2	10000
-Selected 2 parts by partition key, 2 parts by primary key, 2 marks by primary key, 2 marks to read from 2 ranges
-
-select uniqExact(_part), count() from xMM where d >= '2020-09-01 00:00:00' and d <= '2020-10-01 00:00:00' and a=1;
-1	1
-Selected 1 parts by partition key, 1 parts by primary key, 1 marks by primary key, 1 marks to read from 1 ranges
-
-select uniqExact(_part), count() from xMM where d >= '2020-09-01 00:00:00' and d <= '2020-10-01 00:00:00' and a<>3;
-2	5001
-Selected 2 parts by partition key, 2 parts by primary key, 2 marks by primary key, 2 marks to read from 2 ranges
-
-select uniqExact(_part), count() from xMM where d >= '2020-09-01 00:00:00' and d < '2020-10-01 00:00:00' and a<>3;
-1	5000
-Selected 1 parts by partition key, 1 parts by primary key, 1 marks by primary key, 1 marks to read from 1 ranges
-
-select uniqExact(_part), count() from xMM where d >= '2020-09-01 00:00:00' and d < '2020-11-01 00:00:00' and a = 1;
-2	10000
-Selected 2 parts by partition key, 2 parts by primary key, 2 marks by primary key, 2 marks to read from 2 ranges
-
-select uniqExact(_part), count() from xMM where a = 1;
-3	15000
-Selected 3 parts by partition key, 3 parts by primary key, 3 marks by primary key, 3 marks to read from 3 ranges
-
-select uniqExact(_part), count() from xMM where a = 66;
-0	0
-Selected 0 parts by partition key, 0 parts by primary key, 0 marks by primary key, 0 marks to read from 0 ranges
-
-select uniqExact(_part), count() from xMM where a <> 66;
-6	30000
-Selected 6 parts by partition key, 6 parts by primary key, 6 marks by primary key, 6 marks to read from 6 ranges
-
-select uniqExact(_part), count() from xMM where a = 2;
-2	10000
-Selected 2 parts by partition key, 2 parts by primary key, 2 marks by primary key, 2 marks to read from 2 ranges
-
-select uniqExact(_part), count() from xMM where a = 1;
-2	15000
-Selected 2 parts by partition key, 2 parts by primary key, 2 marks by primary key, 2 marks to read from 2 ranges
-
-select uniqExact(_part), count() from xMM where toStartOfDay(d) >= '2020-10-01 00:00:00';
-1	10000
-Selected 1 parts by partition key, 1 parts by primary key, 1 marks by primary key, 1 marks to read from 1 ranges
-
-select uniqExact(_part), count() from xMM where a <> 66;
-5	30000
-Selected 5 parts by partition key, 5 parts by primary key, 5 marks by primary key, 5 marks to read from 5 ranges
-
-select uniqExact(_part), count() from xMM where d >= '2020-09-01 00:00:00' and d <= '2020-10-01 00:00:00' and a<>3;
-2	5001
-Selected 2 parts by partition key, 2 parts by primary key, 2 marks by primary key, 2 marks to read from 2 ranges
-
-select uniqExact(_part), count() from xMM where d >= '2020-09-01 00:00:00' and d < '2020-10-01 00:00:00' and a<>3;
-1	5000
-Selected 1 parts by partition key, 1 parts by primary key, 1 marks by primary key, 1 marks to read from 1 ranges
-
diff --git a/tests/queries/0_stateless/01508_partition_pruning.queries b/tests/queries/0_stateless/01508_partition_pruning_long.queries
similarity index 100%
rename from tests/queries/0_stateless/01508_partition_pruning.queries
rename to tests/queries/0_stateless/01508_partition_pruning_long.queries
diff --git a/tests/queries/0_stateless/01508_partition_pruning_long.reference b/tests/queries/0_stateless/01508_partition_pruning_long.reference
new file mode 100644
index 00000000000..70f529c6058
--- /dev/null
+++ b/tests/queries/0_stateless/01508_partition_pruning_long.reference
@@ -0,0 +1,244 @@
+--------- tMM ----------------------------
+select uniqExact(_part), count() from tMM where toDate(d)=toDate('2020-09-15');
+0	0
+Selected 0/6 parts by partition key, 0 parts by primary key, 0/0 marks by primary key, 0 marks to read from 0 ranges
+
+select uniqExact(_part), count() from tMM where toDate(d)=toDate('2020-09-01');
+2	2880
+Selected 2/6 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges
+
+select uniqExact(_part), count() from tMM where toDate(d)=toDate('2020-10-15');
+1	1440
+Selected 1/6 parts by partition key, 1 parts by primary key, 1/2 marks by primary key, 1 marks to read from 1 ranges
+
+select uniqExact(_part), count() from tMM where toDate(d)='2020-09-15';
+0	0
+Selected 0/6 parts by partition key, 0 parts by primary key, 0/0 marks by primary key, 0 marks to read from 0 ranges
+
+select uniqExact(_part), count() from tMM where toYYYYMM(d)=202009;
+2	10000
+Selected 2/6 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges
+
+select uniqExact(_part), count() from tMM where toYYYYMMDD(d)=20200816;
+2	2880
+Selected 2/6 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges
+
+select uniqExact(_part), count() from tMM where toYYYYMMDD(d)=20201015;
+1	1440
+Selected 1/6 parts by partition key, 1 parts by primary key, 1/2 marks by primary key, 1 marks to read from 1 ranges
+
+select uniqExact(_part), count() from tMM where toDate(d)='2020-10-15';
+1	1440
+Selected 1/6 parts by partition key, 1 parts by primary key, 1/2 marks by primary key, 1 marks to read from 1 ranges
+
+select uniqExact(_part), count() from tMM where d >= '2020-09-01 00:00:00' and d<'2020-10-15 00:00:00';
+3	15000
+Selected 3/6 parts by partition key, 3 parts by primary key, 3/6 marks by primary key, 3 marks to read from 3 ranges
+
+select uniqExact(_part), count() from tMM where d >= '2020-01-16 00:00:00' and d < toDateTime('2021-08-17 00:00:00');
+6	30000
+Selected 6/6 parts by partition key, 6 parts by primary key, 6/12 marks by primary key, 6 marks to read from 6 ranges
+
+select uniqExact(_part), count() from tMM where d >= '2020-09-16 00:00:00' and d < toDateTime('2020-10-01 00:00:00');
+0	0
+Selected 0/6 parts by partition key, 0 parts by primary key, 0/0 marks by primary key, 0 marks to read from 0 ranges
+
+select uniqExact(_part), count() from tMM where d >= '2020-09-12 00:00:00' and d < '2020-10-16 00:00:00';
+2	6440
+Selected 2/6 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges
+
+select uniqExact(_part), count() from tMM where toStartOfDay(d) >= '2020-09-12 00:00:00';
+2	10000
+Selected 2/6 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges
+
+select uniqExact(_part), count() from tMM where toStartOfDay(d) = '2020-09-01 00:00:00';
+2	2880
+Selected 2/6 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges
+
+select uniqExact(_part), count() from tMM where toStartOfDay(d) = '2020-10-01 00:00:00';
+1	1440
+Selected 1/6 parts by partition key, 1 parts by primary key, 1/2 marks by primary key, 1 marks to read from 1 ranges
+
+select uniqExact(_part), count() from tMM where toStartOfDay(d) >= '2020-09-15 00:00:00' and d < '2020-10-16 00:00:00';
+2	6440
+Selected 2/6 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges
+
+select uniqExact(_part), count() from tMM where toYYYYMM(d) between 202009 and 202010;
+4	20000
+Selected 4/6 parts by partition key, 4 parts by primary key, 4/8 marks by primary key, 4 marks to read from 4 ranges
+
+select uniqExact(_part), count() from tMM where toYYYYMM(d) between 202009 and 202009;
+2	10000
+Selected 2/6 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges
+
+select uniqExact(_part), count() from tMM where toYYYYMM(d) between 202009 and 202010 and toStartOfDay(d) = '2020-10-01 00:00:00';
+1	1440
+Selected 1/6 parts by partition key, 1 parts by primary key, 1/2 marks by primary key, 1 marks to read from 1 ranges
+
+select uniqExact(_part), count() from tMM where toYYYYMM(d) >= 202009 and toStartOfDay(d) < '2020-10-02 00:00:00';
+3	11440
+Selected 3/6 parts by partition key, 3 parts by primary key, 3/6 marks by primary key, 3 marks to read from 3 ranges
+
+select uniqExact(_part), count() from tMM where toYYYYMM(d) > 202009 and toStartOfDay(d) < '2020-10-02 00:00:00';
+1	1440
+Selected 1/6 parts by partition key, 1 parts by primary key, 1/2 marks by primary key, 1 marks to read from 1 ranges
+
+select uniqExact(_part), count() from tMM where toYYYYMM(d)+1 > 202009 and toStartOfDay(d) < '2020-10-02 00:00:00';
+3	11440
+Selected 3/6 parts by partition key, 3 parts by primary key, 3/6 marks by primary key, 3 marks to read from 3 ranges
+
+select uniqExact(_part), count() from tMM where toYYYYMM(d)+1 > 202010 and toStartOfDay(d) < '2020-10-02 00:00:00';
+1	1440
+Selected 1/6 parts by partition key, 1 parts by primary key, 1/2 marks by primary key, 1 marks to read from 1 ranges
+
+select uniqExact(_part), count() from tMM where toYYYYMM(d)+1 > 202010;
+2	10000
+Selected 2/6 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges
+
+select uniqExact(_part), count() from tMM where toYYYYMM(d-1)+1 = 202010;
+3	9999
+Selected 3/6 parts by partition key, 3 parts by primary key, 3/6 marks by primary key, 3 marks to read from 3 ranges
+
+select uniqExact(_part), count() from tMM where toStartOfMonth(d) >= '2020-09-15';
+2	10000
+Selected 2/6 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges
+
+select uniqExact(_part), count() from tMM where toStartOfMonth(d) >= '2020-09-01';
+4	20000
+Selected 4/6 parts by partition key, 4 parts by primary key, 4/8 marks by primary key, 4 marks to read from 4 ranges
+
+select uniqExact(_part), count() from tMM where toStartOfMonth(d) >= '2020-09-01' and toStartOfMonth(d) < '2020-10-01';
+2	10000
+Selected 2/6 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges
+
+select uniqExact(_part), count() from tMM where toYYYYMM(d-1)+1 = 202010;
+2	9999
+Selected 2/3 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges
+
+select uniqExact(_part), count() from tMM where toYYYYMM(d)+1 > 202010;
+1	10000
+Selected 1/3 parts by partition key, 1 parts by primary key, 1/2 marks by primary key, 1 marks to read from 1 ranges
+
+select uniqExact(_part), count() from tMM where toYYYYMM(d) between 202009 and 202010;
+2	20000
+Selected 2/3 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges
+
+--------- tDD ----------------------------
+select uniqExact(_part), count() from tDD where toDate(d)=toDate('2020-09-24');
+1	10000
+Selected 1/4 parts by partition key, 1 parts by primary key, 1/2 marks by primary key, 1 marks to read from 1 ranges
+
+select uniqExact(_part), count() FROM tDD WHERE toDate(d) = toDate('2020-09-24');
+1	10000
+Selected 1/4 parts by partition key, 1 parts by primary key, 1/2 marks by primary key, 1 marks to read from 1 ranges
+
+select uniqExact(_part), count() FROM tDD WHERE toDate(d) = '2020-09-24';
+1	10000
+Selected 1/4 parts by partition key, 1 parts by primary key, 1/2 marks by primary key, 1 marks to read from 1 ranges
+
+select uniqExact(_part), count() FROM tDD WHERE toDate(d) >= '2020-09-23' and toDate(d) <= '2020-09-26';
+3	40000
+Selected 3/4 parts by partition key, 3 parts by primary key, 4/7 marks by primary key, 4 marks to read from 3 ranges
+
+select uniqExact(_part), count() FROM tDD WHERE toYYYYMMDD(d) >= 20200923 and toDate(d) <= '2020-09-26';
+3	40000
+Selected 3/4 parts by partition key, 3 parts by primary key, 4/7 marks by primary key, 4 marks to read from 3 ranges
+
+--------- sDD ----------------------------
+select uniqExact(_part), count() from sDD;
+6	30000
+Selected 6/6 parts by partition key, 6 parts by primary key, 6/12 marks by primary key, 6 marks to read from 6 ranges
+
+select uniqExact(_part), count() from sDD where toYYYYMM(toDateTime(intDiv(d,1000),'UTC')-1)+1 = 202010;
+3	9999
+Selected 3/6 parts by partition key, 3 parts by primary key, 3/6 marks by primary key, 3 marks to read from 3 ranges
+
+select uniqExact(_part), count() from sDD where toYYYYMM(toDateTime(intDiv(d,1000),'UTC')-1) = 202010;
+2	9999
+Selected 2/6 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges
+
+select uniqExact(_part), count() from sDD where toYYYYMM(toDateTime(intDiv(d,1000),'UTC')-1) = 202110;
+0	0
+Selected 0/6 parts by partition key, 0 parts by primary key, 0/0 marks by primary key, 0 marks to read from 0 ranges
+
+select uniqExact(_part), count() from sDD where toYYYYMM(toDateTime(intDiv(d,1000),'UTC'))+1 > 202009 and toStartOfDay(toDateTime(intDiv(d,1000),'UTC')) < toDateTime('2020-10-02 00:00:00','UTC');
+3	11440
+Selected 3/6 parts by partition key, 3 parts by primary key, 3/6 marks by primary key, 3 marks to read from 3 ranges
+
+select uniqExact(_part), count() from sDD where toYYYYMM(toDateTime(intDiv(d,1000),'UTC'))+1 > 202009 and toDateTime(intDiv(d,1000),'UTC') < toDateTime('2020-10-01 00:00:00','UTC');
+2	10000
+Selected 2/6 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges
+
+select uniqExact(_part), count() from sDD where d >= 1598918400000;
+4	20000
+Selected 4/6 parts by partition key, 4 parts by primary key, 4/8 marks by primary key, 4 marks to read from 4 ranges
+
+select uniqExact(_part), count() from sDD where d >= 1598918400000 and toYYYYMM(toDateTime(intDiv(d,1000),'UTC')-1) < 202010;
+3	10001
+Selected 3/6 parts by partition key, 3 parts by primary key, 3/6 marks by primary key, 3 marks to read from 3 ranges
+
+--------- xMM ----------------------------
+select uniqExact(_part), count() from xMM where toStartOfDay(d) >= '2020-10-01 00:00:00';
+2	10000
+Selected 2/6 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges
+
+select uniqExact(_part), count() from xMM where d >= '2020-09-01 00:00:00' and d <= '2020-10-01 00:00:00';
+3	10001
+Selected 3/6 parts by partition key, 3 parts by primary key, 3/6 marks by primary key, 3 marks to read from 3 ranges
+
+select uniqExact(_part), count() from xMM where d >= '2020-09-01 00:00:00' and d < '2020-10-01 00:00:00';
+2	10000
+Selected 2/6 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges
+
+select uniqExact(_part), count() from xMM where d >= '2020-09-01 00:00:00' and d <= '2020-10-01 00:00:00' and a=1;
+1	1
+Selected 1/6 parts by partition key, 1 parts by primary key, 1/2 marks by primary key, 1 marks to read from 1 ranges
+
+select uniqExact(_part), count() from xMM where d >= '2020-09-01 00:00:00' and d <= '2020-10-01 00:00:00' and a<>3;
+2	5001
+Selected 2/6 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges
+
+select uniqExact(_part), count() from xMM where d >= '2020-09-01 00:00:00' and d < '2020-10-01 00:00:00' and a<>3;
+1	5000
+Selected 1/6 parts by partition key, 1 parts by primary key, 1/2 marks by primary key, 1 marks to read from 1 ranges
+
+select uniqExact(_part), count() from xMM where d >= '2020-09-01 00:00:00' and d < '2020-11-01 00:00:00' and a = 1;
+2	10000
+Selected 2/6 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges
+
+select uniqExact(_part), count() from xMM where a = 1;
+3	15000
+Selected 3/6 parts by partition key, 3 parts by primary key, 3/6 marks by primary key, 3 marks to read from 3 ranges
+
+select uniqExact(_part), count() from xMM where a = 66;
+0	0
+Selected 0/6 parts by partition key, 0 parts by primary key, 0/0 marks by primary key, 0 marks to read from 0 ranges
+
+select uniqExact(_part), count() from xMM where a <> 66;
+6	30000
+Selected 6/6 parts by partition key, 6 parts by primary key, 6/12 marks by primary key, 6 marks to read from 6 ranges
+
+select uniqExact(_part), count() from xMM where a = 2;
+2	10000
+Selected 2/6 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges
+
+select uniqExact(_part), count() from xMM where a = 1;
+2	15000
+Selected 2/5 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges
+
+select uniqExact(_part), count() from xMM where toStartOfDay(d) >= '2020-10-01 00:00:00';
+1	10000
+Selected 1/5 parts by partition key, 1 parts by primary key, 1/2 marks by primary key, 1 marks to read from 1 ranges
+
+select uniqExact(_part), count() from xMM where a <> 66;
+5	30000
+Selected 5/5 parts by partition key, 5 parts by primary key, 5/10 marks by primary key, 5 marks to read from 5 ranges
+
+select uniqExact(_part), count() from xMM where d >= '2020-09-01 00:00:00' and d <= '2020-10-01 00:00:00' and a<>3;
+2	5001
+Selected 2/5 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges
+
+select uniqExact(_part), count() from xMM where d >= '2020-09-01 00:00:00' and d < '2020-10-01 00:00:00' and a<>3;
+1	5000
+Selected 1/5 parts by partition key, 1 parts by primary key, 1/2 marks by primary key, 1 marks to read from 1 ranges
+
diff --git a/tests/queries/0_stateless/01508_partition_pruning.sh b/tests/queries/0_stateless/01508_partition_pruning_long.sh
similarity index 88%
rename from tests/queries/0_stateless/01508_partition_pruning.sh
rename to tests/queries/0_stateless/01508_partition_pruning_long.sh
index b5ec6388d5c..1b3c524ac77 100755
--- a/tests/queries/0_stateless/01508_partition_pruning.sh
+++ b/tests/queries/0_stateless/01508_partition_pruning_long.sh
@@ -4,8 +4,8 @@
 # Description of test result:
 #   Test the correctness of the partition
 #   pruning
-#   
-#   Script executes queries from a file 01508_partition_pruning.queries  (1 line = 1 query) 
+#
+#   Script executes queries from a file 01508_partition_pruning_long.queries  (1 line = 1 query)
 #   Queries are started with 'select' (but NOT with 'SELECT') are executed with log_level=debug
 #-------------------------------------------------------------------------------------------
 
@@ -18,7 +18,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 #export CURDIR=.
 
 
-queries="${CURDIR}/01508_partition_pruning.queries"
+queries="${CURDIR}/01508_partition_pruning_long.queries"
 while IFS= read -r sql
 do
   [ -z "$sql" ] && continue
@@ -30,9 +30,7 @@ do
     ${CLICKHOUSE_CLIENT} --query "$sql" 2>&1 | grep -oh "Selected .* parts by partition key, *. parts by primary key, .* marks by primary key, .* marks to read from .* ranges.*$"
     CLICKHOUSE_CLIENT=$(echo ${CLICKHOUSE_CLIENT} | sed 's/--send_logs_level=debug/'"--send_logs_level=${CLICKHOUSE_CLIENT_SERVER_LOGS_LEVEL}"'/g')
     echo ""
-  else  
+  else
     ${CLICKHOUSE_CLIENT} --query "$sql"
-  fi  
+  fi
 done < "$queries"
-
-
diff --git a/tests/queries/0_stateless/01513_optimize_aggregation_in_order_memory_long.reference b/tests/queries/0_stateless/01513_optimize_aggregation_in_order_memory_long.reference
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/queries/0_stateless/01513_optimize_aggregation_in_order_memory.sql b/tests/queries/0_stateless/01513_optimize_aggregation_in_order_memory_long.sql
similarity index 92%
rename from tests/queries/0_stateless/01513_optimize_aggregation_in_order_memory.sql
rename to tests/queries/0_stateless/01513_optimize_aggregation_in_order_memory_long.sql
index 6aa38a914f7..87c66609421 100644
--- a/tests/queries/0_stateless/01513_optimize_aggregation_in_order_memory.sql
+++ b/tests/queries/0_stateless/01513_optimize_aggregation_in_order_memory_long.sql
@@ -1,7 +1,7 @@
 drop table if exists data_01513;
 create table data_01513 (key String) engine=MergeTree() order by key;
 -- 10e3 groups, 1e3 keys each
-insert into data_01513 select number%10e3 from numbers(toUInt64(2e6));
+insert into data_01513 select number%10e3 from numbers(2e6);
 -- reduce number of parts to 1
 optimize table data_01513 final;
 
diff --git a/tests/queries/0_stateless/01591_window_functions.reference b/tests/queries/0_stateless/01591_window_functions.reference
index 46cbaa4e998..d2543f0db75 100644
--- a/tests/queries/0_stateless/01591_window_functions.reference
+++ b/tests/queries/0_stateless/01591_window_functions.reference
@@ -912,3 +912,84 @@ Expression ((Projection + Before ORDER BY))
                   Expression ((Before window functions + (Projection + Before ORDER BY)))
                     SettingQuotaAndLimits (Set limits and quota after reading from storage)
                       ReadFromStorage (SystemNumbers)
+-- A test case for the sort comparator found by fuzzer.
+SELECT
+    max(number) OVER (ORDER BY number DESC NULLS FIRST),
+    max(number) OVER (ORDER BY number ASC NULLS FIRST)
+FROM numbers(2)
+;
+1	0
+1	1
+-- some true window functions -- rank and friends
+select number, p, o,
+    count(*) over w,
+    rank() over w,
+    dense_rank() over w,
+    row_number() over w
+from (select number, intDiv(number, 5) p, mod(number, 3) o
+    from numbers(31) order by o, number) t
+window w as (partition by p order by o)
+order by p, o, number
+settings max_block_size = 2;
+0	0	0	2	1	1	1
+3	0	0	2	1	1	2
+1	0	1	4	3	2	3
+4	0	1	4	3	2	4
+2	0	2	5	5	3	5
+6	1	0	2	1	1	1
+9	1	0	2	1	1	2
+7	1	1	3	3	2	3
+5	1	2	5	4	3	4
+8	1	2	5	4	3	5
+12	2	0	1	1	1	1
+10	2	1	3	2	2	2
+13	2	1	3	2	2	3
+11	2	2	5	4	3	4
+14	2	2	5	4	3	5
+15	3	0	2	1	1	2
+18	3	0	2	1	1	1
+16	3	1	4	3	2	3
+19	3	1	4	3	2	4
+17	3	2	5	5	3	5
+21	4	0	2	1	1	1
+24	4	0	2	1	1	2
+22	4	1	3	3	2	3
+20	4	2	5	4	3	5
+23	4	2	5	4	3	4
+27	5	0	1	1	1	1
+25	5	1	3	2	2	2
+28	5	1	3	2	2	3
+26	5	2	5	4	3	4
+29	5	2	5	4	3	5
+30	6	0	1	1	1	1
+-- our replacement for lag/lead
+select
+    anyOrNull(number)
+        over (order by number rows between 1 preceding and 1 preceding),
+    anyOrNull(number)
+        over (order by number rows between 1 following and 1 following)
+from numbers(5);
+\N	1
+0	2
+1	3
+2	4
+3	\N
+-- case-insensitive SQL-standard synonyms for any and anyLast
+select
+    number,
+    fIrSt_VaLue(number) over w,
+    lAsT_vAlUe(number) over w
+from numbers(10)
+window w as (order by number range between 1 preceding and 1 following)
+order by number
+;
+0	0	1
+1	0	2
+2	1	3
+3	2	4
+4	3	5
+5	4	6
+6	5	7
+7	6	8
+8	7	9
+9	8	9
diff --git a/tests/queries/0_stateless/01591_window_functions.sql b/tests/queries/0_stateless/01591_window_functions.sql
index 04fd48bde9f..03bd8371e23 100644
--- a/tests/queries/0_stateless/01591_window_functions.sql
+++ b/tests/queries/0_stateless/01591_window_functions.sql
@@ -308,3 +308,40 @@ from
     (select number, intDiv(number, 3) p, mod(number, 5) o
         from numbers(16)) t
 ;
+
+-- A test case for the sort comparator found by fuzzer.
+SELECT
+    max(number) OVER (ORDER BY number DESC NULLS FIRST),
+    max(number) OVER (ORDER BY number ASC NULLS FIRST)
+FROM numbers(2)
+;
+
+-- some true window functions -- rank and friends
+select number, p, o,
+    count(*) over w,
+    rank() over w,
+    dense_rank() over w,
+    row_number() over w
+from (select number, intDiv(number, 5) p, mod(number, 3) o
+    from numbers(31) order by o, number) t
+window w as (partition by p order by o)
+order by p, o, number
+settings max_block_size = 2;
+
+-- our replacement for lag/lead
+select
+    anyOrNull(number)
+        over (order by number rows between 1 preceding and 1 preceding),
+    anyOrNull(number)
+        over (order by number rows between 1 following and 1 following)
+from numbers(5);
+
+-- case-insensitive SQL-standard synonyms for any and anyLast
+select
+    number,
+    fIrSt_VaLue(number) over w,
+    lAsT_vAlUe(number) over w
+from numbers(10)
+window w as (order by number range between 1 preceding and 1 following)
+order by number
+;
diff --git a/tests/queries/0_stateless/01600_remerge_sort_lowered_memory_bytes_ratio.sql b/tests/queries/0_stateless/01600_remerge_sort_lowered_memory_bytes_ratio.sql
index b33b74c918d..5de4210d3f2 100644
--- a/tests/queries/0_stateless/01600_remerge_sort_lowered_memory_bytes_ratio.sql
+++ b/tests/queries/0_stateless/01600_remerge_sort_lowered_memory_bytes_ratio.sql
@@ -10,8 +10,8 @@ set max_block_size=40960;
 --     MergeSortingTransform: Re-merging intermediate ORDER BY data (20 blocks with 819200 rows) to save memory consumption
 --     MergeSortingTransform: Memory usage is lowered from 186.25 MiB to 95.00 MiB
 --     MergeSortingTransform: Re-merging is not useful (memory usage was not lowered by remerge_sort_lowered_memory_bytes_ratio=2.0)
-select number k, repeat(toString(number), 11) v1, repeat(toString(number), 12) v2 from numbers(toUInt64(3e6)) order by k limit 400e3 format Null; -- { serverError 241 }
-select number k, repeat(toString(number), 11) v1, repeat(toString(number), 12) v2 from numbers(toUInt64(3e6)) order by k limit 400e3 settings remerge_sort_lowered_memory_bytes_ratio=2. format Null; -- { serverError 241 }
+select number k, repeat(toString(number), 11) v1, repeat(toString(number), 12) v2 from numbers(3e6) order by k limit 400e3 format Null; -- { serverError 241 }
+select number k, repeat(toString(number), 11) v1, repeat(toString(number), 12) v2 from numbers(3e6) order by k limit 400e3 settings remerge_sort_lowered_memory_bytes_ratio=2. format Null; -- { serverError 241 }
 
 -- remerge_sort_lowered_memory_bytes_ratio 1.9 is good (need at least 1.91/0.98=1.94)
 --     MergeSortingTransform: Re-merging intermediate ORDER BY data (20 blocks with 819200 rows) to save memory consumption
@@ -26,4 +26,4 @@ select number k, repeat(toString(number), 11) v1, repeat(toString(number), 12) v
 --     MergeSortingTransform: Memory usage is lowered from 188.13 MiB to 95.00 MiB
 --     MergeSortingTransform: Re-merging intermediate ORDER BY data (20 blocks with 809600 rows) to save memory consumption
 --     MergeSortingTransform: Memory usage is lowered from 188.13 MiB to 95.00 MiB
-select number k, repeat(toString(number), 11) v1, repeat(toString(number), 12) v2 from numbers(toUInt64(3e6)) order by k limit 400e3 settings remerge_sort_lowered_memory_bytes_ratio=1.9 format Null;
+select number k, repeat(toString(number), 11) v1, repeat(toString(number), 12) v2 from numbers(3e6) order by k limit 400e3 settings remerge_sort_lowered_memory_bytes_ratio=1.9 format Null;
diff --git a/tests/queries/0_stateless/01602_runningConcurrency.reference b/tests/queries/0_stateless/01602_runningConcurrency.reference
new file mode 100644
index 00000000000..1bd238ccde8
--- /dev/null
+++ b/tests/queries/0_stateless/01602_runningConcurrency.reference
@@ -0,0 +1,19 @@
+Invocation with Date columns
+1
+2
+3
+2
+1
+Invocation with DateTime
+1
+2
+3
+2
+1
+Invocation with DateTime64
+1
+2
+3
+2
+1
+Erroneous cases
diff --git a/tests/queries/0_stateless/01602_runningConcurrency.sql b/tests/queries/0_stateless/01602_runningConcurrency.sql
new file mode 100644
index 00000000000..55b3aae867a
--- /dev/null
+++ b/tests/queries/0_stateless/01602_runningConcurrency.sql
@@ -0,0 +1,51 @@
+--
+SELECT 'Invocation with Date columns';
+
+DROP TABLE IF EXISTS runningConcurrency_test;
+CREATE TABLE runningConcurrency_test(begin Date, end Date) ENGINE = Memory;
+
+INSERT INTO runningConcurrency_test VALUES ('2020-12-01', '2020-12-10'), ('2020-12-02', '2020-12-10'), ('2020-12-03', '2020-12-12'), ('2020-12-10', '2020-12-12'), ('2020-12-13', '2020-12-20');
+SELECT runningConcurrency(begin, end) FROM runningConcurrency_test;
+
+DROP TABLE runningConcurrency_test;
+
+--
+SELECT 'Invocation with DateTime';
+
+DROP TABLE IF EXISTS runningConcurrency_test;
+CREATE TABLE runningConcurrency_test(begin DateTime, end DateTime) ENGINE = Memory;
+
+INSERT INTO runningConcurrency_test VALUES ('2020-12-01 00:00:00', '2020-12-01 00:59:59'), ('2020-12-01 00:30:00', '2020-12-01 00:59:59'), ('2020-12-01 00:40:00', '2020-12-01 01:30:30'), ('2020-12-01 01:10:00', '2020-12-01 01:30:30'), ('2020-12-01 01:50:00', '2020-12-01 01:59:59');
+SELECT runningConcurrency(begin, end) FROM runningConcurrency_test;
+
+DROP TABLE runningConcurrency_test;
+
+--
+SELECT 'Invocation with DateTime64';
+
+DROP TABLE IF EXISTS runningConcurrency_test;
+CREATE TABLE runningConcurrency_test(begin DateTime64(3), end DateTime64(3)) ENGINE = Memory;
+
+INSERT INTO runningConcurrency_test VALUES ('2020-12-01 00:00:00.000', '2020-12-01 00:00:00.100'), ('2020-12-01 00:00:00.010', '2020-12-01 00:00:00.100'), ('2020-12-01 00:00:00.020', '2020-12-01 00:00:00.200'), ('2020-12-01 00:00:00.150', '2020-12-01 00:00:00.200'), ('2020-12-01 00:00:00.250', '2020-12-01 00:00:00.300');
+SELECT runningConcurrency(begin, end) FROM runningConcurrency_test;
+
+DROP TABLE runningConcurrency_test;
+
+--
+SELECT 'Erroneous cases';
+
+-- Constant columns are currently not supported.
+SELECT runningConcurrency(toDate(arrayJoin([1, 2])), toDate('2000-01-01')); -- { serverError 44 }
+
+-- Unsupported data types
+SELECT runningConcurrency('strings are', 'not supported'); -- { serverError 43 }
+SELECT runningConcurrency(NULL, NULL); -- { serverError 43 }
+SELECT runningConcurrency(CAST(NULL, 'Nullable(DateTime)'), CAST(NULL, 'Nullable(DateTime)')); -- { serverError 43 }
+
+-- Mismatching data types
+SELECT runningConcurrency(toDate('2000-01-01'), toDateTime('2000-01-01 00:00:00')); -- { serverError 43 }
+
+-- begin > end
+SELECT runningConcurrency(toDate('2000-01-02'), toDate('2000-01-01')); -- { serverError 117 }
+
+                                                       
diff --git a/tests/queries/0_stateless/01611_constant_folding_subqueries.reference b/tests/queries/0_stateless/01611_constant_folding_subqueries.reference
index ac91b53b754..d10502c5860 100644
--- a/tests/queries/0_stateless/01611_constant_folding_subqueries.reference
+++ b/tests/queries/0_stateless/01611_constant_folding_subqueries.reference
@@ -7,3 +7,5 @@ EXPLAIN SYNTAX SELECT (SELECT * FROM system.numbers LIMIT 1 OFFSET 1) AS n, toUI
 SELECT
     identity(cast(0, \'UInt64\')) AS n,
     toUInt64(10 / n)
+SELECT * FROM (WITH (SELECT * FROM system.numbers LIMIT 1 OFFSET 1) AS n, toUInt64(10 / n) as q SELECT * FROM system.one WHERE q > 0);
+0
diff --git a/tests/queries/0_stateless/01611_constant_folding_subqueries.sql b/tests/queries/0_stateless/01611_constant_folding_subqueries.sql
index abf67a8ed6a..59f057d1ec5 100644
--- a/tests/queries/0_stateless/01611_constant_folding_subqueries.sql
+++ b/tests/queries/0_stateless/01611_constant_folding_subqueries.sql
@@ -2,3 +2,4 @@
 SELECT * FROM (SELECT (SELECT * FROM system.numbers LIMIT 1 OFFSET 1) AS n, toUInt64(10 / n)) FORMAT CSV;
 SELECT (SELECT * FROM system.numbers LIMIT 1 OFFSET 1) AS n, toUInt64(10 / n) FORMAT CSV;
 EXPLAIN SYNTAX SELECT (SELECT * FROM system.numbers LIMIT 1 OFFSET 1) AS n, toUInt64(10 / n);
+SELECT * FROM (WITH (SELECT * FROM system.numbers LIMIT 1 OFFSET 1) AS n, toUInt64(10 / n) as q SELECT * FROM system.one WHERE q > 0);
diff --git a/tests/queries/0_stateless/01641_memory_tracking_insert_optimize_long.reference b/tests/queries/0_stateless/01641_memory_tracking_insert_optimize_long.reference
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/queries/0_stateless/01641_memory_tracking_insert_optimize.sql b/tests/queries/0_stateless/01641_memory_tracking_insert_optimize_long.sql
similarity index 96%
rename from tests/queries/0_stateless/01641_memory_tracking_insert_optimize.sql
rename to tests/queries/0_stateless/01641_memory_tracking_insert_optimize_long.sql
index f059da20755..7a92f40b3f0 100644
--- a/tests/queries/0_stateless/01641_memory_tracking_insert_optimize.sql
+++ b/tests/queries/0_stateless/01641_memory_tracking_insert_optimize_long.sql
@@ -5,7 +5,7 @@ create table data_01641 (key Int, value String) engine=MergeTree order by (key,
 -- peak memory usage is 170MiB
 set max_memory_usage='200Mi';
 system stop merges data_01641;
-insert into data_01641 select number, toString(number) from numbers(toUInt64(120e6));
+insert into data_01641 select number, toString(number) from numbers(120e6);
 
 -- peak:
 -- - is 21MiB if background merges already scheduled
diff --git a/tests/queries/0_stateless/01650_drop_part_and_deduplication_zookeeper.sql b/tests/queries/0_stateless/01650_drop_part_and_deduplication_zookeeper.sql
index 50596680618..c3e459dfc49 100644
--- a/tests/queries/0_stateless/01650_drop_part_and_deduplication_zookeeper.sql
+++ b/tests/queries/0_stateless/01650_drop_part_and_deduplication_zookeeper.sql
@@ -5,7 +5,7 @@ CREATE TABLE partitioned_table (
     partitioner UInt8,
     value String
 )
-ENGINE ReplicatedMergeTree('/clickhouse/test/01650_drop_part_and_deduplication/partitioned_table', '1')
+ENGINE ReplicatedMergeTree('/clickhouse/01650_drop_part_and_deduplication_partitioned_table', '1')
 ORDER BY key
 PARTITION BY partitioner;
 
@@ -16,24 +16,24 @@ INSERT INTO partitioned_table VALUES (11, 1, 'AA'), (22, 2, 'BB'), (33, 3, 'CC')
 
 SELECT partition_id, name FROM system.parts WHERE table = 'partitioned_table' AND database = currentDatabase() ORDER BY name;
 
-SELECT substring(name, 1, 2), value FROM system.zookeeper WHERE path='/clickhouse/test/01650_drop_part_and_deduplication/partitioned_table/blocks/' ORDER BY value;
+SELECT substring(name, 1, 2), value FROM system.zookeeper WHERE path='/clickhouse/01650_drop_part_and_deduplication_partitioned_table/blocks/' ORDER BY value;
 
 INSERT INTO partitioned_table VALUES (33, 3, 'CC'); -- must be deduplicated
 
 SELECT partition_id, name FROM system.parts WHERE table = 'partitioned_table' AND database = currentDatabase() ORDER BY name;
 
-SELECT substring(name, 1, 2), value FROM system.zookeeper WHERE path='/clickhouse/test/01650_drop_part_and_deduplication/partitioned_table/blocks/' ORDER BY value;
+SELECT substring(name, 1, 2), value FROM system.zookeeper WHERE path='/clickhouse/01650_drop_part_and_deduplication_partitioned_table/blocks/' ORDER BY value;
 
 ALTER TABLE partitioned_table DROP PART '3_1_1_0';
 
 SELECT partition_id, name FROM system.parts WHERE table = 'partitioned_table' AND database = currentDatabase() ORDER BY name;
 
-SELECT substring(name, 1, 2), value FROM system.zookeeper WHERE path='/clickhouse/test/01650_drop_part_and_deduplication/partitioned_table/blocks/' ORDER BY value;
+SELECT substring(name, 1, 2), value FROM system.zookeeper WHERE path='/clickhouse/01650_drop_part_and_deduplication_partitioned_table/blocks/' ORDER BY value;
 
 INSERT INTO partitioned_table VALUES (33, 3, 'CC'); -- mustn't be deduplicated
 
 SELECT partition_id, name FROM system.parts WHERE table = 'partitioned_table' AND database = currentDatabase() ORDER BY name;
 
-SELECT substring(name, 1, 2), value FROM system.zookeeper WHERE path='/clickhouse/test/01650_drop_part_and_deduplication/partitioned_table/blocks/' ORDER BY value;
+SELECT substring(name, 1, 2), value FROM system.zookeeper WHERE path='/clickhouse/01650_drop_part_and_deduplication_partitioned_table/blocks/' ORDER BY value;
 
 DROP TABLE IF EXISTS partitioned_table;
diff --git a/tests/queries/0_stateless/01653_move_conditions_from_join_on_to_where.reference b/tests/queries/0_stateless/01653_move_conditions_from_join_on_to_where.reference
new file mode 100644
index 00000000000..19487c9f942
--- /dev/null
+++ b/tests/queries/0_stateless/01653_move_conditions_from_join_on_to_where.reference
@@ -0,0 +1,140 @@
+---------Q1----------
+2	2	2	20
+SELECT
+    a,
+    b,
+    table2.a,
+    table2.b
+FROM table1
+ALL INNER JOIN 
+(
+    SELECT
+        a,
+        b
+    FROM table2
+) AS table2 ON a = table2.a
+WHERE table2.b = toUInt32(20)
+---------Q2----------
+2	2	2	20
+SELECT
+    a,
+    b,
+    table2.a,
+    table2.b
+FROM table1
+ALL INNER JOIN 
+(
+    SELECT
+        a,
+        b
+    FROM table2
+) AS table2 ON a = table2.a
+WHERE (table2.a < table2.b) AND (table2.b = toUInt32(20))
+---------Q3----------
+---------Q4----------
+6	40
+SELECT
+    a,
+    table2.b
+FROM table1
+ALL INNER JOIN 
+(
+    SELECT
+        a,
+        b
+    FROM table2
+) AS table2 ON a = toUInt32(10 - table2.a)
+WHERE (b = 6) AND (table2.b > 20)
+---------Q5----------
+SELECT
+    a,
+    table2.b
+FROM table1
+ALL INNER JOIN 
+(
+    SELECT
+        a,
+        b
+    FROM table2
+    WHERE 0
+) AS table2 ON a = table2.a
+WHERE 0
+---------Q6----------
+---------Q7----------
+0	0	0	0
+SELECT
+    a,
+    b,
+    table2.a,
+    table2.b
+FROM table1
+ALL INNER JOIN 
+(
+    SELECT
+        a,
+        b
+    FROM table2
+) AS table2 ON a = table2.a
+WHERE (table2.b < toUInt32(40)) AND (b < 1)
+---------Q8----------
+---------Q9---will not be optimized----------
+SELECT
+    a,
+    b,
+    table2.a,
+    table2.b
+FROM table1
+ALL LEFT JOIN 
+(
+    SELECT
+        a,
+        b
+    FROM table2
+) AS table2 ON (a = table2.a) AND (b = toUInt32(10))
+SELECT
+    a,
+    b,
+    table2.a,
+    table2.b
+FROM table1
+ALL RIGHT JOIN 
+(
+    SELECT
+        a,
+        b
+    FROM table2
+) AS table2 ON (a = table2.a) AND (b = toUInt32(10))
+SELECT
+    a,
+    b,
+    table2.a,
+    table2.b
+FROM table1
+ALL FULL OUTER JOIN 
+(
+    SELECT
+        a,
+        b
+    FROM table2
+) AS table2 ON (a = table2.a) AND (b = toUInt32(10))
+SELECT
+    a,
+    b,
+    table2.a,
+    table2.b
+FROM table1
+ALL FULL OUTER JOIN 
+(
+    SELECT
+        a,
+        b
+    FROM table2
+) AS table2 ON (a = table2.a) AND (table2.b = toUInt32(10))
+WHERE a < toUInt32(20)
+SELECT
+    a,
+    b,
+    table2.a,
+    table2.b
+FROM table1
+CROSS JOIN table2
diff --git a/tests/queries/0_stateless/01653_move_conditions_from_join_on_to_where.sql b/tests/queries/0_stateless/01653_move_conditions_from_join_on_to_where.sql
new file mode 100644
index 00000000000..23871a9c47c
--- /dev/null
+++ b/tests/queries/0_stateless/01653_move_conditions_from_join_on_to_where.sql
@@ -0,0 +1,48 @@
+DROP TABLE IF EXISTS table1;
+DROP TABLE IF EXISTS table2;
+
+CREATE TABLE table1 (a UInt32, b UInt32) ENGINE = Memory;
+CREATE TABLE table2 (a UInt32, b UInt32) ENGINE = Memory;
+
+INSERT INTO table1 SELECT number, number FROM numbers(10);
+INSERT INTO table2 SELECT number * 2, number * 20 FROM numbers(6);
+
+SELECT '---------Q1----------';
+SELECT * FROM table1 JOIN table2 ON (table1.a = table2.a) AND (table2.b = toUInt32(20));
+EXPLAIN SYNTAX SELECT * FROM table1 JOIN table2 ON (table1.a = table2.a) AND (table2.b = toUInt32(20));
+
+SELECT '---------Q2----------';
+SELECT * FROM table1 JOIN table2 ON (table1.a = table2.a) AND (table2.a < table2.b) AND (table2.b = toUInt32(20));
+EXPLAIN SYNTAX SELECT * FROM table1 JOIN table2 ON (table1.a = table2.a) AND (table2.a < table2.b) AND (table2.b = toUInt32(20));
+
+SELECT '---------Q3----------';
+SELECT * FROM table1 JOIN table2 ON (table1.a = toUInt32(table2.a + 5)) AND (table2.a < table1.b) AND (table2.b > toUInt32(20)); -- { serverError 48 }
+
+SELECT '---------Q4----------';
+SELECT table1.a, table2.b FROM table1 INNER JOIN table2 ON (table1.a = toUInt32(10 - table2.a)) AND (table1.b = 6) AND (table2.b > 20);
+EXPLAIN SYNTAX SELECT table1.a, table2.b FROM table1 INNER JOIN table2 ON (table1.a = toUInt32(10 - table2.a)) AND (table1.b = 6) AND (table2.b > 20);
+
+SELECT '---------Q5----------';
+SELECT table1.a, table2.b FROM table1 JOIN table2 ON (table1.a = table2.a) AND (table1.b = 6) AND (table2.b > 20) AND (10 < 6);
+EXPLAIN SYNTAX SELECT table1.a, table2.b FROM table1 JOIN table2 ON (table1.a = table2.a) AND (table1.b = 6) AND (table2.b > 20) AND (10 < 6);
+
+SELECT '---------Q6----------';
+SELECT table1.a, table2.b FROM table1 JOIN table2 ON (table1.b = 6) AND (table2.b > 20); -- { serverError 403 } 
+
+SELECT '---------Q7----------';
+SELECT * FROM table1 JOIN table2 ON (table1.a = table2.a) AND (table2.b < toUInt32(40)) where table1.b < 1;
+EXPLAIN SYNTAX SELECT * FROM table1 JOIN table2 ON (table1.a = table2.a) AND (table2.b < toUInt32(40)) where table1.b < 1;
+SELECT * FROM table1 JOIN table2 ON (table1.a = table2.a) AND (table2.b < toUInt32(40)) where table1.b > 10;
+
+SELECT '---------Q8----------';
+SELECT * FROM table1 INNER JOIN table2 ON (table1.a = table2.a) AND (table2.b < toUInt32(table1, 10)); -- { serverError 47 }
+
+SELECT '---------Q9---will not be optimized----------';
+EXPLAIN SYNTAX SELECT * FROM table1 LEFT JOIN table2 ON (table1.a = table2.a) AND (table1.b = toUInt32(10));
+EXPLAIN SYNTAX SELECT * FROM table1 RIGHT JOIN table2 ON (table1.a = table2.a) AND (table1.b = toUInt32(10));
+EXPLAIN SYNTAX SELECT * FROM table1 FULL JOIN table2 ON (table1.a = table2.a) AND (table1.b = toUInt32(10));
+EXPLAIN SYNTAX SELECT * FROM table1 FULL JOIN table2 ON (table1.a = table2.a) AND (table2.b = toUInt32(10)) WHERE table1.a < toUInt32(20);
+EXPLAIN SYNTAX SELECT * FROM table1 , table2;
+
+DROP TABLE table1;
+DROP TABLE table2;
diff --git a/tests/queries/0_stateless/01658_read_file_to_stringcolumn.reference b/tests/queries/0_stateless/01658_read_file_to_stringcolumn.reference
new file mode 100644
index 00000000000..87659c32e39
--- /dev/null
+++ b/tests/queries/0_stateless/01658_read_file_to_stringcolumn.reference
@@ -0,0 +1,25 @@
+aaaaaaaaa	bbbbbbbbb
+:0
+:0
+:0
+ccccccccc	aaaaaaaaa	bbbbbbbbb
+ccccccccc	aaaaaaaaa	bbbbbbbbb
+:0
+aaaaaaaaa
+bbbbbbbbb
+ccccccccc
+:107
+:79
+:35
+:35
+:35
+699415
+aaaaaaaaa	bbbbbbbbb
+ccccccccc	aaaaaaaaa	bbbbbbbbb
+ccccccccc	aaaaaaaaa	bbbbbbbbb
+ccccccccc	aaaaaaaaa	bbbbbbbbb
+ccccccccc	aaaaaaaaa	bbbbbbbbb
+699415	0
+:0
+:107
+:79
diff --git a/tests/queries/0_stateless/01658_read_file_to_stringcolumn.sh b/tests/queries/0_stateless/01658_read_file_to_stringcolumn.sh
new file mode 100755
index 00000000000..593f0e59ea7
--- /dev/null
+++ b/tests/queries/0_stateless/01658_read_file_to_stringcolumn.sh
@@ -0,0 +1,89 @@
+#!/usr/bin/env bash
+set -eu
+
+CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
+. "$CURDIR"/../shell_config.sh
+
+# Data preparation.
+# Now we can get the user_files_path by use the table file function for trick. also we can get it by query as:
+#  "insert into function file('exist.txt', 'CSV', 'val1 char') values ('aaaa'); select _path from file('exist.txt', 'CSV', 'val1 char')"
+user_files_path=$(clickhouse-client --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 |grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}')
+
+mkdir -p ${user_files_path}/
+echo -n aaaaaaaaa > ${user_files_path}/a.txt
+echo -n bbbbbbbbb > ${user_files_path}/b.txt
+echo -n ccccccccc > ${user_files_path}/c.txt
+echo -n ccccccccc > /tmp/c.txt
+mkdir -p ${user_files_path}/dir
+
+
+### 1st TEST in CLIENT mode.
+${CLICKHOUSE_CLIENT} --query "drop table if exists data;"
+${CLICKHOUSE_CLIENT} --query "create table data (A String, B String) engine=MergeTree() order by A;"
+
+
+# Valid cases:
+${CLICKHOUSE_CLIENT} --query "select file('${user_files_path}/a.txt'), file('${user_files_path}/b.txt');";echo ":"$?
+${CLICKHOUSE_CLIENT} --query "insert into data select file('${user_files_path}/a.txt'), file('${user_files_path}/b.txt');";echo ":"$?
+${CLICKHOUSE_CLIENT} --query "insert into data select file('${user_files_path}/a.txt'), file('${user_files_path}/b.txt');";echo ":"$?
+${CLICKHOUSE_CLIENT} --query "select file('${user_files_path}/c.txt'), * from data";echo ":"$?
+${CLICKHOUSE_CLIENT} --multiquery --query "
+	create table filenames(name String) engine=MergeTree() order by tuple();
+	insert into filenames values ('a.txt'), ('b.txt'), ('c.txt');
+	select file(name) from filenames format TSV;
+	drop table if exists filenames;
+"
+
+# Invalid cases: (Here using sub-shell to catch exception avoiding the test quit)
+# Test non-exists file
+echo "clickhouse-client --query "'"select file('"'nonexist.txt'), file('${user_files_path}/b.txt')"'";echo :$?' | bash 2>/dev/null
+# Test isDir
+echo "clickhouse-client --query "'"select file('"'${user_files_path}/dir'), file('${user_files_path}/b.txt')"'";echo :$?' | bash 2>/dev/null
+# Test path out of the user_files directory. It's not allowed in client mode
+echo "clickhouse-client --query "'"select file('"'/tmp/c.txt'), file('${user_files_path}/b.txt')"'";echo :$?' | bash 2>/dev/null
+
+# Test relative path consists of ".." whose absolute path is out of the user_files directory.
+echo "clickhouse-client --query "'"select file('"'${user_files_path}/../../../../tmp/c.txt'), file('b.txt')"'";echo :$?' | bash 2>/dev/null
+echo "clickhouse-client --query "'"select file('"'../../../../a.txt'), file('${user_files_path}/b.txt')"'";echo :$?' | bash 2>/dev/null
+
+
+### 2nd TEST in LOCAL mode.
+
+echo -n aaaaaaaaa > a.txt
+echo -n bbbbbbbbb > b.txt
+echo -n ccccccccc > c.txt
+mkdir -p dir
+#Test for large files, with length : 699415
+c_count=$(wc -c ${CURDIR}/01518_nullable_aggregate_states2.reference | awk '{print $1}')
+echo $c_count
+
+# Valid cases:
+# The default dir is the CWD path in LOCAL mode
+${CLICKHOUSE_LOCAL} --query "
+	drop table if exists data;
+	create table data (A String, B String) engine=MergeTree() order by A;
+	select file('a.txt'), file('b.txt');
+	insert into data select file('a.txt'), file('b.txt');
+	insert into data select file('a.txt'), file('b.txt');
+	select file('c.txt'), * from data;
+	select file('/tmp/c.txt'), * from data;
+	select $c_count, $c_count -length(file('${CURDIR}/01518_nullable_aggregate_states2.reference'))
+"
+echo ":"$?
+
+
+# Invalid cases: (Here using sub-shell to catch exception avoiding the test quit)
+# Test non-exists file
+echo "clickhouse-local --query "'"select file('"'nonexist.txt'), file('b.txt')"'";echo :$?' | bash 2>/dev/null
+
+# Test isDir
+echo "clickhouse-local --query "'"select file('"'dir'), file('b.txt')"'";echo :$?' | bash 2>/dev/null
+
+# Restore
+rm -rf a.txt b.txt c.txt dir
+rm -rf ${user_files_path}/a.txt
+rm -rf ${user_files_path}/b.txt
+rm -rf ${user_files_path}/c.txt
+rm -rf /tmp/c.txt
+rm -rf ${user_files_path}/dir
diff --git a/tests/queries/0_stateless/01669_columns_declaration_serde.sql b/tests/queries/0_stateless/01669_columns_declaration_serde.sql
index 8e3354d63cd..a6bf1184e9f 100644
--- a/tests/queries/0_stateless/01669_columns_declaration_serde.sql
+++ b/tests/queries/0_stateless/01669_columns_declaration_serde.sql
@@ -22,12 +22,12 @@ DROP TABLE IF EXISTS test_r1;
 DROP TABLE IF EXISTS test_r2;
 
 CREATE TABLE test_r1 (x UInt64, "\\" String DEFAULT '\r\n\t\\' || '
-') ENGINE = ReplicatedMergeTree('/clickhouse/test', 'r1') ORDER BY "\\";
+') ENGINE = ReplicatedMergeTree('/clickhouse/test_01669', 'r1') ORDER BY "\\";
 
 INSERT INTO test_r1 ("\\") VALUES ('\\');
 
 CREATE TABLE test_r2 (x UInt64, "\\" String DEFAULT '\r\n\t\\' || '
-') ENGINE = ReplicatedMergeTree('/clickhouse/test', 'r2') ORDER BY "\\";
+') ENGINE = ReplicatedMergeTree('/clickhouse/test_01669', 'r2') ORDER BY "\\";
 
 SYSTEM SYNC REPLICA test_r2;
 
diff --git a/tests/queries/0_stateless/01700_system_zookeeper_path_in.reference b/tests/queries/0_stateless/01700_system_zookeeper_path_in.reference
index 78462f9fc0e..2fc177c812e 100644
--- a/tests/queries/0_stateless/01700_system_zookeeper_path_in.reference
+++ b/tests/queries/0_stateless/01700_system_zookeeper_path_in.reference
@@ -1,7 +1,16 @@
-clickhouse
-task_queue
-clickhouse
-task_queue
-clickhouse
-task_queue
-ddl
+block_numbers
+blocks
+1
+========
+block_numbers
+blocks
+1
+========
+block_numbers
+blocks
+========
+1
+failed_parts
+last_part
+leader_election-0000000000
+parallel
diff --git a/tests/queries/0_stateless/01700_system_zookeeper_path_in.sql b/tests/queries/0_stateless/01700_system_zookeeper_path_in.sql
index a5c7488ef97..d4126098c7c 100644
--- a/tests/queries/0_stateless/01700_system_zookeeper_path_in.sql
+++ b/tests/queries/0_stateless/01700_system_zookeeper_path_in.sql
@@ -1,6 +1,19 @@
-SELECT name FROM system.zookeeper WHERE path = '/';
-SELECT name FROM system.zookeeper WHERE path = 'clickhouse';
-SELECT name FROM system.zookeeper WHERE path IN ('/');
-SELECT name FROM system.zookeeper WHERE path IN ('clickhouse');
-SELECT name FROM system.zookeeper WHERE path IN ('/','/clickhouse');
-SELECT name FROM system.zookeeper WHERE path IN (SELECT concat('/clickhouse/',name) FROM system.zookeeper WHERE (path = '/clickhouse/'));
\ No newline at end of file
+DROP TABLE IF EXISTS sample_table;
+
+CREATE TABLE sample_table (
+    key UInt64
+)
+ENGINE ReplicatedMergeTree('/clickhouse/01700_system_zookeeper_path_in', '1')
+ORDER BY tuple();
+
+SELECT name FROM system.zookeeper WHERE path = '/clickhouse/01700_system_zookeeper_path_in' AND name like 'block%' ORDER BY name;
+SELECT name FROM system.zookeeper WHERE path = '/clickhouse/01700_system_zookeeper_path_in/replicas' ORDER BY name;
+SELECT '========';
+SELECT name FROM system.zookeeper WHERE path IN ('/clickhouse/01700_system_zookeeper_path_in') AND name LIKE 'block%' ORDER BY name;
+SELECT name FROM system.zookeeper WHERE path IN ('/clickhouse/01700_system_zookeeper_path_in/replicas') ORDER BY name;
+SELECT '========';
+SELECT name FROM system.zookeeper WHERE path IN ('/clickhouse/01700_system_zookeeper_path_in','/clickhouse/01700_system_zookeeper_path_in/replicas') AND name LIKE 'block%' ORDER BY name;
+SELECT '========';
+SELECT name FROM system.zookeeper WHERE path IN (SELECT concat('/clickhouse/01700_system_zookeeper_path_in/', name) FROM system.zookeeper WHERE (path = '/clickhouse/01700_system_zookeeper_path_in')) ORDER BY name;
+
+DROP TABLE IF EXISTS sample_table;
diff --git a/tests/queries/0_stateless/01701_parallel_parsing_infinite_segmentation.reference b/tests/queries/0_stateless/01701_parallel_parsing_infinite_segmentation.reference
new file mode 100644
index 00000000000..587579af915
--- /dev/null
+++ b/tests/queries/0_stateless/01701_parallel_parsing_infinite_segmentation.reference
@@ -0,0 +1 @@
+Ok.
diff --git a/tests/queries/0_stateless/01701_parallel_parsing_infinite_segmentation.sh b/tests/queries/0_stateless/01701_parallel_parsing_infinite_segmentation.sh
new file mode 100755
index 00000000000..d3e634eb560
--- /dev/null
+++ b/tests/queries/0_stateless/01701_parallel_parsing_infinite_segmentation.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash                                                                                                                                                                                                                                           
+                                                                                                                                                                                                                                                              
+CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)                                                                                                                                                                                                          
+# shellcheck source=../shell_config.sh                                                                                                                                                                                                                        
+. "$CURDIR"/../shell_config.sh   
+
+${CLICKHOUSE_CLIENT} -q "create table insert_big_json(a String, b String) engine=MergeTree() order by tuple()";
+
+python3 -c "[print('{{\"a\":\"{}\", \"b\":\"{}\"'.format('clickhouse'* 1000000, 'dbms' * 1000000)) for i in range(10)]; [print('{{\"a\":\"{}\", \"b\":\"{}\"}}'.format('clickhouse'* 100000, 'dbms' * 100000)) for i in range(10)]" 2>/dev/null  | ${CLICKHOUSE_CLIENT} --input_format_parallel_parsing=1 --max_memory_usage=0 -q "insert into insert_big_json FORMAT JSONEachRow" 2>&1 | grep -q "min_chunk_bytes_for_parallel_parsing" && echo "Ok." || echo "FAIL" ||:
\ No newline at end of file
diff --git a/tests/queries/0_stateless/01702_system_numbers_scientific_notation.reference b/tests/queries/0_stateless/01702_system_numbers_scientific_notation.reference
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/queries/0_stateless/01702_system_numbers_scientific_notation.sql b/tests/queries/0_stateless/01702_system_numbers_scientific_notation.sql
new file mode 100644
index 00000000000..6e037ee4a2e
--- /dev/null
+++ b/tests/queries/0_stateless/01702_system_numbers_scientific_notation.sql
@@ -0,0 +1,5 @@
+select * from numbers(1e2) format Null;
+select * from numbers_mt(1e2) format Null;
+select * from numbers_mt('100') format Null; -- { serverError 43 }
+select * from numbers_mt(inf) format Null; -- { serverError 43 }
+select * from numbers_mt(nan) format Null; -- { serverError 43 }
diff --git a/tests/queries/0_stateless/01709_inactive_parts_to_delay_throw.reference b/tests/queries/0_stateless/01709_inactive_parts_to_delay_throw.reference
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/queries/0_stateless/01709_inactive_parts_to_delay_throw.sql b/tests/queries/0_stateless/01709_inactive_parts_to_delay_throw.sql
new file mode 100644
index 00000000000..fad890c4807
--- /dev/null
+++ b/tests/queries/0_stateless/01709_inactive_parts_to_delay_throw.sql
@@ -0,0 +1,12 @@
+drop table if exists x;
+
+create table x (i int) engine MergeTree order by i settings old_parts_lifetime = 10000000000, min_bytes_for_wide_part = 0, inactive_parts_to_throw_insert = 1;
+
+insert into x values (1);
+insert into x values (2);
+
+optimize table x final;
+
+insert into x values (3); -- { serverError 252; }
+
+drop table if exists x;
diff --git a/tests/queries/0_stateless/01710_join_use_nulls.reference b/tests/queries/0_stateless/01710_join_use_nulls.reference
new file mode 100644
index 00000000000..8bd111e0416
--- /dev/null
+++ b/tests/queries/0_stateless/01710_join_use_nulls.reference
@@ -0,0 +1,3 @@
+3
+1
+1
diff --git a/tests/queries/0_stateless/01710_join_use_nulls.sql b/tests/queries/0_stateless/01710_join_use_nulls.sql
new file mode 100644
index 00000000000..b024227d4e2
--- /dev/null
+++ b/tests/queries/0_stateless/01710_join_use_nulls.sql
@@ -0,0 +1,21 @@
+DROP TABLE IF EXISTS X;
+DROP TABLE IF EXISTS Y;
+
+CREATE TABLE X (id Int) ENGINE=Memory;
+CREATE TABLE Y (id Int) ENGINE=Memory;
+
+-- Type mismatch of columns to JOIN by: plus(id, 1) Int64 at left, Y.id Int32 at right.
+SELECT Y.id - 1 FROM X RIGHT JOIN Y ON (X.id + 1) = Y.id SETTINGS join_use_nulls=1; -- { serverError 53 }
+SELECT Y.id - 1 FROM X RIGHT JOIN Y ON (X.id + 1) = toInt64(Y.id) SETTINGS join_use_nulls=1;
+
+-- Logical error: 'Arguments of 'plus' have incorrect data types: '2' of type 'UInt8', '1' of type 'UInt8''.
+-- Because 1 became toNullable(1), i.e.:
+--     2 UInt8 Const(size = 1, UInt8(size = 1))
+--     1 UInt8 Const(size = 1, Nullable(size = 1, UInt8(size = 1), UInt8(size = 1)))
+SELECT 2+1 FROM system.one X RIGHT JOIN system.one Y ON X.dummy+1 = Y.dummy SETTINGS join_use_nulls = 1; -- { serverError 53 }
+SELECT 2+1 FROM system.one X RIGHT JOIN system.one Y ON X.dummy+1 = toUInt16(Y.dummy) SETTINGS join_use_nulls = 1;
+SELECT X.dummy+1 FROM system.one X RIGHT JOIN system.one Y ON X.dummy = Y.dummy SETTINGS join_use_nulls = 1;
+SELECT Y.dummy+1 FROM system.one X RIGHT JOIN system.one Y ON X.dummy = Y.dummy SETTINGS join_use_nulls = 1;
+
+DROP TABLE X;
+DROP TABLE Y;
diff --git a/tests/queries/0_stateless/01711_decimal_multiplication.reference b/tests/queries/0_stateless/01711_decimal_multiplication.reference
new file mode 100644
index 00000000000..37869329ca4
--- /dev/null
+++ b/tests/queries/0_stateless/01711_decimal_multiplication.reference
@@ -0,0 +1,4 @@
+2.0000
+2.0000
+2.0000
+2.0000
diff --git a/tests/queries/0_stateless/01711_decimal_multiplication.sql b/tests/queries/0_stateless/01711_decimal_multiplication.sql
new file mode 100644
index 00000000000..10d23599b4d
--- /dev/null
+++ b/tests/queries/0_stateless/01711_decimal_multiplication.sql
@@ -0,0 +1,4 @@
+SELECT materialize(toDecimal64(4,4)) - materialize(toDecimal32(2,2));
+SELECT toDecimal64(4,4) - materialize(toDecimal32(2,2));
+SELECT materialize(toDecimal64(4,4)) - toDecimal32(2,2);
+SELECT toDecimal64(4,4) - toDecimal32(2,2);
diff --git a/tests/queries/0_stateless/01715_background_checker_blather_zookeeper.reference b/tests/queries/0_stateless/01715_background_checker_blather_zookeeper.reference
new file mode 100644
index 00000000000..d00491fd7e5
--- /dev/null
+++ b/tests/queries/0_stateless/01715_background_checker_blather_zookeeper.reference
@@ -0,0 +1 @@
+1
diff --git a/tests/queries/0_stateless/01715_background_checker_blather_zookeeper.sql b/tests/queries/0_stateless/01715_background_checker_blather_zookeeper.sql
new file mode 100644
index 00000000000..66b53369517
--- /dev/null
+++ b/tests/queries/0_stateless/01715_background_checker_blather_zookeeper.sql
@@ -0,0 +1,28 @@
+DROP TABLE IF EXISTS i20203_1;
+DROP TABLE IF EXISTS i20203_2;
+
+CREATE TABLE i20203_1 (a Int8)
+ENGINE = ReplicatedMergeTree('/clickhouse/01715_background_checker_i20203', 'r1')
+ORDER BY tuple();
+
+CREATE TABLE i20203_2 (a Int8)
+ENGINE = ReplicatedMergeTree('/clickhouse/01715_background_checker_i20203', 'r2')
+ORDER BY tuple();
+
+DETACH TABLE i20203_2;
+INSERT INTO i20203_1 VALUES (2);
+
+DETACH TABLE i20203_1;
+ATTACH TABLE i20203_2;
+
+-- sleep 10 seconds
+SELECT number from numbers(10) where sleepEachRow(1) Format Null;
+
+SELECT num_tries < 50
+FROM system.replication_queue
+WHERE table = 'i20203_2' AND database = currentDatabase();
+
+ATTACH TABLE i20203_1;
+
+DROP TABLE IF EXISTS i20203_1;
+DROP TABLE IF EXISTS i20203_2;
diff --git a/tests/queries/0_stateless/01715_table_function_view_fix.reference b/tests/queries/0_stateless/01715_table_function_view_fix.reference
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/queries/0_stateless/01715_table_function_view_fix.sql b/tests/queries/0_stateless/01715_table_function_view_fix.sql
new file mode 100644
index 00000000000..de5150b7b70
--- /dev/null
+++ b/tests/queries/0_stateless/01715_table_function_view_fix.sql
@@ -0,0 +1 @@
+SELECT view(SELECT 1); -- { clientError 62 }
diff --git a/tests/queries/0_stateless/01716_array_difference_overflow.reference b/tests/queries/0_stateless/01716_array_difference_overflow.reference
new file mode 100644
index 00000000000..5297534679e
--- /dev/null
+++ b/tests/queries/0_stateless/01716_array_difference_overflow.reference
@@ -0,0 +1 @@
+[0,9223372036854710272]
diff --git a/tests/queries/0_stateless/01716_array_difference_overflow.sql b/tests/queries/0_stateless/01716_array_difference_overflow.sql
new file mode 100644
index 00000000000..3d153725294
--- /dev/null
+++ b/tests/queries/0_stateless/01716_array_difference_overflow.sql
@@ -0,0 +1,2 @@
+-- Overflow is Ok and behaves as the CPU does it.
+SELECT arrayDifference([65536, -9223372036854775808]);
diff --git a/tests/queries/0_stateless/01716_decimal_comparison_ubsan.reference b/tests/queries/0_stateless/01716_decimal_comparison_ubsan.reference
new file mode 100644
index 00000000000..573541ac970
--- /dev/null
+++ b/tests/queries/0_stateless/01716_decimal_comparison_ubsan.reference
@@ -0,0 +1 @@
+0
diff --git a/tests/queries/0_stateless/01716_decimal_comparison_ubsan.sql b/tests/queries/0_stateless/01716_decimal_comparison_ubsan.sql
new file mode 100644
index 00000000000..f68d9de1995
--- /dev/null
+++ b/tests/queries/0_stateless/01716_decimal_comparison_ubsan.sql
@@ -0,0 +1,2 @@
+SET decimal_check_overflow = 0;
+SELECT toDecimal64(0, 8) = 9223372036854775807;
diff --git a/tests/queries/0_stateless/01716_drop_rename_sign_column.reference b/tests/queries/0_stateless/01716_drop_rename_sign_column.reference
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/queries/0_stateless/01716_drop_rename_sign_column.sql b/tests/queries/0_stateless/01716_drop_rename_sign_column.sql
new file mode 100644
index 00000000000..c9119ee2b46
--- /dev/null
+++ b/tests/queries/0_stateless/01716_drop_rename_sign_column.sql
@@ -0,0 +1,14 @@
+DROP TABLE IF EXISTS signed_table;
+
+CREATE TABLE signed_table (
+    k UInt32,
+    v String,
+    s Int8
+) ENGINE CollapsingMergeTree(s) ORDER BY k;
+
+INSERT INTO signed_table(k, v, s) VALUES (1, 'a', 1);
+
+ALTER TABLE signed_table DROP COLUMN s; --{serverError 524}
+ALTER TABLE signed_table RENAME COLUMN s TO s1; --{serverError 524}
+
+DROP TABLE IF EXISTS signed_table;
diff --git a/tests/queries/0_stateless/01717_global_with_subquery_fix.reference b/tests/queries/0_stateless/01717_global_with_subquery_fix.reference
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/queries/0_stateless/01717_global_with_subquery_fix.sql b/tests/queries/0_stateless/01717_global_with_subquery_fix.sql
new file mode 100644
index 00000000000..14c4ac3e4ca
--- /dev/null
+++ b/tests/queries/0_stateless/01717_global_with_subquery_fix.sql
@@ -0,0 +1 @@
+WITH (SELECT count(distinct colU) from tabA) AS withA, (SELECT count(distinct colU) from tabA) AS withB SELECT withA / withB AS ratio FROM (SELECT date AS period, colX FROM (SELECT date, if(colA IN (SELECT colB FROM tabC), 0, colA) AS colX FROM tabB) AS tempB GROUP BY period, colX) AS main; -- {serverError 60}
diff --git a/tests/queries/0_stateless/01717_int_div_float_too_large_ubsan.reference b/tests/queries/0_stateless/01717_int_div_float_too_large_ubsan.reference
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/queries/0_stateless/01717_int_div_float_too_large_ubsan.sql b/tests/queries/0_stateless/01717_int_div_float_too_large_ubsan.sql
new file mode 100644
index 00000000000..c4f26a079f0
--- /dev/null
+++ b/tests/queries/0_stateless/01717_int_div_float_too_large_ubsan.sql
@@ -0,0 +1,2 @@
+SELECT intDiv(9223372036854775807, 0.9998999834060669); -- { serverError 153 }
+SELECT intDiv(9223372036854775807, 1.);  -- { serverError 153 }
diff --git a/tests/queries/0_stateless/01718_subtract_seconds_date.reference b/tests/queries/0_stateless/01718_subtract_seconds_date.reference
new file mode 100644
index 00000000000..97e3da8cc48
--- /dev/null
+++ b/tests/queries/0_stateless/01718_subtract_seconds_date.reference
@@ -0,0 +1,2 @@
+2021-02-14 23:59:59
+10
diff --git a/tests/queries/0_stateless/01718_subtract_seconds_date.sql b/tests/queries/0_stateless/01718_subtract_seconds_date.sql
new file mode 100644
index 00000000000..6bffcd4db5a
--- /dev/null
+++ b/tests/queries/0_stateless/01718_subtract_seconds_date.sql
@@ -0,0 +1,2 @@
+SELECT subtractSeconds(toDate('2021-02-15'), 1);
+SELECT subtractSeconds(today(), 1) - subtractSeconds(today(), 11);
diff --git a/tests/queries/0_stateless/01719_join_timezone.reference b/tests/queries/0_stateless/01719_join_timezone.reference
new file mode 100644
index 00000000000..c2702a38012
--- /dev/null
+++ b/tests/queries/0_stateless/01719_join_timezone.reference
@@ -0,0 +1,3 @@
+2020-05-13 13:38:45	2020-05-13 16:38:45
+2020-05-13 13:38:45	2020-05-13 16:38:45
+2020-05-13 13:38:45	2020-05-13 16:38:45
diff --git a/tests/queries/0_stateless/01719_join_timezone.sql b/tests/queries/0_stateless/01719_join_timezone.sql
new file mode 100644
index 00000000000..cbf0c27fcfc
--- /dev/null
+++ b/tests/queries/0_stateless/01719_join_timezone.sql
@@ -0,0 +1,45 @@
+DROP TABLE IF EXISTS test;
+
+CREATE TABLE test (timestamp DateTime('UTC'), i UInt8) Engine=MergeTree() PARTITION BY toYYYYMM(timestamp) ORDER BY (i);
+INSERT INTO test values ('2020-05-13 16:38:45', 1);
+
+SELECT
+    toTimeZone(timestamp, 'America/Sao_Paulo') AS converted,
+    timestamp AS original
+FROM test
+LEFT JOIN (SELECT 2 AS x) AS anything ON x = i
+WHERE timestamp >= toDateTime('2020-05-13T00:00:00', 'America/Sao_Paulo');
+
+/* This was incorrect result in previous ClickHouse versions:
+┌─converted───────────┬─original────────────┐
+│ 2020-05-13 16:38:45 │ 2020-05-13 16:38:45 │ <-- toTimeZone is ignored.
+└─────────────────────┴─────────────────────┘
+*/
+
+SELECT
+    toTimeZone(timestamp, 'America/Sao_Paulo') AS converted,
+    timestamp AS original
+FROM test
+-- LEFT JOIN (SELECT 2 AS x) AS anything ON x = i -- Removing the join fixes the issue.
+WHERE timestamp >= toDateTime('2020-05-13T00:00:00', 'America/Sao_Paulo');
+
+/*
+┌─converted───────────┬─original────────────┐
+│ 2020-05-13 13:38:45 │ 2020-05-13 16:38:45 │ <-- toTimeZone works.
+└─────────────────────┴─────────────────────┘
+*/
+
+SELECT
+    toTimeZone(timestamp, 'America/Sao_Paulo') AS converted,
+    timestamp AS original
+FROM test
+LEFT JOIN (SELECT 2 AS x) AS anything ON x = i
+WHERE timestamp >= '2020-05-13T00:00:00'; -- Not using toDateTime in the WHERE also fixes the issue.
+
+/*
+┌─converted───────────┬─original────────────┐
+│ 2020-05-13 13:38:45 │ 2020-05-13 16:38:45 │ <-- toTimeZone works.
+└─────────────────────┴─────────────────────┘
+*/
+
+DROP TABLE test;
diff --git a/tests/queries/0_stateless/01720_union_distinct_with_limit.reference b/tests/queries/0_stateless/01720_union_distinct_with_limit.reference
new file mode 100644
index 00000000000..d00491fd7e5
--- /dev/null
+++ b/tests/queries/0_stateless/01720_union_distinct_with_limit.reference
@@ -0,0 +1 @@
+1
diff --git a/tests/queries/0_stateless/01720_union_distinct_with_limit.sql b/tests/queries/0_stateless/01720_union_distinct_with_limit.sql
new file mode 100644
index 00000000000..9fc5b3eafd2
--- /dev/null
+++ b/tests/queries/0_stateless/01720_union_distinct_with_limit.sql
@@ -0,0 +1,8 @@
+SELECT x
+FROM
+(
+    SELECT 1 AS x
+    UNION DISTINCT
+    SELECT 1
+)
+LIMIT 1;
diff --git a/tests/queries/0_stateless/arcadia_skip_list.txt b/tests/queries/0_stateless/arcadia_skip_list.txt
index 38d5d3871f5..b141443a979 100644
--- a/tests/queries/0_stateless/arcadia_skip_list.txt
+++ b/tests/queries/0_stateless/arcadia_skip_list.txt
@@ -189,6 +189,7 @@
 01650_fetch_patition_with_macro_in_zk_path
 01651_bugs_from_15889
 01655_agg_if_nullable
+01658_read_file_to_stringcolumn
 01182_materialized_view_different_structure
 01660_sum_ubsan
 01669_columns_declaration_serde
diff --git a/tests/queries/1_stateful/00158_cache_dictionary_has.reference b/tests/queries/1_stateful/00158_cache_dictionary_has.reference
index f8d5cd4f53d..ad4bce6bec5 100644
--- a/tests/queries/1_stateful/00158_cache_dictionary_has.reference
+++ b/tests/queries/1_stateful/00158_cache_dictionary_has.reference
@@ -1,6 +1,6 @@
+100
 6410
-6410
-25323
+100
 25323
-1774655
+100
 1774655
diff --git a/tests/queries/1_stateful/00158_cache_dictionary_has.sql b/tests/queries/1_stateful/00158_cache_dictionary_has.sql
index 063e7843fd4..8461728c58e 100644
--- a/tests/queries/1_stateful/00158_cache_dictionary_has.sql
+++ b/tests/queries/1_stateful/00158_cache_dictionary_has.sql
@@ -6,15 +6,15 @@ CREATE DICTIONARY db_dict.cache_hits
 PRIMARY KEY WatchID
 SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'hits' PASSWORD '' DB 'test'))
 LIFETIME(MIN 300 MAX 600)
-LAYOUT(CACHE(SIZE_IN_CELLS 100000 QUERY_WAIT_TIMEOUT_MILLISECONDS 600000));
+LAYOUT(CACHE(SIZE_IN_CELLS 100 QUERY_WAIT_TIMEOUT_MILLISECONDS 600000));
 
-SELECT sum(flag) FROM (SELECT dictHas('db_dict.cache_hits', toUInt64(WatchID)) as flag FROM test.hits PREWHERE WatchID % 1400 == 0);
+SELECT sum(flag) FROM (SELECT dictHas('db_dict.cache_hits', toUInt64(WatchID)) as flag FROM test.hits PREWHERE WatchID % 1400 == 0 LIMIT 100);
 SELECT count() from test.hits PREWHERE WatchID % 1400 == 0;
 
-SELECT sum(flag) FROM (SELECT dictHas('db_dict.cache_hits', toUInt64(WatchID)) as flag FROM test.hits PREWHERE WatchID % 350 == 0);
+SELECT sum(flag) FROM (SELECT dictHas('db_dict.cache_hits', toUInt64(WatchID)) as flag FROM test.hits PREWHERE WatchID % 350 == 0 LIMIT 100);
 SELECT count() from test.hits PREWHERE WatchID % 350 == 0;
 
-SELECT sum(flag) FROM (SELECT dictHas('db_dict.cache_hits', toUInt64(WatchID)) as flag FROM test.hits PREWHERE WatchID % 5 == 0);
+SELECT sum(flag) FROM (SELECT dictHas('db_dict.cache_hits', toUInt64(WatchID)) as flag FROM test.hits PREWHERE WatchID % 5 == 0 LIMIT 100);
 SELECT count() from test.hits PREWHERE WatchID % 5 == 0;
 
 DROP DICTIONARY IF EXISTS db_dict.cache_hits;
diff --git a/tests/queries/skip_list.json b/tests/queries/skip_list.json
index d76603bf633..85b0864a188 100644
--- a/tests/queries/skip_list.json
+++ b/tests/queries/skip_list.json
@@ -103,7 +103,7 @@
         "00738_lock_for_inner_table"
     ],
     "polymorphic-parts": [
-        "01508_partition_pruning", /// bug, shoud be fixed
+        "01508_partition_pruning_long", /// bug, shoud be fixed
         "01482_move_to_prewhere_and_cast" /// bug, shoud be fixed
     ],
     "antlr": [
@@ -267,7 +267,7 @@
         "01501_clickhouse_client_INSERT_exception",
         "01504_compression_multiple_streams",
         "01508_explain_header",
-        "01508_partition_pruning",
+        "01508_partition_pruning_long",
         "01509_check_parallel_quorum_inserts",
         "01509_parallel_quorum_and_merge",
         "01515_mv_and_array_join_optimisation_bag",
@@ -572,6 +572,9 @@
         "01603_rename_overwrite_bug",
         "01646_system_restart_replicas_smoke", // system restart replicas is a global query
         "01676_dictget_in_default_expression",
+        "01715_background_checker_blather_zookeeper",
+        "01700_system_zookeeper_path_in",
+        "01669_columns_declaration_serde",
         "attach",
         "ddl_dictionaries",
         "dictionary",
@@ -579,6 +582,7 @@
         "live_view",
         "memory_leak",
         "memory_limit",
-        "polygon_dicts" // they use an explicitly specified database
+        "polygon_dicts", // they use an explicitly specified database
+        "01658_read_file_to_stringcolumn"
     ]
 }
diff --git a/utils/convert-month-partitioned-parts/main.cpp b/utils/convert-month-partitioned-parts/main.cpp
index bce1e08077c..0a697937eb6 100644
--- a/utils/convert-month-partitioned-parts/main.cpp
+++ b/utils/convert-month-partitioned-parts/main.cpp
@@ -97,6 +97,8 @@ void run(String part_path, String date_column, String dest_path)
     Poco::File(new_tmp_part_path_str + "checksums.txt").setWriteable();
     WriteBufferFromFile checksums_out(new_tmp_part_path_str + "checksums.txt", 4096);
     checksums.write(checksums_out);
+    checksums_in.close();
+    checksums_out.close();
 
     Poco::File(new_tmp_part_path).renameTo(new_part_path.toString());
 }
diff --git a/utils/github/backport.py b/utils/github/backport.py
index 576e3b069c2..7fddbbee241 100644
--- a/utils/github/backport.py
+++ b/utils/github/backport.py
@@ -62,7 +62,7 @@ class Backport:
         RE_NO_BACKPORT = re.compile(r'^v(\d+\.\d+)-no-backport$')
         RE_BACKPORTED = re.compile(r'^v(\d+\.\d+)-backported$')
 
-        # pull-requests are sorted by ancestry from the least recent.
+        # pull-requests are sorted by ancestry from the most recent.
         for pr in pull_requests:
             while repo.comparator(branches[-1][1]) >= repo.comparator(pr['mergeCommit']['oid']):
                 logging.info("PR #{} is already inside {}. Dropping this branch for further PRs".format(pr['number'], branches[-1][0]))
diff --git a/utils/github/local.py b/utils/github/local.py
index a997721bc76..2ad8d4b8b71 100644
--- a/utils/github/local.py
+++ b/utils/github/local.py
@@ -6,15 +6,15 @@ import os
 import re
 
 
-class RepositoryBase(object):
+class RepositoryBase:
     def __init__(self, repo_path):
         import git
 
         self._repo = git.Repo(repo_path, search_parent_directories=(not repo_path))
 
-        # commit comparator
+        # comparator of commits
         def cmp(x, y):
-            if x == y:
+            if str(x) == str(y):
                 return 0
             if self._repo.is_ancestor(x, y):
                 return -1
diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv
index 8d05f5fff46..d0d782e77ec 100644
--- a/utils/list-versions/version_date.tsv
+++ b/utils/list-versions/version_date.tsv
@@ -1,6 +1,9 @@
+v21.2.3.15-stable	2021-02-14
 v21.2.2.8-stable	2021-02-07
+v21.1.4.46-stable	2021-02-14
 v21.1.3.32-stable	2021-02-03
 v21.1.2.15-stable	2021-01-18
+v20.12.6.29-stable	2021-02-14
 v20.12.5.18-stable	2021-02-03
 v20.12.5.14-stable	2020-12-28
 v20.12.4.5-stable	2020-12-24
diff --git a/utils/zookeeper-test/main.cpp b/utils/zookeeper-test/main.cpp
index 8f8aac00866..bfd7df26726 100644
--- a/utils/zookeeper-test/main.cpp
+++ b/utils/zookeeper-test/main.cpp
@@ -127,18 +127,22 @@ void testCreateListWatchEvent(zkutil::ZooKeeper & zk)
 
 void testMultiRequest(zkutil::ZooKeeper & zk)
 {
+    std::cerr << "Testing multi request\n";
     Coordination::Requests requests;
     requests.push_back(zkutil::makeCreateRequest("/data/multirequest", "aaa", zkutil::CreateMode::Persistent));
     requests.push_back(zkutil::makeSetRequest("/data/multirequest", "bbb", -1));
     zk.multi(requests);
+    std::cerr << "Multi executed\n";
 
     try
     {
         requests.clear();
+        std::cerr << "Testing bad multi\n";
         requests.push_back(zkutil::makeCreateRequest("/data/multirequest", "qweqwe", zkutil::CreateMode::Persistent));
         requests.push_back(zkutil::makeSetRequest("/data/multirequest", "bbb", -1));
         requests.push_back(zkutil::makeSetRequest("/data/multirequest", "ccc", -1));
         zk.multi(requests);
+        std::cerr << "Bad multi executed\n";
         std::terminate();
     }
     catch (...)
@@ -147,6 +151,7 @@ void testMultiRequest(zkutil::ZooKeeper & zk)
     }
 
     checkEq(zk, "/data/multirequest", "bbb");
+    std::cerr << "Multi request finished\n";
 }
 
 std::mutex elements_mutex;