Merge branch 'master' into database_replicated

2024-11-21 15:12:02 +00:00 · 2021-02-16 01:07:23 +03:00 · 2021-02-16 01:07:23 +03:00 · bf6f64a3fb
commit bf6f64a3fb
parent cf57c3b4a2 c9dd1aa58b
439 changed files with 10354 additions and 2370 deletions
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@ -1,32 +0,0 @@
-# See the example here: https://github.com/github/codeql-action
-
-name: "CodeQL Scanning"
-
-on:
-  schedule:
-    - cron: '0 19 * * *'
-jobs:
-  CodeQL-Build:
-
-    runs-on: self-hosted
-    timeout-minutes: 1440
-
-    steps:
-    - name: Checkout repository
-      uses: actions/checkout@v2
-      with:
-        fetch-depth: 2
-        submodules: 'recursive'
-
-    - name: Initialize CodeQL
-      uses: github/codeql-action/init@v1
-
-      with:
-        languages: cpp
-
-    - run: sudo apt-get update && sudo apt-get install -y git cmake python ninja-build gcc-10 g++-10 && mkdir build
-    - run: cd build && CC=gcc-10 CXX=g++-10 cmake ..
-    - run: cd build && ninja
-
-    - name: Perform CodeQL Analysis
-      uses: github/codeql-action/analyze@v1
--- a/.gitmodules
+++ b/.gitmodules
@ -220,4 +220,4 @@
 	url = https://github.com/ClickHouse-Extras/boringssl.git
 [submodule "contrib/NuRaft"]
 	path = contrib/NuRaft
-	url = https://github.com/eBay/NuRaft.git
+	url = https://github.com/ClickHouse-Extras/NuRaft.git
--- a/README.md
+++ b/README.md
@ -11,9 +11,5 @@ ClickHouse® is an open-source column-oriented database management system that a
 * [Slack](https://join.slack.com/t/clickhousedb/shared_invite/zt-ly9m4w1x-6j7x5Ts_pQZqrctAbRZ3cg) and [Telegram](https://telegram.me/clickhouse_en) allow to chat with ClickHouse users in real-time.
 * [Blog](https://clickhouse.yandex/blog/en/) contains various ClickHouse-related articles, as well as announcements and reports about events.
 * [Code Browser](https://clickhouse.tech/codebrowser/html_report/ClickHouse/index.html) with syntax highlight and navigation.
-* [Yandex.Messenger channel](https://yandex.ru/chat/#/join/20e380d9-c7be-4123-ab06-e95fb946975e) shares announcements and useful links in Russian.
 * [Contacts](https://clickhouse.tech/#contacts) can help to get your questions answered if there are any.
 * You can also [fill this form](https://clickhouse.tech/#meet) to meet Yandex ClickHouse team in person.
-
-## Upcoming Events
-* [Chinese ClickHouse Meetup (online)](http://hdxu.cn/8KxZE) on 6 February 2021.
--- a/base/common/arithmeticOverflow.h
+++ b/base/common/arithmeticOverflow.h
@ -1,6 +1,8 @@
 #pragma once

 #include <common/extended_types.h>
+#include <common/defines.h>
+

 namespace common
 {
@ -156,4 +158,11 @@ namespace common
            return false;
        return (x * y) / y != x;
    }
+
+    /// Multiply and ignore overflow.
+    template <typename T1, typename T2>
+    inline auto NO_SANITIZE_UNDEFINED mulIgnoreOverflow(T1 x, T2 y)
+    {
+        return x * y;
+    }
 }
--- a/base/daemon/BaseDaemon.cpp
+++ b/base/daemon/BaseDaemon.cpp
@ -152,7 +152,7 @@ static void signalHandler(int sig, siginfo_t * info, void * context)
    if (sig != SIGTSTP) /// This signal is used for debugging.
    {
        /// The time that is usually enough for separate thread to print info into log.
-        sleepForSeconds(10);
+        sleepForSeconds(20);  /// FIXME: use some feedback from threads that process stacktrace
        call_default_signal_handler(sig);
    }

@ -311,7 +311,8 @@ private:
        if (stack_trace.getSize())
        {
            /// Write bare stack trace (addresses) just in case if we will fail to print symbolized stack trace.
-            /// NOTE This still require memory allocations and mutex lock inside logger. BTW we can also print it to stderr using write syscalls.
+            /// NOTE: This still require memory allocations and mutex lock inside logger.
+            ///       BTW we can also print it to stderr using write syscalls.

            std::stringstream bare_stacktrace;
            bare_stacktrace << "Stack trace:";
@ -324,7 +325,7 @@ private:
        /// Write symbolized stack trace line by line for better grep-ability.
        stack_trace.toStringEveryLine([&](const std::string & s) { LOG_FATAL(log, s); });

-#if defined(__linux__)
+#if defined(OS_LINUX)
        /// Write information about binary checksum. It can be difficult to calculate, so do it only after printing stack trace.
        String calculated_binary_hash = getHashOfLoadedBinaryHex();
        if (daemon.stored_binary_hash.empty())
@ -561,6 +562,7 @@ void debugIncreaseOOMScore()
    {
        DB::WriteBufferFromFile buf("/proc/self/oom_score_adj");
        buf.write(new_score.c_str(), new_score.size());
+        buf.close();
    }
    catch (const Poco::Exception & e)
    {
@ -783,7 +785,7 @@ void BaseDaemon::initializeTerminationAndSignalProcessing()
    /// Setup signal handlers.
    /// SIGTSTP is added for debugging purposes. To output a stack trace of any running thread at anytime.

-    addSignalHandler({SIGABRT, SIGSEGV, SIGILL, SIGBUS, SIGSYS, SIGFPE, SIGPIPE, SIGTSTP}, signalHandler, &handled_signals);
+    addSignalHandler({SIGABRT, SIGSEGV, SIGILL, SIGBUS, SIGSYS, SIGFPE, SIGPIPE, SIGTSTP, SIGTRAP}, signalHandler, &handled_signals);
    addSignalHandler({SIGHUP, SIGUSR1}, closeLogsSignalHandler, &handled_signals);
    addSignalHandler({SIGINT, SIGQUIT, SIGTERM}, terminateRequestedSignalHandler, &handled_signals);

@ -986,7 +988,7 @@ void BaseDaemon::setupWatchdog()
        if (errno == ECHILD)
        {
            logger().information("Child process no longer exists.");
-            _exit(status);
+            _exit(WEXITSTATUS(status));
        }

        if (WIFEXITED(status))
@ -1020,7 +1022,7 @@ void BaseDaemon::setupWatchdog()

        /// Automatic restart is not enabled but you can play with it.
 #if 1
-        _exit(status);
+        _exit(WEXITSTATUS(status));
 #else
        logger().information("Will restart.");
        if (argv0)
--- a/cmake/find/nuraft.cmake
+++ b/cmake/find/nuraft.cmake
@ -11,7 +11,7 @@ if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/NuRaft/CMakeLists.txt")
    return()
 endif ()

-if (NOT OS_FREEBSD)
+if (NOT OS_FREEBSD AND NOT OS_DARWIN)
    set (USE_NURAFT 1)
    set (NURAFT_LIBRARY nuraft)

@ -20,5 +20,5 @@ if (NOT OS_FREEBSD)
    message (STATUS "Using NuRaft=${USE_NURAFT}: ${NURAFT_INCLUDE_DIR} : ${NURAFT_LIBRARY}")
 else()
    set (USE_NURAFT 0)
-    message (STATUS "Using internal NuRaft library on FreeBSD is not supported")
+    message (STATUS "Using internal NuRaft library on FreeBSD and Darwin is not supported")
 endif()
--- a/contrib/NuRaft
+++ b/contrib/NuRaft
@ -1 +1 @@
-Subproject commit 410bd149da84cdde60b4436b02b738749f4e87e1
+Subproject commit 7adf7ae33e7d5c307342431b577c8ab1025ee793
--- a/contrib/boost
+++ b/contrib/boost
@ -1 +1 @@
-Subproject commit 8e259cd2a6b60d75dd17e73432f11bb7b9351bb1
+Subproject commit 48f40ebb539220d328958f8823b094c0b07a4e79
--- a/contrib/nuraft-cmake/CMakeLists.txt
+++ b/contrib/nuraft-cmake/CMakeLists.txt
@ -30,7 +30,12 @@ set(SRCS

 add_library(nuraft ${SRCS})

-target_compile_definitions(nuraft PRIVATE USE_BOOST_ASIO=1 BOOST_ASIO_STANDALONE=1)
+
+if (NOT OPENSSL_SSL_LIBRARY OR NOT OPENSSL_CRYPTO_LIBRARY)
+    target_compile_definitions(nuraft PRIVATE USE_BOOST_ASIO=1 BOOST_ASIO_STANDALONE=1 SSL_LIBRARY_NOT_FOUND=1)
+else()
+    target_compile_definitions(nuraft PRIVATE USE_BOOST_ASIO=1 BOOST_ASIO_STANDALONE=1)
+endif()

 target_include_directories (nuraft SYSTEM PRIVATE ${LIBRARY_DIR}/include/libnuraft)
 # for some reason include "asio.h" directly without "boost/" prefix.
--- a/docker/server/alpine-build.sh
+++ b/docker/server/alpine-build.sh
@ -54,8 +54,10 @@ docker cp -L "${ubuntu20image}":/lib/x86_64-linux-gnu/libm.so.6       "${CONTAIN
 docker cp -L "${ubuntu20image}":/lib/x86_64-linux-gnu/libpthread.so.0 "${CONTAINER_ROOT_FOLDER}/lib"
 docker cp -L "${ubuntu20image}":/lib/x86_64-linux-gnu/librt.so.1      "${CONTAINER_ROOT_FOLDER}/lib"
 docker cp -L "${ubuntu20image}":/lib/x86_64-linux-gnu/libnss_dns.so.2 "${CONTAINER_ROOT_FOLDER}/lib"
+docker cp -L "${ubuntu20image}":/lib/x86_64-linux-gnu/libnss_files.so.2 "${CONTAINER_ROOT_FOLDER}/lib"
 docker cp -L "${ubuntu20image}":/lib/x86_64-linux-gnu/libresolv.so.2  "${CONTAINER_ROOT_FOLDER}/lib"
 docker cp -L "${ubuntu20image}":/lib64/ld-linux-x86-64.so.2           "${CONTAINER_ROOT_FOLDER}/lib64"
+docker cp -L "${ubuntu20image}":/etc/nsswitch.conf                    "${CONTAINER_ROOT_FOLDER}/etc"

 docker build "$DOCKER_BUILD_FOLDER" -f Dockerfile.alpine -t "${DOCKER_IMAGE}:${VERSION}-alpine" --pull
 rm -rf "$CONTAINER_ROOT_FOLDER"
--- a/docker/test/fasttest/run.sh
+++ b/docker/test/fasttest/run.sh
@ -120,7 +120,7 @@ function clone_root
                git checkout FETCH_HEAD
                echo 'Clonned merge head'
            else
-                git fetch
+                git fetch origin "+refs/pull/$PULL_REQUEST_NUMBER/head"
                git checkout "$COMMIT_SHA"
                echo 'Checked out to commit'
            fi
@ -163,6 +163,7 @@ function clone_submodules
            contrib/xz
            contrib/dragonbox
            contrib/fast_float
+            contrib/NuRaft
        )

        git submodule sync
@ -182,6 +183,7 @@ function run_cmake
        "-DENABLE_EMBEDDED_COMPILER=0"
        "-DENABLE_THINLTO=0"
        "-DUSE_UNWIND=1"
+        "-DENABLE_NURAFT=1"
    )

    # TODO remove this? we don't use ccache anyway. An option would be to download it
--- a/docker/test/fuzzer/run-fuzzer.sh
+++ b/docker/test/fuzzer/run-fuzzer.sh
@ -190,7 +190,7 @@ case "$stage" in
        # Lost connection to the server. This probably means that the server died
        # with abort.
        echo "failure" > status.txt
-        if ! grep -ao "Received signal.*\|Logical error.*\|Assertion.*failed\|Failed assertion.*\|.*runtime error: .*\|.*is located.*\|SUMMARY: MemorySanitizer:.*\|SUMMARY: ThreadSanitizer:.*" server.log > description.txt
+        if ! grep -ao "Received signal.*\|Logical error.*\|Assertion.*failed\|Failed assertion.*\|.*runtime error: .*\|.*is located.*\|SUMMARY: MemorySanitizer:.*\|SUMMARY: ThreadSanitizer:.*\|.*_LIBCPP_ASSERT.*" server.log > description.txt
        then
            echo "Lost connection to server. See the logs." > description.txt
        fi
--- a/docker/test/performance-comparison/perf.py
+++ b/docker/test/performance-comparison/perf.py
@ -44,6 +44,7 @@ parser.add_argument('--port', nargs='*', default=[9000], help="Space-separated l
 parser.add_argument('--runs', type=int, default=1, help='Number of query runs per server.')
 parser.add_argument('--max-queries', type=int, default=None, help='Test no more than this number of queries, chosen at random.')
 parser.add_argument('--queries-to-run', nargs='*', type=int, default=None, help='Space-separated list of indexes of queries to test.')
+parser.add_argument('--max-query-seconds', type=int, default=10, help='For how many seconds at most a query is allowed to run. The script finishes with error if this time is exceeded.')
 parser.add_argument('--profile-seconds', type=int, default=0, help='For how many seconds to profile a query for which the performance has changed.')
 parser.add_argument('--long', action='store_true', help='Do not skip the tests tagged as long.')
 parser.add_argument('--print-queries', action='store_true', help='Print test queries and exit.')
@ -323,7 +324,7 @@ for query_index in queries_to_run:
            server_seconds += elapsed
            print(f'query\t{query_index}\t{run_id}\t{conn_index}\t{elapsed}')

-            if elapsed > 10:
+            if elapsed > args.max_query_seconds:
                # Stop processing pathologically slow queries, to avoid timing out
                # the entire test task. This shouldn't really happen, so we don't
                # need much handling for this case and can just exit.
--- a/docker/test/stateless_pytest/Dockerfile
+++ b/docker/test/stateless_pytest/Dockerfile
@ -5,7 +5,10 @@ RUN apt-get update -y && \
    apt-get install -y --no-install-recommends \
        python3-pip \
        python3-setuptools \
-        python3-wheel
+        python3-wheel \
+        brotli \
+        netcat-openbsd \
+        zstd

 RUN python3 -m pip install \
    wheel \
@ -15,7 +18,10 @@ RUN python3 -m pip install \
    pytest-randomly \
    pytest-rerunfailures \
    pytest-timeout \
-    pytest-xdist
+    pytest-xdist \
+    pandas \
+    numpy \
+    scipy

 CMD dpkg -i package_folder/clickhouse-common-static_*.deb; \
    dpkg -i package_folder/clickhouse-common-static-dbg_*.deb; \
--- a/docs/_description_templates/template-data-type.md
+++ b/docs/_description_templates/template-data-type.md
@ -0,0 +1,29 @@
+---
+toc_priority: 
+toc_title: 
+---
+
+# data_type_name {#data_type-name}
+
+Description.
+
+**Parameters** (Optional)
+
+-   `x` — Description. [Type name](relative/path/to/type/dscr.md#type).
+-   `y` — Description. [Type name](relative/path/to/type/dscr.md#type).
+
+**Examples**
+
+```sql
+
+```
+
+## Additional Info {#additional-info} (Optional)
+
+The name of an additional section can be any, for example, **Usage**.
+
+**See Also** (Optional)
+
+-   [link](#)
+
+[Original article](https://clickhouse.tech/docs/en/data_types/<data-type-name>/) <!--hide-->
--- a/docs/en/engines/database-engines/materialize-mysql.md
+++ b/docs/en/engines/database-engines/materialize-mysql.md
@ -93,6 +93,7 @@ ClickHouse has only one physical order, which is determined by `ORDER BY` clause
 - Cascade `UPDATE/DELETE` queries are not supported by the `MaterializeMySQL` engine.
 - Replication can be easily broken.
 - Manual operations on database and tables are forbidden.
+- `MaterializeMySQL` is influenced by [optimize_on_insert](../../operations/settings/settings.md#optimize-on-insert) setting. The data is merged in the corresponding table in the `MaterializeMySQL` database when a table in the MySQL server changes.

 ## Examples of Use {#examples-of-use}

@ -156,4 +157,4 @@ SELECT * FROM mysql.test;
 └───┴─────┴──────┘
 ```

-[Original article](https://clickhouse.tech/docs/en/database_engines/materialize-mysql/) <!--hide-->
+[Original article](https://clickhouse.tech/docs/en/engines/database-engines/materialize-mysql/) <!--hide-->
--- a/docs/en/engines/table-engines/integrations/index.md
+++ b/docs/en/engines/table-engines/integrations/index.md
@ -12,6 +12,9 @@ List of supported integrations:
 -   [ODBC](../../../engines/table-engines/integrations/odbc.md)
 -   [JDBC](../../../engines/table-engines/integrations/jdbc.md)
 -   [MySQL](../../../engines/table-engines/integrations/mysql.md)
+-   [MongoDB](../../../engines/table-engines/integrations/mongodb.md)
 -   [HDFS](../../../engines/table-engines/integrations/hdfs.md)
 -   [S3](../../../engines/table-engines/integrations/s3.md)
 -   [Kafka](../../../engines/table-engines/integrations/kafka.md)
+-   [EmbeddedRocksDB](../../../engines/table-engines/integrations/embedded-rocksdb.md)
+-   [RabbitMQ](../../../engines/table-engines/integrations/rabbitmq.md)
--- a/docs/en/engines/table-engines/integrations/mongodb.md
+++ b/docs/en/engines/table-engines/integrations/mongodb.md
@ -0,0 +1,57 @@
+---
+toc_priority: 7
+toc_title: MongoDB
+---
+
+# MongoDB {#mongodb}
+
+MongoDB engine is read-only table engine which allows to read data (`SELECT` queries) from remote MongoDB collection. Engine supports only non-nested data types. `INSERT` queries are not supported.
+
+## Creating a Table {#creating-a-table}
+
+``` sql
+CREATE TABLE [IF NOT EXISTS] [db.]table_name
+(
+    name1 [type1],
+    name2 [type2],
+    ...
+) ENGINE = MongoDB(host:port, database, collection, user, password);
+```
+
+**Engine Parameters**
+
+-   `host:port` — MongoDB server address.
+
+-   `database` — Remote database name.
+
+-   `collection` — Remote collection name.
+
+-   `user` — MongoDB user.
+
+-   `password` — User password.
+
+## Usage Example {#usage-example}
+
+Table in ClickHouse which allows to read data from MongoDB collection:
+
+``` text
+CREATE TABLE mongo_table
+(
+    key UInt64, 
+    data String
+) ENGINE = MongoDB('mongo1:27017', 'test', 'simple_table', 'testuser', 'clickhouse');
+```
+
+Query:
+
+``` sql
+SELECT COUNT() FROM mongo_table;
+```
+
+``` text
+┌─count()─┐
+│       4 │
+└─────────┘
+```
+
+[Original article](https://clickhouse.tech/docs/en/operations/table_engines/integrations/mongodb/) <!--hide-->
--- a/docs/en/engines/table-engines/integrations/s3.md
+++ b/docs/en/engines/table-engines/integrations/s3.md
@ -136,8 +136,7 @@ The following settings can be specified in configuration file for given endpoint
 -   `access_key_id` and `secret_access_key` — Optional. Specifies credentials to use with given endpoint.
 -   `use_environment_credentials` — Optional, default value is `false`. If set to `true`, S3 client will try to obtain credentials from environment variables and Amazon EC2 metadata for given endpoint.
 -   `header` — Optional, can be speficied multiple times. Adds specified HTTP header to a request to given endpoint.
-
-This configuration also applies to S3 disks in `MergeTree` table engine family.
+-   `server_side_encryption_customer_key_base64` — Optional. If specified, required headers for accessing S3 objects with SSE-C encryption will be set.

 Example:

@ -149,6 +148,7 @@ Example:
        <!-- <secret_access_key>SECRET_ACCESS_KEY</secret_access_key> -->
        <!-- <use_environment_credentials>false</use_environment_credentials> -->
        <!-- <header>Authorization: Bearer SOME-TOKEN</header> -->
+        <!-- <server_side_encryption_customer_key_base64>BASE64-ENCODED-KEY</server_side_encryption_customer_key_base64> -->
    </endpoint-name>
 </s3>
 ```
--- a/docs/en/engines/table-engines/mergetree-family/mergetree.md
+++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md
@ -104,7 +104,8 @@ For a description of parameters, see the [CREATE query description](../../../sql
    -   `max_parts_in_total` — Maximum number of parts in all partitions.
 	-   `max_compress_block_size` — Maximum size of blocks of uncompressed data before compressing for writing to a table. You can also specify this setting in the global settings (see [max_compress_block_size](../../../operations/settings/settings.md#max-compress-block-size) setting). The value specified when table is created overrides the global value for this setting.
 	-   `min_compress_block_size` — Minimum size of blocks of uncompressed data required for compression when writing the next mark. You can also specify this setting in the global settings (see [min_compress_block_size](../../../operations/settings/settings.md#min-compress-block-size) setting). The value specified when table is created overrides the global value for this setting.
-
+    -   `max_partitions_to_read` — Limits the maximum number of partitions that can be accessed in one query. You can also specify setting [max_partitions_to_read](../../../operations/settings/merge-tree-settings.md#max-partitions-to-read) in the global setting.
+    
 **Example of Sections Setting**

 ``` sql
@ -714,6 +715,7 @@ Configuration markup:
            <endpoint>https://storage.yandexcloud.net/my-bucket/root-path/</endpoint>
            <access_key_id>your_access_key_id</access_key_id>
            <secret_access_key>your_secret_access_key</secret_access_key>
+            <server_side_encryption_customer_key_base64>your_base64_encoded_customer_key</server_side_encryption_customer_key_base64>
            <proxy>
                <uri>http://proxy1</uri>
                <uri>http://proxy2</uri>
@ -749,7 +751,8 @@ Optional parameters:
 -   `metadata_path` — Path on local FS to store metadata files for S3. Default value is `/var/lib/clickhouse/disks/<disk_name>/`. 
 -   `cache_enabled` — Allows to cache mark and index files on local FS. Default value is `true`. 
 -   `cache_path` — Path on local FS where to store cached mark and index files. Default value is `/var/lib/clickhouse/disks/<disk_name>/cache/`. 
-   `skip_access_check` — If true disk access checks will not be performed on disk start-up. Default value is `false`.
+-   `skip_access_check` — If true, disk access checks will not be performed on disk start-up. Default value is `false`.
+-   `server_side_encryption_customer_key_base64` — If specified, required headers for accessing S3 objects with SSE-C encryption will be set.


 S3 disk can be configured as `main` or `cold` storage:
--- a/docs/en/faq/operations/delete-old-data.md
+++ b/docs/en/faq/operations/delete-old-data.md
@ -39,4 +39,4 @@ More details on [manipulating partitions](../../sql-reference/statements/alter/p

 It’s rather radical to drop all data from a table, but in some cases it might be exactly what you need.

-More details on [table truncation](../../sql-reference/statements/alter/partition.md#alter_drop-partition).
+More details on [table truncation](../../sql-reference/statements/truncate.md).
--- a/docs/en/interfaces/formats.md
+++ b/docs/en/interfaces/formats.md
@ -31,8 +31,8 @@ The supported formats are:
 | [JSONCompactString](#jsoncompactstring)                                                 | ✗     | ✔      |
 | [JSONEachRow](#jsoneachrow)                                                             | ✔     | ✔      |
 | [JSONEachRowWithProgress](#jsoneachrowwithprogress)                                     | ✗     | ✔      |
-| [JSONStringEachRow](#jsonstringeachrow)                                                 | ✔     | ✔      |
-| [JSONStringEachRowWithProgress](#jsonstringeachrowwithprogress)                         | ✗     | ✔      |
+| [JSONStringsEachRow](#jsonstringseachrow)                                               | ✔     | ✔      |
+| [JSONStringsEachRowWithProgress](#jsonstringseachrowwithprogress)                       | ✗     | ✔      |
 | [JSONCompactEachRow](#jsoncompacteachrow)                                               | ✔     | ✔      |
 | [JSONCompactEachRowWithNamesAndTypes](#jsoncompacteachrowwithnamesandtypes)             | ✔     | ✔      |
 | [JSONCompactStringEachRow](#jsoncompactstringeachrow)                                   | ✔     | ✔      |
@ -612,7 +612,7 @@ Example:
 ```

 ## JSONEachRow {#jsoneachrow}
-## JSONStringEachRow {#jsonstringeachrow}
+## JSONStringsEachRow {#jsonstringseachrow}
 ## JSONCompactEachRow {#jsoncompacteachrow}
 ## JSONCompactStringEachRow {#jsoncompactstringeachrow}

@ -627,9 +627,9 @@ When using these formats, ClickHouse outputs rows as separated, newline-delimite
 When inserting the data, you should provide a separate JSON value for each row.

 ## JSONEachRowWithProgress {#jsoneachrowwithprogress}
-## JSONStringEachRowWithProgress {#jsonstringeachrowwithprogress}
+## JSONStringsEachRowWithProgress {#jsonstringseachrowwithprogress}

-Differs from `JSONEachRow`/`JSONStringEachRow` in that ClickHouse will also yield progress information as JSON values.
+Differs from `JSONEachRow`/`JSONStringsEachRow` in that ClickHouse will also yield progress information as JSON values.

 ```json
 {"row":{"'hello'":"hello","multiply(42, number)":"0","range(5)":[0,1,2,3,4]}}
--- a/docs/en/introduction/adopters.md
+++ b/docs/en/introduction/adopters.md
@ -81,6 +81,7 @@ toc_title: Adopters
 | <a href="https://posthog.com/" class="favicon">PostHog</a> | Product Analytics | Main Product | — | — | [Release Notes, Oct 2020](https://posthog.com/blog/the-posthog-array-1-15-0) |
 | <a href="https://postmates.com/" class="favicon">Postmates</a> | Delivery | — | — | — | [Talk in English, July 2020](https://youtu.be/GMiXCMFDMow?t=188) |
 | <a href="http://www.pragma-innovation.fr/" class="favicon">Pragma Innovation</a> | Telemetry and Big Data Analysis | Main product | — | — | [Slides in English, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup18/4_pragma_innovation.pdf) |
+| <a href="https://prana-system.com/en/" class="favicon">PRANA</a> | Industrial predictive analytics | Main product | — | — | [News (russian), Feb 2021](https://habr.com/en/news/t/541392/) |
 | <a href="https://www.qingcloud.com/" class="favicon">QINGCLOUD</a> | Cloud services | Main product | — | — | [Slides in Chinese, October 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup19/4.%20Cloud%20%2B%20TSDB%20for%20ClickHouse%20张健%20QingCloud.pdf) |
 | <a href="https://qrator.net" class="favicon">Qrator</a> | DDoS protection | Main product | — | — | [Blog Post, March 2019](https://blog.qrator.net/en/clickhouse-ddos-mitigation_37/) |
 | <a href="https://www.rbinternational.com/" class="favicon">Raiffeisenbank</a> | Banking | Analytics | — | — | [Lecture in Russian, December 2020](https://cs.hse.ru/announcements/421965599.html) |
--- a/docs/en/operations/backup.md
+++ b/docs/en/operations/backup.md
@ -5,7 +5,7 @@ toc_title: Data Backup

 # Data Backup {#data-backup}

-While [replication](../engines/table-engines/mergetree-family/replication.md) provides protection from hardware failures, it does not protect against human errors: accidental deletion of data, deletion of the wrong table or a table on the wrong cluster, and software bugs that result in incorrect data processing or data corruption. In many cases mistakes like these will affect all replicas. ClickHouse has built-in safeguards to prevent some types of mistakes — for example, by default [you can’t just drop tables with a MergeTree-like engine containing more than 50 Gb of data](https://github.com/ClickHouse/ClickHouse/blob/v18.14.18-stable/programs/server/config.xml#L322-L330). However, these safeguards don’t cover all possible cases and can be circumvented.
+While [replication](../engines/table-engines/mergetree-family/replication.md) provides protection from hardware failures, it does not protect against human errors: accidental deletion of data, deletion of the wrong table or a table on the wrong cluster, and software bugs that result in incorrect data processing or data corruption. In many cases mistakes like these will affect all replicas. ClickHouse has built-in safeguards to prevent some types of mistakes — for example, by default [you can’t just drop tables with a MergeTree-like engine containing more than 50 Gb of data](server-configuration-parameters/settings.md#max-table-size-to-drop). However, these safeguards don’t cover all possible cases and can be circumvented.

 In order to effectively mitigate possible human errors, you should carefully prepare a strategy for backing up and restoring your data **in advance**.

--- a/docs/en/operations/settings/merge-tree-settings.md
+++ b/docs/en/operations/settings/merge-tree-settings.md
@ -186,5 +186,16 @@ Possible values:
 Default value: auto (number of CPU cores).

 During startup ClickHouse reads all parts of all tables (reads files with metadata of parts) to build a list of all parts in memory. In some systems with a large number of parts this process can take a long time, and this time might be shortened by increasing `max_part_loading_threads` (if this process is not CPU and disk I/O bound).
+## max_partitions_to_read {#max-partitions-to-read}
+
+Limits the maximum number of partitions that can be accessed in one query.
+
+The setting value specified when the table is created can be overridden via query-level setting.
+
+Possible values:
+
+-   Any positive integer.
+
+Default value: -1 (unlimited).

 [Original article](https://clickhouse.tech/docs/en/operations/settings/merge_tree_settings/) <!--hide-->
--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@ -2592,4 +2592,70 @@ Possible values:

 Default value: `16`.

+## opentelemetry_start_trace_probability {#opentelemetry-start-trace-probability}
+
+Sets the probability that the ClickHouse can start a trace for executed queries (if no parent [trace context](https://www.w3.org/TR/trace-context/) is supplied).
+
+Possible values:
+
+-   0 — The trace for all executed queries is disabled (if no parent trace context is supplied).
+-   Positive floating-point number in the range [0..1]. For example, if the setting value is `0,5`, ClickHouse can start a trace on average for half of the queries.
+-   1 — The trace for all executed queries is enabled.
+
+Default value: `0`.
+
+## optimize_on_insert {#optimize-on-insert}
+
+Enables or disables data transformation before the insertion, as if merge was done on this block (according to table engine).
+
+Possible values:
+
+-   0 — Disabled.
+-   1 — Enabled.
+
+Default value: 1.
+
+**Example**
+
+The difference between enabled and disabled:
+
+Query:
+
+```sql
+SET optimize_on_insert = 1;
+
+CREATE TABLE test1 (`FirstTable` UInt32) ENGINE = ReplacingMergeTree ORDER BY FirstTable;
+
+INSERT INTO test1 SELECT number % 2 FROM numbers(5);
+
+SELECT * FROM test1;
+
+SET optimize_on_insert = 0;
+
+CREATE TABLE test2 (`SecondTable` UInt32) ENGINE = ReplacingMergeTree ORDER BY SecondTable;
+
+INSERT INTO test2 SELECT number % 2 FROM numbers(5);
+
+SELECT * FROM test2;
+```
+
+Result:
+
+``` text
+┌─FirstTable─┐
+│          0 │
+│          1 │
+└────────────┘
+
+┌─SecondTable─┐
+│           0 │
+│           0 │
+│           0 │
+│           1 │
+│           1 │
+└─────────────┘
+```
+
+Note that this setting influences [Materialized view](../../sql-reference/statements/create/view.md#materialized) and [MaterializeMySQL](../../engines/database-engines/materialize-mysql.md) behaviour.
+
 [Original article](https://clickhouse.tech/docs/en/operations/settings/settings/) <!-- hide -->
--- a/docs/en/operations/system-tables/opentelemetry_span_log.md
+++ b/docs/en/operations/system-tables/opentelemetry_span_log.md
@ -0,0 +1,53 @@
+# system.opentelemetry_span_log {#system_tables-opentelemetry_span_log}
+
+Contains information about [trace spans](https://opentracing.io/docs/overview/spans/) for executed queries.
+
+Columns:
+
+-   `trace_id` ([UUID](../../sql-reference/data-types/uuid.md) — ID of the trace for executed query.
+
+-   `span_id` ([UInt64](../../sql-reference/data-types/int-uint.md)) — ID of the `trace span`.
+
+-   `parent_span_id` ([UInt64](../../sql-reference/data-types/int-uint.md)) — ID of the parent `trace span`.
+
+-   `operation_name` ([String](../../sql-reference/data-types/string.md)) — The name of the operation.
+
+-   `start_time_us` ([UInt64](../../sql-reference/data-types/int-uint.md)) — The start time of the `trace span` (in microseconds).
+
+-   `finish_time_us` ([UInt64](../../sql-reference/data-types/int-uint.md)) — The finish time of the `trace span` (in microseconds).
+
+-   `finish_date` ([Date](../../sql-reference/data-types/date.md)) — The finish date of the `trace span`.
+
+-   `attribute.names` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — [Attribute](https://opentelemetry.io/docs/go/instrumentation/#attributes) names depending on the `trace span`. They are filled in according to the recommendations in the [OpenTelemetry](https://opentelemetry.io/) standard.
+
+-   `attribute.values` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — Attribute values depending on the `trace span`. They are filled in according to the recommendations in the `OpenTelemetry` standard.
+
+**Example**
+
+Query:
+
+``` sql
+SELECT * FROM system.opentelemetry_span_log LIMIT 1 FORMAT Vertical;
+```
+
+Result:
+
+``` text
+Row 1:
+──────
+trace_id:         cdab0847-0d62-61d5-4d38-dd65b19a1914
+span_id:          701487461015578150
+parent_span_id:   2991972114672045096
+operation_name:   DB::Block DB::InterpreterSelectQuery::getSampleBlockImpl()
+start_time_us:    1612374594529090
+finish_time_us:   1612374594529108
+finish_date:      2021-02-03
+attribute.names:  []
+attribute.values: []
+```
+
+**See Also**
+
+-   [OpenTelemetry](../../operations/opentelemetry.md)
+
+[Original article](https://clickhouse.tech/docs/en/operations/system_tables/opentelemetry_span_log) <!--hide-->
--- a/docs/en/operations/system-tables/part_log.md
+++ b/docs/en/operations/system-tables/part_log.md
@ -16,6 +16,8 @@ The `system.part_log` table contains the following columns:
    -   `MOVE_PART` — Moving the data part from the one disk to another one.
 -   `event_date` ([Date](../../sql-reference/data-types/date.md)) — Event date.
 -   `event_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Event time.
+-   `event_time_microseconds` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — Event time with microseconds precision.
+
 -   `duration_ms` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Duration.
 -   `database` ([String](../../sql-reference/data-types/string.md)) — Name of the database the data part is in.
 -   `table` ([String](../../sql-reference/data-types/string.md)) — Name of the table the data part is in.
@ -47,6 +49,7 @@ query_id:                      983ad9c7-28d5-4ae1-844e-603116b7de31
 event_type:                    NewPart
 event_date:                    2021-02-02
 event_time:                    2021-02-02 11:14:28
+event_time_microseconds:                    2021-02-02 11:14:28.861919
 duration_ms:                   35
 database:                      default
 table:                         log_mt_2
--- a/docs/en/operations/update.md
+++ b/docs/en/operations/update.md
@ -1,9 +1,9 @@
 ---
 toc_priority: 47
-toc_title: ClickHouse Update
+toc_title: ClickHouse Upgrade
 ---

-# ClickHouse Update {#clickhouse-update}
+# ClickHouse Upgrade {#clickhouse-upgrade}

 If ClickHouse was installed from `deb` packages, execute the following commands on the server:

@ -16,3 +16,19 @@ $ sudo service clickhouse-server restart
 If you installed ClickHouse using something other than the recommended `deb` packages, use the appropriate update method.

 ClickHouse does not support a distributed update. The operation should be performed consecutively on each separate server. Do not update all the servers on a cluster simultaneously, or the cluster will be unavailable for some time.
+
+The upgrade of older version of ClickHouse to specific version:
+
+As an example:
+ 
+`xx.yy.a.b` is a current stable version. The latest stable version could be found [here](https://github.com/ClickHouse/ClickHouse/releases)
+
+```bash
+$ sudo apt-get update
+$ sudo apt-get install clickhouse-server=xx.yy.a.b clickhouse-client=xx.yy.a.b clickhouse-common-static=xx.yy.a.b
+$ sudo service clickhouse-server restart
+```
+
+
+
+
--- a/docs/en/sql-reference/aggregate-functions/reference/deltasum.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/deltasum.md
@ -0,0 +1,19 @@
+---
+toc_priority: 141
+---
+
+# deltaSum {#agg_functions-deltasum}
+
+Syntax: `deltaSum(value)`
+
+Adds the differences between consecutive rows. If the difference is negative, it is ignored. 
+`value` must be some integer or floating point type.
+
+Example:
+
+```sql
+select deltaSum(arrayJoin([1, 2, 3]));                  -- => 2
+select deltaSum(arrayJoin([1, 2, 3, 0, 3, 4, 2, 3]));   -- => 7
+select deltaSum(arrayJoin([2.25, 3, 4.5])); -- => 2.25
+```
+
--- a/docs/en/sql-reference/data-types/map.md
+++ b/docs/en/sql-reference/data-types/map.md
@ -0,0 +1,83 @@
+---
+toc_priority: 65
+toc_title: Map(key, value)
+---
+
+# Map(key, value) {#data_type-map}
+
+`Map(key, value)` data type stores `key:value` pairs. 
+
+**Parameters** 
+-   `key` — The key part of the pair. [String](../../sql-reference/data-types/string.md) or [Integer](../../sql-reference/data-types/int-uint.md).
+-   `value` — The value part of the pair. [String](../../sql-reference/data-types/string.md), [Integer](../../sql-reference/data-types/int-uint.md) or [Array](../../sql-reference/data-types/array.md).
+
+!!! warning "Warning"
+    Currently `Map` data type is an experimental feature. To work with it you must set `allow_experimental_map_type = 1`.
+
+To get the value from an `a Map('key', 'value')` column, use `a['key']` syntax. This lookup works now with a linear complexity.
+
+**Examples**
+
+Consider the table:
+
+``` sql
+CREATE TABLE table_map (a Map(String, UInt64)) ENGINE=Memory;
+INSERT INTO table_map VALUES ({'key1':1, 'key2':10}), ({'key1':2,'key2':20}), ({'key1':3,'key2':30});
+```
+
+Select all `key2` values: 
+
+```sql
+SELECT a['key2'] FROM table_map;
+```
+Result:
+
+```text
+┌─arrayElement(a, 'key2')─┐
+│                      10 │
+│                      20 │
+│                      30 │
+└─────────────────────────┘
+```
+
+If there's no such `key` in the `Map()` column, the query returns zeros for numerical values, empty strings or empty arrays. 
+
+```sql
+INSERT INTO table_map VALUES ({'key3':100}), ({});
+SELECT a['key3'] FROM table_map;
+```
+
+Result:
+
+```text
+┌─arrayElement(a, 'key3')─┐
+│                     100 │
+│                       0 │
+└─────────────────────────┘
+┌─arrayElement(a, 'key3')─┐
+│                       0 │
+│                       0 │
+│                       0 │
+└─────────────────────────┘
+```
+
+## Convert Tuple to Map Type {#map-and-tuple}
+
+You can cast `Tuple()` as `Map()` using [CAST](../../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast) function:
+
+``` sql
+SELECT CAST(([1, 2, 3], ['Ready', 'Steady', 'Go']), 'Map(UInt8, String)') AS map;
+```
+
+``` text
+┌─map───────────────────────────┐
+│ {1:'Ready',2:'Steady',3:'Go'} │
+└───────────────────────────────┘
+```
+
+**See Also**
+
+-   [map()](../../sql-reference/functions/tuple-map-functions.md#function-map) function
+-   [CAST()](../../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast) function
+
+[Original article](https://clickhouse.tech/docs/en/data-types/map/) <!--hide-->
--- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md
+++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md
@ -208,8 +208,8 @@ This function returns the value for the specified `id`s and the date range that
 Details of the algorithm:

 -   If the `id` is not found or a range is not found for the `id`, it returns the default value for the dictionary.
-   If there are overlapping ranges, you can use any.
-   If the range delimiter is `NULL` or an invalid date (such as 1900-01-01 or 2039-01-01), the range is left open. The range can be open on both sides.
+-   If there are overlapping ranges, it returns value for any (random) range.
+-   If the range delimiter is `NULL` or an invalid date (such as 1900-01-01), the range is open. The range can be open on both sides.

 Configuration example:

--- a/docs/en/sql-reference/functions/ip-address-functions.md
+++ b/docs/en/sql-reference/functions/ip-address-functions.md
@ -265,32 +265,81 @@ SELECT toIPv6('127.0.0.1')
 └─────────────────────┘
 ```

-## isIPv4String
+## isIPv4String {#isipv4string}

-Determines if the input string is an IPv4 address or not. Returns `1` if true `0` otherwise.
+Determines whether the input string is an IPv4 address or not. If `string` is IPv6 address returns `0`.

-``` sql
-SELECT isIPv4String('127.0.0.1')
+**Syntax**
+
+```sql
+isIPv4String(string)
 ```

+**Parameters**
+
+-   `string` — IP address. [String](../../sql-reference/data-types/string.md).
+
+**Returned value**
+
+-   `1` if `string` is IPv4 address, `0` otherwise.
+
+Type: [UInt8](../../sql-reference/data-types/int-uint.md).
+
+**Examples**
+
+Query:
+
+```sql
+SELECT addr, isIPv4String(addr) FROM ( SELECT ['0.0.0.0', '127.0.0.1', '::ffff:127.0.0.1'] AS addr ) ARRAY JOIN addr
+```
+
+Result:
+
 ``` text
-┌─isIPv4String('127.0.0.1')─┐
-│                         1 │
-└───────────────────────────┘
+┌─addr─────────────┬─isIPv4String(addr)─┐
+│ 0.0.0.0          │                  1 │
+│ 127.0.0.1        │                  1 │
+│ ::ffff:127.0.0.1 │                  0 │
+└──────────────────┴────────────────────┘
 ```

-## isIPv6String
+## isIPv6String {#isipv6string}

-Determines if the input string is an IPv6 address or not. Returns `1` if true `0` otherwise.
+Determines whether the input string is an IPv6 address or not. If `string` is IPv4 address returns `0`.
+
+**Syntax**
+
+```sql
+isIPv6String(string)
+```
+
+**Parameters**
+
+-   `string` — IP address. [String](../../sql-reference/data-types/string.md).
+
+**Returned value**
+
+-   `1` if `string` is IPv6 address, `0` otherwise.
+
+Type: [UInt8](../../sql-reference/data-types/int-uint.md).
+
+**Examples**
+
+Query:

 ``` sql
-SELECT isIPv6String('2001:438:ffff::407d:1bc1')
+SELECT addr, isIPv6String(addr) FROM ( SELECT ['::', '1111::ffff', '::ffff:127.0.0.1', '127.0.0.1'] AS addr ) ARRAY JOIN addr
 ```

+Result:
+
 ``` text
-┌─isIPv6String('2001:438:ffff::407d:1bc1')─┐
-│                                        1 │
-└──────────────────────────────────────────┘
+┌─addr─────────────┬─isIPv6String(addr)─┐
+│ ::               │                  1 │
+│ 1111::ffff       │                  1 │
+│ ::ffff:127.0.0.1 │                  1 │
+│ 127.0.0.1        │                  0 │
+└──────────────────┴────────────────────┘
 ```

 [Original article](https://clickhouse.tech/docs/en/query_language/functions/ip_address_functions/) <!--hide-->
--- a/docs/en/sql-reference/functions/other-functions.md
+++ b/docs/en/sql-reference/functions/other-functions.md
@ -909,6 +909,66 @@ WHERE diff != 1

 Same as for [runningDifference](../../sql-reference/functions/other-functions.md#other_functions-runningdifference), the difference is the value of the first row, returned the value of the first row, and each subsequent row returns the difference from the previous row.

+## runningConcurrency {#runningconcurrency}
+
+Given a series of beginning time and ending time of events, this function calculates concurrency of the events at each of the data point, that is, the beginning time.
+
+!!! warning "Warning"
+    Events spanning multiple data blocks will not be processed correctly. The function resets its state for each new data block.
+
+The result of the function depends on the order of data in the block. It assumes the beginning time is sorted in ascending order.
+
+**Syntax**
+
+``` sql
+runningConcurrency(begin, end)
+```
+
+**Parameters**
+
+-   `begin` — A column for the beginning time of events (inclusive). [Date](../../sql-reference/data-types/date.md), [DateTime](../../sql-reference/data-types/datetime.md), or [DateTime64](../../sql-reference/data-types/datetime64.md).
+-   `end` — A column for the ending time of events (exclusive).  [Date](../../sql-reference/data-types/date.md), [DateTime](../../sql-reference/data-types/datetime.md), or [DateTime64](../../sql-reference/data-types/datetime64.md).
+
+Note that two columns `begin` and `end` must have the same type.
+
+**Returned values**
+
+-   The concurrency of events at the data point.
+
+Type: [UInt32](../../sql-reference/data-types/int-uint.md)
+
+**Example**
+
+Input table:
+
+``` text
+┌───────────────begin─┬─────────────────end─┐
+│ 2020-12-01 00:00:00 │ 2020-12-01 00:59:59 │
+│ 2020-12-01 00:30:00 │ 2020-12-01 00:59:59 │
+│ 2020-12-01 00:40:00 │ 2020-12-01 01:30:30 │
+│ 2020-12-01 01:10:00 │ 2020-12-01 01:30:30 │
+│ 2020-12-01 01:50:00 │ 2020-12-01 01:59:59 │
+└─────────────────────┴─────────────────────┘
+```
+
+Query:
+
+``` sql
+SELECT runningConcurrency(begin, end) FROM example
+```
+
+Result:
+
+``` text
+┌─runningConcurrency(begin, end)─┐
+│                              1 │
+│                              2 │
+│                              3 │
+│                              2 │
+│                              1 │
+└────────────────────────────────┘
+```
+
 ## MACNumToString(num) {#macnumtostringnum}

 Accepts a UInt64 number. Interprets it as a MAC address in big endian. Returns a string containing the corresponding MAC address in the format AA:BB:CC:DD:EE:FF (colon-separated numbers in hexadecimal form).
--- a/docs/en/sql-reference/functions/tuple-map-functions.md
+++ b/docs/en/sql-reference/functions/tuple-map-functions.md
@ -5,6 +5,68 @@ toc_title: Working with maps

 # Functions for maps {#functions-for-working-with-tuple-maps}

+## map {#function-map}
+
+Arranges `key:value` pairs into [Map(key, value)](../../sql-reference/data-types/map.md) data type.
+
+**Syntax** 
+
+``` sql
+map(key1, value1[, key2, value2, ...])
+```
+
+**Parameters** 
+
+-   `key` — The key part of the pair. [String](../../sql-reference/data-types/string.md) or [Integer](../../sql-reference/data-types/int-uint.md).
+-   `value` — The value part of the pair. [String](../../sql-reference/data-types/string.md), [Integer](../../sql-reference/data-types/int-uint.md) or [Array](../../sql-reference/data-types/array.md).
+
+**Returned value**
+
+-  Data structure as `key:value` pairs.
+
+Type: [Map(key, value)](../../sql-reference/data-types/map.md).
+
+**Examples**
+
+Query:
+
+``` sql
+SELECT map('key1', number, 'key2', number * 2) FROM numbers(3);
+```
+
+Result:
+
+``` text
+┌─map('key1', number, 'key2', multiply(number, 2))─┐
+│ {'key1':0,'key2':0}                              │
+│ {'key1':1,'key2':2}                              │
+│ {'key1':2,'key2':4}                              │
+└──────────────────────────────────────────────────┘
+```
+
+Query:
+
+``` sql
+CREATE TABLE table_map (a Map(String, UInt64)) ENGINE = MergeTree() ORDER BY a;
+INSERT INTO table_map SELECT map('key1', number, 'key2', number * 2) FROM numbers(3);
+SELECT a['key2'] FROM table_map;
+```
+
+Result:
+
+``` text
+┌─arrayElement(a, 'key2')─┐
+│                       0 │
+│                       2 │
+│                       4 │
+└─────────────────────────┘
+```
+
+**See Also** 
+
+-   [Map(key, value)](../../sql-reference/data-types/map.md) data type
+
+
 ## mapAdd {#function-mapadd}

 Collect all the keys and sum corresponding values.
@ -112,4 +174,4 @@ Result:
 └──────────────────────────────┴───────────────────────────────────┘
 ```

-[Original article](https://clickhouse.tech/docs/en/query_language/functions/tuple-map-functions/) <!--hide-->
+[Original article](https://clickhouse.tech/docs/en/sql-reference/functions/tuple-map-functions/) <!--hide-->
--- a/docs/en/sql-reference/statements/alter/column.md
+++ b/docs/en/sql-reference/statements/alter/column.md
@ -20,6 +20,7 @@ The following actions are supported:

 -   [ADD COLUMN](#alter_add-column) — Adds a new column to the table.
 -   [DROP COLUMN](#alter_drop-column) — Deletes the column.
+-   [RENAME COLUMN](#alter_rename-column) — Renames the column.
 -   [CLEAR COLUMN](#alter_clear-column) — Resets column values.
 -   [COMMENT COLUMN](#alter_comment-column) — Adds a text comment to the column.
 -   [MODIFY COLUMN](#alter_modify-column) — Changes column’s type, default expression and TTL.
@ -78,6 +79,22 @@ Example:
 ALTER TABLE visits DROP COLUMN browser
 ```

+## RENAME COLUMN {#alter_rename-column}
+
+``` sql
+RENAME COLUMN [IF EXISTS] name to new_name
+```
+
+Renames the column `name` to `new_name`. If the `IF EXISTS` clause is specified, the query won’t return an error if the column doesn’t exist. Since renaming does not involve the underlying data, the query is completed almost instantly.
+
+**NOTE**: Columns specified in the key expression of the table (either with `ORDER BY` or `PRIMARY KEY`) cannot be renamed. Trying to change these columns will produce `SQL Error [524]`. 
+
+Example:
+
+``` sql
+ALTER TABLE visits RENAME COLUMN webBrowser TO browser
+```
+
 ## CLEAR COLUMN {#alter_clear-column}

 ``` sql
--- a/docs/en/sql-reference/statements/create/view.md
+++ b/docs/en/sql-reference/statements/create/view.md
@ -59,6 +59,10 @@ A `SELECT` query can contain `DISTINCT`, `GROUP BY`, `ORDER BY`, `LIMIT`… Note

 The execution of [ALTER](../../../sql-reference/statements/alter/index.md) queries on materialized views has limitations, so they might be inconvenient. If the materialized view uses the construction `TO [db.]name`, you can `DETACH` the view, run `ALTER` for the target table, and then `ATTACH` the previously detached (`DETACH`) view.

+Note that materialized view is influenced by [optimize_on_insert](../../../operations/settings/settings.md#optimize-on-insert) setting. The data is merged before the insertion into a view.
+
 Views look the same as normal tables. For example, they are listed in the result of the `SHOW TABLES` query.

 There isn’t a separate query for deleting views. To delete a view, use [DROP TABLE](../../../sql-reference/statements/drop.md).
+
+[Original article](https://clickhouse.tech/docs/en/sql-reference/statements/create/view/) <!--hide-->
--- a/docs/en/sql-reference/statements/insert-into.md
+++ b/docs/en/sql-reference/statements/insert-into.md
@ -62,8 +62,6 @@ If a list of columns doesn't include all existing columns, the rest of the colum
 -   The values calculated from the `DEFAULT` expressions specified in the table definition.
 -   Zeros and empty strings, if `DEFAULT` expressions are not defined.

-If [strict\_insert\_defaults=1](../../operations/settings/settings.md), columns that do not have `DEFAULT` defined must be listed in the query.
-
 Data can be passed to the INSERT in any [format](../../interfaces/formats.md#formats) supported by ClickHouse. The format must be specified explicitly in the query:

 ``` sql
--- a/docs/en/sql-reference/window-functions/index.md
+++ b/docs/en/sql-reference/window-functions/index.md
@ -10,33 +10,51 @@ This is an experimental feature that is currently in development and is not read
 for general use. It will change in unpredictable backwards-incompatible ways in
 the future releases. Set `allow_experimental_window_functions = 1` to enable it.

-ClickHouse currently supports calculation of aggregate functions over a window.
-Pure window functions such as `rank`, `lag`, `lead` and so on are not yet supported.
+ClickHouse supports the standard grammar for defining windows and window functions. The following features are currently supported:

-The window can be specified either with an `OVER` clause or with a separate
-`WINDOW` clause.
-
-Only two variants of frame are supported, `ROWS` and `RANGE`. Offsets for the `RANGE` frame are not yet supported.
+| Feature | Support or workaround |
+| --------| ----------|
+| ad hoc window specification (`count(*) over (partition by id order by time desc)`) | supported |
+| expressions involving window functions, e.g. `(count(*) over ()) / 2)` | not supported, wrap in a subquery ([feature request](https://github.com/ClickHouse/ClickHouse/issues/19857)) |
+| `WINDOW` clause (`select ... from table window w as (partiton by id)`) | supported |
+| `ROWS` frame | supported |
+| `RANGE` frame | supported, the default |
+| `INTERVAL` syntax for `DateTime` `RANGE OFFSET` frame | not supported, specify the number of seconds instead |
+| `GROUPS` frame | not supported |
+| Calculating aggregate functions over a frame (`sum(value) over (order by time)`) | all aggregate functions are supported |
+| `rank()`, `dense_rank()`, `row_number()` | supported |
+| `lag/lead(value, offset)` | not supported, replace with `any(value) over (.... rows between <offset> preceding and <offset> preceding)`, or `following` for `lead`| 

 ## References

 ### GitHub Issues
+
 The roadmap for the initial support of window functions is [in this issue](https://github.com/ClickHouse/ClickHouse/issues/18097).

 All GitHub issues related to window funtions have the [comp-window-functions](https://github.com/ClickHouse/ClickHouse/labels/comp-window-functions) tag.

 ### Tests
+
 These tests contain the examples of the currently supported grammar:
+
 https://github.com/ClickHouse/ClickHouse/blob/master/tests/performance/window_functions.xml
+
 https://github.com/ClickHouse/ClickHouse/blob/master/tests/queries/0_stateless/01591_window_functions.sql

 ### Postgres Docs
+
 https://www.postgresql.org/docs/current/sql-select.html#SQL-WINDOW
+
 https://www.postgresql.org/docs/devel/sql-expressions.html#SYNTAX-WINDOW-FUNCTIONS
+
 https://www.postgresql.org/docs/devel/functions-window.html
+
 https://www.postgresql.org/docs/devel/tutorial-window.html

 ### MySQL Docs
+
 https://dev.mysql.com/doc/refman/8.0/en/window-function-descriptions.html
+
 https://dev.mysql.com/doc/refman/8.0/en/window-functions-usage.html
+
 https://dev.mysql.com/doc/refman/8.0/en/window-functions-frames.html
--- a/docs/es/operations/backup.md
+++ b/docs/es/operations/backup.md
@ -5,7 +5,7 @@ toc_title: Copia de seguridad de datos

 # Copia de seguridad de datos {#data-backup}

-Mientras que la [replicación](../engines/table-engines/mergetree-family/replication.md) proporciona protección contra fallos de hardware, no protege de errores humanos: el borrado accidental de datos, elminar la tabla equivocada o una tabla en el clúster equivocado, y bugs de software que dan como resultado un procesado incorrecto de los datos o la corrupción de los datos. En muchos casos, errores como estos afectarán a todas las réplicas. ClickHouse dispone de salvaguardas para prevenir algunos tipos de errores — por ejemplo, por defecto [no se puede simplemente eliminar tablas con un motor similar a MergeTree que contenga más de 50 Gb de datos](https://github.com/ClickHouse/ClickHouse/blob/v18.14.18-stable/programs/server/config.xml#L322-L330). Sin embargo, estas salvaguardas no cubren todos los casos posibles y pueden eludirse.
+Mientras que la [replicación](../engines/table-engines/mergetree-family/replication.md) proporciona protección contra fallos de hardware, no protege de errores humanos: el borrado accidental de datos, elminar la tabla equivocada o una tabla en el clúster equivocado, y bugs de software que dan como resultado un procesado incorrecto de los datos o la corrupción de los datos. En muchos casos, errores como estos afectarán a todas las réplicas. ClickHouse dispone de salvaguardas para prevenir algunos tipos de errores — por ejemplo, por defecto [no se puede simplemente eliminar tablas con un motor similar a MergeTree que contenga más de 50 Gb de datos](server-configuration-parameters/settings.md#max-table-size-to-drop). Sin embargo, estas salvaguardas no cubren todos los casos posibles y pueden eludirse.

 Para mitigar eficazmente los posibles errores humanos, debe preparar cuidadosamente una estrategia para realizar copias de seguridad y restaurar sus datos **previamente**.

--- a/docs/fr/operations/backup.md
+++ b/docs/fr/operations/backup.md
@ -7,7 +7,7 @@ toc_title: "La Sauvegarde Des Donn\xE9es"

 # La Sauvegarde Des Données {#data-backup}

-Alors [réplication](../engines/table-engines/mergetree-family/replication.md) provides protection from hardware failures, it does not protect against human errors: accidental deletion of data, deletion of the wrong table or a table on the wrong cluster, and software bugs that result in incorrect data processing or data corruption. In many cases mistakes like these will affect all replicas. ClickHouse has built-in safeguards to prevent some types of mistakes — for example, by default [vous ne pouvez pas simplement supprimer des tables avec un moteur de type MergeTree contenant plus de 50 Go de données](https://github.com/ClickHouse/ClickHouse/blob/v18.14.18-stable/programs/server/config.xml#L322-L330). Toutefois, ces garanties ne couvrent pas tous les cas possibles et peuvent être contournés.
+Alors [réplication](../engines/table-engines/mergetree-family/replication.md) provides protection from hardware failures, it does not protect against human errors: accidental deletion of data, deletion of the wrong table or a table on the wrong cluster, and software bugs that result in incorrect data processing or data corruption. In many cases mistakes like these will affect all replicas. ClickHouse has built-in safeguards to prevent some types of mistakes — for example, by default [vous ne pouvez pas simplement supprimer des tables avec un moteur de type MergeTree contenant plus de 50 Go de données](server-configuration-parameters/settings.md#max-table-size-to-drop). Toutefois, ces garanties ne couvrent pas tous les cas possibles et peuvent être contournés.

 Afin d'atténuer efficacement les erreurs humaines possibles, vous devez préparer soigneusement une stratégie de sauvegarde et de restauration de vos données **préalablement**.

--- a/docs/ja/operations/backup.md
+++ b/docs/ja/operations/backup.md
@ -7,7 +7,7 @@ toc_title: "\u30C7\u30FC\u30BF\u30D0\u30C3\u30AF\u30A2"

 # データバックア {#data-backup}

-ながら [複製](../engines/table-engines/mergetree-family/replication.md) provides protection from hardware failures, it does not protect against human errors: accidental deletion of data, deletion of the wrong table or a table on the wrong cluster, and software bugs that result in incorrect data processing or data corruption. In many cases mistakes like these will affect all replicas. ClickHouse has built-in safeguards to prevent some types of mistakes — for example, by default [50Gbを超えるデータを含むMergeTreeのようなエンジンでは、テーブルを削除することはできません](https://github.com/ClickHouse/ClickHouse/blob/v18.14.18-stable/programs/server/config.xml#L322-L330). しかし、これらの保障措置がカバーしないすべてのケースで回避.
+ながら [複製](../engines/table-engines/mergetree-family/replication.md) provides protection from hardware failures, it does not protect against human errors: accidental deletion of data, deletion of the wrong table or a table on the wrong cluster, and software bugs that result in incorrect data processing or data corruption. In many cases mistakes like these will affect all replicas. ClickHouse has built-in safeguards to prevent some types of mistakes — for example, by default [50Gbを超えるデータを含むMergeTreeのようなエンジンでは、テーブルを削除することはできません](server-configuration-parameters/settings.md#max-table-size-to-drop). しかし、これらの保障措置がカバーしないすべてのケースで回避.

 ヒューマンエラーを効果的に軽減するには、データのバックアップと復元のための戦略を慎重に準備する必要があります **事前に**.

--- a/docs/ru/engines/database-engines/materialize-mysql.md
+++ b/docs/ru/engines/database-engines/materialize-mysql.md
@ -93,6 +93,7 @@ DDL-запросы в MySQL конвертируются в соответств
 - Каскадные запросы `UPDATE/DELETE` не поддерживаются движком `MaterializeMySQL`.
 - Репликация может быть легко нарушена.
 - Прямые операции изменения данных в таблицах и базах данных `MaterializeMySQL` запрещены.
+- На работу `MaterializeMySQL` влияет настройка [optimize_on_insert](../../operations/settings/settings.md#optimize-on-insert). Когда таблица на MySQL сервере меняется, происходит слияние данных в соответсвующей таблице в базе данных `MaterializeMySQL`.

 ## Примеры использования {#examples-of-use}

@ -156,4 +157,4 @@ SELECT * FROM mysql.test;
 └───┴─────┴──────┘
 ```

-[Оригинальная статья](https://clickhouse.tech/docs/ru/database_engines/materialize-mysql/) <!--hide-->
+[Оригинальная статья](https://clickhouse.tech/docs/ru/engines/database-engines/materialize-mysql/) <!--hide-->
--- a/docs/ru/engines/table-engines/integrations/index.md
+++ b/docs/ru/engines/table-engines/integrations/index.md
@ -12,7 +12,10 @@ toc_priority: 30
 -   [ODBC](../../../engines/table-engines/integrations/odbc.md)
 -   [JDBC](../../../engines/table-engines/integrations/jdbc.md)
 -   [MySQL](../../../engines/table-engines/integrations/mysql.md)
+-   [MongoDB](../../../engines/table-engines/integrations/mongodb.md)
 -   [HDFS](../../../engines/table-engines/integrations/hdfs.md)
 -   [Kafka](../../../engines/table-engines/integrations/kafka.md)
+-   [EmbeddedRocksDB](../../../engines/table-engines/integrations/embedded-rocksdb.md)
+-   [RabbitMQ](../../../engines/table-engines/integrations/rabbitmq.md)

 [Оригинальная статья](https://clickhouse.tech/docs/ru/engines/table-engines/integrations/) <!--hide-->
--- a/docs/ru/engines/table-engines/integrations/mongodb.md
+++ b/docs/ru/engines/table-engines/integrations/mongodb.md
@ -0,0 +1,57 @@
+---
+toc_priority: 7
+toc_title: MongoDB
+---
+
+# MongoDB {#mongodb}
+
+Движок таблиц MongoDB позволяет читать данные из коллекций СУБД MongoDB. В таблицах допустимы только плоские (не вложенные) типы данных. Запись (`INSERT`-запросы) не поддерживается.
+
+## Создание таблицы {#creating-a-table}
+
+``` sql
+CREATE TABLE [IF NOT EXISTS] [db.]table_name
+(
+    name1 [type1],
+    name2 [type2],
+    ...
+) ENGINE = MongoDB(host:port, database, collection, user, password);
+```
+
+**Параметры движка**
+
+-   `host:port` — адрес сервера MongoDB.
+
+-   `database` — имя базы данных на удалённом сервере.
+
+-   `collection` — имя коллекции на удалённом сервере.
+
+-   `user` — пользователь MongoDB.
+
+-   `password` — пароль пользователя.
+
+## Примеры использования {#usage-example}
+
+Таблица в ClickHouse для чтения данных из колекции MongoDB:
+
+``` text
+CREATE TABLE mongo_table
+(
+    key UInt64, 
+    data String
+) ENGINE = MongoDB('mongo1:27017', 'test', 'simple_table', 'testuser', 'clickhouse');
+```
+
+Запрос к таблице:
+
+``` sql
+SELECT COUNT() FROM mongo_table;
+```
+
+``` text
+┌─count()─┐
+│       4 │
+└─────────┘
+```
+
+[Original article](https://clickhouse.tech/docs/ru/operations/table_engines/integrations/mongodb/) <!--hide-->
--- a/docs/ru/engines/table-engines/mergetree-family/mergetree.md
+++ b/docs/ru/engines/table-engines/mergetree-family/mergetree.md
@ -94,6 +94,7 @@ ORDER BY expr
 	-   `max_parts_in_total` — максимальное количество кусков во всех партициях.
 	-   `max_compress_block_size` — максимальный размер блоков несжатых данных перед сжатием для записи в таблицу. Вы также можете задать этот параметр в глобальных настройках (смотрите [max_compress_block_size](../../../operations/settings/settings.md#max-compress-block-size)). Настройка, которая задается при создании таблицы, имеет более высокий приоритет, чем глобальная.
 	-   `min_compress_block_size` — минимальный размер блоков несжатых данных, необходимых для сжатия при записи следующей засечки. Вы также можете задать этот параметр в глобальных настройках (смотрите [min_compress_block_size](../../../operations/settings/settings.md#min-compress-block-size)). Настройка, которая задается при создании таблицы, имеет более высокий приоритет, чем глобальная.
+    -   `max_partitions_to_read` — Ограничивает максимальное число партиций для чтения в одном запросе. Также возможно указать настройку [max_partitions_to_read](../../../operations/settings/merge-tree-settings.md#max-partitions-to-read) в глобальных настройках.

 **Пример задания секций**

@ -711,4 +712,4 @@ SETTINGS storage_policy = 'moving_from_ssd_to_hdd'

 После выполнения фоновых слияний или мутаций старые куски не удаляются сразу, а через некоторое время (табличная настройка `old_parts_lifetime`). Также они не перемещаются на другие тома или диски, поэтому до момента удаления они продолжают учитываться при подсчёте занятого дискового пространства.

-[Оригинальная статья](https://clickhouse.tech/docs/en/engines/table-engines/mergetree-family/mergetree/) <!--hide-->
+[Оригинальная статья](https://clickhouse.tech/docs/ru/engines/table-engines/mergetree-family/mergetree/) <!--hide-->
--- a/docs/ru/operations/backup.md
+++ b/docs/ru/operations/backup.md
@ -5,7 +5,7 @@ toc_title: "\u0420\u0435\u0437\u0435\u0440\u0432\u043d\u043e\u0435\u0020\u043a\u

 # Резервное копирование данных {#rezervnoe-kopirovanie-dannykh}

-[Репликация](../engines/table-engines/mergetree-family/replication.md) обеспечивает защиту от аппаратных сбоев, но не защищает от человеческих ошибок: случайного удаления данных, удаления не той таблицы, которую надо было, или таблицы на не том кластере, а также программных ошибок, которые приводят к неправильной обработке данных или их повреждению. Во многих случаях подобные ошибки влияют на все реплики. ClickHouse имеет встроенные средства защиты для предотвращения некоторых типов ошибок — например, по умолчанию [не получится удалить таблицы \*MergeTree, содержащие более 50 Гб данных, одной командой](https://github.com/ClickHouse/ClickHouse/blob/v18.14.18-stable/programs/server/config.xml#L322-L330). Однако эти средства защиты не охватывают все возможные случаи и могут быть обойдены.
+[Репликация](../engines/table-engines/mergetree-family/replication.md) обеспечивает защиту от аппаратных сбоев, но не защищает от человеческих ошибок: случайного удаления данных, удаления не той таблицы, которую надо было, или таблицы на не том кластере, а также программных ошибок, которые приводят к неправильной обработке данных или их повреждению. Во многих случаях подобные ошибки влияют на все реплики. ClickHouse имеет встроенные средства защиты для предотвращения некоторых типов ошибок — например, по умолчанию [не получится удалить таблицы \*MergeTree, содержащие более 50 Гб данных, одной командой](server-configuration-parameters/settings.md#max-table-size-to-drop). Однако эти средства защиты не охватывают все возможные случаи и могут быть обойдены.

 Для того чтобы эффективно уменьшить возможные человеческие ошибки, следует тщательно подготовить стратегию резервного копирования и восстановления данных **заранее**.

--- a/docs/ru/operations/settings/merge-tree-settings.md
+++ b/docs/ru/operations/settings/merge-tree-settings.md
@ -181,4 +181,16 @@ Eсли суммарное число активных кусков во все

 При старте ClickHouse читает все куски всех таблиц (читает файлы с метаданными кусков), чтобы построить в ОЗУ список всех кусков. В некоторых системах с большим количеством кусков этот процесс может занимать длительное время, и это время можно сократить, увеличив `max_part_loading_threads` (если при этом процессе есть недозагруженность CPU и диска).

-{## [Оригинальная статья](https://clickhouse.tech/docs/ru/operations/settings/merge-tree-settings/) ##}
+## max_partitions_to_read {#max-partitions-to-read}
+
+Ограничивает максимальное число партиций для чтения в одном запросе.
+
+Указанное при создании таблицы значение настройки может быть переназначено настройкой на уровне запроса.
+
+Возможные значения:
+
+-   Любое положительное целое число.
+
+Значение по умолчанию: -1 (неограниченно).
+
+[Original article](https://clickhouse.tech/docs/ru/operations/settings/merge_tree_settings/) <!--hide-->
--- a/docs/ru/operations/settings/settings.md
+++ b/docs/ru/operations/settings/settings.md
@ -2473,4 +2473,70 @@ SELECT SUM(-1), MAX(0) FROM system.one WHERE 0;

 Значение по умолчанию: `16`.

+## opentelemetry_start_trace_probability {#opentelemetry-start-trace-probability}
+
+Задает вероятность того, что ClickHouse начнет трассировку для выполненных запросов (если не указан [входящий контекст](https://www.w3.org/TR/trace-context/) трассировки).
+
+Возможные значения:
+
+-   0 — трассировка для выполненных запросов отключена (если не указан входящий контекст трассировки).
+-   Положительное число с плавающей точкой в диапазоне [0..1]. Например, при значении настройки, равной `0,5`, ClickHouse начнет трассировку в среднем для половины запросов.
+-   1 — трассировка для всех выполненных запросов включена.
+
+Значение по умолчанию: `0`.
+
+## optimize_on_insert {#optimize-on-insert}
+
+Включает или выключает преобразование данных перед добавлением в таблицу, как будто над добавляемым блоком предварительно было произведено слияние (в соответствии с движком таблицы).
+
+Возможные значения:
+
+-   0 — выключена
+-   1 — включена.
+
+Значение по умолчанию: 1.
+
+**Пример**
+
+Сравните добавление данных при включенной и выключенной настройке:
+
+Запрос:
+
+```sql
+SET optimize_on_insert = 1;
+
+CREATE TABLE test1 (`FirstTable` UInt32) ENGINE = ReplacingMergeTree ORDER BY FirstTable;
+
+INSERT INTO test1 SELECT number % 2 FROM numbers(5);
+
+SELECT * FROM test1;
+
+SET optimize_on_insert = 0;
+
+CREATE TABLE test2 (`SecondTable` UInt32) ENGINE = ReplacingMergeTree ORDER BY SecondTable;
+
+INSERT INTO test2 SELECT number % 2 FROM numbers(5);
+
+SELECT * FROM test2;
+```
+
+Результат:
+
+``` text
+┌─FirstTable─┐
+│          0 │
+│          1 │
+└────────────┘
+
+┌─SecondTable─┐
+│           0 │
+│           0 │
+│           0 │
+│           1 │
+│           1 │
+└─────────────┘
+```
+
+Обратите внимание на то, что эта настройка влияет на поведение [материализованных представлений](../../sql-reference/statements/create/view.md#materialized) и БД [MaterializeMySQL](../../engines/database-engines/materialize-mysql.md).
+
 [Оригинальная статья](https://clickhouse.tech/docs/ru/operations/settings/settings/) <!--hide-->
--- a/docs/ru/operations/system-tables/opentelemetry_span_log.md
+++ b/docs/ru/operations/system-tables/opentelemetry_span_log.md
@ -0,0 +1,49 @@
+# system.opentelemetry_span_log {#system_tables-opentelemetry_span_log}
+
+Содержит информацию о [trace spans](https://opentracing.io/docs/overview/spans/) для выполненных запросов.
+
+Столбцы:
+
+-   `trace_id` ([UUID](../../sql-reference/data-types/uuid.md) — идентификатор трассировки для выполненного запроса.
+
+-   `span_id` ([UInt64](../../sql-reference/data-types/int-uint.md)) — идентификатор `trace span`.
+
+-   `parent_span_id` ([UInt64](../../sql-reference/data-types/int-uint.md)) — идентификатор родительского `trace span`.
+
+-   `operation_name` ([String](../../sql-reference/data-types/string.md)) — имя операции.
+
+-   `start_time_us` ([UInt64](../../sql-reference/data-types/int-uint.md)) — время начала `trace span` (в микросекундах).
+
+-   `finish_time_us` ([UInt64](../../sql-reference/data-types/int-uint.md)) — время окончания `trace span` (в микросекундах).
+
+-   `finish_date` ([Date](../../sql-reference/data-types/date.md)) — дата окончания `trace span`.
+
+-   `attribute.names` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — имена [атрибутов](https://opentelemetry.io/docs/go/instrumentation/#attributes) в зависимости от `trace span`. Заполняются согласно рекомендациям в стандарте [OpenTelemetry](https://opentelemetry.io/).
+
+-   `attribute.values` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — значения атрибутов в зависимости от `trace span`. Заполняются согласно рекомендациям в стандарте `OpenTelemetry`.
+
+**Пример**
+
+Запрос:
+
+``` sql
+SELECT * FROM system.opentelemetry_span_log LIMIT 1 FORMAT Vertical;
+```
+
+Результат:
+
+``` text
+Row 1:
+──────
+trace_id:         cdab0847-0d62-61d5-4d38-dd65b19a1914
+span_id:          701487461015578150
+parent_span_id:   2991972114672045096
+operation_name:   DB::Block DB::InterpreterSelectQuery::getSampleBlockImpl()
+start_time_us:    1612374594529090
+finish_time_us:   1612374594529108
+finish_date:      2021-02-03
+attribute.names:  []
+attribute.values: []
+```
+
+[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/system_tables/opentelemetry_span_log) <!--hide-->
--- a/docs/ru/sql-reference/data-types/map.md
+++ b/docs/ru/sql-reference/data-types/map.md
@ -0,0 +1,69 @@
+---
+toc_priority: 65
+toc_title: Map(key, value)
+---
+
+# Map(key, value) {#data_type-map}
+
+Тип данных `Map(key, value)` хранит пары `ключ:значение`. 
+
+**Параметры** 
+-   `key` — ключ. [String](../../sql-reference/data-types/string.md) или [Integer](../../sql-reference/data-types/int-uint.md).
+-   `value` — значение. [String](../../sql-reference/data-types/string.md), [Integer](../../sql-reference/data-types/int-uint.md) или [Array](../../sql-reference/data-types/array.md).
+
+!!! warning "Предупреждение"
+    Сейчас использование типа данных `Map` является экспериментальной возможностью. Чтобы использовать этот тип данных, включите настройку `allow_experimental_map_type = 1`.
+
+Чтобы получить значение из колонки `a Map('key', 'value')`, используйте синтаксис `a['key']`. В настоящее время такая подстановка работает по алгоритму с линейной сложностью.
+
+**Примеры**
+
+Рассмотрим таблицу:
+
+``` sql
+CREATE TABLE table_map (a Map(String, UInt64)) ENGINE=Memory;
+INSERT INTO table_map VALUES ({'key1':1, 'key2':10}), ({'key1':2,'key2':20}), ({'key1':3,'key2':30});
+```
+
+Выборка всех значений ключа `key2`: 
+
+```sql
+SELECT a['key2'] FROM table_map;
+```
+Результат:
+
+```text
+┌─arrayElement(a, 'key2')─┐
+│                      10 │
+│                      20 │
+│                      30 │
+└─────────────────────────┘
+```
+
+Если для какого-то ключа `key` в колонке с типом `Map()` нет значения, запрос возвращает нули для числовых колонок, пустые строки или пустые массивы. 
+
+```sql
+INSERT INTO table_map VALUES ({'key3':100}), ({});
+SELECT a['key3'] FROM table_map;
+```
+
+Результат:
+
+```text
+┌─arrayElement(a, 'key3')─┐
+│                     100 │
+│                       0 │
+└─────────────────────────┘
+┌─arrayElement(a, 'key3')─┐
+│                       0 │
+│                       0 │
+│                       0 │
+└─────────────────────────┘
+```
+
+**См. также**
+
+-   функция [map()](../../sql-reference/functions/tuple-map-functions.md#function-map)
+-   функция [CAST()](../../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast)
+
+[Original article](https://clickhouse.tech/docs/ru/data-types/map/) <!--hide-->
--- a/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md
+++ b/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md
@ -205,8 +205,8 @@ RANGE(MIN first MAX last)
 Особенности алгоритма:

 -   Если не найден `id` или для найденного `id` не найден диапазон, то возвращается значение по умолчанию для словаря.
-   Если есть перекрывающиеся диапазоны, то можно использовать любой подходящий.
-   Если граница диапазона `NULL` или некорректная дата (1900-01-01, 2039-01-01), то диапазон считается открытым. Диапазон может быть открытым с обеих сторон.
+-   Если есть перекрывающиеся диапазоны, то возвращается значение из любого (случайного) подходящего диапазона.
+-   Если граница диапазона `NULL` или некорректная дата (1900-01-01), то диапазон считается открытым. Диапазон может быть открытым с обеих сторон.

 Пример конфигурации:

--- a/docs/ru/sql-reference/functions/ip-address-functions.md
+++ b/docs/ru/sql-reference/functions/ip-address-functions.md
@ -243,4 +243,81 @@ SELECT
 └───────────────────────────────────┴──────────────────────────────────┘
 ```

+## isIPv4String {#isipv4string}
+
+Определяет, является ли строка адресом IPv4 или нет. Также вернет `0`, если `string` — адрес IPv6.
+
+**Синтаксис**
+
+```sql
+isIPv4String(string)
+```
+
+**Параметры**
+
+-   `string` — IP адрес. [String](../../sql-reference/data-types/string.md).
+
+**Возвращаемое значение**
+
+-   `1` если `string` является адресом IPv4 , иначе — `0`.
+
+Тип: [UInt8](../../sql-reference/data-types/int-uint.md).
+
+**Примеры**
+
+Запрос:
+
+```sql
+SELECT addr, isIPv4String(addr) FROM ( SELECT ['0.0.0.0', '127.0.0.1', '::ffff:127.0.0.1'] AS addr ) ARRAY JOIN addr
+```
+
+Результат:
+
+``` text
+┌─addr─────────────┬─isIPv4String(addr)─┐
+│ 0.0.0.0          │                  1 │
+│ 127.0.0.1        │                  1 │
+│ ::ffff:127.0.0.1 │                  0 │
+└──────────────────┴────────────────────┘
+```
+
+## isIPv6String {#isipv6string}
+
+Определяет, является ли строка адресом IPv6 или нет. Также вернет `0`, если `string` — адрес IPv4.
+
+**Синтаксис**
+
+```sql
+isIPv6String(string)
+```
+
+**Параметры**
+
+-   `string` — IP адрес. [String](../../sql-reference/data-types/string.md).
+
+**Возвращаемое значение**
+
+-   `1` если `string` является адресом IPv6 , иначе — `0`.
+
+Тип: [UInt8](../../sql-reference/data-types/int-uint.md).
+
+**Примеры**
+
+Запрос:
+
+``` sql
+SELECT addr, isIPv6String(addr) FROM ( SELECT ['::', '1111::ffff', '::ffff:127.0.0.1', '127.0.0.1'] AS addr ) ARRAY JOIN addr
+```
+
+Результат:
+
+``` text
+┌─addr─────────────┬─isIPv6String(addr)─┐
+│ ::               │                  1 │
+│ 1111::ffff       │                  1 │
+│ ::ffff:127.0.0.1 │                  1 │
+│ 127.0.0.1        │                  0 │
+└──────────────────┴────────────────────┘
+```
+
 [Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/ip_address_functions/) <!--hide-->
--- a/docs/ru/sql-reference/functions/tuple-map-functions.md
+++ b/docs/ru/sql-reference/functions/tuple-map-functions.md
@ -5,6 +5,66 @@ toc_title: Работа с контейнерами map

 # Функции для работы с контейнерами map {#functions-for-working-with-tuple-maps}

+## map {#function-map}
+
+Преобразовывает пары `ключ:значение` в тип данных [Map(key, value)](../../sql-reference/data-types/map.md).
+
+**Синтаксис** 
+
+``` sql
+map(key1, value1[, key2, value2, ...])
+```
+
+**Параметры** 
+
+-   `key` — ключ. [String](../../sql-reference/data-types/string.md) или [Integer](../../sql-reference/data-types/int-uint.md).
+-   `value` — значение. [String](../../sql-reference/data-types/string.md), [Integer](../../sql-reference/data-types/int-uint.md) или [Array](../../sql-reference/data-types/array.md).
+
+**Возвращаемое значение**
+
+-   Структура данных в виде пар `ключ:значение`.
+
+Тип: [Map(key, value)](../../sql-reference/data-types/map.md).
+
+**Примеры**
+
+Запрос:
+
+``` sql
+SELECT map('key1', number, 'key2', number * 2) FROM numbers(3);
+```
+
+Результат:
+
+``` text
+┌─map('key1', number, 'key2', multiply(number, 2))─┐
+│ {'key1':0,'key2':0}                              │
+│ {'key1':1,'key2':2}                              │
+│ {'key1':2,'key2':4}                              │
+└──────────────────────────────────────────────────┘
+```
+
+Запрос:
+
+``` sql
+CREATE TABLE table_map (a Map(String, UInt64)) ENGINE = MergeTree() ORDER BY a;
+INSERT INTO table_map SELECT map('key1', number, 'key2', number * 2) FROM numbers(3);
+SELECT a['key2'] FROM table_map;
+```
+
+Результат:
+
+``` text
+┌─arrayElement(a, 'key2')─┐
+│                       0 │
+│                       2 │
+│                       4 │
+└─────────────────────────┘
+```
+
+**См. также** 
+
+-   тип данных [Map(key, value)](../../sql-reference/data-types/map.md)
 ## mapAdd {#function-mapadd}

 Собирает все ключи и суммирует соответствующие значения.
--- a/docs/ru/sql-reference/statements/create/view.md
+++ b/docs/ru/sql-reference/statements/create/view.md
@ -56,9 +56,10 @@ CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER] [TO[db.]na

 Недоработано выполнение запросов `ALTER` над материализованными представлениями, поэтому они могут быть неудобными для использования. Если материализованное представление использует конструкцию `TO [db.]name`, то можно выполнить `DETACH` представления, `ALTER` для целевой таблицы и последующий `ATTACH` ранее отсоединенного (`DETACH`) представления.

+Обратите внимание, что работа материлизованного представления находится под влиянием настройки [optimize_on_insert](../../../operations/settings/settings.md#optimize-on-insert). Перед вставкой данных в таблицу происходит их слияние.
+ 
 Представления выглядят так же, как обычные таблицы. Например, они перечисляются в результате запроса `SHOW TABLES`.

 Отсутствует отдельный запрос для удаления представлений. Чтобы удалить представление, следует использовать `DROP TABLE`.

-[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/statements/create/view) 
-<!--hide-->
+[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/statements/create/view) <!--hide-->
--- a/docs/ru/sql-reference/statements/insert-into.md
+++ b/docs/ru/sql-reference/statements/insert-into.md
@ -63,8 +63,6 @@ SELECT * FROM insert_select_testtable
 -   Значения, вычисляемые из `DEFAULT` выражений, указанных в определении таблицы.
 -   Нули и пустые строки, если `DEFAULT` не определены.

-Если [strict_insert_defaults=1](../../operations/settings/settings.md), то столбцы, для которых не определены `DEFAULT`, необходимо перечислить в запросе.
-
 В INSERT можно передавать данные любого [формата](../../interfaces/formats.md#formats), который поддерживает ClickHouse. Для этого формат необходимо указать в запросе в явном виде:

 ``` sql
--- a/docs/zh/operations/backup.md
+++ b/docs/zh/operations/backup.md
@ -7,7 +7,7 @@ toc_title: "\u6570\u636E\u5907\u4EFD"

 # 数据备份 {#data-backup}

-尽管[副本](../engines/table-engines/mergetree-family/replication.md) 可以预防硬件错误带来的数据丢失, 但是它不能防止人为操作的错误: 意外删除数据, 删除错误的 table 或者删除错误 cluster 上的 table, 可以导致错误数据处理错误或者数据损坏的 bugs. 这类意外可能会影响所有的副本. ClickHouse 有内建的保障措施可以预防一些错误 — 例如, 默认情况下[您不能使用类似MergeTree的引擎删除包含超过50Gb数据的表](https://github.com/ClickHouse/ClickHouse/blob/v18.14.18-stable/programs/server/config.xml#L322-L330). 但是，这些保障措施不能涵盖所有可能的情况，并且可以规避。
+尽管[副本](../engines/table-engines/mergetree-family/replication.md) 可以预防硬件错误带来的数据丢失, 但是它不能防止人为操作的错误: 意外删除数据, 删除错误的 table 或者删除错误 cluster 上的 table, 可以导致错误数据处理错误或者数据损坏的 bugs. 这类意外可能会影响所有的副本. ClickHouse 有内建的保障措施可以预防一些错误 — 例如, 默认情况下[您不能使用类似MergeTree的引擎删除包含超过50Gb数据的表](server-configuration-parameters/settings.md#max-table-size-to-drop). 但是，这些保障措施不能涵盖所有可能的情况，并且可以规避。

 为了有效地减少可能的人为错误，您应该 **提前**准备备份和还原数据的策略.

--- a/programs/client/Client.cpp
+++ b/programs/client/Client.cpp
@ -1374,7 +1374,30 @@ private:
            {
                // Probably the server is dead because we found an assertion
                // failure. Fail fast.
-                fmt::print(stderr, "Lost connection to the server\n");
+                fmt::print(stderr, "Lost connection to the server.\n");
+
+                // Print the changed settings because they might be needed to
+                // reproduce the error.
+                const auto & changes = context.getSettingsRef().changes();
+                if (!changes.empty())
+                {
+                    fmt::print(stderr, "Changed settings: ");
+                    for (size_t i = 0; i < changes.size(); ++i)
+                    {
+                        if (i)
+                        {
+                            fmt::print(stderr, ", ");
+                        }
+                        fmt::print(stderr, "{} = '{}'", changes[i].name,
+                            toString(changes[i].value));
+                    }
+                    fmt::print(stderr, "\n");
+                }
+                else
+                {
+                    fmt::print(stderr, "No changed settings.\n");
+                }
+
                return false;
            }

--- a/programs/client/QueryFuzzer.cpp
+++ b/programs/client/QueryFuzzer.cpp
@ -363,6 +363,16 @@ void QueryFuzzer::fuzzWindowFrame(WindowFrame & frame)
            frame.end_offset = getRandomField(0).get<Int64>();
            break;
        }
+        case 5:
+        {
+            frame.begin_preceding = fuzz_rand() % 2;
+            break;
+        }
+        case 6:
+        {
+            frame.end_preceding = fuzz_rand() % 2;
+            break;
+        }
        default:
            break;
    }
--- a/programs/client/clickhouse-client.xml
+++ b/programs/client/clickhouse-client.xml
@ -29,4 +29,25 @@
        <test>{display_name} \x01\e[1;32m\x02:)\x01\e[0m\x02 </test> <!-- if it matched to the substring "test" in the server display name - -->
        <production>{display_name} \x01\e[1;31m\x02:)\x01\e[0m\x02 </production> <!-- if it matched to the substring "production" in the server display name -->
    </prompt_by_server_display_name>
+
+    <!-- 
+        Settings adjustable via command-line parameters
+        can take their defaults from that config file, see examples:
+
+    <host>127.0.0.1</host>
+    <port>9440</port>
+    <secure>1</secure>
+    <user>dbuser</user>
+    <password>dbpwd123</password>
+    <format>PrettyCompactMonoBlock</format>
+    <multiline>1</multiline>
+    <multiquery>1</multiquery>
+    <stacktrace>1</stacktrace>
+    <database>default2</database>
+    <pager>less -SR</pager>
+    <history_file>/home/user/clickhouse_custom_history.log</history_file>
+    <max_parser_depth>2500</max_parser_depth>
+
+        The same can be done on user-level configuration, just create & adjust: ~/.clickhouse-client/config.xml
+    -->
 </config>
--- a/programs/copier/ClusterCopier.cpp
+++ b/programs/copier/ClusterCopier.cpp
@ -316,9 +316,6 @@ void ClusterCopier::process(const ConnectionTimeouts & timeouts)
            }
        }

-        /// Delete helping tables in both cases (whole table is done or not)
-        dropHelpingTables(task_table);
-
        if (!table_is_done)
        {
            throw Exception("Too many tries to process table " + task_table.table_id + ". Abort remaining execution",
@ -1044,6 +1041,11 @@ bool ClusterCopier::tryProcessTable(const ConnectionTimeouts & timeouts, TaskTab
    {
        LOG_INFO(log, "Table {} is not processed yet.Copied {} of {}, will retry", task_table.table_id, finished_partitions, required_partitions);
    }
+    else
+    {
+        /// Delete helping tables in case that whole table is done
+        dropHelpingTables(task_table);
+    }

    return table_is_done;
 }
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@ -59,7 +59,6 @@
 #include <Disks/registerDisks.h>
 #include <Common/Config/ConfigReloader.h>
 #include <Server/HTTPHandlerFactory.h>
-#include <Server/TestKeeperTCPHandlerFactory.h>
 #include "MetricsTransmitter.h"
 #include <Common/StatusFile.h>
 #include <Server/TCPHandlerFactory.h>
@ -94,6 +93,9 @@
 #   include <Server/GRPCServer.h>
 #endif

+#if USE_NURAFT
+#   include <Server/NuKeeperTCPHandlerFactory.h>
+#endif

 namespace CurrentMetrics
 {
@ -843,23 +845,33 @@ int Server::main(const std::vector<std::string> & /*args*/)
        listen_try = true;
    }

-    for (const auto & listen_host : listen_hosts)
+    if (config().has("test_keeper_server"))
    {
-        /// TCP TestKeeper
-        const char * port_name = "test_keeper_server.tcp_port";
-        createServer(listen_host, port_name, listen_try, [&](UInt16 port)
+#if USE_NURAFT
+        /// Initialize test keeper RAFT. Do nothing if no nu_keeper_server in config.
+        global_context->initializeNuKeeperStorageDispatcher();
+        for (const auto & listen_host : listen_hosts)
        {
-            Poco::Net::ServerSocket socket;
-            auto address = socketBindListen(socket, listen_host, port);
-            socket.setReceiveTimeout(settings.receive_timeout);
-            socket.setSendTimeout(settings.send_timeout);
-            servers_to_start_before_tables->emplace_back(
-                port_name,
-                std::make_unique<Poco::Net::TCPServer>(
-                    new TestKeeperTCPHandlerFactory(*this), server_pool, socket, new Poco::Net::TCPServerParams));
+            /// TCP NuKeeper
+            const char * port_name = "test_keeper_server.tcp_port";
+            createServer(listen_host, port_name, listen_try, [&](UInt16 port)
+            {
+                Poco::Net::ServerSocket socket;
+                auto address = socketBindListen(socket, listen_host, port);
+                socket.setReceiveTimeout(settings.receive_timeout);
+                socket.setSendTimeout(settings.send_timeout);
+                servers_to_start_before_tables->emplace_back(
+                    port_name,
+                    std::make_unique<Poco::Net::TCPServer>(
+                        new NuKeeperTCPHandlerFactory(*this), server_pool, socket, new Poco::Net::TCPServerParams));
+
+                LOG_INFO(log, "Listening for connections to NuKeeper (tcp): {}", address.toString());
+            });
+        }
+#else
+        throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "ClickHouse server built without NuRaft library. Cannot use internal coordination.");
+#endif

-            LOG_INFO(log, "Listening for connections to fake zookeeper (tcp): {}", address.toString());
-        });
    }

    for (auto & server : *servers_to_start_before_tables)
@ -899,6 +911,8 @@ int Server::main(const std::vector<std::string> & /*args*/)
                LOG_INFO(log, "Closed connections to servers for tables. But {} remain. Probably some tables of other users cannot finish their connections after context shutdown.", current_connections);
            else
                LOG_INFO(log, "Closed connections to servers for tables.");
+
+            global_context->shutdownNuKeeperStorageDispatcher();
        }

        /** Explicitly destroy Context. It is more convenient than in destructor of Server, because logger is still available.
--- a/programs/server/config.xml
+++ b/programs/server/config.xml
@ -421,9 +421,15 @@
    <!-- Comma-separated list of prefixes for user-defined settings. -->
    <custom_settings_prefixes></custom_settings_prefixes>

-    <!-- System profile of settings. This settings are used by internal processes (Buffer storage, Distributed DDL worker and so on). -->
+    <!-- System profile of settings. This settings are used by internal processes (Distributed DDL worker and so on). -->
    <!-- <system_profile>default</system_profile> -->

+    <!-- Buffer profile of settings.
+         This settings are used by Buffer storage to flush data to the underlying table.
+         Default: used from system_profile directive.
+    -->
+    <!-- <buffer_profile>default</buffer_profile> -->
+
    <!-- Default database. -->
    <default_database>default</default_database>

--- a/src/Access/DiskAccessStorage.cpp
+++ b/src/Access/DiskAccessStorage.cpp
@ -217,6 +217,7 @@ namespace
        /// Write the file.
        WriteBufferFromFile out{tmp_file_path.string()};
        out.write(file_contents.data(), file_contents.size());
+        out.close();

        /// Rename.
        std::filesystem::rename(tmp_file_path, file_path);
@ -274,6 +275,7 @@ namespace
            writeStringBinary(name, out);
            writeUUIDText(id, out);
        }
+        out.close();
    }


--- a/src/AggregateFunctions/AggregateFunctionAny.cpp
+++ b/src/AggregateFunctions/AggregateFunctionAny.cpp
@ -34,6 +34,14 @@ void registerAggregateFunctionsAny(AggregateFunctionFactory & factory)
    factory.registerFunction("any", { createAggregateFunctionAny, properties });
    factory.registerFunction("anyLast", { createAggregateFunctionAnyLast, properties });
    factory.registerFunction("anyHeavy", { createAggregateFunctionAnyHeavy, properties });
+
+    // Synonyms for use as window functions.
+    factory.registerFunction("first_value",
+        { createAggregateFunctionAny, properties },
+        AggregateFunctionFactory::CaseInsensitive);
+    factory.registerFunction("last_value",
+        { createAggregateFunctionAnyLast, properties },
+        AggregateFunctionFactory::CaseInsensitive);
 }

 }
--- a/src/AggregateFunctions/AggregateFunctionDeltaSum.cpp
+++ b/src/AggregateFunctions/AggregateFunctionDeltaSum.cpp
@ -0,0 +1,49 @@
+#include <AggregateFunctions/AggregateFunctionDeltaSum.h>
+
+#include <AggregateFunctions/AggregateFunctionFactory.h>
+#include <AggregateFunctions/FactoryHelpers.h>
+#include <AggregateFunctions/Helpers.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
+    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+}
+
+namespace
+{
+
+AggregateFunctionPtr createAggregateFunctionDeltaSum(
+    const String & name,
+    const DataTypes & arguments,
+    const Array & params)
+{
+    assertNoParameters(name, params);
+
+    if (arguments.size() != 1)
+        throw Exception("Incorrect number of arguments for aggregate function " + name,
+            ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
+
+    DataTypePtr data_type = arguments[0];
+
+    if (isInteger(data_type) || isFloat(data_type))
+        return AggregateFunctionPtr(createWithNumericType<AggregationFunctionDeltaSum>(
+            *data_type, arguments, params));
+    else
+        throw Exception("Illegal type " + arguments[0]->getName() + " of argument for aggregate function " + name,
+            ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+}
+}
+
+void registerAggregateFunctionDeltaSum(AggregateFunctionFactory & factory)
+{
+    AggregateFunctionProperties properties = { .returns_default_when_only_null = true, .is_order_dependent = true };
+
+    factory.registerFunction("deltaSum", { createAggregateFunctionDeltaSum, properties });
+}
+
+}
--- a/src/AggregateFunctions/AggregateFunctionDeltaSum.h
+++ b/src/AggregateFunctions/AggregateFunctionDeltaSum.h
@ -0,0 +1,129 @@
+#pragma once
+
+#include <type_traits>
+#include <experimental/type_traits>
+
+#include <IO/ReadHelpers.h>
+#include <IO/WriteHelpers.h>
+
+#include <Columns/ColumnVector.h>
+#include <DataTypes/DataTypesDecimal.h>
+#include <DataTypes/DataTypesNumber.h>
+
+#include <AggregateFunctions/IAggregateFunction.h>
+
+
+namespace DB
+{
+
+template <typename T>
+struct AggregationFunctionDeltaSumData
+{
+    T sum = 0;
+    T last = 0;
+    T first = 0;
+    bool seen_last = false;
+    bool seen_first = false;
+};
+
+template <typename T>
+class AggregationFunctionDeltaSum final
+    : public IAggregateFunctionDataHelper<AggregationFunctionDeltaSumData<T>, AggregationFunctionDeltaSum<T>>
+{
+public:
+    AggregationFunctionDeltaSum(const DataTypes & arguments, const Array & params)
+        : IAggregateFunctionDataHelper<AggregationFunctionDeltaSumData<T>, AggregationFunctionDeltaSum<T>>{arguments, params}
+    {}
+
+    AggregationFunctionDeltaSum()
+        : IAggregateFunctionDataHelper<AggregationFunctionDeltaSumData<T>, AggregationFunctionDeltaSum<T>>{}
+    {}
+
+    String getName() const override { return "deltaSum"; }
+
+    DataTypePtr getReturnType() const override { return std::make_shared<DataTypeNumber<T>>(); }
+
+    void NO_SANITIZE_UNDEFINED ALWAYS_INLINE add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override
+    {
+        auto value = assert_cast<const ColumnVector<T> &>(*columns[0]).getData()[row_num];
+
+        if ((this->data(place).last < value) && this->data(place).seen_last)
+        {
+            this->data(place).sum += (value - this->data(place).last);
+        }
+
+        this->data(place).last = value;
+        this->data(place).seen_last = true;
+
+        if (!this->data(place).seen_first)
+        {
+            this->data(place).first = value;
+            this->data(place).seen_first = true;
+        }
+    }
+
+    void NO_SANITIZE_UNDEFINED ALWAYS_INLINE merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
+    {
+        auto place_data = &this->data(place);
+        auto rhs_data = &this->data(rhs);
+
+        if ((place_data->last < rhs_data->first) && place_data->seen_last && rhs_data->seen_first)
+        {
+            // If the lhs last number seen is less than the first number the rhs saw, the lhs is before
+            // the rhs, for example [0, 2] [4, 7]. So we want to add the deltasums, but also add the
+            // difference between lhs last number and rhs first number (the 2 and 4). Then we want to
+            // take last value from the rhs, so first and last become 0 and 7.
+
+            place_data->sum += rhs_data->sum + (rhs_data->first - place_data->last);
+            place_data->last = rhs_data->last;
+        }
+        else if ((rhs_data->last < place_data->first && rhs_data->seen_last && place_data->seen_first))
+        {
+            // In the opposite scenario, the lhs comes after the rhs, e.g. [4, 6] [1, 2]. Since we
+            // assume the input interval states are sorted by time, we assume this is a counter
+            // reset, and therefore do *not* add the difference between our first value and the
+            // rhs last value.
+
+            place_data->sum += rhs_data->sum;
+            place_data->first = rhs_data->first;
+        }
+        else if (rhs_data->seen_first)
+        {
+            // If we're here then the lhs is an empty state and the rhs does have some state, so
+            // we'll just take that state.
+
+            place_data->first = rhs_data->first;
+            place_data->seen_first = rhs_data->seen_first;
+            place_data->last = rhs_data->last;
+            place_data->seen_last = rhs_data->seen_last;
+            place_data->sum = rhs_data->sum;
+        }
+
+        // Otherwise lhs either has data or is uninitialized, so we don't need to modify its values.
+    }
+
+    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf) const override
+    {
+        writeIntBinary(this->data(place).sum, buf);
+        writeIntBinary(this->data(place).first, buf);
+        writeIntBinary(this->data(place).last, buf);
+        writePODBinary<bool>(this->data(place).seen_first, buf);
+        writePODBinary<bool>(this->data(place).seen_last, buf);
+    }
+
+    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, Arena *) const override
+    {
+        readIntBinary(this->data(place).sum, buf);
+        readIntBinary(this->data(place).first, buf);
+        readIntBinary(this->data(place).last, buf);
+        readPODBinary<bool>(this->data(place).seen_first, buf);
+        readPODBinary<bool>(this->data(place).seen_last, buf);
+    }
+
+    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
+    {
+        assert_cast<ColumnVector<T> &>(to).getData().push_back(this->data(place).sum);
+    }
+};
+
+}
--- a/src/AggregateFunctions/AggregateFunctionGroupArrayMoving.h
+++ b/src/AggregateFunctions/AggregateFunctionGroupArrayMoving.h
@ -40,7 +40,7 @@ struct MovingData
    Array value;    /// Prefix sums.
    T sum = 0;

-    void add(T val, Arena * arena)
+    void NO_SANITIZE_UNDEFINED add(T val, Arena * arena)
    {
        sum += val;
        value.push_back(sum, arena);
@ -120,7 +120,7 @@ public:
        this->data(place).add(static_cast<ResultT>(value), arena);
    }

-    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena * arena) const override
+    void NO_SANITIZE_UNDEFINED merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena * arena) const override
    {
        auto & cur_elems = this->data(place);
        auto & rhs_elems = this->data(rhs);
--- a/src/AggregateFunctions/AggregateFunctionGroupBitmap.cpp
+++ b/src/AggregateFunctions/AggregateFunctionGroupBitmap.cpp
@ -16,6 +16,22 @@ namespace ErrorCodes

 namespace
 {
+
+    template <template <typename, typename> class AggregateFunctionTemplate, template <typename> class Data, typename... TArgs>
+    static IAggregateFunction * createWithIntegerType(const IDataType & argument_type, TArgs && ... args)
+    {
+        WhichDataType which(argument_type);
+        if (which.idx == TypeIndex::UInt8) return new AggregateFunctionTemplate<UInt8, Data<UInt8>>(std::forward<TArgs>(args)...);
+        if (which.idx == TypeIndex::UInt16) return new AggregateFunctionTemplate<UInt16, Data<UInt16>>(std::forward<TArgs>(args)...);
+        if (which.idx == TypeIndex::UInt32) return new AggregateFunctionTemplate<UInt32, Data<UInt32>>(std::forward<TArgs>(args)...);
+        if (which.idx == TypeIndex::UInt64) return new AggregateFunctionTemplate<UInt64, Data<UInt64>>(std::forward<TArgs>(args)...);
+        if (which.idx == TypeIndex::Int8) return new AggregateFunctionTemplate<Int8, Data<Int8>>(std::forward<TArgs>(args)...);
+        if (which.idx == TypeIndex::Int16) return new AggregateFunctionTemplate<Int16, Data<Int16>>(std::forward<TArgs>(args)...);
+        if (which.idx == TypeIndex::Int32) return new AggregateFunctionTemplate<Int32, Data<Int32>>(std::forward<TArgs>(args)...);
+        if (which.idx == TypeIndex::Int64) return new AggregateFunctionTemplate<Int64, Data<Int64>>(std::forward<TArgs>(args)...);
+        return nullptr;
+    }
+
    template <template <typename> class Data>
    AggregateFunctionPtr createAggregateFunctionBitmap(const std::string & name, const DataTypes & argument_types, const Array & parameters)
    {
@ -28,7 +44,7 @@ namespace
                    + " is illegal, because it cannot be used in Bitmap operations",
                ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);

-        AggregateFunctionPtr res(createWithUnsignedIntegerType<AggregateFunctionBitmap, Data>(*argument_types[0], argument_types[0]));
+        AggregateFunctionPtr res(createWithIntegerType<AggregateFunctionBitmap, Data>(*argument_types[0], argument_types[0]));

        if (!res)
            throw Exception(
@ -55,7 +71,7 @@ namespace
        const DataTypeAggregateFunction & datatype_aggfunc = dynamic_cast<const DataTypeAggregateFunction &>(*argument_type_ptr);
        AggregateFunctionPtr aggfunc = datatype_aggfunc.getFunction();
        argument_type_ptr = aggfunc->getArgumentTypes()[0];
-        AggregateFunctionPtr res(createWithUnsignedIntegerType<AggregateFunctionTemplate, AggregateFunctionGroupBitmapData>(
+        AggregateFunctionPtr res(createWithIntegerType<AggregateFunctionTemplate, AggregateFunctionGroupBitmapData>(
            *argument_type_ptr, argument_type_ptr));
        if (!res)
            throw Exception(
--- a/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h
+++ b/src/AggregateFunctions/AggregateFunctionGroupBitmapData.h
@ -32,6 +32,7 @@ template <typename T, UInt8 small_set_size>
 class RoaringBitmapWithSmallSet : private boost::noncopyable
 {
 private:
+    using UnsignedT = std::make_unsigned_t<T>;
    SmallSet<T, small_set_size> small;
    using ValueBuffer = std::vector<T>;
    using RoaringBitmap = std::conditional_t<sizeof(T) >= 8, roaring::Roaring64Map, roaring::Roaring>;
@ -363,6 +364,7 @@ public:
    /**
     * Check whether the argument is the subset of this set.
     * Empty set is a subset of any other set (consistent with hasAll).
+     * It's used in subset and currently only support comparing same type
     */
    UInt8 rb_is_subset(const RoaringBitmapWithSmallSet & r1) const
    {
@ -486,6 +488,7 @@ public:

    /**
     * Return new set with specified range (not include the range_end)
+     * It's used in subset and currently only support UInt32
     */
    UInt64 rb_range(UInt64 range_start, UInt64 range_end, RoaringBitmapWithSmallSet & r1) const
    {
@ -525,6 +528,7 @@ public:

    /**
     * Return new set of the smallest `limit` values in set which is no less than `range_start`.
+     * It's used in subset and currently only support UInt32
     */
    UInt64 rb_limit(UInt64 range_start, UInt64 limit, RoaringBitmapWithSmallSet & r1) const
    {
@ -578,10 +582,10 @@ public:
        {
            if (small.empty())
                return 0;
-            auto min_val = std::numeric_limits<std::make_unsigned_t<T>>::max();
+            auto min_val = std::numeric_limits<UnsignedT>::max();
            for (const auto & x : small)
            {
-                auto val = x.getValue();
+                UnsignedT val = x.getValue();
                if (val < min_val)
                    min_val = val;
            }
@ -597,10 +601,10 @@ public:
        {
            if (small.empty())
                return 0;
-            auto max_val = std::numeric_limits<std::make_unsigned_t<T>>::min();
+            UnsignedT max_val = 0;
            for (const auto & x : small)
            {
-                auto val = x.getValue();
+                UnsignedT val = x.getValue();
                if (val > max_val)
                    max_val = val;
            }
@ -611,7 +615,8 @@ public:
    }

    /**
-     * Replace value
+     * Replace value.
+     * It's used in transform and currently can only support UInt32
     */
    void rb_replace(const UInt64 * from_vals, const UInt64 * to_vals, size_t num)
    {
--- a/src/AggregateFunctions/IAggregateFunction.h
+++ b/src/AggregateFunctions/IAggregateFunction.h
@ -26,6 +26,7 @@ class ReadBuffer;
 class WriteBuffer;
 class IColumn;
 class IDataType;
+class IWindowFunction;

 using DataTypePtr = std::shared_ptr<const IDataType>;
 using DataTypes = std::vector<DataTypePtr>;
@ -215,6 +216,20 @@ public:
    const DataTypes & getArgumentTypes() const { return argument_types; }
    const Array & getParameters() const { return parameters; }

+    // Any aggregate function can be calculated over a window, but there are some
+    // window functions such as rank() that require a different interface, e.g.
+    // because they don't respect the window frame, or need to be notified when
+    // a new peer group starts. They pretend to be normal aggregate functions,
+    // but will fail if you actually try to use them in Aggregator. The
+    // WindowTransform recognizes these functions and handles them differently.
+    // We could have a separate factory for window functions, and make all
+    // aggregate functions implement IWindowFunction interface and so on. This
+    // would be more logically correct, but more complex. We only have a handful
+    // of true window functions, so this hack-ish interface suffices.
+    virtual IWindowFunction * asWindowFunction() { return nullptr; }
+    virtual const IWindowFunction * asWindowFunction() const
+    { return const_cast<IAggregateFunction *>(this)->asWindowFunction(); }
+
 protected:
    DataTypes argument_types;
    Array parameters;
--- a/src/AggregateFunctions/registerAggregateFunctions.cpp
+++ b/src/AggregateFunctions/registerAggregateFunctions.cpp
@ -11,6 +11,7 @@ class AggregateFunctionFactory;
 void registerAggregateFunctionAvg(AggregateFunctionFactory &);
 void registerAggregateFunctionAvgWeighted(AggregateFunctionFactory &);
 void registerAggregateFunctionCount(AggregateFunctionFactory &);
+void registerAggregateFunctionDeltaSum(AggregateFunctionFactory &);
 void registerAggregateFunctionGroupArray(AggregateFunctionFactory &);
 void registerAggregateFunctionGroupUniqArray(AggregateFunctionFactory &);
 void registerAggregateFunctionGroupArrayInsertAt(AggregateFunctionFactory &);
@ -57,6 +58,8 @@ void registerAggregateFunctionCombinatorOrFill(AggregateFunctionCombinatorFactor
 void registerAggregateFunctionCombinatorResample(AggregateFunctionCombinatorFactory &);
 void registerAggregateFunctionCombinatorDistinct(AggregateFunctionCombinatorFactory &);

+void registerWindowFunctions(AggregateFunctionFactory & factory);
+

 void registerAggregateFunctions()
 {
@ -66,6 +69,7 @@ void registerAggregateFunctions()
        registerAggregateFunctionAvg(factory);
        registerAggregateFunctionAvgWeighted(factory);
        registerAggregateFunctionCount(factory);
+        registerAggregateFunctionDeltaSum(factory);
        registerAggregateFunctionGroupArray(factory);
        registerAggregateFunctionGroupUniqArray(factory);
        registerAggregateFunctionGroupArrayInsertAt(factory);
@ -101,6 +105,8 @@ void registerAggregateFunctions()
        registerAggregateFunctionMannWhitney(factory);
        registerAggregateFunctionWelchTTest(factory);
        registerAggregateFunctionStudentTTest(factory);
+
+        registerWindowFunctions(factory);
    }

    {
--- a/src/AggregateFunctions/ya.make
+++ b/src/AggregateFunctions/ya.make
@ -19,6 +19,7 @@ SRCS(
    AggregateFunctionCategoricalInformationValue.cpp
    AggregateFunctionCombinatorFactory.cpp
    AggregateFunctionCount.cpp
+    AggregateFunctionDeltaSum.cpp
    AggregateFunctionDistinct.cpp
    AggregateFunctionEntropy.cpp
    AggregateFunctionFactory.cpp
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -60,6 +60,7 @@ add_subdirectory (Processors)
 add_subdirectory (Formats)
 add_subdirectory (Compression)
 add_subdirectory (Server)
+add_subdirectory (Coordination)


 set(dbms_headers)
@ -100,8 +101,8 @@ endif()
 list (APPEND clickhouse_common_io_sources ${CONFIG_BUILD})
 list (APPEND clickhouse_common_io_headers ${CONFIG_VERSION} ${CONFIG_COMMON})

-list (APPEND dbms_sources Functions/IFunction.cpp Functions/FunctionFactory.cpp Functions/FunctionHelpers.cpp Functions/extractTimeZoneFromFunctionArguments.cpp)
-list (APPEND dbms_headers Functions/IFunctionImpl.h Functions/FunctionFactory.h Functions/FunctionHelpers.h Functions/extractTimeZoneFromFunctionArguments.h)
+list (APPEND dbms_sources Functions/IFunction.cpp Functions/FunctionFactory.cpp Functions/FunctionHelpers.cpp Functions/extractTimeZoneFromFunctionArguments.cpp Functions/replicate.cpp)
+list (APPEND dbms_headers Functions/IFunctionImpl.h Functions/FunctionFactory.h Functions/FunctionHelpers.h Functions/extractTimeZoneFromFunctionArguments.h Functions/replicate.h)

 list (APPEND dbms_sources
    AggregateFunctions/AggregateFunctionFactory.cpp
@ -192,6 +193,10 @@ add_object_library(clickhouse_processors_merges_algorithms Processors/Merges/Alg
 add_object_library(clickhouse_processors_queryplan Processors/QueryPlan)
 add_object_library(clickhouse_processors_queryplan_optimizations Processors/QueryPlan/Optimizations)

+if (USE_NURAFT)
+    add_object_library(clickhouse_coordination Coordination)
+endif()
+
 set (DBMS_COMMON_LIBRARIES)
 # libgcc_s does not provide an implementation of an atomics library. Instead,
 # GCC’s libatomic library can be used to supply these when using libgcc_s.
@ -314,7 +319,7 @@ if (USE_KRB5)
 endif()

 if (USE_NURAFT)
-    dbms_target_link_libraries(PRIVATE ${NURAFT_LIBRARY})
+    dbms_target_link_libraries(PUBLIC ${NURAFT_LIBRARY})
 endif()

 if(RE2_INCLUDE_DIR)
--- a/src/Columns/ColumnDecimal.cpp
+++ b/src/Columns/ColumnDecimal.cpp
@ -30,6 +30,12 @@ namespace ErrorCodes
    extern const int LOGICAL_ERROR;
 }

+template class DecimalPaddedPODArray<Decimal32>;
+template class DecimalPaddedPODArray<Decimal64>;
+template class DecimalPaddedPODArray<Decimal128>;
+template class DecimalPaddedPODArray<Decimal256>;
+template class DecimalPaddedPODArray<DateTime64>;
+
 template <typename T>
 int ColumnDecimal<T>::compareAt(size_t n, size_t m, const IColumn & rhs_, int) const
 {
@ -370,4 +376,5 @@ template class ColumnDecimal<Decimal64>;
 template class ColumnDecimal<Decimal128>;
 template class ColumnDecimal<Decimal256>;
 template class ColumnDecimal<DateTime64>;
+
 }
--- a/src/Columns/ColumnDecimal.h
+++ b/src/Columns/ColumnDecimal.h
@ -50,6 +50,14 @@ private:
    UInt32 scale;
 };

+/// Prevent implicit template instantiation of DecimalPaddedPODArray for common decimal types
+
+extern template class DecimalPaddedPODArray<Decimal32>;
+extern template class DecimalPaddedPODArray<Decimal64>;
+extern template class DecimalPaddedPODArray<Decimal128>;
+extern template class DecimalPaddedPODArray<Decimal256>;
+extern template class DecimalPaddedPODArray<DateTime64>;
+
 /// A ColumnVector for Decimals
 template <typename T>
 class ColumnDecimal final : public COWHelper<ColumnVectorHelper, ColumnDecimal<T>>
@ -215,4 +223,14 @@ ColumnPtr ColumnDecimal<T>::indexImpl(const PaddedPODArray<Type> & indexes, size
    return res;
 }

+
+/// Prevent implicit template instantiation of ColumnDecimal for common decimal types
+
+extern template class ColumnDecimal<Decimal32>;
+extern template class ColumnDecimal<Decimal64>;
+extern template class ColumnDecimal<Decimal128>;
+extern template class ColumnDecimal<Decimal256>;
+extern template class ColumnDecimal<DateTime64>;
+
+
 }
--- a/src/Columns/ColumnVector.cpp
+++ b/src/Columns/ColumnVector.cpp
@ -535,4 +535,5 @@ template class ColumnVector<Int128>;
 template class ColumnVector<Int256>;
 template class ColumnVector<Float32>;
 template class ColumnVector<Float64>;
+
 }
--- a/src/Columns/ColumnVector.h
+++ b/src/Columns/ColumnVector.h
@ -345,4 +345,21 @@ ColumnPtr ColumnVector<T>::indexImpl(const PaddedPODArray<Type> & indexes, size_
    return res;
 }

+/// Prevent implicit template instantiation of ColumnVector for common types
+
+extern template class ColumnVector<UInt8>;
+extern template class ColumnVector<UInt16>;
+extern template class ColumnVector<UInt32>;
+extern template class ColumnVector<UInt64>;
+extern template class ColumnVector<UInt128>;
+extern template class ColumnVector<UInt256>;
+extern template class ColumnVector<Int8>;
+extern template class ColumnVector<Int16>;
+extern template class ColumnVector<Int32>;
+extern template class ColumnVector<Int64>;
+extern template class ColumnVector<Int128>;
+extern template class ColumnVector<Int256>;
+extern template class ColumnVector<Float32>;
+extern template class ColumnVector<Float64>;
+
 }
--- a/src/Common/Allocator.cpp
+++ b/src/Common/Allocator.cpp
@ -19,3 +19,8 @@
      */
    __attribute__((__weak__)) extern const size_t MMAP_THRESHOLD = 16384;
 #endif
+
+template class Allocator<false, false>;
+template class Allocator<true, false>;
+template class Allocator<false, true>;
+template class Allocator<true, true>;
--- a/src/Common/Allocator.h
+++ b/src/Common/Allocator.h
@ -352,6 +352,12 @@ template<typename Base, size_t initial_bytes, size_t Alignment>
 constexpr size_t allocatorInitialBytes<AllocatorWithStackMemory<
    Base, initial_bytes, Alignment>> = initial_bytes;

+/// Prevent implicit template instantiation of Allocator
+
+extern template class Allocator<false, false>;
+extern template class Allocator<true, false>;
+extern template class Allocator<false, true>;
+extern template class Allocator<true, true>;

 #if !__clang__
 #pragma GCC diagnostic pop
--- a/src/Common/ColumnsHashing.h
+++ b/src/Common/ColumnsHashing.h
@ -28,12 +28,12 @@ namespace ColumnsHashing

 /// For the case when there is one numeric key.
 /// UInt8/16/32/64 for any type with corresponding bit width.
-template <typename Value, typename Mapped, typename FieldType, bool use_cache = true>
+template <typename Value, typename Mapped, typename FieldType, bool use_cache = true, bool need_offset = false>
 struct HashMethodOneNumber
-    : public columns_hashing_impl::HashMethodBase<HashMethodOneNumber<Value, Mapped, FieldType, use_cache>, Value, Mapped, use_cache>
+    : public columns_hashing_impl::HashMethodBase<HashMethodOneNumber<Value, Mapped, FieldType, use_cache, need_offset>, Value, Mapped, use_cache, need_offset>
 {
-    using Self = HashMethodOneNumber<Value, Mapped, FieldType, use_cache>;
-    using Base = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache>;
+    using Self = HashMethodOneNumber<Value, Mapped, FieldType, use_cache, need_offset>;
+    using Base = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache, need_offset>;

    const char * vec;

@ -70,12 +70,12 @@ struct HashMethodOneNumber


 /// For the case when there is one string key.
-template <typename Value, typename Mapped, bool place_string_to_arena = true, bool use_cache = true>
+template <typename Value, typename Mapped, bool place_string_to_arena = true, bool use_cache = true, bool need_offset = false>
 struct HashMethodString
-    : public columns_hashing_impl::HashMethodBase<HashMethodString<Value, Mapped, place_string_to_arena, use_cache>, Value, Mapped, use_cache>
+    : public columns_hashing_impl::HashMethodBase<HashMethodString<Value, Mapped, place_string_to_arena, use_cache, need_offset>, Value, Mapped, use_cache, need_offset>
 {
-    using Self = HashMethodString<Value, Mapped, place_string_to_arena, use_cache>;
-    using Base = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache>;
+    using Self = HashMethodString<Value, Mapped, place_string_to_arena, use_cache, need_offset>;
+    using Base = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache, need_offset>;

    const IColumn::Offset * offsets;
    const UInt8 * chars;
@ -108,12 +108,13 @@ protected:


 /// For the case when there is one fixed-length string key.
-template <typename Value, typename Mapped, bool place_string_to_arena = true, bool use_cache = true>
+template <typename Value, typename Mapped, bool place_string_to_arena = true, bool use_cache = true, bool need_offset = false>
 struct HashMethodFixedString
-    : public columns_hashing_impl::HashMethodBase<HashMethodFixedString<Value, Mapped, place_string_to_arena, use_cache>, Value, Mapped, use_cache>
+    : public columns_hashing_impl::
+          HashMethodBase<HashMethodFixedString<Value, Mapped, place_string_to_arena, use_cache, need_offset>, Value, Mapped, use_cache, need_offset>
 {
-    using Self = HashMethodFixedString<Value, Mapped, place_string_to_arena, use_cache>;
-    using Base = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache>;
+    using Self = HashMethodFixedString<Value, Mapped, place_string_to_arena, use_cache, need_offset>;
+    using Base = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache, need_offset>;

    size_t n;
    const ColumnFixedString::Chars * chars;
@ -454,13 +455,20 @@ template <>
 struct LowCardinalityKeys<false> {};

 /// For the case when all keys are of fixed length, and they fit in N (for example, 128) bits.
-template <typename Value, typename Key, typename Mapped, bool has_nullable_keys_ = false, bool has_low_cardinality_ = false, bool use_cache = true>
+template <
+    typename Value,
+    typename Key,
+    typename Mapped,
+    bool has_nullable_keys_ = false,
+    bool has_low_cardinality_ = false,
+    bool use_cache = true,
+    bool need_offset = false>
 struct HashMethodKeysFixed
    : private columns_hashing_impl::BaseStateKeysFixed<Key, has_nullable_keys_>
-    , public columns_hashing_impl::HashMethodBase<HashMethodKeysFixed<Value, Key, Mapped, has_nullable_keys_, has_low_cardinality_, use_cache>, Value, Mapped, use_cache>
+    , public columns_hashing_impl::HashMethodBase<HashMethodKeysFixed<Value, Key, Mapped, has_nullable_keys_, has_low_cardinality_, use_cache, need_offset>, Value, Mapped, use_cache, need_offset>
 {
-    using Self = HashMethodKeysFixed<Value, Key, Mapped, has_nullable_keys_, has_low_cardinality_, use_cache>;
-    using BaseHashed = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache>;
+    using Self = HashMethodKeysFixed<Value, Key, Mapped, has_nullable_keys_, has_low_cardinality_, use_cache, need_offset>;
+    using BaseHashed = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache, need_offset>;
    using Base = columns_hashing_impl::BaseStateKeysFixed<Key, has_nullable_keys_>;

    static constexpr bool has_nullable_keys = has_nullable_keys_;
@ -470,6 +478,12 @@ struct HashMethodKeysFixed
    Sizes key_sizes;
    size_t keys_size;

+    /// SSSE3 shuffle method can be used. Shuffle masks will be calculated and stored here.
+#if defined(__SSSE3__) && !defined(MEMORY_SANITIZER)
+    std::unique_ptr<uint8_t[]> masks;
+    std::unique_ptr<const char*[]> columns_data;
+#endif
+
    HashMethodKeysFixed(const ColumnRawPtrs & key_columns, const Sizes & key_sizes_, const HashMethodContextPtr &)
        : Base(key_columns), key_sizes(std::move(key_sizes_)), keys_size(key_columns.size())
    {
@ -490,6 +504,58 @@ struct HashMethodKeysFixed
                    low_cardinality_keys.nested_columns[i] = key_columns[i];
            }
        }
+
+#if defined(__SSSE3__) && !defined(MEMORY_SANITIZER)
+        if constexpr (!has_low_cardinality && !has_nullable_keys && sizeof(Key) <= 16)
+        {
+            /** The task is to "pack" multiple fixed-size fields into single larger Key.
+              * Example: pack UInt8, UInt32, UInt16, UInt64 into UInt128 key:
+              * [- ---- -- -------- -] - the resulting uint128 key
+              *  ^  ^   ^   ^       ^
+              *  u8 u32 u16 u64    zero
+              *
+              * We can do it with the help of SSSE3 shuffle instruction.
+              *
+              * There will be a mask for every GROUP BY element (keys_size masks in total).
+              * Every mask has 16 bytes but only sizeof(Key) bytes are used (other we don't care).
+              *
+              * Every byte in the mask has the following meaning:
+              * - if it is 0..15, take the element at this index from source register and place here in the result;
+              * - if it is 0xFF - set the elemend in the result to zero.
+              *
+              * Example:
+              * We want to copy UInt32 to offset 1 in the destination and set other bytes in the destination as zero.
+              * The corresponding mask will be: FF, 0, 1, 2, 3, FF, FF, FF, FF, FF, FF, FF, FF, FF, FF, FF
+              *
+              * The max size of destination is 16 bytes, because we cannot process more with SSSE3.
+              *
+              * The method is disabled under MSan, because it's allowed
+              * to load into SSE register and process up to 15 bytes of uninitialized memory in columns padding.
+              * We don't use this uninitialized memory but MSan cannot look "into" the shuffle instruction.
+              *
+              * 16-bytes masks can be placed overlapping, only first sizeof(Key) bytes are relevant in each mask.
+              * We initialize them to 0xFF and then set the needed elements.
+              */
+            size_t total_masks_size = sizeof(Key) * keys_size + (16 - sizeof(Key));
+            masks.reset(new uint8_t[total_masks_size]);
+            memset(masks.get(), 0xFF, total_masks_size);
+
+            size_t offset = 0;
+            for (size_t i = 0; i < keys_size; ++i)
+            {
+                for (size_t j = 0; j < key_sizes[i]; ++j)
+                {
+                    masks[i * sizeof(Key) + offset] = j;
+                    ++offset;
+                }
+            }
+
+            columns_data.reset(new const char*[keys_size]);
+
+            for (size_t i = 0; i < keys_size; ++i)
+                columns_data[i] = Base::getActualColumns()[i]->getRawData().data;
+        }
+#endif
    }

    ALWAYS_INLINE Key getKeyHolder(size_t row, Arena &) const
@ -505,6 +571,10 @@ struct HashMethodKeysFixed
                return packFixed<Key, true>(row, keys_size, low_cardinality_keys.nested_columns, key_sizes,
                                            &low_cardinality_keys.positions, &low_cardinality_keys.position_sizes);

+#if defined(__SSSE3__) && !defined(MEMORY_SANITIZER)
+            if constexpr (!has_low_cardinality && !has_nullable_keys && sizeof(Key) <= 16)
+                return packFixedShuffle<Key>(columns_data.get(), keys_size, key_sizes.data(), row, masks.get());
+#endif
            return packFixed<Key>(row, keys_size, Base::getActualColumns(), key_sizes);
        }
    }
@ -540,13 +610,13 @@ protected:
 };

 /// For the case when there is one string key.
-template <typename Value, typename Mapped, bool use_cache = true>
+template <typename Value, typename Mapped, bool use_cache = true, bool need_offset = false>
 struct HashMethodHashed
-    : public columns_hashing_impl::HashMethodBase<HashMethodHashed<Value, Mapped, use_cache>, Value, Mapped, use_cache>
+    : public columns_hashing_impl::HashMethodBase<HashMethodHashed<Value, Mapped, use_cache, need_offset>, Value, Mapped, use_cache, need_offset>
 {
    using Key = UInt128;
-    using Self = HashMethodHashed<Value, Mapped, use_cache>;
-    using Base = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache>;
+    using Self = HashMethodHashed<Value, Mapped, use_cache, need_offset>;
+    using Base = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache, need_offset>;

    ColumnRawPtrs key_columns;

--- a/src/Common/ColumnsHashingImpl.h
+++ b/src/Common/ColumnsHashingImpl.h
@ -87,34 +87,61 @@ public:
    bool isInserted() const { return inserted; }
 };

-template <typename Mapped>
-class FindResultImpl
+/// FindResult optionally may contain pointer to value and offset in hashtable buffer.
+/// Only bool found is required.
+/// So we will have 4 different specializations for FindResultImpl
+class FindResultImplBase
 {
-    Mapped * value;
    bool found;

 public:
-    FindResultImpl(Mapped * value_, bool found_) : value(value_), found(found_) {}
+    explicit FindResultImplBase(bool found_) : found(found_) {}
    bool isFound() const { return found; }
-    Mapped & getMapped() const { return *value; }
+};
+
+template <bool need_offset = false>
+class FindResultImplOffsetBase
+{
+public:
+    constexpr static bool has_offset = need_offset;
+    explicit FindResultImplOffsetBase(size_t /* off */) {}
 };

 template <>
-class FindResultImpl<void>
+class FindResultImplOffsetBase<true>
 {
-    bool found;
-
+    size_t offset;
 public:
-    explicit FindResultImpl(bool found_) : found(found_) {}
-    bool isFound() const { return found; }
+    constexpr static bool has_offset = true;
+
+    explicit FindResultImplOffsetBase(size_t off) : offset(off) {}
+    ALWAYS_INLINE size_t getOffset() const { return offset; }
 };

-template <typename Derived, typename Value, typename Mapped, bool consecutive_keys_optimization>
+template <typename Mapped, bool need_offset = false>
+class FindResultImpl : public FindResultImplBase, public FindResultImplOffsetBase<need_offset>
+{
+    Mapped * value;
+
+public:
+    FindResultImpl(Mapped * value_, bool found_, size_t off)
+        : FindResultImplBase(found_), FindResultImplOffsetBase<need_offset>(off), value(value_) {}
+    Mapped & getMapped() const { return *value; }
+};
+
+template <bool need_offset>
+class FindResultImpl<void, need_offset> : public FindResultImplBase, public FindResultImplOffsetBase<need_offset>
+{
+public:
+    FindResultImpl(bool found_, size_t off) : FindResultImplBase(found_), FindResultImplOffsetBase<need_offset>(off) {}
+};
+
+template <typename Derived, typename Value, typename Mapped, bool consecutive_keys_optimization, bool need_offset = false>
 class HashMethodBase
 {
 public:
    using EmplaceResult = EmplaceResultImpl<Mapped>;
-    using FindResult = FindResultImpl<Mapped>;
+    using FindResult = FindResultImpl<Mapped, need_offset>;
    static constexpr bool has_mapped = !std::is_same<Mapped, void>::value;
    using Cache = LastElementCache<Value, consecutive_keys_optimization>;

@ -217,12 +244,15 @@ protected:
    {
        if constexpr (Cache::consecutive_keys_optimization)
        {
+            /// It's possible to support such combination, but code will became more complex.
+            /// Now there's not place where we need this options enabled together
+            static_assert(!FindResult::has_offset, "`consecutive_keys_optimization` and `has_offset` are conflicting options");
            if (cache.check(key))
            {
                if constexpr (has_mapped)
-                    return FindResult(&cache.value.second, cache.found);
+                    return FindResult(&cache.value.second, cache.found, 0);
                else
-                    return FindResult(cache.found);
+                    return FindResult(cache.found, 0);
            }
        }

@ -247,10 +277,15 @@ protected:
            }
        }

+        size_t offset = 0;
+        if constexpr (FindResult::has_offset)
+        {
+            offset = it ? data.offsetInternal(it) : 0;
+        }
        if constexpr (has_mapped)
-            return FindResult(it ? &it->getMapped() : nullptr, it != nullptr);
+            return FindResult(it ? &it->getMapped() : nullptr, it != nullptr, offset);
        else
-            return FindResult(it != nullptr);
+            return FindResult(it != nullptr, offset);
    }
 };

--- a/src/Common/Dwarf.cpp
+++ b/src/Common/Dwarf.cpp
@ -19,8 +19,6 @@
 /** This file was edited for ClickHouse.
  */

-#include <optional>
-
 #include <string.h>

 #include <Common/Elf.h>
@ -43,6 +41,7 @@
 #define DW_FORM_ref4 0x13
 #define DW_FORM_data8 0x07
 #define DW_FORM_ref8 0x14
+#define DW_FORM_ref_sig8 0x20
 #define DW_FORM_sdata 0x0d
 #define DW_FORM_udata 0x0f
 #define DW_FORM_ref_udata 0x15
@ -54,9 +53,24 @@
 #define DW_FORM_strp 0x0e
 #define DW_FORM_indirect 0x16
 #define DW_TAG_compile_unit 0x11
+#define DW_TAG_subprogram 0x2e
+#define DW_TAG_try_block 0x32
+#define DW_TAG_catch_block 0x25
+#define DW_TAG_entry_point 0x03
+#define DW_TAG_common_block 0x1a
+#define DW_TAG_lexical_block 0x0b
 #define DW_AT_stmt_list 0x10
 #define DW_AT_comp_dir 0x1b
 #define DW_AT_name 0x03
+#define DW_AT_high_pc 0x12
+#define DW_AT_low_pc 0x11
+#define DW_AT_entry_pc 0x52
+#define DW_AT_ranges 0x55
+#define DW_AT_abstract_origin 0x31
+#define DW_AT_call_line 0x59
+#define DW_AT_call_file 0x58
+#define DW_AT_linkage_name 0x6e
+#define DW_AT_specification 0x47
 #define DW_LNE_define_file 0x03
 #define DW_LNS_copy 0x01
 #define DW_LNS_advance_pc 0x02
@ -84,7 +98,7 @@ namespace ErrorCodes
 }


-Dwarf::Dwarf(const Elf & elf) : elf_(&elf)
+Dwarf::Dwarf(const std::shared_ptr<Elf> & elf) : elf_(elf)
 {
    init();
 }
@ -99,6 +113,10 @@ Dwarf::Section::Section(std::string_view d) : is64Bit_(false), data_(d)

 namespace
 {
+// Maximum number of DIEAbbreviation to cache in a compilation unit. Used to
+// speed up inline function lookup.
+const uint32_t kMaxAbbreviationEntries = 1000;
+
 // All following read* functions read from a std::string_view, advancing the
 // std::string_view, and aborting if there's not enough room.

@ -158,7 +176,7 @@ uint64_t readOffset(std::string_view & sp, bool is64Bit)
 // Read "len" bytes
 std::string_view readBytes(std::string_view & sp, uint64_t len)
 {
-    SAFE_CHECK(len >= sp.size(), "invalid string length");
+    SAFE_CHECK(len <= sp.size(), "invalid string length: " + std::to_string(len) + " vs. " + std::to_string(sp.size()));
    std::string_view ret(sp.data(), len);
    sp.remove_prefix(len);
    return ret;
@ -364,15 +382,18 @@ void Dwarf::init()
        || !getSection(".debug_line", &line_)
        || !getSection(".debug_str", &strings_))
    {
-        elf_ = nullptr;
+        elf_.reset();
        return;
    }

    // Optional: fast address range lookup. If missing .debug_info can
    // be used - but it's much slower (linear scan).
    getSection(".debug_aranges", &aranges_);
+
+    getSection(".debug_ranges", &ranges_);
 }

+// static
 bool Dwarf::readAbbreviation(std::string_view & section, DIEAbbreviation & abbr)
 {
    // abbreviation code
@ -384,14 +405,14 @@ bool Dwarf::readAbbreviation(std::string_view & section, DIEAbbreviation & abbr)
    abbr.tag = readULEB(section);

    // does this entry have children?
-    abbr.hasChildren = (read<uint8_t>(section) != DW_CHILDREN_no);
+    abbr.has_children = (read<uint8_t>(section) != DW_CHILDREN_no);

    // attributes
    const char * attribute_begin = section.data();
    for (;;)
    {
        SAFE_CHECK(!section.empty(), "invalid attribute section");
-        auto attr = readAttribute(section);
+        auto attr = readAttributeSpec(section);
        if (attr.name == 0 && attr.form == 0)
            break;
    }
@ -400,11 +421,161 @@ bool Dwarf::readAbbreviation(std::string_view & section, DIEAbbreviation & abbr)
    return true;
 }

-Dwarf::DIEAbbreviation::Attribute Dwarf::readAttribute(std::string_view & sp)
+// static
+void Dwarf::readCompilationUnitAbbrs(std::string_view abbrev, CompilationUnit & cu)
+{
+    abbrev.remove_prefix(cu.abbrev_offset);
+
+    DIEAbbreviation abbr;
+    while (readAbbreviation(abbrev, abbr))
+    {
+        // Abbreviation code 0 is reserved for null debugging information entries.
+        if (abbr.code != 0 && abbr.code <= kMaxAbbreviationEntries)
+        {
+            cu.abbr_cache[abbr.code - 1] = abbr;
+        }
+    }
+}
+
+size_t Dwarf::forEachChild(const CompilationUnit & cu, const Die & die, std::function<bool(const Die & die)> f) const
+{
+    size_t next_die_offset = forEachAttribute(cu, die, [&](const Attribute &) { return true; });
+    if (!die.abbr.has_children)
+    {
+        return next_die_offset;
+    }
+
+    auto child_die = getDieAtOffset(cu, next_die_offset);
+    while (child_die.code != 0)
+    {
+        if (!f(child_die))
+        {
+            return child_die.offset;
+        }
+
+        // NOTE: Don't run `f` over grandchildren, just skip over them.
+        size_t sibling_offset = forEachChild(cu, child_die, [](const Die &) { return true; });
+        child_die = getDieAtOffset(cu, sibling_offset);
+    }
+
+    // childDie is now a dummy die whose offset is to the code 0 marking the
+    // end of the children. Need to add one to get the offset of the next die.
+    return child_die.offset + 1;
+}
+
+/*
+ * Iterate over all attributes of the given DIE, calling the given callable
+ * for each. Iteration is stopped early if any of the calls return false.
+ */
+size_t Dwarf::forEachAttribute(const CompilationUnit & cu, const Die & die, std::function<bool(const Attribute & die)> f) const
+{
+    auto attrs = die.abbr.attributes;
+    auto values = std::string_view{info_.data() + die.offset + die.attr_offset, cu.offset + cu.size - die.offset - die.attr_offset};
+    while (auto spec = readAttributeSpec(attrs))
+    {
+        auto attr = readAttribute(die, spec, values);
+        if (!f(attr))
+        {
+            return static_cast<size_t>(-1);
+        }
+    }
+    return values.data() - info_.data();
+}
+
+Dwarf::Attribute Dwarf::readAttribute(const Die & die, AttributeSpec spec, std::string_view & info) const
+{
+    switch (spec.form)
+    {
+        case DW_FORM_addr:
+            return {spec, die, read<uintptr_t>(info)};
+        case DW_FORM_block1:
+            return {spec, die, readBytes(info, read<uint8_t>(info))};
+        case DW_FORM_block2:
+            return {spec, die, readBytes(info, read<uint16_t>(info))};
+        case DW_FORM_block4:
+            return {spec, die, readBytes(info, read<uint32_t>(info))};
+        case DW_FORM_block:
+            [[fallthrough]];
+        case DW_FORM_exprloc:
+            return {spec, die, readBytes(info, readULEB(info))};
+        case DW_FORM_data1:
+            [[fallthrough]];
+        case DW_FORM_ref1:
+            return {spec, die, read<uint8_t>(info)};
+        case DW_FORM_data2:
+            [[fallthrough]];
+        case DW_FORM_ref2:
+            return {spec, die, read<uint16_t>(info)};
+        case DW_FORM_data4:
+            [[fallthrough]];
+        case DW_FORM_ref4:
+            return {spec, die, read<uint32_t>(info)};
+        case DW_FORM_data8:
+            [[fallthrough]];
+        case DW_FORM_ref8:
+            [[fallthrough]];
+        case DW_FORM_ref_sig8:
+            return {spec, die, read<uint64_t>(info)};
+        case DW_FORM_sdata:
+            return {spec, die, uint64_t(readSLEB(info))};
+        case DW_FORM_udata:
+            [[fallthrough]];
+        case DW_FORM_ref_udata:
+            return {spec, die, readULEB(info)};
+        case DW_FORM_flag:
+            return {spec, die, read<uint8_t>(info)};
+        case DW_FORM_flag_present:
+            return {spec, die, 1u};
+        case DW_FORM_sec_offset:
+            [[fallthrough]];
+        case DW_FORM_ref_addr:
+            return {spec, die, readOffset(info, die.is64Bit)};
+        case DW_FORM_string:
+            return {spec, die, readNullTerminated(info)};
+        case DW_FORM_strp:
+            return {spec, die, getStringFromStringSection(readOffset(info, die.is64Bit))};
+        case DW_FORM_indirect: // form is explicitly specified
+            // Update spec with the actual FORM.
+            spec.form = readULEB(info);
+            return readAttribute(die, spec, info);
+        default:
+            SAFE_CHECK(false, "invalid attribute form");
+    }
+
+    return {spec, die, 0u};
+}
+
+// static
+Dwarf::AttributeSpec Dwarf::readAttributeSpec(std::string_view & sp)
 {
    return {readULEB(sp), readULEB(sp)};
 }

+// static
+Dwarf::CompilationUnit Dwarf::getCompilationUnit(std::string_view info, uint64_t offset)
+{
+    SAFE_CHECK(offset < info.size(), "unexpected offset");
+    CompilationUnit cu;
+    std::string_view chunk(info);
+    cu.offset = offset;
+    chunk.remove_prefix(offset);
+
+    auto initial_length = read<uint32_t>(chunk);
+    cu.is64Bit = (initial_length == uint32_t(-1));
+    cu.size = cu.is64Bit ? read<uint64_t>(chunk) : initial_length;
+    SAFE_CHECK(cu.size <= chunk.size(), "invalid chunk size");
+    cu.size += cu.is64Bit ? 12 : 4;
+
+    cu.version = read<uint16_t>(chunk);
+    SAFE_CHECK(cu.version >= 2 && cu.version <= 4, "invalid info version");
+    cu.abbrev_offset = readOffset(chunk, cu.is64Bit);
+    cu.addr_size = read<uint8_t>(chunk);
+    SAFE_CHECK(cu.addr_size == sizeof(uintptr_t), "invalid address size");
+
+    cu.first_die = chunk.data() - info.data();
+    return cu;
+}
+
 Dwarf::DIEAbbreviation Dwarf::getAbbreviation(uint64_t code, uint64_t offset) const
 {
    // Linear search in the .debug_abbrev section, starting at offset
@ -516,104 +687,411 @@ bool Dwarf::findDebugInfoOffset(uintptr_t address, std::string_view aranges, uin
    return false;
 }

+Dwarf::Die Dwarf::getDieAtOffset(const CompilationUnit & cu, uint64_t offset) const
+{
+    SAFE_CHECK(offset < info_.size(), "unexpected offset");
+    Die die;
+    std::string_view sp{info_.data() + offset, cu.offset + cu.size - offset};
+    die.offset = offset;
+    die.is64Bit = cu.is64Bit;
+    auto code = readULEB(sp);
+    die.code = code;
+    if (code == 0)
+    {
+        return die;
+    }
+    die.attr_offset = sp.data() - info_.data() - offset;
+    die.abbr = !cu.abbr_cache.empty() && die.code < kMaxAbbreviationEntries ? cu.abbr_cache[die.code - 1]
+                                                                            : getAbbreviation(die.code, cu.abbrev_offset);
+
+    return die;
+}
+
+Dwarf::Die Dwarf::findDefinitionDie(const CompilationUnit & cu, const Die & die) const
+{
+    // Find the real definition instead of declaration.
+    // DW_AT_specification: Incomplete, non-defining, or separate declaration
+    // corresponding to a declaration
+    auto offset = getAttribute<uint64_t>(cu, die, DW_AT_specification);
+    if (!offset)
+    {
+        return die;
+    }
+    return getDieAtOffset(cu, cu.offset + offset.value());
+}
+
 /**
 * Find the @locationInfo for @address in the compilation unit represented
 * by the @sp .debug_info entry.
 * Returns whether the address was found.
 * Advances @sp to the next entry in .debug_info.
 */
-bool Dwarf::findLocation(uintptr_t address, std::string_view & infoEntry, LocationInfo & locationInfo) const
+bool Dwarf::findLocation(
+    uintptr_t address,
+    const LocationInfoMode mode,
+    CompilationUnit & cu,
+    LocationInfo & info,
+    std::vector<SymbolizedFrame> & inline_frames) const
 {
-    // For each compilation unit compiled with a DWARF producer, a
-    // contribution is made to the .debug_info section of the object
-    // file. Each such contribution consists of a compilation unit
-    // header (see Section 7.5.1.1) followed by a single
-    // DW_TAG_compile_unit or DW_TAG_partial_unit debugging information
-    // entry, together with its children.
-
-    // 7.5.1.1 Compilation Unit Header
-    //  1. unit_length (4B or 12B): read by Section::next
-    //  2. version (2B)
-    //  3. debug_abbrev_offset (4B or 8B): offset into the .debug_abbrev section
-    //  4. address_size (1B)
-
-    Section debug_info_section(infoEntry);
-    std::string_view chunk;
-    SAFE_CHECK(debug_info_section.next(chunk), "invalid debug info");
-
-    auto version = read<uint16_t>(chunk);
-    SAFE_CHECK(version >= 2 && version <= 4, "invalid info version");
-    uint64_t abbrev_offset = readOffset(chunk, debug_info_section.is64Bit());
-    auto address_size = read<uint8_t>(chunk);
-    SAFE_CHECK(address_size == sizeof(uintptr_t), "invalid address size");
-
-    // We survived so far. The first (and only) DIE should be DW_TAG_compile_unit
-    // NOTE: - binutils <= 2.25 does not issue DW_TAG_partial_unit.
-    //       - dwarf compression tools like `dwz` may generate it.
-    // TODO(tudorb): Handle DW_TAG_partial_unit?
-    auto code = readULEB(chunk);
-    SAFE_CHECK(code != 0, "invalid code");
-    auto abbr = getAbbreviation(code, abbrev_offset);
-    SAFE_CHECK(abbr.tag == DW_TAG_compile_unit, "expecting compile unit entry");
-    // Skip children entries, remove_prefix to the next compilation unit entry.
-    infoEntry.remove_prefix(chunk.end() - infoEntry.begin());
+    Die die = getDieAtOffset(cu, cu.first_die);
+    // Partial compilation unit (DW_TAG_partial_unit) is not supported.
+    SAFE_CHECK(die.abbr.tag == DW_TAG_compile_unit, "expecting compile unit entry");

    // Read attributes, extracting the few we care about
-    bool found_line_offset = false;
-    uint64_t line_offset = 0;
+    std::optional<uint64_t> line_offset = 0;
    std::string_view compilation_directory;
-    std::string_view main_file_name;
+    std::optional<std::string_view> main_file_name;
+    std::optional<uint64_t> base_addr_cu;

-    DIEAbbreviation::Attribute attr;
-    std::string_view attributes = abbr.attributes;
-    for (;;)
+    forEachAttribute(cu, die, [&](const Attribute & attr)
    {
-        attr = readAttribute(attributes);
-        if (attr.name == 0 && attr.form == 0)
-        {
-            break;
-        }
-        auto val = readAttributeValue(chunk, attr.form, debug_info_section.is64Bit());
-        switch (attr.name)
+        switch (attr.spec.name)
        {
            case DW_AT_stmt_list:
                // Offset in .debug_line for the line number VM program for this
                // compilation unit
-                line_offset = std::get<uint64_t>(val);
-                found_line_offset = true;
+                line_offset = std::get<uint64_t>(attr.attr_value);
                break;
            case DW_AT_comp_dir:
                // Compilation directory
-                compilation_directory = std::get<std::string_view>(val);
+                compilation_directory = std::get<std::string_view>(attr.attr_value);
                break;
            case DW_AT_name:
                // File name of main file being compiled
-                main_file_name = std::get<std::string_view>(val);
+                main_file_name = std::get<std::string_view>(attr.attr_value);
+                break;
+            case DW_AT_low_pc:
+            case DW_AT_entry_pc:
+                // 2.17.1: historically DW_AT_low_pc was used. DW_AT_entry_pc was
+                // introduced in DWARF3. Support either to determine the base address of
+                // the CU.
+                base_addr_cu = std::get<uint64_t>(attr.attr_value);
                break;
        }
-    }
+        // Iterate through all attributes until find all above.
+        return true;
+    });

-    if (!main_file_name.empty())
+    if (main_file_name)
    {
-        locationInfo.hasMainFile = true;
-        locationInfo.mainFile = Path(compilation_directory, "", main_file_name);
+        info.has_main_file = true;
+        info.main_file = Path(compilation_directory, "", *main_file_name);
    }

-    if (!found_line_offset)
+    if (!line_offset)
    {
        return false;
    }

    std::string_view line_section(line_);
-    line_section.remove_prefix(line_offset);
+    line_section.remove_prefix(*line_offset);
    LineNumberVM line_vm(line_section, compilation_directory);

    // Execute line number VM program to find file and line
-    locationInfo.hasFileAndLine = line_vm.findAddress(address, locationInfo.file, locationInfo.line);
-    return locationInfo.hasFileAndLine;
+    info.has_file_and_line = line_vm.findAddress(address, info.file, info.line);
+
+    bool check_inline = (mode == LocationInfoMode::FULL_WITH_INLINE);
+
+    if (info.has_file_and_line && check_inline)
+    {
+        // Re-get the compilation unit with abbreviation cached.
+        cu.abbr_cache.clear();
+        cu.abbr_cache.resize(kMaxAbbreviationEntries);
+        readCompilationUnitAbbrs(abbrev_, cu);
+
+        // Find the subprogram that matches the given address.
+        Die subprogram;
+        findSubProgramDieForAddress(cu, die, address, base_addr_cu, subprogram);
+
+        // Subprogram is the DIE of caller function.
+        if (check_inline && subprogram.abbr.has_children)
+        {
+            // Use an extra location and get its call file and call line, so that
+            // they can be used for the second last location when we don't have
+            // enough inline frames for all inline functions call stack.
+            const size_t max_size = Dwarf::kMaxInlineLocationInfoPerFrame + 1;
+            std::vector<CallLocation> call_locations;
+            call_locations.reserve(Dwarf::kMaxInlineLocationInfoPerFrame + 1);
+
+            findInlinedSubroutineDieForAddress(cu, subprogram, line_vm, address, base_addr_cu, call_locations, max_size);
+            size_t num_found = call_locations.size();
+
+            if (num_found > 0)
+            {
+                const auto inner_most_file = info.file;
+                const auto inner_most_line = info.line;
+
+                // Earlier we filled in locationInfo:
+                // - mainFile: the path to the CU -- the file where the non-inlined
+                //   call is made from.
+                // - file + line: the location of the inner-most inlined call.
+                // Here we already find inlined info so mainFile would be redundant.
+                info.has_main_file = false;
+                info.main_file = Path{};
+                // @findInlinedSubroutineDieForAddress fills inlineLocations[0] with the
+                // file+line of the non-inlined outer function making the call.
+                // locationInfo.name is already set by the caller by looking up the
+                // non-inlined function @address belongs to.
+                info.has_file_and_line = true;
+                info.file = call_locations[0].file;
+                info.line = call_locations[0].line;
+
+                // The next inlined subroutine's call file and call line is the current
+                // caller's location.
+                for (size_t i = 0; i < num_found - 1; i++)
+                {
+                    call_locations[i].file = call_locations[i + 1].file;
+                    call_locations[i].line = call_locations[i + 1].line;
+                }
+                // CallLocation for the inner-most inlined function:
+                // - will be computed if enough space was available in the passed
+                //   buffer.
+                // - will have a .name, but no !.file && !.line
+                // - its corresponding file+line is the one returned by LineVM based
+                //   on @address.
+                // Use the inner-most inlined file+line info we got from the LineVM.
+                call_locations[num_found - 1].file = inner_most_file;
+                call_locations[num_found - 1].line = inner_most_line;
+
+                // Fill in inline frames in reverse order (as expected by the caller).
+                std::reverse(call_locations.begin(), call_locations.end());
+                for (const auto & call_location : call_locations)
+                {
+                    SymbolizedFrame inline_frame;
+                    inline_frame.found = true;
+                    inline_frame.addr = address;
+                    inline_frame.name = call_location.name.data();
+                    inline_frame.location.has_file_and_line = true;
+                    inline_frame.location.file = call_location.file;
+                    inline_frame.location.line = call_location.line;
+                    inline_frames.push_back(inline_frame);
+                }
+            }
+        }
+    }
+
+    return info.has_file_and_line;
 }

-bool Dwarf::findAddress(uintptr_t address, LocationInfo & locationInfo, LocationInfoMode mode) const
+void Dwarf::findSubProgramDieForAddress(
+    const CompilationUnit & cu, const Die & die, uint64_t address, std::optional<uint64_t> base_addr_cu, Die & subprogram) const
+{
+    forEachChild(cu, die, [&](const Die & child_die)
+    {
+        if (child_die.abbr.tag == DW_TAG_subprogram)
+        {
+            std::optional<uint64_t> low_pc;
+            std::optional<uint64_t> high_pc;
+            std::optional<bool> is_high_pc_addr;
+            std::optional<uint64_t> range_offset;
+            forEachAttribute(cu, child_die, [&](const Attribute & attr)
+            {
+                switch (attr.spec.name)
+                {
+                    case DW_AT_ranges:
+                        range_offset = std::get<uint64_t>(attr.attr_value);
+                        break;
+                    case DW_AT_low_pc:
+                        low_pc = std::get<uint64_t>(attr.attr_value);
+                        break;
+                    case DW_AT_high_pc:
+                        // Value of DW_AT_high_pc attribute can be an address
+                        // (DW_FORM_addr) or an offset (DW_FORM_data).
+                        is_high_pc_addr = (attr.spec.form == DW_FORM_addr);
+                        high_pc = std::get<uint64_t>(attr.attr_value);
+                        break;
+                }
+                // Iterate through all attributes until find all above.
+                return true;
+            });
+            bool pc_match = low_pc && high_pc && is_high_pc_addr && address >= *low_pc
+                && (address < (*is_high_pc_addr ? *high_pc : *low_pc + *high_pc));
+            bool range_match = range_offset && isAddrInRangeList(address, base_addr_cu, range_offset.value(), cu.addr_size);
+            if (pc_match || range_match)
+            {
+                subprogram = child_die;
+                return false;
+            }
+        }
+
+        findSubProgramDieForAddress(cu, child_die, address, base_addr_cu, subprogram);
+
+        // Iterates through children until find the inline subprogram.
+        return true;
+    });
+}
+
+/**
+ * Find DW_TAG_inlined_subroutine child DIEs that contain @address and
+ * then extract:
+ * - Where was it called from (DW_AT_call_file & DW_AT_call_line):
+ *   the statement or expression that caused the inline expansion.
+ * - The inlined function's name. As a function may be inlined multiple
+ *   times, common attributes like DW_AT_linkage_name or DW_AT_name
+ *   are only stored in its "concrete out-of-line instance" (a
+ *   DW_TAG_subprogram) which we find using DW_AT_abstract_origin.
+ */
+void Dwarf::findInlinedSubroutineDieForAddress(
+    const CompilationUnit & cu,
+    const Die & die,
+    const LineNumberVM & line_vm,
+    uint64_t address,
+    std::optional<uint64_t> base_addr_cu,
+    std::vector<CallLocation> & locations,
+    const size_t max_size) const
+{
+    if (locations.size() >= max_size)
+    {
+        return;
+    }
+
+    forEachChild(cu, die, [&](const Die & child_die)
+    {
+        // Between a DW_TAG_subprogram and and DW_TAG_inlined_subroutine we might
+        // have arbitrary intermediary "nodes", including DW_TAG_common_block,
+        // DW_TAG_lexical_block, DW_TAG_try_block, DW_TAG_catch_block and
+        // DW_TAG_with_stmt, etc.
+        // We can't filter with locationhere since its range may be not specified.
+        // See section 2.6.2: A location list containing only an end of list entry
+        // describes an object that exists in the source code but not in the
+        // executable program.
+        if (child_die.abbr.tag == DW_TAG_try_block || child_die.abbr.tag == DW_TAG_catch_block || child_die.abbr.tag == DW_TAG_entry_point
+            || child_die.abbr.tag == DW_TAG_common_block || child_die.abbr.tag == DW_TAG_lexical_block)
+        {
+            findInlinedSubroutineDieForAddress(cu, child_die, line_vm, address, base_addr_cu, locations, max_size);
+            return true;
+        }
+
+        std::optional<uint64_t> low_pc;
+        std::optional<uint64_t> high_pc;
+        std::optional<bool> is_high_pc_addr;
+        std::optional<uint64_t> abstract_origin;
+        std::optional<uint64_t> abstract_origin_ref_type;
+        std::optional<uint64_t> call_file;
+        std::optional<uint64_t> call_line;
+        std::optional<uint64_t> range_offset;
+        forEachAttribute(cu, child_die, [&](const Attribute & attr)
+        {
+            switch (attr.spec.name)
+            {
+                case DW_AT_ranges:
+                    range_offset = std::get<uint64_t>(attr.attr_value);
+                    break;
+                case DW_AT_low_pc:
+                    low_pc = std::get<uint64_t>(attr.attr_value);
+                    break;
+                case DW_AT_high_pc:
+                    // Value of DW_AT_high_pc attribute can be an address
+                    // (DW_FORM_addr) or an offset (DW_FORM_data).
+                    is_high_pc_addr = (attr.spec.form == DW_FORM_addr);
+                    high_pc = std::get<uint64_t>(attr.attr_value);
+                    break;
+                case DW_AT_abstract_origin:
+                    abstract_origin_ref_type = attr.spec.form;
+                    abstract_origin = std::get<uint64_t>(attr.attr_value);
+                    break;
+                case DW_AT_call_line:
+                    call_line = std::get<uint64_t>(attr.attr_value);
+                    break;
+                case DW_AT_call_file:
+                    call_file = std::get<uint64_t>(attr.attr_value);
+                    break;
+            }
+            // Iterate through all until find all above attributes.
+            return true;
+        });
+
+        // 2.17 Code Addresses and Ranges
+        // Any debugging information entry describing an entity that has a
+        // machine code address or range of machine code addresses,
+        // which includes compilation units, module initialization, subroutines,
+        // ordinary blocks, try/catch blocks, labels and the like, may have
+        //  - A DW_AT_low_pc attribute for a single address,
+        //  - A DW_AT_low_pc and DW_AT_high_pc pair of attributes for a
+        //    single contiguous range of addresses, or
+        //  - A DW_AT_ranges attribute for a non-contiguous range of addresses.
+        // TODO: Support DW_TAG_entry_point and DW_TAG_common_block that don't
+        // have DW_AT_low_pc/DW_AT_high_pc pairs and DW_AT_ranges.
+        // TODO: Support relocated address which requires lookup in relocation map.
+        bool pc_match
+            = low_pc && high_pc && is_high_pc_addr && address >= *low_pc && (address < (*is_high_pc_addr ? *high_pc : *low_pc + *high_pc));
+        bool range_match = range_offset && isAddrInRangeList(address, base_addr_cu, range_offset.value(), cu.addr_size);
+        if (!pc_match && !range_match)
+        {
+            // Address doesn't match. Keep searching other children.
+            return true;
+        }
+
+        if (!abstract_origin || !abstract_origin_ref_type || !call_line || !call_file)
+        {
+            // We expect a single sibling DIE to match on addr, but it's missing
+            // required fields. Stop searching for other DIEs.
+            return false;
+        }
+
+        CallLocation location;
+        location.file = line_vm.getFullFileName(*call_file);
+        location.line = *call_line;
+
+        auto get_function_name = [&](const CompilationUnit & srcu, uint64_t die_offset)
+        {
+            auto decl_die = getDieAtOffset(srcu, die_offset);
+            // Jump to the actual function definition instead of declaration for name
+            // and line info.
+            auto def_die = findDefinitionDie(srcu, decl_die);
+
+            std::string_view name;
+            // The file and line will be set in the next inline subroutine based on
+            // its DW_AT_call_file and DW_AT_call_line.
+            forEachAttribute(srcu, def_die, [&](const Attribute & attr)
+            {
+                switch (attr.spec.name)
+                {
+                    case DW_AT_linkage_name:
+                        name = std::get<std::string_view>(attr.attr_value);
+                        break;
+                    case DW_AT_name:
+                        // NOTE: when DW_AT_linkage_name and DW_AT_name match, dwarf
+                        // emitters omit DW_AT_linkage_name (to save space). If present
+                        // DW_AT_linkage_name should always be preferred (mangled C++ name
+                        // vs just the function name).
+                        if (name.empty())
+                        {
+                            name = std::get<std::string_view>(attr.attr_value);
+                        }
+                        break;
+                }
+                return true;
+            });
+            return name;
+        };
+
+        // DW_AT_abstract_origin is a reference. There a 3 types of references:
+        // - the reference can identify any debugging information entry within the
+        //   compilation unit (DW_FORM_ref1, DW_FORM_ref2, DW_FORM_ref4,
+        //   DW_FORM_ref8, DW_FORM_ref_udata). This type of reference is an offset
+        //   from the first byte of the compilation header for the compilation unit
+        //   containing the reference.
+        // - the reference can identify any debugging information entry within a
+        //   .debug_info section; in particular, it may refer to an entry in a
+        //   different compilation unit (DW_FORM_ref_addr)
+        // - the reference can identify any debugging information type entry that
+        //   has been placed in its own type unit.
+        //   Not applicable for DW_AT_abstract_origin.
+        location.name = (*abstract_origin_ref_type != DW_FORM_ref_addr)
+            ? get_function_name(cu, cu.offset + *abstract_origin)
+            : get_function_name(findCompilationUnit(info_, *abstract_origin), *abstract_origin);
+
+        locations.push_back(location);
+
+        findInlinedSubroutineDieForAddress(cu, child_die, line_vm, address, base_addr_cu, locations, max_size);
+
+        return false;
+    });
+}
+
+bool Dwarf::findAddress(
+    uintptr_t address, LocationInfo & locationInfo, LocationInfoMode mode, std::vector<SymbolizedFrame> & inline_frames) const
 {
    locationInfo = LocationInfo();

@ -635,10 +1113,9 @@ bool Dwarf::findAddress(uintptr_t address, LocationInfo & locationInfo, Location
        if (findDebugInfoOffset(address, aranges_, offset))
        {
            // Read compilation unit header from .debug_info
-            std::string_view info_entry(info_);
-            info_entry.remove_prefix(offset);
-            findLocation(address, info_entry, locationInfo);
-            return locationInfo.hasFileAndLine;
+            auto unit = getCompilationUnit(info_, offset);
+            findLocation(address, mode, unit, locationInfo, inline_frames);
+            return locationInfo.has_file_and_line;
        }
        else if (mode == LocationInfoMode::FAST)
        {
@ -650,20 +1127,92 @@ bool Dwarf::findAddress(uintptr_t address, LocationInfo & locationInfo, Location
        }
        else
        {
-            SAFE_CHECK(mode == LocationInfoMode::FULL, "unexpected mode");
+            SAFE_CHECK(mode == LocationInfoMode::FULL || mode == LocationInfoMode::FULL_WITH_INLINE, "unexpected mode");
            // Fall back to the linear scan.
        }
    }

    // Slow path (linear scan): Iterate over all .debug_info entries
    // and look for the address in each compilation unit.
-    std::string_view info_entry(info_);
-    while (!info_entry.empty() && !locationInfo.hasFileAndLine)
-        findLocation(address, info_entry, locationInfo);
+    uint64_t offset = 0;
+    while (offset < info_.size() && !locationInfo.has_file_and_line)
+    {
+        auto unit = getCompilationUnit(info_, offset);
+        offset += unit.size;
+        findLocation(address, mode, unit, locationInfo, inline_frames);
+    }

-    return locationInfo.hasFileAndLine;
+    return locationInfo.has_file_and_line;
 }

+bool Dwarf::isAddrInRangeList(uint64_t address, std::optional<uint64_t> base_addr, size_t offset, uint8_t addr_size) const
+{
+    SAFE_CHECK(addr_size == 4 || addr_size == 8, "wrong address size");
+    if (ranges_.empty())
+    {
+        return false;
+    }
+
+    const bool is_64bit_addr = addr_size == 8;
+    std::string_view sp = ranges_;
+    sp.remove_prefix(offset);
+    const uint64_t max_addr = is_64bit_addr ? std::numeric_limits<uint64_t>::max() : std::numeric_limits<uint32_t>::max();
+    while (!sp.empty())
+    {
+        uint64_t begin = readOffset(sp, is_64bit_addr);
+        uint64_t end = readOffset(sp, is_64bit_addr);
+        // The range list entry is a base address selection entry.
+        if (begin == max_addr)
+        {
+            base_addr = end;
+            continue;
+        }
+        // The range list entry is an end of list entry.
+        if (begin == 0 && end == 0)
+        {
+            break;
+        }
+        // Check if the given address falls in the range list entry.
+        // 2.17.3 Non-Contiguous Address Ranges
+        // The applicable base address of a range list entry is determined by the
+        // closest preceding base address selection entry (see below) in the same
+        // range list. If there is no such selection entry, then the applicable base
+        // address defaults to the base address of the compilation unit.
+        if (base_addr && address >= begin + *base_addr && address < end + *base_addr)
+        {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+// static
+Dwarf::CompilationUnit Dwarf::findCompilationUnit(std::string_view info, uint64_t targetOffset)
+{
+    SAFE_CHECK(targetOffset < info.size(), "unexpected target address");
+    uint64_t offset = 0;
+    while (offset < info.size())
+    {
+        std::string_view chunk(info);
+        chunk.remove_prefix(offset);
+
+        auto initial_length = read<uint32_t>(chunk);
+        auto is_64bit = (initial_length == uint32_t(-1));
+        auto size = is_64bit ? read<uint64_t>(chunk) : initial_length;
+        SAFE_CHECK(size <= chunk.size(), "invalid chunk size");
+        size += is_64bit ? 12 : 4;
+
+        if (offset + size > targetOffset)
+        {
+            break;
+        }
+        offset += size;
+    }
+    return getCompilationUnit(info, offset);
+}
+
+
 Dwarf::LineNumberVM::LineNumberVM(std::string_view data, std::string_view compilationDirectory)
    : compilationDirectory_(compilationDirectory)
 {
--- a/src/Common/Dwarf.h
+++ b/src/Common/Dwarf.h
@ -21,9 +21,13 @@
 /** This file was edited for ClickHouse.
  */

+#include <functional>
+#include <memory>
+#include <optional>
 #include <string>
 #include <string_view>
 #include <variant>
+#include <vector>


 namespace DB
@ -61,7 +65,13 @@ class Dwarf final
    // be live for as long as the passed-in Elf is live.
 public:
    /** Create a DWARF parser around an ELF file. */
-    explicit Dwarf(const Elf & elf);
+    explicit Dwarf(const std::shared_ptr<Elf> & elf);
+
+    /**
+     * More than one location info may exist if current frame is an inline
+     * function call.
+     */
+    static constexpr uint32_t kMaxInlineLocationInfoPerFrame = 10;

    /**
      * Represent a file path a s collection of three parts (base directory,
@ -70,7 +80,7 @@ public:
    class Path
    {
    public:
-        Path() {}
+        Path() = default;

        Path(std::string_view baseDir, std::string_view subDir, std::string_view file);

@ -107,6 +117,14 @@ public:
        std::string_view file_;
    };

+    // Indicates inline function `name` is called  at `line@file`.
+    struct CallLocation
+    {
+        Path file = {};
+        uint64_t line;
+        std::string_view name;
+    };
+
    enum class LocationInfoMode
    {
        // Don't resolve location info.
@ -115,30 +133,47 @@ public:
        FAST,
        // Scan all CU in .debug_info (slow!) on .debug_aranges lookup failure.
        FULL,
+        // Scan .debug_info (super slower, use with caution) for inline functions in
+        // addition to FULL.
+        FULL_WITH_INLINE,
    };

    struct LocationInfo
    {
-        bool hasMainFile = false;
-        Path mainFile;
+        bool has_main_file = false;
+        Path main_file;

-        bool hasFileAndLine = false;
+        bool has_file_and_line = false;
        Path file;
        uint64_t line = 0;
    };

+    /**
+     * Frame information: symbol name and location.
+     */
+    struct SymbolizedFrame
+    {
+        bool found = false;
+        uintptr_t addr = 0;
+        // Mangled symbol name. Use `folly::demangle()` to demangle it.
+        const char * name = nullptr;
+        LocationInfo location;
+        std::shared_ptr<const Elf> file;
+
+        void clear() { *this = SymbolizedFrame(); }
+    };
+
    /** Find the file and line number information corresponding to address.
      * The address must be physical - offset in object file without offset in virtual memory where the object is loaded.
      */
-    bool findAddress(uintptr_t address, LocationInfo & info, LocationInfoMode mode) const;
+    bool findAddress(uintptr_t address, LocationInfo & info, LocationInfoMode mode, std::vector<SymbolizedFrame> & inline_frames) const;

 private:
    static bool findDebugInfoOffset(uintptr_t address, std::string_view aranges, uint64_t & offset);

    void init();
-    bool findLocation(uintptr_t address, std::string_view & infoEntry, LocationInfo & info) const;

-    const Elf * elf_;
+    std::shared_ptr<const Elf> elf_;

    // DWARF section made up of chunks, each prefixed with a length header.
    // The length indicates whether the chunk is DWARF-32 or DWARF-64, which
@ -169,17 +204,81 @@ private:
    {
        uint64_t code;
        uint64_t tag;
-        bool hasChildren;
-
-        struct Attribute
-        {
-            uint64_t name;
-            uint64_t form;
-        };
+        bool has_children = false;

        std::string_view attributes;
    };

+    // Debugging information entry to define a low-level representation of a
+    // source program. Each debugging information entry consists of an identifying
+    // tag and a series of attributes. An entry, or group of entries together,
+    // provide a description of a corresponding entity in the source program.
+    struct Die
+    {
+        bool is64Bit;
+        // Offset from start to first attribute
+        uint8_t attr_offset;
+        // Offset within debug info.
+        uint32_t offset;
+        uint64_t code;
+        DIEAbbreviation abbr;
+    };
+
+    struct AttributeSpec
+    {
+        uint64_t name = 0;
+        uint64_t form = 0;
+
+        explicit operator bool() const { return name != 0 || form != 0; }
+    };
+
+    struct Attribute
+    {
+        AttributeSpec spec;
+        const Die & die;
+        std::variant<uint64_t, std::string_view> attr_value;
+    };
+
+    struct CompilationUnit
+    {
+        bool is64Bit;
+        uint8_t version;
+        uint8_t addr_size;
+        // Offset in .debug_info of this compilation unit.
+        uint32_t offset;
+        uint32_t size;
+        // Offset in .debug_info for the first DIE in this compilation unit.
+        uint32_t first_die;
+        uint64_t abbrev_offset;
+        // Only the CompilationUnit that contains the caller functions needs this cache.
+        // Indexed by (abbr.code - 1) if (abbr.code - 1) < abbrCache.size();
+        std::vector<DIEAbbreviation> abbr_cache;
+    };
+
+    static CompilationUnit getCompilationUnit(std::string_view info, uint64_t offset);
+
+    /** cu must exist during the life cycle of created detail::Die. */
+    Die getDieAtOffset(const CompilationUnit & cu, uint64_t offset) const;
+
+    /**
+     * Find the actual definition DIE instead of declaration for the given die.
+     */
+    Die findDefinitionDie(const CompilationUnit & cu, const Die & die) const;
+
+    bool findLocation(
+        uintptr_t address,
+        LocationInfoMode mode,
+        CompilationUnit & cu,
+        LocationInfo & info,
+        std::vector<SymbolizedFrame> & inline_frames) const;
+
+    /**
+     * Finds a subprogram debugging info entry that contains a given address among
+     * children of given die. Depth first search.
+     */
+    void findSubProgramDieForAddress(
+        const CompilationUnit & cu, const Die & die, uint64_t address, std::optional<uint64_t> base_addr_cu, Die & subprogram) const;
+
    // Interpreter for the line number bytecode VM
    class LineNumberVM
    {
@ -188,6 +287,13 @@ private:

        bool findAddress(uintptr_t target, Path & file, uint64_t & line);

+        /** Gets full file name at given index including directory. */
+        Path getFullFileName(uint64_t index) const
+        {
+            auto fn = getFileName(index);
+            return Path({}, getIncludeDirectory(fn.directoryIndex), fn.relativeName);
+        }
+
    private:
        void init();
        void reset();
@ -259,18 +365,50 @@ private:
        uint64_t discriminator_;
    };

+    /**
+     * Finds inlined subroutine DIEs and their caller lines that contains a given
+     * address among children of given die. Depth first search.
+     */
+    void findInlinedSubroutineDieForAddress(
+        const CompilationUnit & cu,
+        const Die & die,
+        const LineNumberVM & line_vm,
+        uint64_t address,
+        std::optional<uint64_t> base_addr_cu,
+        std::vector<CallLocation> & locations,
+        size_t max_size) const;
+
    // Read an abbreviation from a std::string_view, return true if at end; remove_prefix section
    static bool readAbbreviation(std::string_view & section, DIEAbbreviation & abbr);

+    static void readCompilationUnitAbbrs(std::string_view abbrev, CompilationUnit & cu);
+
+    /**
+     * Iterates over all children of a debugging info entry, calling the given
+     * callable for each. Iteration is stopped early if any of the calls return
+     * false. Returns the offset of next DIE after iterations.
+     */
+    size_t forEachChild(const CompilationUnit & cu, const Die & die, std::function<bool(const Die & die)> f) const;
+
    // Get abbreviation corresponding to a code, in the chunk starting at
    // offset in the .debug_abbrev section
    DIEAbbreviation getAbbreviation(uint64_t code, uint64_t offset) const;

+    /**
+     * Iterates over all attributes of a debugging info entry, calling the given
+     * callable for each. If all attributes are visited, then return the offset of
+     * next DIE, or else iteration is stopped early and return size_t(-1) if any
+     * of the calls return false.
+     */
+    size_t forEachAttribute(const CompilationUnit & cu, const Die & die, std::function<bool(const Attribute & die)> f) const;
+
+    Attribute readAttribute(const Die & die, AttributeSpec spec, std::string_view & info) const;
+
    // Read one attribute <name, form> pair, remove_prefix sp; returns <0, 0> at end.
-    static DIEAbbreviation::Attribute readAttribute(std::string_view & sp);
+    static AttributeSpec readAttributeSpec(std::string_view & sp);

    // Read one attribute value, remove_prefix sp
-    typedef std::variant<uint64_t, std::string_view> AttributeValue;
+    using AttributeValue = std::variant<uint64_t, std::string_view>;
    AttributeValue readAttributeValue(std::string_view & sp, uint64_t form, bool is64Bit) const;

    // Get an ELF section by name, return true if found
@ -279,11 +417,34 @@ private:
    // Get a string from the .debug_str section
    std::string_view getStringFromStringSection(uint64_t offset) const;

+    template <class T>
+    std::optional<T> getAttribute(const CompilationUnit & cu, const Die & die, uint64_t attr_name) const
+    {
+        std::optional<T> result;
+        forEachAttribute(cu, die, [&](const Attribute & attr)
+        {
+            if (attr.spec.name == attr_name)
+            {
+                result = std::get<T>(attr.attr_value);
+                return false;
+            }
+            return true;
+        });
+        return result;
+    }
+
+    // Check if the given address is in the range list at the given offset in .debug_ranges.
+    bool isAddrInRangeList(uint64_t address, std::optional<uint64_t> base_addr, size_t offset, uint8_t addr_size) const;
+
+    // Finds the Compilation Unit starting at offset.
+    static CompilationUnit findCompilationUnit(std::string_view info, uint64_t targetOffset);
+
    std::string_view info_; // .debug_info
    std::string_view abbrev_; // .debug_abbrev
    std::string_view aranges_; // .debug_aranges
    std::string_view line_; // .debug_line
    std::string_view strings_; // .debug_str
+    std::string_view ranges_; // .debug_ranges
 };

 }
--- a/src/Common/ErrorCodes.cpp
+++ b/src/Common/ErrorCodes.cpp
@ -534,7 +534,8 @@
    M(565, TOO_MANY_PARTITIONS) \
    M(566, CANNOT_RMDIR) \
    M(567, DUPLICATED_PART_UUIDS) \
-    M(568, DATABASE_REPLICATION_FAILED) \
+    M(568, RAFT_ERROR) \
+    M(569, DATABASE_REPLICATION_FAILED) \
    \
    M(999, KEEPER_EXCEPTION) \
    M(1000, POCO_EXCEPTION) \
--- a/src/Common/HashTable/FixedHashTable.h
+++ b/src/Common/HashTable/FixedHashTable.h
@ -476,6 +476,17 @@ public:

    size_t getBufferSizeInCells() const { return NUM_CELLS; }

+    /// Return offset for result in internal buffer.
+    /// Result can have value up to `getBufferSizeInCells() + 1`
+    /// because offset for zero value considered to be 0
+    /// and for other values it will be `offset in buffer + 1`
+    size_t offsetInternal(ConstLookupResult ptr) const
+    {
+        if (ptr->isZero(*this))
+            return 0;
+        return ptr - buf + 1;
+    }
+
    const Cell * data() const { return buf; }
    Cell * data() { return buf; }

--- a/src/Common/HashTable/HashTable.h
+++ b/src/Common/HashTable/HashTable.h
@ -1294,6 +1294,17 @@ public:
        return grower.bufSize();
    }

+    /// Return offset for result in internal buffer.
+    /// Result can have value up to `getBufferSizeInCells() + 1`
+    /// because offset for zero value considered to be 0
+    /// and for other values it will be `offset in buffer + 1`
+    size_t offsetInternal(ConstLookupResult ptr) const
+    {
+        if (ptr->isZero(*this))
+            return 0;
+        return ptr - buf + 1;
+    }
+
 #ifdef DBMS_HASH_MAP_COUNT_COLLISIONS
    size_t getCollisions() const
    {
--- a/src/Common/MemoryTracker.cpp
+++ b/src/Common/MemoryTracker.cpp
@ -24,8 +24,8 @@ namespace
 ///
 /// - when it is explicitly blocked with LockExceptionInThread
 ///
-/// - to avoid std::terminate(), when stack unwinding is current in progress in
-///   this thread.
+/// - to avoid std::terminate(), when stack unwinding is currently in progress
+///   in this thread.
 ///
 ///   NOTE: that since C++11 destructor marked with noexcept by default, and
 ///   this means that any throw from destructor (that is not marked with
--- a/src/Common/PODArray.cpp
+++ b/src/Common/PODArray.cpp
@ -6,4 +6,14 @@ namespace DB
 /// Used for left padding of PODArray when empty
 const char empty_pod_array[empty_pod_array_size]{};

+template class PODArray<UInt8, 4096, Allocator<false>, 15, 16>;
+template class PODArray<UInt16, 4096, Allocator<false>, 15, 16>;
+template class PODArray<UInt32, 4096, Allocator<false>, 15, 16>;
+template class PODArray<UInt64, 4096, Allocator<false>, 15, 16>;
+
+template class PODArray<Int8, 4096, Allocator<false>, 15, 16>;
+template class PODArray<Int16, 4096, Allocator<false>, 15, 16>;
+template class PODArray<Int32, 4096, Allocator<false>, 15, 16>;
+template class PODArray<Int64, 4096, Allocator<false>, 15, 16>;
+
 }
--- a/src/Common/PODArray.h
+++ b/src/Common/PODArray.h
@ -725,4 +725,16 @@ void swap(PODArray<T, initial_bytes, TAllocator, pad_right_> & lhs, PODArray<T,
 }
 #pragma GCC diagnostic pop

+/// Prevent implicit template instantiation of PODArray for common numeric types
+
+extern template class PODArray<UInt8, 4096, Allocator<false>, 15, 16>;
+extern template class PODArray<UInt16, 4096, Allocator<false>, 15, 16>;
+extern template class PODArray<UInt32, 4096, Allocator<false>, 15, 16>;
+extern template class PODArray<UInt64, 4096, Allocator<false>, 15, 16>;
+
+extern template class PODArray<Int8, 4096, Allocator<false>, 15, 16>;
+extern template class PODArray<Int16, 4096, Allocator<false>, 15, 16>;
+extern template class PODArray<Int32, 4096, Allocator<false>, 15, 16>;
+extern template class PODArray<Int64, 4096, Allocator<false>, 15, 16>;
+
 }
--- a/src/Common/PODArray_fwd.h
+++ b/src/Common/PODArray_fwd.h
@ -3,8 +3,8 @@
  * This file contains some using-declarations that define various kinds of
  * PODArray.
  */
-#pragma once

+#include <common/types.h>
 #include <Common/Allocator_fwd.h>

 namespace DB
--- a/src/Common/StackTrace.cpp
+++ b/src/Common/StackTrace.cpp
@ -217,10 +217,12 @@ void StackTrace::symbolize(const StackTrace::FramePointers & frame_pointers, siz
            current_frame.object = object->name;
            if (std::filesystem::exists(current_frame.object.value()))
            {
-                auto dwarf_it = dwarfs.try_emplace(object->name, *object->elf).first;
+                auto dwarf_it = dwarfs.try_emplace(object->name, object->elf).first;

                DB::Dwarf::LocationInfo location;
-                if (dwarf_it->second.findAddress(uintptr_t(current_frame.physical_addr), location, DB::Dwarf::LocationInfoMode::FAST))
+                std::vector<DB::Dwarf::SymbolizedFrame> inline_frames;
+                if (dwarf_it->second.findAddress(
+                        uintptr_t(current_frame.physical_addr), location, DB::Dwarf::LocationInfoMode::FAST, inline_frames))
                {
                    current_frame.file = location.file.toString();
                    current_frame.line = location.line;
@ -314,7 +316,11 @@ const StackTrace::FramePointers & StackTrace::getFramePointers() const
 }

 static void toStringEveryLineImpl(
-    const StackTrace::FramePointers & frame_pointers, size_t offset, size_t size, std::function<void(const std::string &)> callback)
+    bool fatal,
+    const StackTrace::FramePointers & frame_pointers,
+    size_t offset,
+    size_t size,
+    std::function<void(const std::string &)> callback)
 {
    if (size == 0)
        return callback("<Empty trace>");
@ -324,11 +330,12 @@ static void toStringEveryLineImpl(
    const DB::SymbolIndex & symbol_index = *symbol_index_ptr;
    std::unordered_map<std::string, DB::Dwarf> dwarfs;

-    std::stringstream out;      // STYLE_CHECK_ALLOW_STD_STRING_STREAM
+    std::stringstream out;  // STYLE_CHECK_ALLOW_STD_STRING_STREAM
    out.exceptions(std::ios::failbit);

    for (size_t i = offset; i < size; ++i)
    {
+        std::vector<DB::Dwarf::SymbolizedFrame> inline_frames;
        const void * virtual_addr = frame_pointers[i];
        const auto * object = symbol_index.findObject(virtual_addr);
        uintptr_t virtual_offset = object ? uintptr_t(object->address_begin) : 0;
@ -340,10 +347,11 @@ static void toStringEveryLineImpl(
        {
            if (std::filesystem::exists(object->name))
            {
-                auto dwarf_it = dwarfs.try_emplace(object->name, *object->elf).first;
+                auto dwarf_it = dwarfs.try_emplace(object->name, object->elf).first;

                DB::Dwarf::LocationInfo location;
-                if (dwarf_it->second.findAddress(uintptr_t(physical_addr), location, DB::Dwarf::LocationInfoMode::FAST))
+                auto mode = fatal ? DB::Dwarf::LocationInfoMode::FULL_WITH_INLINE : DB::Dwarf::LocationInfoMode::FAST;
+                if (dwarf_it->second.findAddress(uintptr_t(physical_addr), location, mode, inline_frames))
                    out << location.file.toString() << ":" << location.line << ": ";
            }
        }
@ -360,11 +368,20 @@ static void toStringEveryLineImpl(
        out << " @ " << physical_addr;
        out << " in " << (object ? object->name : "?");

+        for (size_t j = 0; j < inline_frames.size(); ++j)
+        {
+            const auto & frame = inline_frames[j];
+            int status = 0;
+            callback(fmt::format("{}.{}. inlined from {}:{}: {}",
+                     i, j+1, frame.location.file.toString(), frame.location.line, demangle(frame.name, status)));
+        }
+
        callback(out.str());
        out.str({});
    }
 #else
-    std::stringstream out;      // STYLE_CHECK_ALLOW_STD_STRING_STREAM
+    UNUSED(fatal);
+    std::stringstream out;  // STYLE_CHECK_ALLOW_STD_STRING_STREAM
    out.exceptions(std::ios::failbit);

    for (size_t i = offset; i < size; ++i)
@ -382,13 +399,13 @@ static std::string toStringImpl(const StackTrace::FramePointers & frame_pointers
 {
    std::stringstream out;      // STYLE_CHECK_ALLOW_STD_STRING_STREAM
    out.exceptions(std::ios::failbit);
-    toStringEveryLineImpl(frame_pointers, offset, size, [&](const std::string & str) { out << str << '\n'; });
+    toStringEveryLineImpl(false, frame_pointers, offset, size, [&](const std::string & str) { out << str << '\n'; });
    return out.str();
 }

 void StackTrace::toStringEveryLine(std::function<void(const std::string &)> callback) const
 {
-    toStringEveryLineImpl(frame_pointers, offset, size, std::move(callback));
+    toStringEveryLineImpl(true, frame_pointers, offset, size, std::move(callback));
 }


--- a/src/Common/StackTrace.h
+++ b/src/Common/StackTrace.h
@ -51,10 +51,10 @@ public:

    /// Tries to capture stack trace. Fallbacks on parsing caller address from
    /// signal context if no stack trace could be captured
-    StackTrace(const ucontext_t & signal_context);
+    explicit StackTrace(const ucontext_t & signal_context);

    /// Creates empty object for deferred initialization
-    StackTrace(NoCapture);
+    explicit StackTrace(NoCapture);

    size_t getSize() const;
    size_t getOffset() const;
@ -65,6 +65,7 @@ public:
    static void symbolize(const FramePointers & frame_pointers, size_t offset, size_t size, StackTrace::Frames & frames);

    void toStringEveryLine(std::function<void(const std::string &)> callback) const;
+
 protected:
    void tryCapture();

--- a/src/Common/SymbolIndex.h
+++ b/src/Common/SymbolIndex.h
@ -36,7 +36,7 @@ public:
        const void * address_begin;
        const void * address_end;
        std::string name;
-        std::unique_ptr<Elf> elf;
+        std::shared_ptr<Elf> elf;
    };

    /// Address in virtual memory should be passed. These addresses include offset where the object is loaded in memory.
--- a/src/Common/ZooKeeper/IKeeper.h
+++ b/src/Common/ZooKeeper/IKeeper.h
@ -331,7 +331,7 @@ public:
 class IKeeper
 {
 public:
-    virtual ~IKeeper() {}
+    virtual ~IKeeper() = default;

    /// If expired, you can only destroy the object. All other methods will throw exception.
    virtual bool isExpired() const = 0;
--- a/src/Common/ZooKeeper/TestKeeperStorageDispatcher.cpp
+++ b/src/Common/ZooKeeper/TestKeeperStorageDispatcher.cpp
@ -1,139 +0,0 @@
-#include <Common/ZooKeeper/TestKeeperStorageDispatcher.h>
-#include <Common/setThreadName.h>
-
-namespace DB
-{
-
-namespace ErrorCodes
-{
-
-    extern const int LOGICAL_ERROR;
-    extern const int TIMEOUT_EXCEEDED;
-}
-
-}
-namespace zkutil
-{
-
-void TestKeeperStorageDispatcher::processingThread()
-{
-    setThreadName("TestKeeperSProc");
-
-    while (!shutdown)
-    {
-        RequestInfo info;
-
-        UInt64 max_wait = UInt64(operation_timeout.totalMilliseconds());
-
-        if (requests_queue.tryPop(info, max_wait))
-        {
-            if (shutdown)
-                break;
-
-            try
-            {
-                auto responses = storage.processRequest(info.request, info.session_id);
-                for (const auto & response_for_session : responses)
-                    setResponse(response_for_session.session_id, response_for_session.response);
-            }
-            catch (...)
-            {
-                tryLogCurrentException(__PRETTY_FUNCTION__);
-            }
-        }
-    }
-}
-
-void TestKeeperStorageDispatcher::setResponse(int64_t session_id, const Coordination::ZooKeeperResponsePtr & response)
-{
-    std::lock_guard lock(session_to_response_callback_mutex);
-    auto session_writer = session_to_response_callback.find(session_id);
-    if (session_writer == session_to_response_callback.end())
-        return;
-
-    session_writer->second(response);
-    /// Session closed, no more writes
-    if (response->xid != Coordination::WATCH_XID && response->getOpNum() == Coordination::OpNum::Close)
-        session_to_response_callback.erase(session_writer);
-}
-
-void TestKeeperStorageDispatcher::finalize()
-{
-    {
-        std::lock_guard lock(push_request_mutex);
-
-        if (shutdown)
-            return;
-
-        shutdown = true;
-
-        if (processing_thread.joinable())
-            processing_thread.join();
-    }
-
-    RequestInfo info;
-    TestKeeperStorage::RequestsForSessions expired_requests;
-    while (requests_queue.tryPop(info))
-        expired_requests.push_back(TestKeeperStorage::RequestForSession{info.session_id, info.request});
-
-    auto expired_responses = storage.finalize(expired_requests);
-
-    for (const auto & response_for_session : expired_responses)
-        setResponse(response_for_session.session_id, response_for_session.response);
-}
-
-void TestKeeperStorageDispatcher::putRequest(const Coordination::ZooKeeperRequestPtr & request, int64_t session_id)
-{
-
-    {
-        std::lock_guard lock(session_to_response_callback_mutex);
-        if (session_to_response_callback.count(session_id) == 0)
-            throw Exception(DB::ErrorCodes::LOGICAL_ERROR, "Unknown session id {}", session_id);
-    }
-
-    RequestInfo request_info;
-    request_info.time = clock::now();
-    request_info.request = request;
-    request_info.session_id = session_id;
-
-    std::lock_guard lock(push_request_mutex);
-    /// Put close requests without timeouts
-    if (request->getOpNum() == Coordination::OpNum::Close)
-        requests_queue.push(std::move(request_info));
-    else if (!requests_queue.tryPush(std::move(request_info), operation_timeout.totalMilliseconds()))
-        throw Exception("Cannot push request to queue within operation timeout", ErrorCodes::TIMEOUT_EXCEEDED);
-}
-
-TestKeeperStorageDispatcher::TestKeeperStorageDispatcher()
-{
-    processing_thread = ThreadFromGlobalPool([this] { processingThread(); });
-}
-
-TestKeeperStorageDispatcher::~TestKeeperStorageDispatcher()
-{
-    try
-    {
-        finalize();
-    }
-    catch (...)
-    {
-        tryLogCurrentException(__PRETTY_FUNCTION__);
-    }
-}
-
-void TestKeeperStorageDispatcher::registerSession(int64_t session_id, ZooKeeperResponseCallback callback)
-{
-    std::lock_guard lock(session_to_response_callback_mutex);
-    if (!session_to_response_callback.try_emplace(session_id, callback).second)
-        throw Exception(DB::ErrorCodes::LOGICAL_ERROR, "Session with id {} already registered in dispatcher", session_id);
-}
-
-void TestKeeperStorageDispatcher::finishSession(int64_t session_id)
-{
-    std::lock_guard lock(session_to_response_callback_mutex);
-    auto session_it = session_to_response_callback.find(session_id);
-    if (session_it != session_to_response_callback.end())
-        session_to_response_callback.erase(session_it);
-}
-
-}
--- a/src/Common/ZooKeeper/TestKeeperStorageDispatcher.h
+++ b/src/Common/ZooKeeper/TestKeeperStorageDispatcher.h
@ -1,60 +0,0 @@
-#pragma once
-
-#include <Common/ThreadPool.h>
-#include <Common/ConcurrentBoundedQueue.h>
-#include <Common/ZooKeeper/TestKeeperStorage.h>
-#include <functional>
-
-namespace zkutil
-{
-
-using ZooKeeperResponseCallback = std::function<void(const Coordination::ZooKeeperResponsePtr & response)>;
-
-class TestKeeperStorageDispatcher
-{
-private:
-    Poco::Timespan operation_timeout{0, Coordination::DEFAULT_OPERATION_TIMEOUT_MS * 1000};
-
-    using clock = std::chrono::steady_clock;
-
-    struct RequestInfo
-    {
-        Coordination::ZooKeeperRequestPtr request;
-        clock::time_point time;
-        int64_t session_id;
-    };
-
-    std::mutex push_request_mutex;
-
-    using RequestsQueue = ConcurrentBoundedQueue<RequestInfo>;
-    RequestsQueue requests_queue{1};
-    std::atomic<bool> shutdown{false};
-    using SessionToResponseCallback = std::unordered_map<int64_t, ZooKeeperResponseCallback>;
-
-    std::mutex session_to_response_callback_mutex;
-    SessionToResponseCallback session_to_response_callback;
-
-    ThreadFromGlobalPool processing_thread;
-
-    TestKeeperStorage storage;
-
-private:
-    void processingThread();
-    void finalize();
-    void setResponse(int64_t session_id, const Coordination::ZooKeeperResponsePtr & response);
-
-public:
-    TestKeeperStorageDispatcher();
-    ~TestKeeperStorageDispatcher();
-
-    void putRequest(const Coordination::ZooKeeperRequestPtr & request, int64_t session_id);
-    int64_t getSessionID()
-    {
-        return storage.getSessionID();
-    }
-    void registerSession(int64_t session_id, ZooKeeperResponseCallback callback);
-    /// Call if we don't need any responses for this session no more (session was expired)
-    void finishSession(int64_t session_id);
-};
-
-}
--- a/src/Common/ZooKeeper/ZooKeeperCommon.cpp
+++ b/src/Common/ZooKeeper/ZooKeeperCommon.cpp
@ -37,6 +37,26 @@ void ZooKeeperRequest::write(WriteBuffer & out) const
    out.next();
 }

+void ZooKeeperSyncRequest::writeImpl(WriteBuffer & out) const
+{
+    Coordination::write(path, out);
+}
+
+void ZooKeeperSyncRequest::readImpl(ReadBuffer & in)
+{
+    Coordination::read(path, in);
+}
+
+void ZooKeeperSyncResponse::readImpl(ReadBuffer & in)
+{
+    Coordination::read(path, in);
+}
+
+void ZooKeeperSyncResponse::writeImpl(WriteBuffer & out) const
+{
+    Coordination::write(path, out);
+}
+
 void ZooKeeperWatchResponse::readImpl(ReadBuffer & in)
 {
    Coordination::read(type, in);
@ -51,6 +71,13 @@ void ZooKeeperWatchResponse::writeImpl(WriteBuffer & out) const
    Coordination::write(path, out);
 }

+void ZooKeeperWatchResponse::write(WriteBuffer & out) const
+{
+    if (error == Error::ZOK)
+        ZooKeeperResponse::write(out);
+    /// skip bad responses for watches
+}
+
 void ZooKeeperAuthRequest::writeImpl(WriteBuffer & out) const
 {
    Coordination::write(type, out);
@ -326,6 +353,12 @@ void ZooKeeperMultiRequest::readImpl(ReadBuffer & in)
    }
 }

+bool ZooKeeperMultiRequest::isReadRequest() const
+{
+    /// Possibly we can do better
+    return false;
+}
+
 void ZooKeeperMultiResponse::readImpl(ReadBuffer & in)
 {
    for (auto & response : responses)
@ -410,6 +443,7 @@ void ZooKeeperMultiResponse::writeImpl(WriteBuffer & out) const
 }

 ZooKeeperResponsePtr ZooKeeperHeartbeatRequest::makeResponse() const { return std::make_shared<ZooKeeperHeartbeatResponse>(); }
+ZooKeeperResponsePtr ZooKeeperSyncRequest::makeResponse() const { return std::make_shared<ZooKeeperSyncResponse>(); }
 ZooKeeperResponsePtr ZooKeeperAuthRequest::makeResponse() const { return std::make_shared<ZooKeeperAuthResponse>(); }
 ZooKeeperResponsePtr ZooKeeperCreateRequest::makeResponse() const { return std::make_shared<ZooKeeperCreateResponse>(); }
 ZooKeeperResponsePtr ZooKeeperRemoveRequest::makeResponse() const { return std::make_shared<ZooKeeperRemoveResponse>(); }
@ -465,6 +499,7 @@ void registerZooKeeperRequest(ZooKeeperRequestFactory & factory)
 ZooKeeperRequestFactory::ZooKeeperRequestFactory()
 {
    registerZooKeeperRequest<OpNum::Heartbeat, ZooKeeperHeartbeatRequest>(*this);
+    registerZooKeeperRequest<OpNum::Sync, ZooKeeperSyncRequest>(*this);
    registerZooKeeperRequest<OpNum::Auth, ZooKeeperAuthRequest>(*this);
    registerZooKeeperRequest<OpNum::Close, ZooKeeperCloseRequest>(*this);
    registerZooKeeperRequest<OpNum::Create, ZooKeeperCreateRequest>(*this);
--- a/src/Common/ZooKeeper/ZooKeeperCommon.h
+++ b/src/Common/ZooKeeper/ZooKeeperCommon.h
@ -30,7 +30,7 @@ struct ZooKeeperResponse : virtual Response
    virtual ~ZooKeeperResponse() override = default;
    virtual void readImpl(ReadBuffer &) = 0;
    virtual void writeImpl(WriteBuffer &) const = 0;
-    void write(WriteBuffer & out) const;
+    virtual void write(WriteBuffer & out) const;
    virtual OpNum getOpNum() const = 0;
 };

@ -60,6 +60,7 @@ struct ZooKeeperRequest : virtual Request
    static std::shared_ptr<ZooKeeperRequest> read(ReadBuffer & in);

    virtual ZooKeeperResponsePtr makeResponse() const = 0;
+    virtual bool isReadRequest() const = 0;
 };

 using ZooKeeperRequestPtr = std::shared_ptr<ZooKeeperRequest>;
@ -71,6 +72,26 @@ struct ZooKeeperHeartbeatRequest final : ZooKeeperRequest
    void writeImpl(WriteBuffer &) const override {}
    void readImpl(ReadBuffer &) override {}
    ZooKeeperResponsePtr makeResponse() const override;
+    bool isReadRequest() const override { return false; }
+};
+
+struct ZooKeeperSyncRequest final : ZooKeeperRequest
+{
+    String path;
+    String getPath() const override { return path; }
+    OpNum getOpNum() const override { return OpNum::Sync; }
+    void writeImpl(WriteBuffer & out) const override;
+    void readImpl(ReadBuffer & in) override;
+    ZooKeeperResponsePtr makeResponse() const override;
+    bool isReadRequest() const override { return false; }
+};
+
+struct ZooKeeperSyncResponse final : ZooKeeperResponse
+{
+    String path;
+    void readImpl(ReadBuffer & in) override;
+    void writeImpl(WriteBuffer & out) const override;
+    OpNum getOpNum() const override { return OpNum::Sync; }
 };

 struct ZooKeeperHeartbeatResponse final : ZooKeeperResponse
@ -86,6 +107,8 @@ struct ZooKeeperWatchResponse final : WatchResponse, ZooKeeperResponse

    void writeImpl(WriteBuffer & out) const override;

+    void write(WriteBuffer & out) const override;
+
    OpNum getOpNum() const override
    {
        throw Exception("OpNum for watch response doesn't exist", Error::ZRUNTIMEINCONSISTENCY);
@ -104,6 +127,7 @@ struct ZooKeeperAuthRequest final : ZooKeeperRequest
    void readImpl(ReadBuffer & in) override;

    ZooKeeperResponsePtr makeResponse() const override;
+    bool isReadRequest() const override { return false; }
 };

 struct ZooKeeperAuthResponse final : ZooKeeperResponse
@ -122,6 +146,7 @@ struct ZooKeeperCloseRequest final : ZooKeeperRequest
    void readImpl(ReadBuffer &) override {}

    ZooKeeperResponsePtr makeResponse() const override;
+    bool isReadRequest() const override { return false; }
 };

 struct ZooKeeperCloseResponse final : ZooKeeperResponse
@ -146,6 +171,7 @@ struct ZooKeeperCreateRequest final : public CreateRequest, ZooKeeperRequest
    void readImpl(ReadBuffer & in) override;

    ZooKeeperResponsePtr makeResponse() const override;
+    bool isReadRequest() const override { return false; }
 };

 struct ZooKeeperCreateResponse final : CreateResponse, ZooKeeperResponse
@ -167,6 +193,7 @@ struct ZooKeeperRemoveRequest final : RemoveRequest, ZooKeeperRequest
    void readImpl(ReadBuffer & in) override;

    ZooKeeperResponsePtr makeResponse() const override;
+    bool isReadRequest() const override { return false; }
 };

 struct ZooKeeperRemoveResponse final : RemoveResponse, ZooKeeperResponse
@ -183,6 +210,7 @@ struct ZooKeeperExistsRequest final : ExistsRequest, ZooKeeperRequest
    void readImpl(ReadBuffer & in) override;

    ZooKeeperResponsePtr makeResponse() const override;
+    bool isReadRequest() const override { return !has_watch; }
 };

 struct ZooKeeperExistsResponse final : ExistsResponse, ZooKeeperResponse
@ -199,6 +227,7 @@ struct ZooKeeperGetRequest final : GetRequest, ZooKeeperRequest
    void readImpl(ReadBuffer & in) override;

    ZooKeeperResponsePtr makeResponse() const override;
+    bool isReadRequest() const override { return !has_watch; }
 };

 struct ZooKeeperGetResponse final : GetResponse, ZooKeeperResponse
@ -217,6 +246,7 @@ struct ZooKeeperSetRequest final : SetRequest, ZooKeeperRequest
    void writeImpl(WriteBuffer & out) const override;
    void readImpl(ReadBuffer & in) override;
    ZooKeeperResponsePtr makeResponse() const override;
+    bool isReadRequest() const override { return false; }
 };

 struct ZooKeeperSetResponse final : SetResponse, ZooKeeperResponse
@ -232,6 +262,7 @@ struct ZooKeeperListRequest : ListRequest, ZooKeeperRequest
    void writeImpl(WriteBuffer & out) const override;
    void readImpl(ReadBuffer & in) override;
    ZooKeeperResponsePtr makeResponse() const override;
+    bool isReadRequest() const override { return !has_watch; }
 };

 struct ZooKeeperSimpleListRequest final : ZooKeeperListRequest
@ -261,6 +292,7 @@ struct ZooKeeperCheckRequest final : CheckRequest, ZooKeeperRequest
    void readImpl(ReadBuffer & in) override;

    ZooKeeperResponsePtr makeResponse() const override;
+    bool isReadRequest() const override { return !has_watch; }
 };

 struct ZooKeeperCheckResponse final : CheckResponse, ZooKeeperResponse
@ -290,6 +322,7 @@ struct ZooKeeperMultiRequest final : MultiRequest, ZooKeeperRequest
    void readImpl(ReadBuffer & in) override;

    ZooKeeperResponsePtr makeResponse() const override;
+    bool isReadRequest() const override;
 };

 struct ZooKeeperMultiResponse final : MultiResponse, ZooKeeperResponse
--- a/src/Common/ZooKeeper/ZooKeeperConstants.cpp
+++ b/src/Common/ZooKeeper/ZooKeeperConstants.cpp
@ -15,6 +15,7 @@ static const std::unordered_set<int32_t> VALID_OPERATIONS =
    static_cast<int32_t>(OpNum::Get),
    static_cast<int32_t>(OpNum::Set),
    static_cast<int32_t>(OpNum::SimpleList),
+    static_cast<int32_t>(OpNum::Sync),
    static_cast<int32_t>(OpNum::Heartbeat),
    static_cast<int32_t>(OpNum::List),
    static_cast<int32_t>(OpNum::Check),
@ -48,6 +49,8 @@ std::string toString(OpNum op_num)
            return "Check";
        case OpNum::Multi:
            return "Multi";
+        case OpNum::Sync:
+            return "Sync";
        case OpNum::Heartbeat:
            return "Heartbeat";
        case OpNum::Auth:
--- a/Show More
+++ b/Show More