Merge branch 'ClickHouse:master' into typo

2024-11-19 06:01:57 +00:00 · 2022-05-31 11:28:12 +08:00 · 2022-05-31 11:28:12 +08:00 · 53020b096d
commit 53020b096d
parent ca67e67a74 c6b20cd5ed
537 changed files with 11421 additions and 11422 deletions
--- a/.github/workflows/backport_branches.yml
+++ b/.github/workflows/backport_branches.yml
@ -359,15 +359,11 @@ jobs:
    steps:
      - name: Set envs
        run: |
-          DEPENDENCIES=$(cat << 'EOF' | jq '. | length'
-          ${{ toJSON(needs) }}
-          EOF
-          )
-          echo "DEPENDENCIES=$DEPENDENCIES" >> "$GITHUB_ENV"
          cat >> "$GITHUB_ENV" << 'EOF'
          CHECK_NAME=ClickHouse build check (actions)
          REPORTS_PATH=${{runner.temp}}/reports_dir
          TEMP_PATH=${{runner.temp}}/report_check
+          NEEDS_DATA_PATH=${{runner.temp}}/needs.json
          EOF
      - name: Download json reports
        uses: actions/download-artifact@v2
@ -382,8 +378,11 @@ jobs:
        run: |
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
+          cat > "$NEEDS_DATA_PATH" << 'EOF'
+          ${{ toJSON(needs) }}
+          EOF
          cd "$GITHUB_WORKSPACE/tests/ci"
-          python3 build_report_check.py "$CHECK_NAME" "$DEPENDENCIES"
+          python3 build_report_check.py "$CHECK_NAME"
      - name: Cleanup
        if: always()
        run: |
--- a/.github/workflows/jepsen.yml
+++ b/.github/workflows/jepsen.yml
@ -7,11 +7,8 @@ concurrency:
 on: # yamllint disable-line rule:truthy
  schedule:
    - cron: '0 */6 * * *'
-  workflow_run:
-    workflows: ["PullRequestCI"]
-    types:
-      - completed
  workflow_dispatch:
+  workflow_call:
 jobs:
  KeeperJepsenRelease:
    runs-on: [self-hosted, style-checker]
--- a/.github/workflows/master.yml
+++ b/.github/workflows/master.yml
@ -970,16 +970,12 @@ jobs:
    steps:
      - name: Set envs
        run: |
-          DEPENDENCIES=$(cat << 'EOF' | jq '. | length'
-          ${{ toJSON(needs) }}
-          EOF
-          )
-          echo "DEPENDENCIES=$DEPENDENCIES" >> "$GITHUB_ENV"
          cat >> "$GITHUB_ENV" << 'EOF'
          CHECK_NAME=ClickHouse build check (actions)
          REPORTS_PATH=${{runner.temp}}/reports_dir
          REPORTS_PATH=${{runner.temp}}/reports_dir
          TEMP_PATH=${{runner.temp}}/report_check
+          NEEDS_DATA_PATH=${{runner.temp}}/needs.json
          EOF
      - name: Download json reports
        uses: actions/download-artifact@v2
@ -994,8 +990,11 @@ jobs:
        run: |
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
+          cat > "$NEEDS_DATA_PATH" << 'EOF'
+          ${{ toJSON(needs) }}
+          EOF
          cd "$GITHUB_WORKSPACE/tests/ci"
-          python3 build_report_check.py "$CHECK_NAME" "$DEPENDENCIES"
+          python3 build_report_check.py "$CHECK_NAME"
      - name: Cleanup
        if: always()
        run: |
@ -1018,15 +1017,11 @@ jobs:
    steps:
      - name: Set envs
        run: |
-          DEPENDENCIES=$(cat << 'EOF' | jq '. | length'
-          ${{ toJSON(needs) }}
-          EOF
-          )
-          echo "DEPENDENCIES=$DEPENDENCIES" >> "$GITHUB_ENV"
          cat >> "$GITHUB_ENV" << 'EOF'
          TEMP_PATH=${{runner.temp}}/report_check
          REPORTS_PATH=${{runner.temp}}/reports_dir
          CHECK_NAME=ClickHouse special build check (actions)
+          NEEDS_DATA_PATH=${{runner.temp}}/needs.json
          EOF
      - name: Download json reports
        uses: actions/download-artifact@v2
@ -1041,8 +1036,11 @@ jobs:
        run: |
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
+          cat > "$NEEDS_DATA_PATH" << 'EOF'
+          ${{ toJSON(needs) }}
+          EOF
          cd "$GITHUB_WORKSPACE/tests/ci"
-          python3 build_report_check.py "$CHECK_NAME" "$DEPENDENCIES"
+          python3 build_report_check.py "$CHECK_NAME"
      - name: Cleanup
        if: always()
        run: |
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@ -1025,15 +1025,11 @@ jobs:
    steps:
      - name: Set envs
        run: |
-          DEPENDENCIES=$(cat << 'EOF' | jq '. | length'
-          ${{ toJSON(needs) }}
-          EOF
-          )
-          echo "DEPENDENCIES=$DEPENDENCIES" >> "$GITHUB_ENV"
          cat >> "$GITHUB_ENV" << 'EOF'
          CHECK_NAME=ClickHouse build check (actions)
          REPORTS_PATH=${{runner.temp}}/reports_dir
          TEMP_PATH=${{runner.temp}}/report_check
+          NEEDS_DATA_PATH=${{runner.temp}}/needs.json
          EOF
      - name: Download json reports
        uses: actions/download-artifact@v2
@ -1048,8 +1044,11 @@ jobs:
        run: |
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
+          cat > "$NEEDS_DATA_PATH" << 'EOF'
+          ${{ toJSON(needs) }}
+          EOF
          cd "$GITHUB_WORKSPACE/tests/ci"
-          python3 build_report_check.py "$CHECK_NAME" "$DEPENDENCIES"
+          python3 build_report_check.py "$CHECK_NAME"
      - name: Cleanup
        if: always()
        run: |
@ -1073,15 +1072,11 @@ jobs:
    steps:
      - name: Set envs
        run: |
-          DEPENDENCIES=$(cat << 'EOF' | jq '. | length'
-          ${{ toJSON(needs) }}
-          EOF
-          )
-          echo "DEPENDENCIES=$DEPENDENCIES" >> "$GITHUB_ENV"
          cat >> "$GITHUB_ENV" << 'EOF'
          TEMP_PATH=${{runner.temp}}/report_check
          REPORTS_PATH=${{runner.temp}}/reports_dir
          CHECK_NAME=ClickHouse special build check (actions)
+          NEEDS_DATA_PATH=${{runner.temp}}/needs.json
          EOF
      - name: Download json reports
        uses: actions/download-artifact@v2
@ -1096,8 +1091,11 @@ jobs:
        run: |
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
+          cat > "$NEEDS_DATA_PATH" << 'EOF'
+          ${{ toJSON(needs) }}
+          EOF
          cd "$GITHUB_WORKSPACE/tests/ci"
-          python3 build_report_check.py "$CHECK_NAME" "$DEPENDENCIES"
+          python3 build_report_check.py "$CHECK_NAME"
      - name: Cleanup
        if: always()
        run: |
@ -3272,6 +3270,13 @@ jobs:
          # shellcheck disable=SC2046
          docker rm -f $(docker ps -a -q) ||:
          sudo rm -fr "$TEMP_PATH"
+#############################################################################################
+###################################### JEPSEN TESTS #########################################
+#############################################################################################
+  Jepsen:
+    needs: [BuilderBinRelease]
+    uses: ./.github/workflows/jepsen.yml
+
  FinishCheck:
    needs:
      - StyleCheck
@ -3336,6 +3341,7 @@ jobs:
      - SplitBuildSmokeTest
      - CompatibilityCheck
      - IntegrationTestsFlakyCheck
+      - Jepsen
    runs-on: [self-hosted, style-checker]
    steps:
      - name: Clear repository
--- a/.github/workflows/release_branches.yml
+++ b/.github/workflows/release_branches.yml
@ -442,16 +442,12 @@ jobs:
    steps:
      - name: Set envs
        run: |
-          DEPENDENCIES=$(cat << 'EOF' | jq '. | length'
-          ${{ toJSON(needs) }}
-          EOF
-          )
-          echo "DEPENDENCIES=$DEPENDENCIES" >> "$GITHUB_ENV"
          cat >> "$GITHUB_ENV" << 'EOF'
          CHECK_NAME=ClickHouse build check (actions)
          REPORTS_PATH=${{runner.temp}}/reports_dir
          REPORTS_PATH=${{runner.temp}}/reports_dir
          TEMP_PATH=${{runner.temp}}/report_check
+          NEEDS_DATA_PATH=${{runner.temp}}/needs.json
          EOF
      - name: Download json reports
        uses: actions/download-artifact@v2
@ -466,8 +462,11 @@ jobs:
        run: |
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
+          cat > "$NEEDS_DATA_PATH" << 'EOF'
+          ${{ toJSON(needs) }}
+          EOF
          cd "$GITHUB_WORKSPACE/tests/ci"
-          python3 build_report_check.py "$CHECK_NAME" "$DEPENDENCIES"
+          python3 build_report_check.py "$CHECK_NAME"
      - name: Cleanup
        if: always()
        run: |
--- a/SECURITY.md
+++ b/SECURITY.md
@ -25,9 +25,11 @@ The following versions of ClickHouse server are currently being supported with s
 | 21.10   | :x: |
 | 21.11   | :x: |
 | 21.12   | :x: |
-| 22.1   | ✅ |
-| 22.2   | ✅ |
+| 22.1   | :x: |
+| 22.2   | :x: |
 | 22.3   | ✅ |
+| 22.4   | ✅ |
+| 22.5   | ✅ |

 ## Reporting a Vulnerability

--- a/contrib/cctz
+++ b/contrib/cctz
@ -1 +1 @@
-Subproject commit 9edd0861d8328b2ae77e8fb5f4d7dcd1cf33b42b
+Subproject commit 8c71d74bdf76c3fa401da845089ae60a6c0aeefa
--- a/docker/server/Dockerfile.alpine
+++ b/docker/server/Dockerfile.alpine
@ -63,7 +63,7 @@ RUN arch=${TARGETARCH:-amd64} \
    && chown clickhouse:clickhouse /var/lib/clickhouse \
    && chown root:clickhouse /var/log/clickhouse-server \
    && chmod +x /entrypoint.sh \
-    && apk add --no-cache su-exec bash tzdata \
+    && apk add --no-cache bash tzdata \
    && cp /usr/share/zoneinfo/UTC /etc/localtime \
    && echo "UTC" > /etc/timezone \
    && chmod ugo+Xrw -R /var/lib/clickhouse /var/log/clickhouse-server /etc/clickhouse-server /etc/clickhouse-client
--- a/docker/server/Dockerfile.ubuntu
+++ b/docker/server/Dockerfile.ubuntu
@ -3,8 +3,6 @@ FROM ubuntu:20.04
 # see https://github.com/moby/moby/issues/4032#issuecomment-192327844
 ARG DEBIAN_FRONTEND=noninteractive

-COPY su-exec.c /su-exec.c
-
 # ARG for quick switch to a given ubuntu mirror
 ARG apt_archive="http://archive.ubuntu.com"
 RUN sed -i "s|http://archive.ubuntu.com|${apt_archive}|g" /etc/apt/sources.list \
@ -19,17 +17,11 @@ RUN sed -i "s|http://archive.ubuntu.com|${apt_archive}|g" /etc/apt/sources.list
        locales \
        wget \
        tzdata \
-    && apt-get install -y --no-install-recommends tcc libc-dev && \
-        tcc /su-exec.c -o /bin/su-exec && \
-        chown root:root /bin/su-exec && \
-        chmod 0755 /bin/su-exec && \
-        rm /su-exec.c && \
-        apt-get purge -y --auto-remove tcc libc-dev libc-dev-bin libc6-dev linux-libc-dev \
    && apt-get clean

 ARG REPO_CHANNEL="stable"
 ARG REPOSITORY="deb https://packages.clickhouse.com/deb ${REPO_CHANNEL} main"
-ARG VERSION=22.1.1.*
+ARG VERSION=22.5.1.*
 ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static"

 # set non-empty deb_location_url url to create a docker image
@ -51,21 +43,6 @@ ARG single_binary_location_url=""
 # installed to prevent picking those uid / gid by some unrelated software.
 # The same uid / gid (101) is used both for alpine and ubuntu.

-# To drop privileges, we need 'su' command, that simply changes uid and gid.
-# In fact, the 'su' command from Linux is not so simple, due to inherent vulnerability in Linux:
-# https://ruderich.org/simon/notes/su-sudo-from-root-tty-hijacking
-# It has to mitigate this drawback of Linux, and to do this, 'su' command is creating it's own pseudo-terminal
-# and forwarding commands. Due to some ridiculous curcumstances, it does not work in Docker (or it does)
-# and for these reasons people are using alternatives to the 'su' command in Docker,
-# that don't mess with the terminal, don't care about closing the opened files, etc...
-# but can only be safe to drop privileges inside Docker.
-# The question - what implementation of 'su' command to use.
-# It should be a simple script doing about just two syscalls.
-# Some people tend to use 'gosu' tool that is written in Go.
-# It is not used for several reasons:
-# 1. Dependency on some foreign code in yet another programming language - does not sound alright.
-# 2. Anselmo D. Adams suggested not to use it due to false positive alarms in some undisclosed security scanners.
-
 ARG TARGETARCH

 RUN arch=${TARGETARCH:-amd64} \
--- a/docker/server/entrypoint.sh
+++ b/docker/server/entrypoint.sh
@ -15,29 +15,15 @@ CLICKHOUSE_GID="${CLICKHOUSE_GID:-"$(id -g clickhouse)"}"
 if [ "$(id -u)" = "0" ]; then
    USER=$CLICKHOUSE_UID
    GROUP=$CLICKHOUSE_GID
-    if command -v gosu &> /dev/null; then
-        gosu="gosu $USER:$GROUP"
-    elif command -v su-exec &> /dev/null; then
-        gosu="su-exec $USER:$GROUP"
-    else
-        echo "No gosu/su-exec detected!"
-        exit 1
-    fi
 else
    USER="$(id -u)"
    GROUP="$(id -g)"
-    gosu=""
    DO_CHOWN=0
 fi

 # set some vars
 CLICKHOUSE_CONFIG="${CLICKHOUSE_CONFIG:-/etc/clickhouse-server/config.xml}"

-if ! $gosu test -f "$CLICKHOUSE_CONFIG" -a -r "$CLICKHOUSE_CONFIG"; then
-    echo "Configuration file '$CLICKHOUSE_CONFIG' isn't readable by user with id '$USER'"
-    exit 1
-fi
-
 # get CH directories locations
 DATA_DIR="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=path || true)"
 TMP_DIR="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=tmp_path || true)"
@ -65,12 +51,7 @@ do
    # check if variable not empty
    [ -z "$dir" ] && continue
    # ensure directories exist
-    if [ "$DO_CHOWN" = "1" ]; then
-      mkdir="mkdir"
-    else
-      mkdir="$gosu mkdir"
-    fi
-    if ! $mkdir -p "$dir"; then
+    if ! mkdir -p "$dir"; then
        echo "Couldn't create necessary directory: $dir"
        exit 1
    fi
@ -81,9 +62,6 @@ do
        if [ "$(stat -c %u "$dir")" != "$USER" ] || [ "$(stat -c %g "$dir")" != "$GROUP" ]; then
            chown -R "$USER:$GROUP" "$dir"
        fi
-    elif ! $gosu test -d "$dir" -a -w "$dir" -a -r "$dir"; then
-        echo "Necessary directory '$dir' isn't accessible by user with id '$USER'"
-        exit 1
    fi
 done

@ -117,7 +95,7 @@ if [ -n "$(ls /docker-entrypoint-initdb.d/)" ] || [ -n "$CLICKHOUSE_DB" ]; then
    HTTP_PORT="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=http_port)"

    # Listen only on localhost until the initialization is done
-    $gosu /usr/bin/clickhouse-server --config-file="$CLICKHOUSE_CONFIG" -- --listen_host=127.0.0.1 &
+    /usr/bin/clickhouse su "${USER}:${GROUP}" /usr/bin/clickhouse-server --config-file="$CLICKHOUSE_CONFIG" -- --listen_host=127.0.0.1 &
    pid="$!"

    # check if clickhouse is ready to accept connections
@ -173,7 +151,7 @@ if [[ $# -lt 1 ]] || [[ "$1" == "--"* ]]; then
    # so the container can't be finished by ctrl+c
    CLICKHOUSE_WATCHDOG_ENABLE=${CLICKHOUSE_WATCHDOG_ENABLE:-0}
    export CLICKHOUSE_WATCHDOG_ENABLE
-    exec $gosu /usr/bin/clickhouse-server --config-file="$CLICKHOUSE_CONFIG" "$@"
+    /usr/bin/clickhouse su "${USER}:${GROUP}" /usr/bin/clickhouse-server --config-file="$CLICKHOUSE_CONFIG" "$@"
 fi

 # Otherwise, we assume the user want to run his own process, for example a `bash` shell to explore this image
--- a/docker/server/su-exec.c
+++ b/docker/server/su-exec.c
@ -1,138 +0,0 @@
-/*
-
-https://github.com/ncopa/su-exec
-The file is copy-pasted verbatim to avoid supply chain attacks.
-
-The MIT License (MIT)
-
-Copyright (c) 2015 ncopa
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-/* set user and group id and exec */
-
-#include <sys/types.h>
-
-#include <err.h>
-#include <errno.h>
-#include <grp.h>
-#include <pwd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-
-static char *argv0;
-
-static void usage(int exitcode)
-{
-	printf("Usage: %s user-spec command [args]\n", argv0);
-	exit(exitcode);
-}
-
-int main(int argc, char *argv[])
-{
-	char *user, *group, **cmdargv;
-	char *end;
-
-	uid_t uid = getuid();
-	gid_t gid = getgid();
-
-	argv0 = argv[0];
-	if (argc < 3)
-		usage(0);
-
-	user = argv[1];
-	group = strchr(user, ':');
-	if (group)
-		*group++ = '\0';
-
-	cmdargv = &argv[2];
-
-	struct passwd *pw = NULL;
-	if (user[0] != '\0') {
-		uid_t nuid = strtol(user, &end, 10);
-		if (*end == '\0')
-			uid = nuid;
-		else {
-			pw = getpwnam(user);
-			if (pw == NULL)
-				err(1, "getpwnam(%s)", user);
-		}
-	}
-	if (pw == NULL) {
-		pw = getpwuid(uid);
-	}
-	if (pw != NULL) {
-		uid = pw->pw_uid;
-		gid = pw->pw_gid;
-	}
-
-	setenv("HOME", pw != NULL ? pw->pw_dir : "/", 1);
-
-	if (group && group[0] != '\0') {
-		/* group was specified, ignore grouplist for setgroups later */
-		pw = NULL;
-
-		gid_t ngid = strtol(group, &end, 10);
-		if (*end == '\0')
-			gid = ngid;
-		else {
-			struct group *gr = getgrnam(group);
-			if (gr == NULL)
-				err(1, "getgrnam(%s)", group);
-			gid = gr->gr_gid;
-		}
-	}
-
-	if (pw == NULL) {
-		if (setgroups(1, &gid) < 0)
-			err(1, "setgroups(%i)", gid);
-	} else {
-		int ngroups = 0;
-		gid_t *glist = NULL;
-
-		while (1) {
-			int r = getgrouplist(pw->pw_name, gid, glist, &ngroups);
-
-			if (r >= 0) {
-				if (setgroups(ngroups, glist) < 0)
-					err(1, "setgroups");
-				break;
-			}
-
-			glist = realloc(glist, ngroups * sizeof(gid_t));
-			if (glist == NULL)
-				err(1, "malloc");
-		}
-	}
-
-	if (setgid(gid) < 0)
-		err(1, "setgid(%i)", gid);
-
-	if (setuid(uid) < 0)
-		err(1, "setuid(%i)", uid);
-
-	execvp(cmdargv[0], cmdargv);
-	err(1, "%s", cmdargv[0]);
-
-	return 1;
-}
--- a/docker/test/fuzzer/run-fuzzer.sh
+++ b/docker/test/fuzzer/run-fuzzer.sh
@ -355,22 +355,8 @@ fi
 cat > report.html <<EOF ||:
 <!DOCTYPE html>
 <html lang="en">
-<link rel="preload" as="font" href="https://yastatic.net/adv-www/_/sUYVCPUAQE7ExrvMS7FoISoO83s.woff2" type="font/woff2" crossorigin="anonymous"/>
  <style>
-@font-face {
-    font-family:'Yandex Sans Display Web';
-    src:url(https://yastatic.net/adv-www/_/H63jN0veW07XQUIA2317lr9UIm8.eot);
-    src:url(https://yastatic.net/adv-www/_/H63jN0veW07XQUIA2317lr9UIm8.eot?#iefix) format('embedded-opentype'),
-            url(https://yastatic.net/adv-www/_/sUYVCPUAQE7ExrvMS7FoISoO83s.woff2) format('woff2'),
-            url(https://yastatic.net/adv-www/_/v2Sve_obH3rKm6rKrtSQpf-eB7U.woff) format('woff'),
-            url(https://yastatic.net/adv-www/_/PzD8hWLMunow5i3RfJ6WQJAL7aI.ttf) format('truetype'),
-            url(https://yastatic.net/adv-www/_/lF_KG5g4tpQNlYIgA0e77fBSZ5s.svg#YandexSansDisplayWeb-Regular) format('svg');
-    font-weight:400;
-    font-style:normal;
-    font-stretch:normal
-}
-
-body { font-family: "Yandex Sans Display Web", Arial, sans-serif; background: #EEE; }
+body { font-family: "DejaVu Sans", "Noto Sans", Arial, sans-serif; background: #EEE; }
 h1 { margin-left: 10px; }
 th, td { border: 0; padding: 5px 10px 5px 10px; text-align: left; vertical-align: top; line-height: 1.5; background-color: #FFF;
 td { white-space: pre; font-family: Monospace, Courier New; }
@ -378,7 +364,6 @@ border: 0; box-shadow: 0 0 0 1px rgba(0, 0, 0, 0.05), 0 8px 25px -5px rgba(0, 0,
 a { color: #06F; text-decoration: none; }
 a:hover, a:active { color: #F40; text-decoration: underline; }
 table { border: 0; }
-.main { margin-left: 10%; }
 p.links a { padding: 5px; margin: 3px; background: #FFF; line-height: 2; white-space: nowrap; box-shadow: 0 0 0 1px rgba(0, 0, 0, 0.05), 0 8px 25px -5px rgba(0, 0, 0, 0.1); }
 th { cursor: pointer; }

--- a/docs/en/development/architecture.md
+++ b/docs/en/development/architecture.md
@ -121,7 +121,7 @@ There are ordinary functions and aggregate functions. For aggregate functions, s

 Ordinary functions do not change the number of rows – they work as if they are processing each row independently. In fact, functions are not called for individual rows, but for `Block`’s of data to implement vectorized query execution.

-There are some miscellaneous functions, like [blockSize](../sql-reference/functions/other-functions.md#function-blocksize), [rowNumberInBlock](../sql-reference/functions/other-functions.md#function-rownumberinblock), and [runningAccumulate](../sql-reference/functions/other-functions.md#runningaccumulate), that exploit block processing and violate the independence of rows.
+There are some miscellaneous functions, like [blockSize](../sql-reference/functions/other-functions.md#blocksize-function-blocksize), [rowNumberInBlock](../sql-reference/functions/other-functions.md#rownumberinblock-function-rownumberinblock), and [runningAccumulate](../sql-reference/functions/other-functions.md#runningaccumulate-runningaccumulate), that exploit block processing and violate the independence of rows.

 ClickHouse has strong typing, so there’s no implicit type conversion. If a function does not support a specific combination of types, it throws an exception. But functions can work (be overloaded) for many different combinations of types. For example, the `plus` function (to implement the `+` operator) works for any combination of numeric types: `UInt8` + `Float32`, `UInt16` + `Int8`, and so on. Also, some variadic functions can accept any number of arguments, such as the `concat` function.

--- a/docs/en/development/contrib.md
+++ b/docs/en/development/contrib.md
@ -92,16 +92,13 @@ The list of third-party libraries can be obtained by the following query:
 SELECT library_name, license_type, license_path FROM system.licenses ORDER BY library_name COLLATE 'en';
 ```

-[Example](https://gh-api.clickhouse.com/play?user=play#U0VMRUNUIGxpYnJhcnlfbmFtZSwgbGljZW5zZV90eXBlLCBsaWNlbnNlX3BhdGggRlJPTSBzeXN0ZW0ubGljZW5zZXMgT1JERVIgQlkgbGlicmFyeV9uYW1lIENPTExBVEUgJ2VuJw==)
+[Example](https://play.clickhouse.com/play?user=play#U0VMRUNUIGxpYnJhcnlfbmFtZSwgbGljZW5zZV90eXBlLCBsaWNlbnNlX3BhdGggRlJPTSBzeXN0ZW0ubGljZW5zZXMgT1JERVIgQlkgbGlicmFyeV9uYW1lIENPTExBVEUgJ2VuJw==)

-## Guidelines for adding new third-party libraries and maintaining custom changes in them {#adding-third-party-libraries}
+## Adding new third-party libraries and maintaining patches in third-party libraries {#adding-third-party-libraries}

-1. All external third-party code should reside in the dedicated directories under `contrib` directory of ClickHouse repo. Prefer Git submodules, when available.
-2. Fork/mirror the official repo in [Clickhouse-extras](https://github.com/ClickHouse-Extras). Prefer official GitHub repos, when available.
-3. Branch from the branch you want to integrate, e.g., `master` -> `clickhouse/master`, or `release/vX.Y.Z` -> `clickhouse/release/vX.Y.Z`.
-4. All forks in [Clickhouse-extras](https://github.com/ClickHouse-Extras) can be automatically synchronized with upstreams. `clickhouse/...` branches will remain unaffected, since virtually nobody is going to use that naming pattern in their upstream repos.
-5. Add submodules under `contrib` of ClickHouse repo that refer the above forks/mirrors. Set the submodules to track the corresponding `clickhouse/...` branches.
-6. Every time the custom changes have to be made in the library code, a dedicated branch should be created, like `clickhouse/my-fix`. Then this branch should be merged into the branch, that is tracked by the submodule, e.g., `clickhouse/master` or `clickhouse/release/vX.Y.Z`.
-7. No code should be pushed in any branch of the forks in [Clickhouse-extras](https://github.com/ClickHouse-Extras), whose names do not follow `clickhouse/...` pattern.
-8. Always write the custom changes with the official repo in mind. Once the PR is merged from (a feature/fix branch in) your personal fork into the fork in [Clickhouse-extras](https://github.com/ClickHouse-Extras), and the submodule is bumped in ClickHouse repo, consider opening another PR from (a feature/fix branch in) the fork in [Clickhouse-extras](https://github.com/ClickHouse-Extras) to the official repo of the library. This will make sure, that 1) the contribution has more than a single use case and importance, 2) others will also benefit from it, 3) the change will not remain a maintenance burden solely on ClickHouse developers.
-9. When a submodule needs to start using a newer code from the original branch (e.g., `master`), and since the custom changes might be merged in the branch it is tracking (e.g., `clickhouse/master`) and so it may diverge from its original counterpart (i.e., `master`), a careful merge should be carried out first, i.e., `master` -> `clickhouse/master`, and only then the submodule can be bumped in ClickHouse.
+1. Each third-party libary must reside in a dedicated directory under the `contrib/` directory of the ClickHouse repository. Avoid dumps/copies of external code, instead use Git's submodule feature to pull third-party code from an external upstream repository.
+2. Submodules are listed in `.gitmodule`. If the external library can be used as-is, you may reference the upstream repository directly. Otherwise, i.e. the external libary requires patching/customization, create a fork of the official repository in the [Clickhouse organization in GitHub](https://github.com/ClickHouse).
+3. In the latter case, create a branch with `clickhouse/` prefix from the branch you want to integrate, e.g. `clickhouse/master` (for `master`) or `clickhouse/release/vX.Y.Z` (for a `release/vX.Y.Z` tag). The purpose of this branch is to isolate customization of the library from upstream work. For example, pulls from the upstream repository into the fork will leave all `clickhouse/` branches unaffected. Submodules in `contrib/` must only track `clickhouse/` branches of forked third-party repositories.
+4. To patch a fork of a third-party library, create a dedicated branch with `clickhouse/` prefix in the fork, e.g. `clickhouse/fix-some-desaster`. Finally, merge the patch branch into the custom tracking branch (e.g. `clickhouse/master` or `clickhouse/release/vX.Y.Z`) using a PR.
+5. Always create patches of third-party libraries with the official repository in mind. Once a PR of a patch branch to the `clickhouse/` branch in the fork repository is done and the submodule version in ClickHouse's official repository is bumped, consider opening another PR from the patch branch to the upstream library repository. This ensures, that 1) the contribution has more than a single use case and importance, 2) others will also benefit from it, 3) the change will not remain a maintenance burden solely on ClickHouse developers.
+9. To update a submodule with changes in the upstream repository, first merge upstream `master` (or a new `versionX.Y.Z` tag) into the `clickhouse`-tracking branch in the fork repository. Conflicts with patches/customization will need to be resolved in this merge (see Step 4.). Once the merge is done, bump the submodule in ClickHouse to point to the new hash in the fork.
--- a/docs/en/getting-started/example-datasets/brown-benchmark.md
+++ b/docs/en/getting-started/example-datasets/brown-benchmark.md
@ -411,6 +411,6 @@ ORDER BY yr,
         mo;
 ```

-The data is also available for interactive queries in the [Playground](https://gh-api.clickhouse.com/play?user=play), [example](https://gh-api.clickhouse.com/play?user=play#U0VMRUNUIG1hY2hpbmVfbmFtZSwKICAgICAgIE1JTihjcHUpIEFTIGNwdV9taW4sCiAgICAgICBNQVgoY3B1KSBBUyBjcHVfbWF4LAogICAgICAgQVZHKGNwdSkgQVMgY3B1X2F2ZywKICAgICAgIE1JTihuZXRfaW4pIEFTIG5ldF9pbl9taW4sCiAgICAgICBNQVgobmV0X2luKSBBUyBuZXRfaW5fbWF4LAogICAgICAgQVZHKG5ldF9pbikgQVMgbmV0X2luX2F2ZywKICAgICAgIE1JTihuZXRfb3V0KSBBUyBuZXRfb3V0X21pbiwKICAgICAgIE1BWChuZXRfb3V0KSBBUyBuZXRfb3V0X21heCwKICAgICAgIEFWRyhuZXRfb3V0KSBBUyBuZXRfb3V0X2F2ZwpGUk9NICgKICBTRUxFQ1QgbWFjaGluZV9uYW1lLAogICAgICAgICBDT0FMRVNDRShjcHVfdXNlciwgMC4wKSBBUyBjcHUsCiAgICAgICAgIENPQUxFU0NFKGJ5dGVzX2luLCAwLjApIEFTIG5ldF9pbiwKICAgICAgICAgQ09BTEVTQ0UoYnl0ZXNfb3V0LCAwLjApIEFTIG5ldF9vdXQKICBGUk9NIG1nYmVuY2gubG9nczEKICBXSEVSRSBtYWNoaW5lX25hbWUgSU4gKCdhbmFuc2knLCdhcmFnb2cnLCd1cmQnKQogICAgQU5EIGxvZ190aW1lID49IFRJTUVTVEFNUCAnMjAxNy0wMS0xMSAwMDowMDowMCcKKSBBUyByCkdST1VQIEJZIG1hY2hpbmVfbmFtZQ==).
+The data is also available for interactive queries in the [Playground](https://play.clickhouse.com/play?user=play), [example](https://play.clickhouse.com/play?user=play#U0VMRUNUIG1hY2hpbmVfbmFtZSwKICAgICAgIE1JTihjcHUpIEFTIGNwdV9taW4sCiAgICAgICBNQVgoY3B1KSBBUyBjcHVfbWF4LAogICAgICAgQVZHKGNwdSkgQVMgY3B1X2F2ZywKICAgICAgIE1JTihuZXRfaW4pIEFTIG5ldF9pbl9taW4sCiAgICAgICBNQVgobmV0X2luKSBBUyBuZXRfaW5fbWF4LAogICAgICAgQVZHKG5ldF9pbikgQVMgbmV0X2luX2F2ZywKICAgICAgIE1JTihuZXRfb3V0KSBBUyBuZXRfb3V0X21pbiwKICAgICAgIE1BWChuZXRfb3V0KSBBUyBuZXRfb3V0X21heCwKICAgICAgIEFWRyhuZXRfb3V0KSBBUyBuZXRfb3V0X2F2ZwpGUk9NICgKICBTRUxFQ1QgbWFjaGluZV9uYW1lLAogICAgICAgICBDT0FMRVNDRShjcHVfdXNlciwgMC4wKSBBUyBjcHUsCiAgICAgICAgIENPQUxFU0NFKGJ5dGVzX2luLCAwLjApIEFTIG5ldF9pbiwKICAgICAgICAgQ09BTEVTQ0UoYnl0ZXNfb3V0LCAwLjApIEFTIG5ldF9vdXQKICBGUk9NIG1nYmVuY2gubG9nczEKICBXSEVSRSBtYWNoaW5lX25hbWUgSU4gKCdhbmFuc2knLCdhcmFnb2cnLCd1cmQnKQogICAgQU5EIGxvZ190aW1lID49IFRJTUVTVEFNUCAnMjAxNy0wMS0xMSAwMDowMDowMCcKKSBBUyByCkdST1VQIEJZIG1hY2hpbmVfbmFtZQ==).

 [Original article](https://clickhouse.com/docs/en/getting_started/example_datasets/brown-benchmark/) <!--hide-->
--- a/docs/en/getting-started/example-datasets/cell-towers.md
+++ b/docs/en/getting-started/example-datasets/cell-towers.md
@ -126,6 +126,6 @@ SELECT count() FROM cell_towers WHERE pointInPolygon((lon, lat), (SELECT * FROM
 1 rows in set. Elapsed: 0.067 sec. Processed 43.28 million rows, 692.42 MB (645.83 million rows/s., 10.33 GB/s.)
 ```

-The data is also available for interactive queries in the [Playground](https://gh-api.clickhouse.com/play?user=play), [example](https://gh-api.clickhouse.com/play?user=play#U0VMRUNUIG1jYywgY291bnQoKSBGUk9NIGNlbGxfdG93ZXJzIEdST1VQIEJZIG1jYyBPUkRFUiBCWSBjb3VudCgpIERFU0M=).
+The data is also available for interactive queries in the [Playground](https://play.clickhouse.com/play?user=play), [example](https://play.clickhouse.com/play?user=play#U0VMRUNUIG1jYywgY291bnQoKSBGUk9NIGNlbGxfdG93ZXJzIEdST1VQIEJZIG1jYyBPUkRFUiBCWSBjb3VudCgpIERFU0M=).

-Although you cannot create temporary tables there.
+Although you cannot create temporary tables there.
--- a/docs/en/getting-started/example-datasets/menus.md
+++ b/docs/en/getting-started/example-datasets/menus.md
@ -351,4 +351,4 @@ At least they have caviar with vodka. Very nice.

 ## Online Playground {#playground}

-The data is uploaded to ClickHouse Playground, [example](https://gh-api.clickhouse.com/play?user=play#U0VMRUNUCiAgICByb3VuZCh0b1VJbnQzMk9yWmVybyhleHRyYWN0KG1lbnVfZGF0ZSwgJ15cXGR7NH0nKSksIC0xKSBBUyBkLAogICAgY291bnQoKSwKICAgIHJvdW5kKGF2ZyhwcmljZSksIDIpLAogICAgYmFyKGF2ZyhwcmljZSksIDAsIDUwLCAxMDApLAogICAgYW55KGRpc2hfbmFtZSkKRlJPTSBtZW51X2l0ZW1fZGVub3JtCldIRVJFIChtZW51X2N1cnJlbmN5IElOICgnRG9sbGFycycsICcnKSkgQU5EIChkID4gMCkgQU5EIChkIDwgMjAyMikgQU5EIChkaXNoX25hbWUgSUxJS0UgJyVjYXZpYXIlJykKR1JPVVAgQlkgZApPUkRFUiBCWSBkIEFTQw==).
+The data is uploaded to ClickHouse Playground, [example](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICByb3VuZCh0b1VJbnQzMk9yWmVybyhleHRyYWN0KG1lbnVfZGF0ZSwgJ15cXGR7NH0nKSksIC0xKSBBUyBkLAogICAgY291bnQoKSwKICAgIHJvdW5kKGF2ZyhwcmljZSksIDIpLAogICAgYmFyKGF2ZyhwcmljZSksIDAsIDUwLCAxMDApLAogICAgYW55KGRpc2hfbmFtZSkKRlJPTSBtZW51X2l0ZW1fZGVub3JtCldIRVJFIChtZW51X2N1cnJlbmN5IElOICgnRG9sbGFycycsICcnKSkgQU5EIChkID4gMCkgQU5EIChkIDwgMjAyMikgQU5EIChkaXNoX25hbWUgSUxJS0UgJyVjYXZpYXIlJykKR1JPVVAgQlkgZApPUkRFUiBCWSBkIEFTQw==).
--- a/docs/en/getting-started/example-datasets/ontime.md
+++ b/docs/en/getting-started/example-datasets/ontime.md
@ -5,20 +5,9 @@ description: Dataset containing the on-time performance of airline flights

 # OnTime 

-This dataset can be obtained in two ways:
+This dataset contains data from Bureau of Transportation Statistics.

-   import from raw data
-   download of prepared partitions
-
-## Import from Raw Data {#import-from-raw-data}
-
-Downloading data:
-
-``` bash
-wget --no-check-certificate --continue https://transtats.bts.gov/PREZIP/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_{1987..2021}_{1..12}.zip
-```
-
-Creating a table:
+## Creating a table

 ``` sql
 CREATE TABLE `ontime`
@ -29,140 +18,138 @@ CREATE TABLE `ontime`
    `DayofMonth`                      UInt8,
    `DayOfWeek`                       UInt8,
    `FlightDate`                      Date,
-    `Reporting_Airline`               String,
+    `Reporting_Airline`               LowCardinality(String),
    `DOT_ID_Reporting_Airline`        Int32,
-    `IATA_CODE_Reporting_Airline`     String,
-    `Tail_Number`                     String,
-    `Flight_Number_Reporting_Airline` String,
+    `IATA_CODE_Reporting_Airline`     LowCardinality(String),
+    `Tail_Number`                     LowCardinality(String),
+    `Flight_Number_Reporting_Airline` LowCardinality(String),
    `OriginAirportID`                 Int32,
    `OriginAirportSeqID`              Int32,
    `OriginCityMarketID`              Int32,
    `Origin`                          FixedString(5),
-    `OriginCityName`                  String,
+    `OriginCityName`                  LowCardinality(String),
    `OriginState`                     FixedString(2),
-    `OriginStateFips`                 String,
-    `OriginStateName`                 String,
+    `OriginStateFips`                 FixedString(2),
+    `OriginStateName`                 LowCardinality(String),
    `OriginWac`                       Int32,
    `DestAirportID`                   Int32,
    `DestAirportSeqID`                Int32,
    `DestCityMarketID`                Int32,
    `Dest`                            FixedString(5),
-    `DestCityName`                    String,
+    `DestCityName`                    LowCardinality(String),
    `DestState`                       FixedString(2),
-    `DestStateFips`                   String,
-    `DestStateName`                   String,
+    `DestStateFips`                   FixedString(2),
+    `DestStateName`                   LowCardinality(String),
    `DestWac`                         Int32,
    `CRSDepTime`                      Int32,
    `DepTime`                         Int32,
    `DepDelay`                        Int32,
    `DepDelayMinutes`                 Int32,
    `DepDel15`                        Int32,
-    `DepartureDelayGroups`            String,
-    `DepTimeBlk`                      String,
+    `DepartureDelayGroups`            LowCardinality(String),
+    `DepTimeBlk`                      LowCardinality(String),
    `TaxiOut`                         Int32,
-    `WheelsOff`                       Int32,
-    `WheelsOn`                        Int32,
+    `WheelsOff`                       LowCardinality(String),
+    `WheelsOn`                        LowCardinality(String),
    `TaxiIn`                          Int32,
    `CRSArrTime`                      Int32,
    `ArrTime`                         Int32,
    `ArrDelay`                        Int32,
    `ArrDelayMinutes`                 Int32,
    `ArrDel15`                        Int32,
-    `ArrivalDelayGroups`              Int32,
-    `ArrTimeBlk`                      String,
-    `Cancelled`                       UInt8,
+    `ArrivalDelayGroups`              LowCardinality(String),
+    `ArrTimeBlk`                      LowCardinality(String),
+    `Cancelled`                       Int8,
    `CancellationCode`                FixedString(1),
-    `Diverted`                        UInt8,
+    `Diverted`                        Int8,
    `CRSElapsedTime`                  Int32,
    `ActualElapsedTime`               Int32,
-    `AirTime`                         Nullable(Int32),
+    `AirTime`                         Int32,
    `Flights`                         Int32,
    `Distance`                        Int32,
-    `DistanceGroup`                   UInt8,
+    `DistanceGroup`                   Int8,
    `CarrierDelay`                    Int32,
    `WeatherDelay`                    Int32,
    `NASDelay`                        Int32,
    `SecurityDelay`                   Int32,
    `LateAircraftDelay`               Int32,
-    `FirstDepTime`                    String,
-    `TotalAddGTime`                   String,
-    `LongestAddGTime`                 String,
-    `DivAirportLandings`              String,
-    `DivReachedDest`                  String,
-    `DivActualElapsedTime`            String,
-    `DivArrDelay`                     String,
-    `DivDistance`                     String,
-    `Div1Airport`                     String,
+    `FirstDepTime`                    Int16,
+    `TotalAddGTime`                   Int16,
+    `LongestAddGTime`                 Int16,
+    `DivAirportLandings`              Int8,
+    `DivReachedDest`                  Int8,
+    `DivActualElapsedTime`            Int16,
+    `DivArrDelay`                     Int16,
+    `DivDistance`                     Int16,
+    `Div1Airport`                     LowCardinality(String),
    `Div1AirportID`                   Int32,
    `Div1AirportSeqID`                Int32,
-    `Div1WheelsOn`                    String,
-    `Div1TotalGTime`                  String,
-    `Div1LongestGTime`                String,
-    `Div1WheelsOff`                   String,
-    `Div1TailNum`                     String,
-    `Div2Airport`                     String,
+    `Div1WheelsOn`                    Int16,
+    `Div1TotalGTime`                  Int16,
+    `Div1LongestGTime`                Int16,
+    `Div1WheelsOff`                   Int16,
+    `Div1TailNum`                     LowCardinality(String),
+    `Div2Airport`                     LowCardinality(String),
    `Div2AirportID`                   Int32,
    `Div2AirportSeqID`                Int32,
-    `Div2WheelsOn`                    String,
-    `Div2TotalGTime`                  String,
-    `Div2LongestGTime`                String,
-    `Div2WheelsOff`                   String,
-    `Div2TailNum`                     String,
-    `Div3Airport`                     String,
+    `Div2WheelsOn`                    Int16,
+    `Div2TotalGTime`                  Int16,
+    `Div2LongestGTime`                Int16,
+    `Div2WheelsOff`                   Int16,
+    `Div2TailNum`                     LowCardinality(String),
+    `Div3Airport`                     LowCardinality(String),
    `Div3AirportID`                   Int32,
    `Div3AirportSeqID`                Int32,
-    `Div3WheelsOn`                    String,
-    `Div3TotalGTime`                  String,
-    `Div3LongestGTime`                String,
-    `Div3WheelsOff`                   String,
-    `Div3TailNum`                     String,
-    `Div4Airport`                     String,
+    `Div3WheelsOn`                    Int16,
+    `Div3TotalGTime`                  Int16,
+    `Div3LongestGTime`                Int16,
+    `Div3WheelsOff`                   Int16,
+    `Div3TailNum`                     LowCardinality(String),
+    `Div4Airport`                     LowCardinality(String),
    `Div4AirportID`                   Int32,
    `Div4AirportSeqID`                Int32,
-    `Div4WheelsOn`                    String,
-    `Div4TotalGTime`                  String,
-    `Div4LongestGTime`                String,
-    `Div4WheelsOff`                   String,
-    `Div4TailNum`                     String,
-    `Div5Airport`                     String,
+    `Div4WheelsOn`                    Int16,
+    `Div4TotalGTime`                  Int16,
+    `Div4LongestGTime`                Int16,
+    `Div4WheelsOff`                   Int16,
+    `Div4TailNum`                     LowCardinality(String),
+    `Div5Airport`                     LowCardinality(String),
    `Div5AirportID`                   Int32,
    `Div5AirportSeqID`                Int32,
-    `Div5WheelsOn`                    String,
-    `Div5TotalGTime`                  String,
-    `Div5LongestGTime`                String,
-    `Div5WheelsOff`                   String,
-    `Div5TailNum`                     String
+    `Div5WheelsOn`                    Int16,
+    `Div5TotalGTime`                  Int16,
+    `Div5LongestGTime`                Int16,
+    `Div5WheelsOff`                   Int16,
+    `Div5TailNum`                     LowCardinality(String)
 ) ENGINE = MergeTree
-      PARTITION BY Year
-      ORDER BY (IATA_CODE_Reporting_Airline, FlightDate)
-      SETTINGS index_granularity = 8192;
+  ORDER BY (Year, Quarter, Month, DayofMonth, FlightDate, IATA_CODE_Reporting_Airline);
+```
+
+## Import from Raw Data {#import-from-raw-data}
+
+Downloading data:
+
+``` bash
+wget --no-check-certificate --continue https://transtats.bts.gov/PREZIP/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_{1987..2022}_{1..12}.zip
 ```

 Loading data with multiple threads:

 ``` bash
-ls -1 *.zip | xargs -I{} -P $(nproc) bash -c "echo {}; unzip -cq {} '*.csv' | sed 's/\.00//g' | clickhouse-client --input_format_with_names_use_header=0 --query='INSERT INTO ontime FORMAT CSVWithNames'"
+ls -1 *.zip | xargs -I{} -P $(nproc) bash -c "echo {}; unzip -cq {} '*.csv' | sed 's/\.00//g' | clickhouse-client --input_format_csv_empty_as_default 1 --query='INSERT INTO ontime FORMAT CSVWithNames'"
 ```

 (if you will have memory shortage or other issues on your server, remove the `-P $(nproc)` part)

-## Download of Prepared Partitions {#download-of-prepared-partitions}
+## Import from a saved copy

-``` bash
-$ curl -O https://datasets.clickhouse.com/ontime/partitions/ontime.tar
-$ tar xvf ontime.tar -C /var/lib/clickhouse # path to ClickHouse data directory
-$ # check permissions of unpacked data, fix if required
-$ sudo service clickhouse-server restart
-$ clickhouse-client --query "select count(*) from datasets.ontime"
+Alternatively, you can import data from a saved copy by the following query:
+
+```
+INSERT INTO ontime SELECT * FROM s3('https://clickhouse-public-datasets.s3.amazonaws.com/ontime/csv_by_year/*.csv.gz', CSVWithNames) SETTINGS max_insert_threads = 40;
 ```

-:::note    
-If you will run the queries described below, you have to use the full table name, `datasets.ontime`.
-:::
-
-
-!!! info "Info"
-    If you are using the prepared partitions or the Online Playground replace any occurrence of `IATA_CODE_Reporting_Airline` or `IATA_CODE_Reporting_Airline AS Carrier` in the following queries with `Carrier` (see `describe ontime`).
+The snapshot was created on 2022-05-29.

 ## Queries {#queries}

@ -398,7 +385,7 @@ ORDER BY c DESC
 LIMIT 10;
 ```

-You can also play with the data in Playground, [example](https://gh-api.clickhouse.com/play?user=play#U0VMRUNUIERheU9mV2VlaywgY291bnQoKikgQVMgYwpGUk9NIG9udGltZQpXSEVSRSBZZWFyPj0yMDAwIEFORCBZZWFyPD0yMDA4CkdST1VQIEJZIERheU9mV2VlawpPUkRFUiBCWSBjIERFU0M7Cg==).
+You can also play with the data in Playground, [example](https://play.clickhouse.com/play?user=play#U0VMRUNUIERheU9mV2VlaywgY291bnQoKikgQVMgYwpGUk9NIG9udGltZQpXSEVSRSBZZWFyPj0yMDAwIEFORCBZZWFyPD0yMDA4CkdST1VQIEJZIERheU9mV2VlawpPUkRFUiBCWSBjIERFU0M7Cg==).

 This performance test was created by Vadim Tkachenko. See:

--- a/docs/en/getting-started/example-datasets/opensky.md
+++ b/docs/en/getting-started/example-datasets/opensky.md
@ -417,4 +417,4 @@ Result:

 ### Online Playground {#playground}

-You can test other queries to this data set using the interactive resource [Online Playground](https://gh-api.clickhouse.com/play?user=play). For example, [like this](https://gh-api.clickhouse.com/play?user=play#U0VMRUNUCiAgICBvcmlnaW4sCiAgICBjb3VudCgpLAogICAgcm91bmQoYXZnKGdlb0Rpc3RhbmNlKGxvbmdpdHVkZV8xLCBsYXRpdHVkZV8xLCBsb25naXR1ZGVfMiwgbGF0aXR1ZGVfMikpKSBBUyBkaXN0YW5jZSwKICAgIGJhcihkaXN0YW5jZSwgMCwgMTAwMDAwMDAsIDEwMCkgQVMgYmFyCkZST00gb3BlbnNreQpXSEVSRSBvcmlnaW4gIT0gJycKR1JPVVAgQlkgb3JpZ2luCk9SREVSIEJZIGNvdW50KCkgREVTQwpMSU1JVCAxMDA=). However, please note that you cannot create temporary tables here.
+You can test other queries to this data set using the interactive resource [Online Playground](https://play.clickhouse.com/play?user=play). For example, [like this](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBvcmlnaW4sCiAgICBjb3VudCgpLAogICAgcm91bmQoYXZnKGdlb0Rpc3RhbmNlKGxvbmdpdHVkZV8xLCBsYXRpdHVkZV8xLCBsb25naXR1ZGVfMiwgbGF0aXR1ZGVfMikpKSBBUyBkaXN0YW5jZSwKICAgIGJhcihkaXN0YW5jZSwgMCwgMTAwMDAwMDAsIDEwMCkgQVMgYmFyCkZST00gb3BlbnNreQpXSEVSRSBvcmlnaW4gIT0gJycKR1JPVVAgQlkgb3JpZ2luCk9SREVSIEJZIGNvdW50KCkgREVTQwpMSU1JVCAxMDA=). However, please note that you cannot create temporary tables here.
--- a/docs/en/getting-started/example-datasets/recipes.md
+++ b/docs/en/getting-started/example-datasets/recipes.md
@ -334,6 +334,6 @@ Result:

 ### Online Playground

-The dataset is also available in the [Online Playground](https://gh-api.clickhouse.com/play?user=play#U0VMRUNUCiAgICBhcnJheUpvaW4oTkVSKSBBUyBrLAogICAgY291bnQoKSBBUyBjCkZST00gcmVjaXBlcwpHUk9VUCBCWSBrCk9SREVSIEJZIGMgREVTQwpMSU1JVCA1MA==).
+The dataset is also available in the [Online Playground](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBhcnJheUpvaW4oTkVSKSBBUyBrLAogICAgY291bnQoKSBBUyBjCkZST00gcmVjaXBlcwpHUk9VUCBCWSBrCk9SREVSIEJZIGMgREVTQwpMSU1JVCA1MA==).

 [Original article](https://clickhouse.com/docs/en/getting-started/example-datasets/recipes/) <!--hide-->
--- a/docs/en/getting-started/example-datasets/star-schema.md
+++ b/docs/en/getting-started/example-datasets/star-schema.md
@ -26,7 +26,6 @@ $ ./dbgen -s 1000 -T c
 $ ./dbgen -s 1000 -T l
 $ ./dbgen -s 1000 -T p
 $ ./dbgen -s 1000 -T s
-$ ./dbgen -s 1000 -T d
 ```

 Creating tables in ClickHouse:
@ -109,10 +108,8 @@ Converting “star schema” to denormalized “flat schema”:
 SET max_memory_usage = 20000000000;

 CREATE TABLE lineorder_flat
-ENGINE = MergeTree
-PARTITION BY toYear(LO_ORDERDATE)
-ORDER BY (LO_ORDERDATE, LO_ORDERKEY) AS
-SELECT
+ENGINE = MergeTree ORDER BY (LO_ORDERDATE, LO_ORDERKEY)
+AS SELECT
    l.LO_ORDERKEY AS LO_ORDERKEY,
    l.LO_LINENUMBER AS LO_LINENUMBER,
    l.LO_CUSTKEY AS LO_CUSTKEY,
--- a/docs/en/getting-started/example-datasets/uk-price-paid.md
+++ b/docs/en/getting-started/example-datasets/uk-price-paid.md
@ -646,4 +646,4 @@ no projection: 100 rows in set. Elapsed: 0.069 sec. Processed 26.32 million rows

 ### Test It in Playground {#playground}

-The dataset is also available in the [Online Playground](https://gh-api.clickhouse.com/play?user=play#U0VMRUNUIHRvd24sIGRpc3RyaWN0LCBjb3VudCgpIEFTIGMsIHJvdW5kKGF2ZyhwcmljZSkpIEFTIHByaWNlLCBiYXIocHJpY2UsIDAsIDUwMDAwMDAsIDEwMCkgRlJPTSB1a19wcmljZV9wYWlkIFdIRVJFIGRhdGUgPj0gJzIwMjAtMDEtMDEnIEdST1VQIEJZIHRvd24sIGRpc3RyaWN0IEhBVklORyBjID49IDEwMCBPUkRFUiBCWSBwcmljZSBERVNDIExJTUlUIDEwMA==).
+The dataset is also available in the [Online Playground](https://play.clickhouse.com/play?user=play#U0VMRUNUIHRvd24sIGRpc3RyaWN0LCBjb3VudCgpIEFTIGMsIHJvdW5kKGF2ZyhwcmljZSkpIEFTIHByaWNlLCBiYXIocHJpY2UsIDAsIDUwMDAwMDAsIDEwMCkgRlJPTSB1a19wcmljZV9wYWlkIFdIRVJFIGRhdGUgPj0gJzIwMjAtMDEtMDEnIEdST1VQIEJZIHRvd24sIGRpc3RyaWN0IEhBVklORyBjID49IDEwMCBPUkRFUiBCWSBwcmljZSBERVNDIExJTUlUIDEwMA==).
--- a/docs/en/interfaces/third-party/gui.md
+++ b/docs/en/interfaces/third-party/gui.md
@ -147,6 +147,16 @@ Features:

 [Zeppelin-Interpreter-for-ClickHouse](https://github.com/SiderZhang/Zeppelin-Interpreter-for-ClickHouse) is a [Zeppelin](https://zeppelin.apache.org) interpreter for ClickHouse. Compared with JDBC interpreter, it can provide better timeout control for long running queries.

+### ClickCat {#clickcat}
+
+[ClickCat](https://github.com/open-botech/ClickCat) is a firendly user interface that lets you search, explore and visualize your ClickHouse Data.
+
+Features:
+
+-   An online SQL editor which can run your SQL code without any installing.
+-   You can observe all processes and mutations. For those unfinished processes, you can kill them in ui.
+-   The Metrics contains Cluster Analysis,Data Analysis,Query Analysis.
+
 ## Commercial {#commercial}

 ### DataGrip {#datagrip}
--- a/docs/en/operations/clickhouse-keeper.md
+++ b/docs/en/operations/clickhouse-keeper.md
@ -5,15 +5,15 @@ sidebar_label: ClickHouse Keeper

 # ClickHouse Keeper {#clickHouse-keeper}

-ClickHouse server uses [ZooKeeper](https://zookeeper.apache.org/) coordination system for data [replication](../engines/table-engines/mergetree-family/replication.md) and [distributed DDL](../sql-reference/distributed-ddl.md) queries execution. ClickHouse Keeper is an alternative coordination system compatible with ZooKeeper.
+ClickHouse Keeper provides the coordination system for data [replication](../engines/table-engines/mergetree-family/replication.md) and [distributed DDL](../sql-reference/distributed-ddl.md) queries execution. ClickHouse Keeper is compatible with ZooKeeper.

 ## Implementation details {#implementation-details}

-ZooKeeper is one of the first well-known open-source coordination systems. It's implemented in Java, has quite a simple and powerful data model. ZooKeeper's coordination algorithm called ZAB (ZooKeeper Atomic Broadcast) doesn't provide linearizability guarantees for reads, because each ZooKeeper node serves reads locally. Unlike ZooKeeper ClickHouse Keeper is written in C++ and uses [RAFT algorithm](https://raft.github.io/) [implementation](https://github.com/eBay/NuRaft). This algorithm allows to have linearizability for reads and writes, has several open-source implementations in different languages.
+ZooKeeper is one of the first well-known open-source coordination systems. It's implemented in Java, and has quite a simple and powerful data model. ZooKeeper's coordination algorithm, ZooKeeper Atomic Broadcast (ZAB), doesn't provide linearizability guarantees for reads, because each ZooKeeper node serves reads locally. Unlike ZooKeeper ClickHouse Keeper is written in C++ and uses the [RAFT algorithm](https://raft.github.io/) [implementation](https://github.com/eBay/NuRaft). This algorithm allows linearizability for reads and writes, and has several open-source implementations in different languages.

-By default, ClickHouse Keeper provides the same guarantees as ZooKeeper (linearizable writes, non-linearizable reads). It has a compatible client-server protocol, so any standard ZooKeeper client can be used to interact with ClickHouse Keeper. Snapshots and logs have an incompatible format with ZooKeeper, but `clickhouse-keeper-converter` tool allows to convert ZooKeeper data to ClickHouse Keeper snapshot. Interserver protocol in ClickHouse Keeper is also incompatible with ZooKeeper so mixed ZooKeeper / ClickHouse Keeper cluster is impossible.
+By default, ClickHouse Keeper provides the same guarantees as ZooKeeper (linearizable writes, non-linearizable reads). It has a compatible client-server protocol, so any standard ZooKeeper client can be used to interact with ClickHouse Keeper. Snapshots and logs have an incompatible format with ZooKeeper, but the `clickhouse-keeper-converter` tool enables the conversion of ZooKeeper data to ClickHouse Keeper snapshots. The interserver protocol in ClickHouse Keeper is also incompatible with ZooKeeper so a mixed ZooKeeper / ClickHouse Keeper cluster is impossible.

-ClickHouse Keeper supports Access Control List (ACL) the same way as [ZooKeeper](https://zookeeper.apache.org/doc/r3.1.2/zookeeperProgrammers.html#sc_ZooKeeperAccessControl) does. ClickHouse Keeper supports the same set of permissions and has the identical built-in schemes: `world`, `auth`, `digest`, `host` and `ip`. Digest authentication scheme uses pair `username:password`. Password is encoded in Base64.
+ClickHouse Keeper supports Access Control Lists (ACLs) the same way as [ZooKeeper](https://zookeeper.apache.org/doc/r3.1.2/zookeeperProgrammers.html#sc_ZooKeeperAccessControl) does. ClickHouse Keeper supports the same set of permissions and has the identical built-in schemes: `world`, `auth` and `digest`. The digest authentication scheme uses the pair `username:password`, the password is encoded in Base64.

 :::note
 External integrations are not supported.
@ -21,25 +21,25 @@ External integrations are not supported.

 ## Configuration {#configuration}

-ClickHouse Keeper can be used as a standalone replacement for ZooKeeper or as an internal part of the ClickHouse server, but in both cases configuration is almost the same `.xml` file. The main ClickHouse Keeper configuration tag is `<keeper_server>`. Keeper configuration has the following parameters:
+ClickHouse Keeper can be used as a standalone replacement for ZooKeeper or as an internal part of the ClickHouse server. In both cases the configuration is almost the same `.xml` file. The main ClickHouse Keeper configuration tag is `<keeper_server>`. Keeper configuration has the following parameters:

 -    `tcp_port` — Port for a client to connect (default for ZooKeeper is `2181`).
 -    `tcp_port_secure` — Secure port for an SSL connection between client and keeper-server.
 -    `server_id` — Unique server id, each participant of the ClickHouse Keeper cluster must have a unique number (1, 2, 3, and so on).
-    `log_storage_path` — Path to coordination logs, better to store logs on the non-busy device (same for ZooKeeper).
+-    `log_storage_path` — Path to coordination logs, just like ZooKeeper it is best to store logs on non-busy nodes.
 -    `snapshot_storage_path` — Path to coordination snapshots.

 Other common parameters are inherited from the ClickHouse server config (`listen_host`, `logger`, and so on).

-Internal coordination settings are located in `<keeper_server>.<coordination_settings>` section:
+Internal coordination settings are located in the `<keeper_server>.<coordination_settings>` section:

 -    `operation_timeout_ms` — Timeout for a single client operation (ms) (default: 10000).
 -    `min_session_timeout_ms` — Min timeout for client session (ms) (default: 10000).
 -    `session_timeout_ms` — Max timeout for client session (ms) (default: 100000).
-    `dead_session_check_period_ms` — How often ClickHouse Keeper check dead sessions and remove them (ms) (default: 500).
+-    `dead_session_check_period_ms` — How often ClickHouse Keeper checks for dead sessions and removes them (ms) (default: 500).
 -    `heart_beat_interval_ms` — How often a ClickHouse Keeper leader will send heartbeats to followers (ms) (default: 500).
-    `election_timeout_lower_bound_ms` — If the follower didn't receive heartbeats from the leader in this interval, then it can initiate leader election (default: 1000).
-    `election_timeout_upper_bound_ms` — If the follower didn't receive heartbeats from the leader in this interval, then it must initiate leader election (default: 2000).
+-    `election_timeout_lower_bound_ms` — If the follower does not receive a heartbeat from the leader in this interval, then it can initiate leader election (default: 1000).
+-    `election_timeout_upper_bound_ms` — If the follower does not receive a heartbeat from the leader in this interval, then it must initiate leader election (default: 2000).
 -    `rotate_log_storage_interval` — How many log records to store in a single file (default: 100000).
 -    `reserved_log_items` — How many coordination log records to store before compaction (default: 100000).
 -    `snapshot_distance` — How often ClickHouse Keeper will create new snapshots (in the number of records in logs) (default: 100000).
@ -55,7 +55,7 @@ Internal coordination settings are located in `<keeper_server>.<coordination_set
 -    `startup_timeout` — If the server doesn't connect to other quorum participants in the specified timeout it will terminate (ms) (default: 30000).
 -    `four_letter_word_white_list` — White list of 4lw commands (default: "conf,cons,crst,envi,ruok,srst,srvr,stat,wchc,wchs,dirs,mntr,isro").

-Quorum configuration is located in `<keeper_server>.<raft_configuration>` section and contain servers description.
+Quorum configuration is located in the `<keeper_server>.<raft_configuration>` section and contain servers description.

 The only parameter for the whole quorum is `secure`, which enables encrypted connection for communication between quorum participants. The parameter can be set `true` if SSL connection is required for internal communication between nodes, or left unspecified otherwise.

@ -66,7 +66,7 @@ The main parameters for each `<server>` are:
 -    `port` — Port where this server listens for connections.

 :::note
-In the case of a change in the topology of your ClickHouse Keeper cluster (eg. replacing a server), please make sure to keep the mapping `server_id` to `hostname` consistent and avoid shuffling or reusing an existing `server_id` for different servers (eg. it can happen if your rely on automation scripts to deploy ClickHouse Keeper)
+In the case of a change in the topology of your ClickHouse Keeper cluster (e.g., replacing a server), please make sure to keep the mapping of `server_id` to `hostname` consistent and avoid shuffling or reusing an existing `server_id` for different servers (e.g., it can happen if your rely on automation scripts to deploy ClickHouse Keeper)
 :::

 Examples of configuration for quorum with three nodes can be found in [integration tests](https://github.com/ClickHouse/ClickHouse/tree/master/tests/integration) with `test_keeper_` prefix. Example configuration for server #1:
@ -112,7 +112,7 @@ ClickHouse Keeper is bundled into the ClickHouse server package, just add config
 clickhouse-keeper --config /etc/your_path_to_config/config.xml
 ```

-If you don't have the symlink (`clickhouse-keeper`) you can create it or specify `keeper` as argument:
+If you don't have the symlink (`clickhouse-keeper`) you can create it or specify `keeper` as an argument to `clickhouse`:

 ```bash
 clickhouse keeper --config /etc/your_path_to_config/config.xml
--- a/docs/en/operations/configuration-files.md
+++ b/docs/en/operations/configuration-files.md
@ -34,7 +34,7 @@ You can also declare attributes as coming from environment variables by using `f

 The config can also define “substitutions”. If an element has the `incl` attribute, the corresponding substitution from the file will be used as the value. By default, the path to the file with substitutions is `/etc/metrika.xml`. This can be changed in the [include_from](../operations/server-configuration-parameters/settings.md#server_configuration_parameters-include_from) element in the server config. The substitution values are specified in `/clickhouse/substitution_name` elements in this file. If a substitution specified in `incl` does not exist, it is recorded in the log. To prevent ClickHouse from logging missing substitutions, specify the `optional="true"` attribute (for example, settings for [macros](../operations/server-configuration-parameters/settings.md#macros)).

-If you want to replace an entire element with a substitution use `include` as element name.
+If you want to replace an entire element with a substitution use `include` as the element name.

 XML substitution example:

--- a/docs/en/operations/settings/merge-tree-settings.md
+++ b/docs/en/operations/settings/merge-tree-settings.md
@ -114,7 +114,7 @@ A large number of parts in a table reduces performance of ClickHouse queries and

 ## replicated_deduplication_window {#replicated-deduplication-window}

-The number of most recently inserted blocks for which Zookeeper stores hash sums to check for duplicates.
+The number of most recently inserted blocks for which ClickHouse Keeper stores hash sums to check for duplicates.

 Possible values:

@ -123,7 +123,7 @@ Possible values:

 Default value: 100.

-The `Insert` command creates one or more blocks (parts). When inserting into Replicated tables, ClickHouse for [insert deduplication](../../engines/table-engines/mergetree-family/replication/) writes the hash sums of the created parts into Zookeeper. Hash sums are stored only for the most recent `replicated_deduplication_window` blocks. The oldest hash sums are removed from Zookeeper.
+The `Insert` command creates one or more blocks (parts). For [insert deduplication](../../engines/table-engines/mergetree-family/replication/), when writing into replicated tables, ClickHouse writes the hash sums of the created parts into ClickHouse Keeper. Hash sums are stored only for the most recent `replicated_deduplication_window` blocks. The oldest hash sums are removed from ClickHouse Keeper.
 A large number of `replicated_deduplication_window` slows down `Inserts` because it needs to compare more entries.
 The hash sum is calculated from the composition of the field names and types and the data of the inserted part (stream of bytes).

@ -142,7 +142,7 @@ A deduplication mechanism is used, similar to replicated tables (see [replicated

 ## replicated_deduplication_window_seconds {#replicated-deduplication-window-seconds}

-The number of seconds after which the hash sums of the inserted blocks are removed from Zookeeper.
+The number of seconds after which the hash sums of the inserted blocks are removed from ClickHouse Keeper.

 Possible values:

@ -150,7 +150,7 @@ Possible values:

 Default value: 604800 (1 week).

-Similar to [replicated_deduplication_window](#replicated-deduplication-window), `replicated_deduplication_window_seconds` specifies how long to store hash sums of blocks for insert deduplication. Hash sums older than `replicated_deduplication_window_seconds` are removed from Zookeeper, even if they are less than ` replicated_deduplication_window`.
+Similar to [replicated_deduplication_window](#replicated-deduplication-window), `replicated_deduplication_window_seconds` specifies how long to store hash sums of blocks for insert deduplication. Hash sums older than `replicated_deduplication_window_seconds` are removed from ClickHouse Keeper, even if they are less than ` replicated_deduplication_window`.

 ## replicated_fetches_http_connection_timeout {#replicated_fetches_http_connection_timeout}

--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@ -1838,7 +1838,7 @@ Usage

 By default, deduplication is not performed for materialized views but is done upstream, in the source table.
 If an INSERTed block is skipped due to deduplication in the source table, there will be no insertion into attached materialized views. This behaviour exists to enable the insertion of highly aggregated data into materialized views, for cases where inserted blocks are the same after materialized view aggregation but derived from different INSERTs into the source table.
-At the same time, this behaviour “breaks” `INSERT` idempotency. If an `INSERT` into the main table was successful and `INSERT` into a materialized view failed (e.g. because of communication failure with Zookeeper) a client will get an error and can retry the operation. However, the materialized view won’t receive the second insert because it will be discarded by deduplication in the main (source) table. The setting `deduplicate_blocks_in_dependent_materialized_views` allows for changing this behaviour. On retry, a materialized view will receive the repeat insert and will perform a deduplication check by itself,
+At the same time, this behaviour “breaks” `INSERT` idempotency. If an `INSERT` into the main table was successful and `INSERT` into a materialized view failed (e.g. because of communication failure with ClickHouse Keeper) a client will get an error and can retry the operation. However, the materialized view won’t receive the second insert because it will be discarded by deduplication in the main (source) table. The setting `deduplicate_blocks_in_dependent_materialized_views` allows for changing this behaviour. On retry, a materialized view will receive the repeat insert and will perform a deduplication check by itself,
 ignoring check result for the source table, and will insert rows lost because of the first failure.

 ## insert_deduplication_token {#insert_deduplication_token}
@ -2459,7 +2459,7 @@ Default value: 0.

 ## merge_selecting_sleep_ms {#merge_selecting_sleep_ms}

-Sleep time for merge selecting when no part is selected. A lower setting triggers selecting tasks in `background_schedule_pool` frequently, which results in a large number of requests to Zookeeper in large-scale clusters.
+Sleep time for merge selecting when no part is selected. A lower setting triggers selecting tasks in `background_schedule_pool` frequently, which results in a large number of requests to ClickHouse Keeper in large-scale clusters.

 Possible values:

@ -2607,7 +2607,7 @@ Default value: 128.

 ## background_fetches_pool_size {#background_fetches_pool_size}

-Sets the number of threads performing background fetches for [replicated](../../engines/table-engines/mergetree-family/replication.md) tables. This setting is applied at the ClickHouse server start and can’t be changed in a user session. For production usage with frequent small insertions or slow ZooKeeper cluster is recommended to use default value.
+Sets the number of threads performing background fetches for [replicated](../../engines/table-engines/mergetree-family/replication.md) tables. This setting is applied at the ClickHouse server start and can’t be changed in a user session. For production usage with frequent small insertions or slow ZooKeeper cluster it is recommended to use default value.

 Possible values:

--- a/docs/en/operations/system-tables/distributed_ddl_queue.md
+++ b/docs/en/operations/system-tables/distributed_ddl_queue.md
@ -15,7 +15,7 @@ Columns:
 -   `query_start_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Query start time.
 -   `query_finish_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Query finish time.
 -   `query_duration_ms` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Duration of query execution (in milliseconds).
-   `exception_code` ([Enum8](../../sql-reference/data-types/enum.md)) — Exception code from [ZooKeeper](../../operations/tips.md#zookeeper).
+-   `exception_code` ([Enum8](../../sql-reference/data-types/enum.md)) — Exception code from [ClickHouse Keeper](../../operations/tips.md#zookeeper).

 **Example**

--- a/docs/en/operations/system-tables/mutations.md
+++ b/docs/en/operations/system-tables/mutations.md
@ -8,7 +8,7 @@ Columns:

 -   `table` ([String](../../sql-reference/data-types/string.md)) — The name of the table to which the mutation was applied.

-   `mutation_id` ([String](../../sql-reference/data-types/string.md)) — The ID of the mutation. For replicated tables these IDs correspond to znode names in the `<table_path_in_zookeeper>/mutations/` directory in ZooKeeper. For non-replicated tables the IDs correspond to file names in the data directory of the table.
+-   `mutation_id` ([String](../../sql-reference/data-types/string.md)) — The ID of the mutation. For replicated tables these IDs correspond to znode names in the `<table_path_in_clickhouse_keeper>/mutations/` directory in ClickHouse Keeper. For non-replicated tables the IDs correspond to file names in the data directory of the table.

 -   `command` ([String](../../sql-reference/data-types/string.md)) — The mutation command string (the part of the query after `ALTER TABLE [db.]table`).

--- a/docs/en/operations/system-tables/replicas.md
+++ b/docs/en/operations/system-tables/replicas.md
@ -62,13 +62,13 @@ Columns:
    Note that writes can be performed to any replica that is available and has a session in ZK, regardless of whether it is a leader.
 -   `can_become_leader` (`UInt8`) - Whether the replica can be a leader.
 -   `is_readonly` (`UInt8`) - Whether the replica is in read-only mode.
-    This mode is turned on if the config does not have sections with ZooKeeper, if an unknown error occurred when reinitializing sessions in ZooKeeper, and during session reinitialization in ZooKeeper.
-   `is_session_expired` (`UInt8`) - the session with ZooKeeper has expired. Basically the same as `is_readonly`.
+    This mode is turned on if the config does not have sections with ClickHouse Keeper, if an unknown error occurred when reinitializing sessions in ClickHouse Keeper, and during session reinitialization in ClickHouse Keeper.
+-   `is_session_expired` (`UInt8`) - the session with ClickHouse Keeper has expired. Basically the same as `is_readonly`.
 -   `future_parts` (`UInt32`) - The number of data parts that will appear as the result of INSERTs or merges that haven’t been done yet.
 -   `parts_to_check` (`UInt32`) - The number of data parts in the queue for verification. A part is put in the verification queue if there is suspicion that it might be damaged.
-   `zookeeper_path` (`String`) - Path to table data in ZooKeeper.
-   `replica_name` (`String`) - Replica name in ZooKeeper. Different replicas of the same table have different names.
-   `replica_path` (`String`) - Path to replica data in ZooKeeper. The same as concatenating ‘zookeeper_path/replicas/replica_path’.
+-   `zookeeper_path` (`String`) - Path to table data in ClickHouse Keeper.
+-   `replica_name` (`String`) - Replica name in ClickHouse Keeper. Different replicas of the same table have different names.
+-   `replica_path` (`String`) - Path to replica data in ClickHouse Keeper. The same as concatenating ‘zookeeper_path/replicas/replica_path’.
 -   `columns_version` (`Int32`) - Version number of the table structure. Indicates how many times ALTER was performed. If replicas have different versions, it means some replicas haven’t made all of the ALTERs yet.
 -   `queue_size` (`UInt32`) - Size of the queue for operations waiting to be performed. Operations include inserting blocks of data, merges, and certain other actions. It usually coincides with `future_parts`.
 -   `inserts_in_queue` (`UInt32`) - Number of inserts of blocks of data that need to be made. Insertions are usually replicated fairly quickly. If this number is large, it means something is wrong.
@ -86,12 +86,12 @@ The next 4 columns have a non-zero value only where there is an active session w
 -   `last_queue_update` (`DateTime`) - When the queue was updated last time.
 -   `absolute_delay` (`UInt64`) - How big lag in seconds the current replica has.
 -   `total_replicas` (`UInt8`) - The total number of known replicas of this table.
-   `active_replicas` (`UInt8`) - The number of replicas of this table that have a session in ZooKeeper (i.e., the number of functioning replicas).
+-   `active_replicas` (`UInt8`) - The number of replicas of this table that have a session in ClickHouse Keeper (i.e., the number of functioning replicas).
 -   `last_queue_update_exception` (`String`) - When the queue contains broken entries. Especially important when ClickHouse breaks backward compatibility between versions and log entries written by newer versions aren't parseable by old versions.
-   `zookeeper_exception` (`String`) - The last exception message, got if the error happened when fetching the info from ZooKeeper. 
+-   `zookeeper_exception` (`String`) - The last exception message, got if the error happened when fetching the info from ClickHouse Keeper. 
 -   `replica_is_active` ([Map(String, UInt8)](../../sql-reference/data-types/map.md)) — Map between replica name and is replica active.

-If you request all the columns, the table may work a bit slowly, since several reads from ZooKeeper are made for each row.
+If you request all the columns, the table may work a bit slowly, since several reads from ClickHouse Keeper are made for each row.
 If you do not request the last 4 columns (log_max_index, log_pointer, total_replicas, active_replicas), the table works quickly.

 For example, you can check that everything is working correctly like this:
--- a/docs/en/operations/system-tables/replication_queue.md
+++ b/docs/en/operations/system-tables/replication_queue.md
@ -1,6 +1,6 @@
 # replication_queue {#system_tables-replication_queue}

-Contains information about tasks from replication queues stored in ZooKeeper for tables in the `ReplicatedMergeTree` family.
+Contains information about tasks from replication queues stored in Clickhouse Keeper, or ZooKeeper, for tables in the `ReplicatedMergeTree` family.

 Columns:

@ -8,11 +8,11 @@ Columns:

 -   `table` ([String](../../sql-reference/data-types/string.md)) — Name of the table.

-   `replica_name` ([String](../../sql-reference/data-types/string.md)) — Replica name in ZooKeeper. Different replicas of the same table have different names.
+-   `replica_name` ([String](../../sql-reference/data-types/string.md)) — Replica name in ClickHouse Keeper. Different replicas of the same table have different names.

 -   `position` ([UInt32](../../sql-reference/data-types/int-uint.md)) — Position of the task in the queue.

-   `node_name` ([String](../../sql-reference/data-types/string.md)) — Node name in ZooKeeper.
+-   `node_name` ([String](../../sql-reference/data-types/string.md)) — Node name in ClickHouse Keeper.

 -   `type` ([String](../../sql-reference/data-types/string.md)) — Type of the task in the queue, one of:

--- a/docs/en/operations/system-tables/zookeeper.md
+++ b/docs/en/operations/system-tables/zookeeper.md
@ -1,7 +1,7 @@
 # zookeeper {#system-zookeeper}

-The table does not exist if ZooKeeper is not configured. Allows reading data from the ZooKeeper cluster defined in the config.
-The query must either have a ‘path =’   condition or a `path IN`  condition set with the `WHERE` clause as shown below. This corresponds to the path of the children in ZooKeeper that you want to get data for.
+The table does not exist unless ClickHouse Keeper or ZooKeeper is configured. The `system.zookeeper` table exposes data from the Keeper cluster defined in the config.
+The query must either have a ‘path =’   condition or a `path IN`  condition set with the `WHERE` clause as shown below. This corresponds to the path of the children that you want to get data for.

 The query `SELECT * FROM system.zookeeper WHERE path = '/clickhouse'` outputs data for all children on the `/clickhouse` node.
 To output data for all root nodes, write path = ‘/’.
@ -9,7 +9,7 @@ If the path specified in ‘path’ does not exist, an exception will be thrown.

 The query `SELECT * FROM system.zookeeper WHERE path IN ('/', '/clickhouse')` outputs data for all children on the `/` and `/clickhouse` node.
 If in the specified ‘path’ collection has does not exist path, an exception will be thrown.
-It can be used to do a batch of ZooKeeper path queries.
+It can be used to do a batch of Keeper path queries.

 Columns:

--- a/docs/en/operations/tips.md
+++ b/docs/en/operations/tips.md
@ -118,17 +118,17 @@ in XML configuration.
 This is important for ClickHouse to be able to get correct information with `cpuid` instruction.
 Otherwise you may get `Illegal instruction` crashes when hypervisor is run on old CPU models.

-## ZooKeeper {#zookeeper}
+## ClickHouse Keeper and ZooKeeper {#zookeeper}

-You are probably already using ZooKeeper for other purposes. You can use the same installation of ZooKeeper, if it isn’t already overloaded.
+ClickHouse Keeper is recommended to replace ZooKeeper for ClickHouse clusters.  See the documentation for [ClickHouse Keeper](clickhouse-keeper.md)

-It’s best to use a fresh version of ZooKeeper – 3.4.9 or later. The version in stable Linux distributions may be outdated.
+If you would like to continue using ZooKeeper then it is best to use a fresh version of ZooKeeper – 3.4.9 or later. The version in stable Linux distributions may be outdated.

 You should never use manually written scripts to transfer data between different ZooKeeper clusters, because the result will be incorrect for sequential nodes. Never use the “zkcopy” utility for the same reason: https://github.com/ksprojects/zkcopy/issues/15

 If you want to divide an existing ZooKeeper cluster into two, the correct way is to increase the number of its replicas and then reconfigure it as two independent clusters.

-Do not run ZooKeeper on the same servers as ClickHouse. Because ZooKeeper is very sensitive for latency and ClickHouse may utilize all available system resources.
+You can run ClickHouse Keeper on the same server as ClickHouse, but do not run ZooKeeper on the same servers as ClickHouse. Because ZooKeeper is very sensitive for latency and ClickHouse may utilize all available system resources.

 You can have ZooKeeper observers in an ensemble but ClickHouse servers should not interact with observers.

--- a/docs/en/operations/utilities/clickhouse-copier.md
+++ b/docs/en/operations/utilities/clickhouse-copier.md
@ -11,11 +11,11 @@ Copies data from the tables in one cluster to tables in another (or the same) cl
 To get a consistent copy, the data in the source tables and partitions should not change during the entire process.
 :::

-You can run multiple `clickhouse-copier` instances on different servers to perform the same job. ZooKeeper is used for syncing the processes.
+You can run multiple `clickhouse-copier` instances on different servers to perform the same job. ClickHouse Keeper, or ZooKeeper, is used for syncing the processes.

 After starting, `clickhouse-copier`:

-   Connects to ZooKeeper and receives:
+-   Connects to ClickHouse Keeper and receives:

    -   Copying jobs.
    -   The state of the copying jobs.
@ -24,7 +24,7 @@ After starting, `clickhouse-copier`:

    Each running process chooses the “closest” shard of the source cluster and copies the data into the destination cluster, resharding the data if necessary.

-`clickhouse-copier` tracks the changes in ZooKeeper and applies them on the fly.
+`clickhouse-copier` tracks the changes in ClickHouse Keeper and applies them on the fly.

 To reduce network traffic, we recommend running `clickhouse-copier` on the same server where the source data is located.

@ -33,19 +33,19 @@ To reduce network traffic, we recommend running `clickhouse-copier` on the same
 The utility should be run manually:

 ``` bash
-$ clickhouse-copier --daemon --config zookeeper.xml --task-path /task/path --base-dir /path/to/dir
+$ clickhouse-copier --daemon --config keeper.xml --task-path /task/path --base-dir /path/to/dir
 ```

 Parameters:

 -   `daemon` — Starts `clickhouse-copier` in daemon mode.
-   `config` — The path to the `zookeeper.xml` file with the parameters for the connection to ZooKeeper.
-   `task-path` — The path to the ZooKeeper node. This node is used for syncing `clickhouse-copier` processes and storing tasks. Tasks are stored in `$task-path/description`.
-   `task-file` — Optional path to file with task configuration for initial upload to ZooKeeper.
+-   `config` — The path to the `keeper.xml` file with the parameters for the connection to ClickHouse Keeper.
+-   `task-path` — The path to the ClickHouse Keeper node. This node is used for syncing `clickhouse-copier` processes and storing tasks. Tasks are stored in `$task-path/description`.
+-   `task-file` — Optional path to file with task configuration for initial upload to ClickHouse Keeper.
 -   `task-upload-force` — Force upload `task-file` even if node already exists.
 -   `base-dir` — The path to logs and auxiliary files. When it starts, `clickhouse-copier` creates `clickhouse-copier_YYYYMMHHSS_<PID>` subdirectories in `$base-dir`. If this parameter is omitted, the directories are created in the directory where `clickhouse-copier` was launched.

-## Format of Zookeeper.xml {#format-of-zookeeper-xml}
+## Format of keeper.xml {#format-of-zookeeper-xml}

 ``` xml
 <clickhouse>
--- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md
+++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md
@ -21,7 +21,7 @@ ClickHouse generates an exception for errors with dictionaries. Examples of erro
 -   The dictionary being accessed could not be loaded.
 -   Error querying a `cached` dictionary.

-You can view the list of external dictionaries and their statuses in the `system.dictionaries` table.
+You can view the list of external dictionaries and their statuses in the [system.dictionaries](../../../operations/system-tables/dictionaries.md) table.

 The configuration looks like this:

@ -48,6 +48,35 @@ LAYOUT(LAYOUT_TYPE(param value)) -- layout settings
 ...
 ```

+Dictionaries without word `complex-key*` in a layout have a key with [UInt64](../../../sql-reference/data-types/int-uint.md) type, `complex-key*` dictionaries have a composite key (complex, with arbitrary types).
+
+[UInt64](../../../sql-reference/data-types/int-uint.md) keys in XML dictionaries are defined with `<id>` tag.
+
+Configuration example (column key_column has UInt64 type):
+```xml
+...
+<structure>
+    <id>
+        <name>key_column</name>
+    </id>
+...
+```
+
+Composite `complex` keys XML dictionaries are defined `<key>` tag.
+
+Configuration example of a composite key (key has one element with [String](../../../sql-reference/data-types/string.md) type):
+```xml
+...
+<structure>
+    <key>
+        <attribute>
+            <name>country_code</name>
+            <type>String</type>
+        </attribute>
+    </key>
+...
+```
+
 ## Ways to Store Dictionaries in Memory {#ways-to-store-dictionaries-in-memory}

 -   [flat](#flat)
@ -98,6 +127,8 @@ LAYOUT(FLAT(INITIAL_ARRAY_SIZE 50000 MAX_ARRAY_SIZE 5000000))

 The dictionary is completely stored in memory in the form of a hash table. The dictionary can contain any number of elements with any identifiers In practice, the number of keys can reach tens of millions of items.

+The dictionary key has the [UInt64](../../../sql-reference/data-types/int-uint.md) type.
+
 If `preallocate` is `true` (default is `false`) the hash table will be preallocated (this will make the dictionary load faster). But note that you should use it only if:

 - The source support an approximate number of elements (for now it is supported only by the `ClickHouse` source).
@ -125,6 +156,8 @@ LAYOUT(HASHED(PREALLOCATE 0))

 Similar to `hashed`, but uses less memory in favor more CPU usage.

+The dictionary key has the [UInt64](../../../sql-reference/data-types/int-uint.md) type.
+
 It will be also preallocated so as `hashed` (with `preallocate` set to `true`), and note that it is even more significant for `sparse_hashed`.

 Configuration example:
@ -181,6 +214,8 @@ LAYOUT(COMPLEX_KEY_SPARSE_HASHED())

 The dictionary is completely stored in memory. Each attribute is stored in an array. The key attribute is stored in the form of a hashed table where value is an index in the attributes array. The dictionary can contain any number of elements with any identifiers. In practice, the number of keys can reach tens of millions of items.

+The dictionary key has the [UInt64](../../../sql-reference/data-types/int-uint.md) type.
+
 All types of sources are supported. When updating, data (from a file or from a table) is read in its entirety.

 Configuration example:
@ -220,6 +255,7 @@ LAYOUT(COMPLEX_KEY_HASHED_ARRAY())

 The dictionary is stored in memory in the form of a hash table with an ordered array of ranges and their corresponding values.

+The dictionary key has the [UInt64](../../../sql-reference/data-types/int-uint.md) type.
 This storage method works the same way as hashed and allows using date/time (arbitrary numeric type) ranges in addition to the key.

 Example: The table contains discounts for each advertiser in the format:
@ -360,6 +396,8 @@ RANGE(MIN StartDate MAX EndDate);

 The dictionary is stored in a cache that has a fixed number of cells. These cells contain frequently used elements.

+The dictionary key has the [UInt64](../../../sql-reference/data-types/int-uint.md) type.
+
 When searching for a dictionary, the cache is searched first. For each block of data, all keys that are not found in the cache or are outdated are requested from the source using `SELECT attrs... FROM db.table WHERE id IN (k1, k2, ...)`. The received data is then written to the cache.

 If keys are not found in dictionary, then update cache task is created and added into update queue. Update queue properties can be controlled with settings `max_update_queue_size`, `update_queue_push_timeout_milliseconds`, `query_wait_timeout_milliseconds`, `max_threads_for_updates`.
@ -420,6 +458,8 @@ This type of storage is for use with composite [keys](../../../sql-reference/dic

 Similar to `cache`, but stores data on SSD and index in RAM. All cache dictionary settings related to update queue can also be applied to SSD cache dictionaries.

+The dictionary key has the [UInt64](../../../sql-reference/data-types/int-uint.md) type.
+
 ``` xml
 <layout>
    <ssd_cache>
@ -452,7 +492,7 @@ This type of storage is for use with composite [keys](../../../sql-reference/dic

 The dictionary is not stored in memory and directly goes to the source during the processing of a request.

-The dictionary key has the `UInt64` type.
+The dictionary key has the [UInt64](../../../sql-reference/data-types/int-uint.md) type.

 All types of [sources](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md), except local files, are supported.

--- a/docs/en/sql-reference/functions/geo/h3.md
+++ b/docs/en/sql-reference/functions/geo/h3.md
@ -1141,4 +1141,261 @@ Result:
 │ [590080815153545215,590080471556161535,590080677714591743,590077585338138623,590077447899185151,590079509483487231] │
 └─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
 ```
-[Original article](https://clickhouse.com/docs/en/sql-reference/functions/geo/h3) <!--hide-->
+
+## h3GetUnidirectionalEdge {#h3getunidirectionaledge}
+
+Returns a unidirectional edge H3 index based on the provided origin and destination and returns 0 on error.
+
+**Syntax**
+
+``` sql
+h3GetUnidirectionalEdge(originIndex, destinationIndex)
+```
+
+**Parameter**
+
+-   `originIndex` — Origin Hexagon index number. Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
+-   `destinationIndex` — Destination Hexagon index number. Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
+
+**Returned value**
+
+-  Unidirectional Edge Hexagon Index number.
+
+Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
+
+**Example**
+
+Query:
+
+``` sql
+ SELECT h3GetUnidirectionalEdge(599686042433355775, 599686043507097599) as edge;
+```
+
+Result:
+
+``` text
+┌────────────────edge─┐
+│ 1248204388774707199 │
+└─────────────────────┘
+```
+
+## h3UnidirectionalEdgeIsValid {#h3unidirectionaledgeisvalid}
+
+Determines if the provided H3Index is a valid unidirectional edge index. Returns 1 if it's a unidirectional edge and 0 otherwise.
+
+**Syntax**
+
+``` sql
+h3UnidirectionalEdgeisValid(index)
+```
+
+**Parameter**
+
+-   `index` — Hexagon index number. Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
+
+**Returned value**
+
+-   1 — The H3 index is a valid unidirectional edge.
+-   0 — The H3 index is not a valid unidirectional edge.
+
+Type: [UInt8](../../../sql-reference/data-types/int-uint.md).
+
+**Example**
+
+Query:
+
+``` sql
+ SELECT h3UnidirectionalEdgeIsValid(1248204388774707199) as validOrNot;
+```
+
+Result:
+
+``` text
+┌─validOrNot─┐
+│          1 │
+└────────────┘
+```
+
+## h3GetOriginIndexFromUnidirectionalEdge {#h3getoriginindexfromunidirectionaledge}
+
+Returns the origin hexagon index from the unidirectional edge H3Index.
+
+**Syntax**
+
+``` sql
+h3GetOriginIndexFromUnidirectionalEdge(edge)
+```
+
+**Parameter**
+
+-   `edge` — Hexagon index number that represents a unidirectional edge. Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
+
+**Returned value**
+
+-  Origin Hexagon Index number.
+
+Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
+
+**Example**
+
+Query:
+
+``` sql
+ SELECT h3GetOriginIndexFromUnidirectionalEdge(1248204388774707197) as origin;
+```
+
+Result:
+
+``` text
+┌─────────────origin─┐
+│ 599686042433355773 │
+└────────────────────┘
+```
+
+## h3GetDestinationIndexFromUnidirectionalEdge {#h3getdestinationindexfromunidirectionaledge}
+
+Returns the destination hexagon index from the unidirectional edge H3Index.
+
+**Syntax**
+
+``` sql
+h3GetDestinationIndexFromUnidirectionalEdge(edge)
+```
+
+**Parameter**
+
+-   `edge` — Hexagon index number that represents a unidirectional edge. Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
+
+**Returned value**
+
+-  Destination Hexagon Index number.
+
+Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
+
+**Example**
+
+Query:
+
+``` sql
+ SELECT h3GetDestinationIndexFromUnidirectionalEdge(1248204388774707197) as destination;
+```
+
+Result:
+
+``` text
+┌────────destination─┐
+│ 599686043507097597 │
+└────────────────────┘
+```
+
+## h3GetIndexesFromUnidirectionalEdge {#h3getindexesfromunidirectionaledge}
+
+Returns the origin and destination hexagon indexes from the given unidirectional edge H3Index.
+
+**Syntax**
+
+``` sql
+h3GetIndexesFromUnidirectionalEdge(edge)
+```
+
+**Parameter**
+
+-   `edge` — Hexagon index number that represents a unidirectional edge. Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
+
+**Returned value**
+
+A tuple consisting of two values `tuple(origin,destination)`:
+
+- `origin` — Origin Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md).
+- `destination` — Destination Hexagon index number. [UInt64](../../../sql-reference/data-types/int-uint.md).
+
+Returns `(0,0)` if the provided input is not valid.
+
+**Example**
+
+Query:
+
+``` sql
+ SELECT h3GetIndexesFromUnidirectionalEdge(1248204388774707199) as indexes;
+```
+
+Result:
+
+``` text
+┌─indexes─────────────────────────────────┐
+│ (599686042433355775,599686043507097599) │
+└─────────────────────────────────────────┘
+```
+
+## h3GetUnidirectionalEdgesFromHexagon {#h3getunidirectionaledgesfromhexagon}
+
+Provides all of the unidirectional edges from the provided H3Index.
+
+**Syntax**
+
+``` sql
+h3GetUnidirectionalEdgesFromHexagon(index)
+```
+
+**Parameter**
+
+-   `index` — Hexagon index number that represents a unidirectional edge. Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
+
+**Returned value**
+
+Array of h3 indexes representing each unidirectional edge:
+
+Type: [Array](../../../sql-reference/data-types/array.md)([UInt64](../../../sql-reference/data-types/int-uint.md)).
+
+**Example**
+
+Query:
+
+``` sql
+ SELECT h3GetUnidirectionalEdgesFromHexagon(1248204388774707199) as edges;
+```
+
+Result:
+
+``` text
+┌─edges─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
+│ [1248204388774707199,1320261982812635135,1392319576850563071,1464377170888491007,1536434764926418943,1608492358964346879] │
+└───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+```
+
+## h3GetUnidirectionalEdgeBoundary {#h3getunidirectionaledgeboundary}
+
+Returns the coordinates defining the unidirectional edge.
+
+**Syntax**
+
+``` sql
+h3GetUnidirectionalEdgeBoundary(index)
+```
+
+**Parameter**
+
+-   `index` — Hexagon index number that represents a unidirectional edge. Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
+
+**Returned value**
+
+-    Array of pairs '(lon, lat)'.
+     Type: [Array](../../../sql-reference/data-types/array.md)([Float64](../../../sql-reference/data-types/float.md), [Float64](../../../sql-reference/data-types/float.md)).
+
+
+**Example**
+
+Query:
+
+``` sql
+ SELECT h3GetUnidirectionalEdgeBoundary(1248204388774707199) as boundary;
+```
+
+Result:
+
+``` text
+┌─boundary────────────────────────────────────────────────────────────────────────┐
+│ [(37.42012867767779,-122.03773496427027),(37.33755608435299,-122.090428929044)] │
+└─────────────────────────────────────────────────────────────────────────────────┘
+```
+[Original article](https://clickhouse.com/docs/en/sql-reference/functions/geo/h3) <!--hide-->
--- a/docs/ru/development/contrib.md
+++ b/docs/ru/development/contrib.md
@ -92,7 +92,7 @@ sidebar_label: "Используемые сторонние библиотеки
 SELECT library_name, license_type, license_path FROM system.licenses ORDER BY library_name COLLATE 'en';
 ```

-[Пример](https://gh-api.clickhouse.com/play?user=play#U0VMRUNUIGxpYnJhcnlfbmFtZSwgbGljZW5zZV90eXBlLCBsaWNlbnNlX3BhdGggRlJPTSBzeXN0ZW0ubGljZW5zZXMgT1JERVIgQlkgbGlicmFyeV9uYW1lIENPTExBVEUgJ2VuJw==)
+[Пример](https://play.clickhouse.com/play?user=play#U0VMRUNUIGxpYnJhcnlfbmFtZSwgbGljZW5zZV90eXBlLCBsaWNlbnNlX3BhdGggRlJPTSBzeXN0ZW0ubGljZW5zZXMgT1JERVIgQlkgbGlicmFyeV9uYW1lIENPTExBVEUgJ2VuJw==)

 ## Рекомендации по добавлению сторонних библиотек и поддержанию в них пользовательских изменений {#adding-third-party-libraries}

--- a/docs/ru/getting-started/example-datasets/brown-benchmark.md
+++ b/docs/ru/getting-started/example-datasets/brown-benchmark.md
@ -411,5 +411,4 @@ ORDER BY yr,
         mo;
 ```

-Данные также доступны для работы с интерактивными запросами через [Playground](https://gh-api.clickhouse.com/play?user=play), [пример](https://gh-api.clickhouse.com/play?user=play#U0VMRUNUIG1hY2hpbmVfbmFtZSwKICAgICAgIE1JTihjcHUpIEFTIGNwdV9taW4sCiAgICAgICBNQVgoY3B1KSBBUyBjcHVfbWF4LAogICAgICAgQVZHKGNwdSkgQVMgY3B1X2F2ZywKICAgICAgIE1JTihuZXRfaW4pIEFTIG5ldF9pbl9taW4sCiAgICAgICBNQVgobmV0X2luKSBBUyBuZXRfaW5fbWF4LAogICAgICAgQVZHKG5ldF9pbikgQVMgbmV0X2luX2F2ZywKICAgICAgIE1JTihuZXRfb3V0KSBBUyBuZXRfb3V0X21pbiwKICAgICAgIE1BWChuZXRfb3V0KSBBUyBuZXRfb3V0X21heCwKICAgICAgIEFWRyhuZXRfb3V0KSBBUyBuZXRfb3V0X2F2ZwpGUk9NICgKICBTRUxFQ1QgbWFjaGluZV9uYW1lLAogICAgICAgICBDT0FMRVNDRShjcHVfdXNlciwgMC4wKSBBUyBjcHUsCiAgICAgICAgIENPQUxFU0NFKGJ5dGVzX2luLCAwLjApIEFTIG5ldF9pbiwKICAgICAgICAgQ09BTEVTQ0UoYnl0ZXNfb3V0LCAwLjApIEFTIG5ldF9vdXQKICBGUk9NIG1nYmVuY2gubG9nczEKICBXSEVSRSBtYWNoaW5lX25hbWUgSU4gKCdhbmFuc2knLCdhcmFnb2cnLCd1cmQnKQogICAgQU5EIGxvZ190aW1lID49IFRJTUVTVEFNUCAnMjAxNy0wMS0xMSAwMDowMDowMCcKKSBBUyByCkdST1VQIEJZIG1hY2hpbmVfbmFtZQ==).
-
+Данные также доступны для работы с интерактивными запросами через [Playground](https://play.clickhouse.com/play?user=play), [пример](https://play.clickhouse.com/play?user=play#U0VMRUNUIG1hY2hpbmVfbmFtZSwKICAgICAgIE1JTihjcHUpIEFTIGNwdV9taW4sCiAgICAgICBNQVgoY3B1KSBBUyBjcHVfbWF4LAogICAgICAgQVZHKGNwdSkgQVMgY3B1X2F2ZywKICAgICAgIE1JTihuZXRfaW4pIEFTIG5ldF9pbl9taW4sCiAgICAgICBNQVgobmV0X2luKSBBUyBuZXRfaW5fbWF4LAogICAgICAgQVZHKG5ldF9pbikgQVMgbmV0X2luX2F2ZywKICAgICAgIE1JTihuZXRfb3V0KSBBUyBuZXRfb3V0X21pbiwKICAgICAgIE1BWChuZXRfb3V0KSBBUyBuZXRfb3V0X21heCwKICAgICAgIEFWRyhuZXRfb3V0KSBBUyBuZXRfb3V0X2F2ZwpGUk9NICgKICBTRUxFQ1QgbWFjaGluZV9uYW1lLAogICAgICAgICBDT0FMRVNDRShjcHVfdXNlciwgMC4wKSBBUyBjcHUsCiAgICAgICAgIENPQUxFU0NFKGJ5dGVzX2luLCAwLjApIEFTIG5ldF9pbiwKICAgICAgICAgQ09BTEVTQ0UoYnl0ZXNfb3V0LCAwLjApIEFTIG5ldF9vdXQKICBGUk9NIG1nYmVuY2gubG9nczEKICBXSEVSRSBtYWNoaW5lX25hbWUgSU4gKCdhbmFuc2knLCdhcmFnb2cnLCd1cmQnKQogICAgQU5EIGxvZ190aW1lID49IFRJTUVTVEFNUCAnMjAxNy0wMS0xMSAwMDowMDowMCcKKSBBUyByCkdST1VQIEJZIG1hY2hpbmVfbmFtZQ==).
--- a/docs/ru/getting-started/example-datasets/cell-towers.md
+++ b/docs/ru/getting-started/example-datasets/cell-towers.md
@ -125,4 +125,4 @@ SELECT count() FROM cell_towers WHERE pointInPolygon((lon, lat), (SELECT * FROM
 1 rows in set. Elapsed: 0.067 sec. Processed 43.28 million rows, 692.42 MB (645.83 million rows/s., 10.33 GB/s.)
 ```

-Вы можете протестировать другие запросы с помощью интерактивного ресурса [Playground](https://gh-api.clickhouse.com/play?user=play). Например, [вот так](https://gh-api.clickhouse.com/play?user=play#U0VMRUNUIG1jYywgY291bnQoKSBGUk9NIGNlbGxfdG93ZXJzIEdST1VQIEJZIG1jYyBPUkRFUiBCWSBjb3VudCgpIERFU0M=). Однако, обратите внимание, что здесь нельзя создавать временные таблицы.
+Вы можете протестировать другие запросы с помощью интерактивного ресурса [Playground](https://play.clickhouse.com/play?user=play). Например, [вот так](https://play.clickhouse.com/play?user=play#U0VMRUNUIG1jYywgY291bnQoKSBGUk9NIGNlbGxfdG93ZXJzIEdST1VQIEJZIG1jYyBPUkRFUiBCWSBjb3VudCgpIERFU0M=). Однако, обратите внимание, что здесь нельзя создавать временные таблицы.
--- a/docs/ru/getting-started/example-datasets/recipes.md
+++ b/docs/ru/getting-started/example-datasets/recipes.md
@ -337,6 +337,6 @@ WHERE title = 'Chocolate-Strawberry-Orange Wedding Cake';

 ### Online Playground

-Этот набор данных доступен в [Online Playground](https://gh-api.clickhouse.com/play?user=play#U0VMRUNUCiAgICBhcnJheUpvaW4oTkVSKSBBUyBrLAogICAgY291bnQoKSBBUyBjCkZST00gcmVjaXBlcwpHUk9VUCBCWSBrCk9SREVSIEJZIGMgREVTQwpMSU1JVCA1MA==).
+Этот набор данных доступен в [Online Playground](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBhcnJheUpvaW4oTkVSKSBBUyBrLAogICAgY291bnQoKSBBUyBjCkZST00gcmVjaXBlcwpHUk9VUCBCWSBrCk9SREVSIEJZIGMgREVTQwpMSU1JVCA1MA==).

 [Оригинальная статья](https://clickhouse.com/docs/ru/getting-started/example-datasets/recipes/) <!--hide-->
--- a/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md
+++ b/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md
@ -21,7 +21,7 @@ sidebar_label: "Хранение словарей в памяти"
 -   При обращении к словарю, который не удалось загрузить.
 -   При ошибке запроса к `cached`-словарю.

-Список внешних словарей и их статус можно посмотреть в таблице `system.dictionaries`.
+Список внешних словарей и их статус можно посмотреть в таблице [system.dictionaries](../../../operations/system-tables/dictionaries.md).

 Общий вид конфигурации:

@ -48,6 +48,36 @@ LAYOUT(LAYOUT_TYPE(param value)) -- layout settings
 ...
 ```

+Ключ словарей не имеющих слово `complex-key*` в названии имеет тип [UInt64](../../../sql-reference/data-types/int-uint.md), `complex-key*` словари позволяют произвольный тип ключа (составной, и из разных типов).
+
+[UInt64](../../../sql-reference/data-types/int-uint.md) ключи в XML словарях задаются тегом `<id>`.
+
+Пример конфигурации (поле key_column имеет тип UInt64):
+```xml
+...
+<structure>
+    <id>
+        <name>key_column</name>
+    </id>
+...
+```
+
+Cоставные `complex` ключи в XML словарях задаются тегом `<key>`.
+
+Пример конфигурации составного ключа (ключ состоит из одного элемента с типом [String](../../../sql-reference/data-types/string.md)):
+```xml
+...
+<structure>
+    <key>
+        <attribute>
+            <name>country_code</name>
+            <type>String</type>
+        </attribute>
+    </key>
+...
+```
+
+
 ## Способы размещения словарей в памяти {#ways-to-store-dictionaries-in-memory}

 -   [flat](#flat)
@ -98,6 +128,8 @@ LAYOUT(FLAT(INITIAL_ARRAY_SIZE 50000 MAX_ARRAY_SIZE 5000000))

 Словарь полностью хранится в оперативной памяти в виде хэш-таблиц. Словарь может содержать произвольное количество элементов с произвольными идентификаторами. На практике количество ключей может достигать десятков миллионов элементов.

+Ключ словаря имеет тип [UInt64](../../../sql-reference/data-types/int-uint.md).
+
 Если `preallocate` имеет значение `true` (по умолчанию `false`), хеш-таблица будет предварительно определена (это ускорит загрузку словаря). Используйте этот метод только в случае, если:

 - Источник поддерживает произвольное количество элементов (пока поддерживается только источником `ClickHouse`).
@ -125,6 +157,8 @@ LAYOUT(HASHED(PREALLOCATE 0))

 Аналогичен `hashed`, но при этом занимает меньше места в памяти и генерирует более высокую загрузку CPU.

+Ключ словаря имеет тип [UInt64](../../../sql-reference/data-types/int-uint.md).
+
 Для этого типа размещения также можно задать `preallocate` в значении `true`. В данном случае это более важно, чем для типа `hashed`.

 Пример конфигурации:
@ -181,6 +215,8 @@ LAYOUT(COMPLEX_KEY_SPARSE_HASHED())

 Словарь полностью хранится в оперативной памяти. Каждый атрибут хранится в массиве. Ключевой атрибут хранится в виде хеш-таблицы, где его значение является индексом в массиве атрибутов. Словарь может содержать произвольное количество элементов с произвольными идентификаторами. На практике количество ключей может достигать десятков миллионов элементов.

+Ключ словаря имеет тип [UInt64](../../../sql-reference/data-types/int-uint.md).
+
 Поддерживаются все виды источников. При обновлении данные (из файла, из таблицы) считываются целиком.

 Пример конфигурации:
@ -220,6 +256,7 @@ LAYOUT(COMPLEX_KEY_HASHED_ARRAY())

 Словарь хранится в оперативной памяти в виде хэш-таблицы с упорядоченным массивом диапазонов и соответствующих им значений.

+Ключ словаря имеет тип [UInt64](../../../sql-reference/data-types/int-uint.md).
 Этот способ размещения работает также как и hashed и позволяет дополнительно к ключу использовать дипазоны по дате/времени (произвольному числовому типу).

 Пример: таблица содержит скидки для каждого рекламодателя в виде:
@ -355,6 +392,8 @@ RANGE(MIN StartDate MAX EndDate);

 Словарь хранится в кэше, состоящем из фиксированного количества ячеек. Ячейки содержат часто используемые элементы.

+Ключ словаря имеет тип [UInt64](../../../sql-reference/data-types/int-uint.md).
+
 При поиске в словаре сначала просматривается кэш. На каждый блок данных, все не найденные в кэше или устаревшие ключи запрашиваются у источника с помощью `SELECT attrs... FROM db.table WHERE id IN (k1, k2, ...)`. Затем, полученные данные записываются в кэш.

 Если ключи не были найдены в словаре, то для обновления кэша создается задание и добавляется в очередь обновлений. Параметры очереди обновлений можно устанавливать настройками `max_update_queue_size`, `update_queue_push_timeout_milliseconds`, `query_wait_timeout_milliseconds`, `max_threads_for_updates`
@ -414,6 +453,8 @@ LAYOUT(CACHE(SIZE_IN_CELLS 1000000000))

 Похож на `cache`, но хранит данные на SSD, а индекс в оперативной памяти. Все параметры, относящиеся к очереди обновлений, могут также быть применены к SSD-кэш словарям.

+Ключ словаря имеет тип [UInt64](../../../sql-reference/data-types/int-uint.md).
+
 ``` xml
 <layout>
    <ssd_cache>
@ -446,7 +487,7 @@ LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 16777216 READ_BUFFER_SIZE 1048576

 Словарь не хранит данные локально и взаимодействует с источником непосредственно в момент запроса.

-Ключ словаря имеет тип `UInt64`.
+Ключ словаря имеет тип [UInt64](../../../sql-reference/data-types/int-uint.md).

 Поддерживаются все виды [источников](external-dicts-dict-sources.md), кроме локальных файлов.

--- a/programs/CMakeLists.txt
+++ b/programs/CMakeLists.txt
@ -62,6 +62,8 @@ option (ENABLE_CLICKHOUSE_KEEPER "ClickHouse alternative to ZooKeeper" ${ENABLE_

 option (ENABLE_CLICKHOUSE_KEEPER_CONVERTER "Util allows to convert ZooKeeper logs and snapshots into clickhouse-keeper snapshot" ${ENABLE_CLICKHOUSE_ALL})

+option (ENABLE_CLICKHOUSE_SU "A tool similar to 'su'" ${ENABLE_CLICKHOUSE_ALL})
+
 if (NOT ENABLE_NURAFT)
    # RECONFIGURE_MESSAGE_LEVEL should not be used here,
    # since ENABLE_NURAFT is set to OFF for FreeBSD and Darwin.
@ -237,6 +239,7 @@ add_subdirectory (install)
 add_subdirectory (git-import)
 add_subdirectory (bash-completion)
 add_subdirectory (static-files-disk-uploader)
+add_subdirectory (su)

 if (ENABLE_CLICKHOUSE_KEEPER)
    add_subdirectory (keeper)
@ -269,7 +272,8 @@ if (CLICKHOUSE_ONE_SHARED)
    ${CLICKHOUSE_ODBC_BRIDGE_SOURCES}
    ${CLICKHOUSE_KEEPER_SOURCES}
    ${CLICKHOUSE_KEEPER_CONVERTER_SOURCES}
-    ${CLICKHOUSE_STATIC_FILES_DISK_UPLOADER_SOURCES})
+    ${CLICKHOUSE_STATIC_FILES_DISK_UPLOADER_SOURCES}
+    ${CLICKHOUSE_SU_SOURCES})

    target_link_libraries(clickhouse-lib
      ${CLICKHOUSE_SERVER_LINK}
@ -285,7 +289,8 @@ if (CLICKHOUSE_ONE_SHARED)
      ${CLICKHOUSE_ODBC_BRIDGE_LINK}
      ${CLICKHOUSE_KEEPER_LINK}
      ${CLICKHOUSE_KEEPER_CONVERTER_LINK}
-      ${CLICKHOUSE_STATIC_FILES_DISK_UPLOADER_LINK})
+      ${CLICKHOUSE_STATIC_FILES_DISK_UPLOADER_LINK}
+      ${CLICKHOUSE_SU_LINK})

    target_include_directories(clickhouse-lib
      ${CLICKHOUSE_SERVER_INCLUDE}
@ -318,8 +323,7 @@ if (CLICKHOUSE_SPLIT_BINARY)
        clickhouse-obfuscator
        clickhouse-git-import
        clickhouse-copier
-        clickhouse-static-files-disk-uploader
-    )
+        clickhouse-static-files-disk-uploader)

    if (ENABLE_CLICKHOUSE_ODBC_BRIDGE)
        list (APPEND CLICKHOUSE_ALL_TARGETS clickhouse-odbc-bridge)
@ -387,6 +391,9 @@ else ()
    if (ENABLE_CLICKHOUSE_STATIC_FILES_DISK_UPLOADER)
        clickhouse_target_link_split_lib(clickhouse static-files-disk-uploader)
    endif ()
+    if (ENABLE_CLICKHOUSE_SU)
+        clickhouse_target_link_split_lib(clickhouse su)
+    endif ()
    if (ENABLE_CLICKHOUSE_KEEPER)
        clickhouse_target_link_split_lib(clickhouse keeper)
    endif()
@ -453,6 +460,11 @@ else ()
        install (FILES "${CMAKE_CURRENT_BINARY_DIR}/clickhouse-static-files-disk-uploader" DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse)
        list(APPEND CLICKHOUSE_BUNDLE clickhouse-static-files-disk-uploader)
    endif ()
+    if (ENABLE_CLICKHOUSE_SU)
+        add_custom_target (clickhouse-su ALL COMMAND ${CMAKE_COMMAND} -E create_symlink clickhouse clickhouse-su DEPENDS clickhouse)
+        install (FILES "${CMAKE_CURRENT_BINARY_DIR}/clickhouse-su" DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse)
+        list(APPEND CLICKHOUSE_BUNDLE clickhouse-su)
+    endif ()

    if (ENABLE_CLICKHOUSE_KEEPER)
        if (NOT BUILD_STANDALONE_KEEPER AND CREATE_KEEPER_SYMLINK)
--- a/programs/config_tools.h.in
+++ b/programs/config_tools.h.in
@ -19,3 +19,4 @@
 #cmakedefine01 ENABLE_CLICKHOUSE_KEEPER
 #cmakedefine01 ENABLE_CLICKHOUSE_KEEPER_CONVERTER
 #cmakedefine01 ENABLE_CLICKHOUSE_STATIC_FILES_DISK_UPLOADER
+#cmakedefine01 ENABLE_CLICKHOUSE_SU
--- a/programs/install/Install.cpp
+++ b/programs/install/Install.cpp
@ -925,24 +925,7 @@ namespace
            executable.string(), config.string(), pid_file.string());

        if (!user.empty())
-        {
-#if defined(OS_FREEBSD)
-            command = fmt::format("su -m '{}' -c '{}'", user, command);
-#else
-            bool may_need_sudo = geteuid() != 0;
-            if (may_need_sudo)
-            {
-                struct passwd *p = getpwuid(geteuid());
-                // Only use sudo when we are not the given user
-                if (p == nullptr || std::string(p->pw_name) != user)
-                    command = fmt::format("sudo -u '{}' {}", user, command);
-            }
-            else
-            {
-                command = fmt::format("su -s /bin/sh '{}' -c '{}'", user, command);
-            }
-#endif
-        }
+            command = fmt::format("clickhouse su '{}' {}", user, command);

        fmt::print("Will run {}\n", command);
        executeScript(command, true);
--- a/programs/main.cpp
+++ b/programs/main.cpp
@ -65,6 +65,9 @@ int mainEntryClickHouseKeeperConverter(int argc, char ** argv);
 #if ENABLE_CLICKHOUSE_STATIC_FILES_DISK_UPLOADER
 int mainEntryClickHouseStaticFilesDiskUploader(int argc, char ** argv);
 #endif
+#if ENABLE_CLICKHOUSE_SU
+int mainEntryClickHouseSU(int argc, char ** argv);
+#endif
 #if ENABLE_CLICKHOUSE_INSTALL
 int mainEntryClickHouseInstall(int argc, char ** argv);
 int mainEntryClickHouseStart(int argc, char ** argv);
@ -81,8 +84,6 @@ int mainEntryClickHouseHashBinary(int, char **)
    return 0;
 }

-#define ARRAY_SIZE(a) (sizeof(a)/sizeof((a)[0]))
-
 namespace
 {

@ -138,6 +139,9 @@ std::pair<const char *, MainFunc> clickhouse_applications[] =
 #endif
 #if ENABLE_CLICKHOUSE_STATIC_FILES_DISK_UPLOADER
    {"static-files-disk-uploader", mainEntryClickHouseStaticFilesDiskUploader},
+#endif
+#if ENABLE_CLICKHOUSE_SU
+    {"su", mainEntryClickHouseSU},
 #endif
    {"hash-binary", mainEntryClickHouseHashBinary},
 };
@ -189,7 +193,7 @@ auto instructionFailToString(InstructionFail fail)
 {
    switch (fail)
    {
-#define ret(x) return std::make_tuple(STDERR_FILENO, x, ARRAY_SIZE(x) - 1)
+#define ret(x) return std::make_tuple(STDERR_FILENO, x, sizeof(x) - 1)
        case InstructionFail::NONE:
            ret("NONE");
        case InstructionFail::SSE3:
@ -277,7 +281,7 @@ void checkRequiredInstructionsImpl(volatile InstructionFail & fail)
 #define writeError(data) do \
    { \
        static_assert(__builtin_constant_p(data)); \
-        if (!writeRetry(STDERR_FILENO, data, ARRAY_SIZE(data) - 1)) \
+        if (!writeRetry(STDERR_FILENO, data, sizeof(data) - 1)) \
            _Exit(1); \
    } while (false)

@ -334,6 +338,7 @@ struct Checker
 #endif
 ;

+
 /// NOTE: We will migrate to full static linking or our own dynamic loader to make this code obsolete.
 void checkHarmfulEnvironmentVariables(char ** argv)
 {
@ -406,17 +411,17 @@ int main(int argc_, char ** argv_)
    inside_main = true;
    SCOPE_EXIT({ inside_main = false; });

+    /// PHDR cache is required for query profiler to work reliably
+    /// It also speed up exception handling, but exceptions from dynamically loaded libraries (dlopen)
+    ///  will work only after additional call of this function.
+    updatePHDRCache();
+
    checkHarmfulEnvironmentVariables(argv_);

    /// Reset new handler to default (that throws std::bad_alloc)
    /// It is needed because LLVM library clobbers it.
    std::set_new_handler(nullptr);

-    /// PHDR cache is required for query profiler to work reliably
-    /// It also speed up exception handling, but exceptions from dynamically loaded libraries (dlopen)
-    ///  will work only after additional call of this function.
-    updatePHDRCache();
-
    std::vector<char *> argv(argv_, argv_ + argc_);

    /// Print a basic help if nothing was matched
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@ -1395,8 +1395,11 @@ int Server::main(const std::vector<std::string> & /*args*/)
    fs::create_directories(format_schema_path);

    /// Check sanity of MergeTreeSettings on server startup
-    global_context->getMergeTreeSettings().sanityCheck(settings);
-    global_context->getReplicatedMergeTreeSettings().sanityCheck(settings);
+    {
+        size_t background_pool_tasks = global_context->getMergeMutateExecutor()->getMaxTasksCount();
+        global_context->getMergeTreeSettings().sanityCheck(background_pool_tasks);
+        global_context->getReplicatedMergeTreeSettings().sanityCheck(background_pool_tasks);
+    }

    /// try set up encryption. There are some errors in config, error will be printed and server wouldn't start.
    CompressionCodecEncrypted::Configuration::instance().load(config(), "encryption_codecs");
--- a/programs/static-files-disk-uploader/static-files-disk-uploader.cpp
+++ b/programs/static-files-disk-uploader/static-files-disk-uploader.cpp
@ -200,6 +200,6 @@ try
 }
 catch (...)
 {
-    std::cerr << DB::getCurrentExceptionMessage(false);
+    std::cerr << DB::getCurrentExceptionMessage(false) << '\n';
    return 1;
 }
--- a/programs/su/CMakeLists.txt
+++ b/programs/su/CMakeLists.txt
@ -0,0 +1,3 @@
+set (CLICKHOUSE_SU_SOURCES clickhouse-su.cpp)
+set (CLICKHOUSE_SU_LINK PRIVATE dbms)
+clickhouse_program_add(su)
--- a/programs/su/clickhouse-su.cpp
+++ b/programs/su/clickhouse-su.cpp
@ -0,0 +1,145 @@
+#include <Common/Exception.h>
+#include <IO/ReadHelpers.h>
+#include <fmt/format.h>
+#include <vector>
+
+#include <sys/types.h>
+#include <unistd.h>
+#include <pwd.h>
+#include <grp.h>
+
+
+/// "su" means "set user"
+/// In fact, this program can set Unix user and group.
+///
+/// Usage:
+/// clickhouse su user[:group] args...
+///
+/// - will set user and, optionally, group and exec the remaining args.
+///   user and group can be numeric identifiers or strings.
+///
+/// The motivation for this tool is very obscure and idiosyncratic. It is needed for Docker.
+/// People want to run programs inside Docker with dropped privileges (less than root).
+/// But the standard Linux "su" program is not suitable for usage inside Docker,
+/// because it is creating pseudoterminals to avoid hijacking input from the terminal, for security,
+/// but Docker is also doing something with the terminal and it is incompatible.
+/// For this reason, people use alternative and less "secure" versions of "su" tools like "gosu" or "su-exec".
+/// But it would be very strange to use 3rd-party software only to do two-three syscalls.
+/// That's why we provide this tool.
+///
+/// Note: ClickHouse does not need Docker at all and works better without Docker.
+/// ClickHouse has no dependencies, it is packaged and distributed in single binary.
+/// There is no reason to use Docker unless you are already running all your software in Docker.
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int BAD_ARGUMENTS;
+    extern const int SYSTEM_ERROR;
+}
+
+void setUserAndGroup(std::string arg_uid, std::string arg_gid)
+{
+    static constexpr size_t buf_size = 16384; /// Linux man page says it is enough. Nevertheless, we will check if it's not enough and throw.
+    std::unique_ptr<char[]> buf(new char[buf_size]);
+
+    /// Set the group first, because if we set user, the privileges will be already dropped and we will not be able to set the group later.
+
+    if (!arg_gid.empty())
+    {
+        gid_t gid = 0;
+        if (!tryParse(gid, arg_gid) || gid == 0)
+        {
+            group entry{};
+            group * result{};
+
+            if (0 != getgrnam_r(arg_gid.data(), &entry, buf.get(), buf_size, &result))
+                throwFromErrno(fmt::format("Cannot do 'getgrnam_r' to obtain gid from group name ({})", arg_gid), ErrorCodes::SYSTEM_ERROR);
+
+            if (!result)
+                throw Exception("Group {} is not found in the system", ErrorCodes::BAD_ARGUMENTS);
+
+            gid = entry.gr_gid;
+        }
+
+        if (gid == 0 && getgid() != 0)
+            throw Exception("Group has id 0, but dropping privileges to gid 0 does not make sense", ErrorCodes::BAD_ARGUMENTS);
+
+        if (0 != setgid(gid))
+            throwFromErrno(fmt::format("Cannot do 'setgid' to user ({})", arg_gid), ErrorCodes::SYSTEM_ERROR);
+    }
+
+    if (!arg_uid.empty())
+    {
+        /// Is it numeric id or name?
+        uid_t uid = 0;
+        if (!tryParse(uid, arg_uid) || uid == 0)
+        {
+            passwd entry{};
+            passwd * result{};
+
+            if (0 != getpwnam_r(arg_uid.data(), &entry, buf.get(), buf_size, &result))
+                throwFromErrno(fmt::format("Cannot do 'getpwnam_r' to obtain uid from user name ({})", arg_uid), ErrorCodes::SYSTEM_ERROR);
+
+            if (!result)
+                throw Exception("User {} is not found in the system", ErrorCodes::BAD_ARGUMENTS);
+
+            uid = entry.pw_uid;
+        }
+
+        if (uid == 0 && getuid() != 0)
+            throw Exception("User has id 0, but dropping privileges to uid 0 does not make sense", ErrorCodes::BAD_ARGUMENTS);
+
+        if (0 != setuid(uid))
+            throwFromErrno(fmt::format("Cannot do 'setuid' to user ({})", arg_uid), ErrorCodes::SYSTEM_ERROR);
+    }
+}
+
+}
+
+
+int mainEntryClickHouseSU(int argc, char ** argv)
+try
+{
+    using namespace DB;
+
+    if (argc < 3)
+    {
+        std::cout << "Usage: ./clickhouse su user:group ..." << std::endl;
+        exit(0);
+    }
+
+    std::string_view user_and_group = argv[1];
+
+    std::string user;
+    std::string group;
+
+    auto pos = user_and_group.find(':');
+    if (pos == std::string_view::npos)
+    {
+        user = user_and_group;
+    }
+    else
+    {
+        user = user_and_group.substr(0, pos);
+        group = user_and_group.substr(pos + 1);
+    }
+
+    setUserAndGroup(std::move(user), std::move(group));
+
+    std::vector<char *> new_argv;
+    new_argv.reserve(argc - 1);
+    new_argv.insert(new_argv.begin(), argv + 2, argv + argc);
+    new_argv.push_back(nullptr);
+
+    execvp(new_argv.front(), new_argv.data());
+
+    throwFromErrno("Cannot execvp", ErrorCodes::SYSTEM_ERROR);
+}
+catch (...)
+{
+    std::cerr << DB::getCurrentExceptionMessage(false) << '\n';
+    return 1;
+}
--- a/src/Access/Common/AccessType.h
+++ b/src/Access/Common/AccessType.h
@ -159,6 +159,7 @@ enum class AccessType
    M(SYSTEM_RESTART_REPLICA, "RESTART REPLICA", TABLE, SYSTEM) \
    M(SYSTEM_RESTORE_REPLICA, "RESTORE REPLICA", TABLE, SYSTEM) \
    M(SYSTEM_SYNC_DATABASE_REPLICA, "SYNC DATABASE REPLICA", DATABASE, SYSTEM) \
+    M(SYSTEM_SYNC_TRANSACTION_LOG, "SYNC TRANSACTION LOG", GLOBAL, SYSTEM) \
    M(SYSTEM_FLUSH_DISTRIBUTED, "FLUSH DISTRIBUTED", TABLE, SYSTEM_FLUSH) \
    M(SYSTEM_FLUSH_LOGS, "FLUSH LOGS", GLOBAL, SYSTEM_FLUSH) \
    M(SYSTEM_FLUSH, "", GROUP, SYSTEM) \
--- a/src/AggregateFunctions/AggregateFunctionAvg.h
+++ b/src/AggregateFunctions/AggregateFunctionAvg.h
@ -224,8 +224,16 @@ public:
        ++this->data(place).denominator;
    }

-    void
-    addBatchSinglePlace(
+    void addManyDefaults(
+        AggregateDataPtr __restrict place,
+        const IColumn ** /*columns*/,
+        size_t length,
+        Arena * /*arena*/) const override
+    {
+        this->data(place).denominator += length;
+    }
+
+    void addBatchSinglePlace(
        size_t row_begin,
        size_t row_end,
        AggregateDataPtr place,
--- a/src/AggregateFunctions/AggregateFunctionCount.h
+++ b/src/AggregateFunctions/AggregateFunctionCount.h
@ -53,6 +53,15 @@ public:
        ++data(place).count;
    }

+    void addManyDefaults(
+        AggregateDataPtr __restrict place,
+        const IColumn ** /*columns*/,
+        size_t length,
+        Arena * /*arena*/) const override
+    {
+        data(place).count += length;
+    }
+
    void addBatchSinglePlace(
        size_t row_begin,
        size_t row_end,
--- a/src/AggregateFunctions/AggregateFunctionMinMaxAny.h
+++ b/src/AggregateFunctions/AggregateFunctionMinMaxAny.h
@ -880,8 +880,9 @@ struct AggregateFunctionMinData : Data
 {
    using Self = AggregateFunctionMinData;

-    bool changeIfBetter(const IColumn & column, size_t row_num, Arena * arena) { return this->changeIfLess(column, row_num, arena); }
-    bool changeIfBetter(const Self & to, Arena * arena)                        { return this->changeIfLess(to, arena); }
+    bool changeIfBetter(const IColumn & column, size_t row_num, Arena * arena)     { return this->changeIfLess(column, row_num, arena); }
+    bool changeIfBetter(const Self & to, Arena * arena)                            { return this->changeIfLess(to, arena); }
+    void addManyDefaults(const IColumn & column, size_t /*length*/, Arena * arena) { this->changeIfLess(column, 0, arena); }

    static const char * name() { return "min"; }

@ -907,8 +908,9 @@ struct AggregateFunctionMaxData : Data
 {
    using Self = AggregateFunctionMaxData;

-    bool changeIfBetter(const IColumn & column, size_t row_num, Arena * arena) { return this->changeIfGreater(column, row_num, arena); }
-    bool changeIfBetter(const Self & to, Arena * arena)                        { return this->changeIfGreater(to, arena); }
+    bool changeIfBetter(const IColumn & column, size_t row_num, Arena * arena)     { return this->changeIfGreater(column, row_num, arena); }
+    bool changeIfBetter(const Self & to, Arena * arena)                            { return this->changeIfGreater(to, arena); }
+    void addManyDefaults(const IColumn & column, size_t /*length*/, Arena * arena) { this->changeIfGreater(column, 0, arena); }

    static const char * name() { return "max"; }

@ -935,8 +937,9 @@ struct AggregateFunctionAnyData : Data
    using Self = AggregateFunctionAnyData;
    static constexpr bool is_any = true;

-    bool changeIfBetter(const IColumn & column, size_t row_num, Arena * arena) { return this->changeFirstTime(column, row_num, arena); }
-    bool changeIfBetter(const Self & to, Arena * arena)                        { return this->changeFirstTime(to, arena); }
+    bool changeIfBetter(const IColumn & column, size_t row_num, Arena * arena)     { return this->changeFirstTime(column, row_num, arena); }
+    bool changeIfBetter(const Self & to, Arena * arena)                            { return this->changeFirstTime(to, arena); }
+    void addManyDefaults(const IColumn & column, size_t /*length*/, Arena * arena) { this->changeFirstTime(column, 0, arena); }

    static const char * name() { return "any"; }

@ -962,8 +965,9 @@ struct AggregateFunctionAnyLastData : Data
 {
    using Self = AggregateFunctionAnyLastData;

-    bool changeIfBetter(const IColumn & column, size_t row_num, Arena * arena) { return this->changeEveryTime(column, row_num, arena); }
-    bool changeIfBetter(const Self & to, Arena * arena)                        { return this->changeEveryTime(to, arena); }
+    bool changeIfBetter(const IColumn & column, size_t row_num, Arena * arena)     { return this->changeEveryTime(column, row_num, arena); }
+    bool changeIfBetter(const Self & to, Arena * arena)                            { return this->changeEveryTime(to, arena); }
+    void addManyDefaults(const IColumn & column, size_t /*length*/, Arena * arena) { this->changeEveryTime(column, 0, arena); }

    static const char * name() { return "anyLast"; }

@ -1024,6 +1028,8 @@ struct AggregateFunctionSingleValueOrNullData : Data
        return false;
    }

+    void addManyDefaults(const IColumn & column, size_t /*length*/, Arena * arena) { this->changeIfBetter(column, 0, arena); }
+
    void insertResultInto(IColumn & to) const
    {
        if (is_null || first_value)
@ -1098,6 +1104,12 @@ struct AggregateFunctionAnyHeavyData : Data
        return false;
    }

+    void addManyDefaults(const IColumn & column, size_t length, Arena * arena)
+    {
+        for (size_t i = 0; i < length; ++i)
+            changeIfBetter(column, 0, arena);
+    }
+
    void write(WriteBuffer & buf, const ISerialization & serialization) const
    {
        Data::write(buf, serialization);
@ -1158,6 +1170,15 @@ public:
        this->data(place).changeIfBetter(*columns[0], row_num, arena);
    }

+    void addManyDefaults(
+        AggregateDataPtr __restrict place,
+        const IColumn ** columns,
+        size_t length,
+        Arena * arena) const override
+    {
+        this->data(place).addManyDefaults(*columns[0], length, arena);
+    }
+
    void addBatchSinglePlace(
        size_t row_begin,
        size_t row_end,
--- a/src/AggregateFunctions/AggregateFunctionSum.h
+++ b/src/AggregateFunctions/AggregateFunctionSum.h
@ -489,6 +489,33 @@ public:
        }
    }

+    void addManyDefaults(
+        AggregateDataPtr __restrict /*place*/,
+        const IColumn ** /*columns*/,
+        size_t /*length*/,
+        Arena * /*arena*/) const override
+    {
+    }
+
+    void addBatchSparse(
+        size_t row_begin,
+        size_t row_end,
+        AggregateDataPtr * places,
+        size_t place_offset,
+        const IColumn ** columns,
+        Arena * arena) const override
+    {
+        const auto & column_sparse = assert_cast<const ColumnSparse &>(*columns[0]);
+        const auto * values = &column_sparse.getValuesColumn();
+        const auto & offsets = column_sparse.getOffsetsData();
+
+        size_t from = std::lower_bound(offsets.begin(), offsets.end(), row_begin) - offsets.begin();
+        size_t to = std::lower_bound(offsets.begin(), offsets.end(), row_end) - offsets.begin();
+
+        for (size_t i = from; i < to; ++i)
+            add(places[offsets[i]] + place_offset, &values, i + 1, arena);
+    }
+
    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
    {
        this->data(place).merge(this->data(rhs));
--- a/src/AggregateFunctions/AggregateFunctionUniq.h
+++ b/src/AggregateFunctions/AggregateFunctionUniq.h
@ -237,6 +237,15 @@ public:
        detail::OneAdder<T, Data>::add(this->data(place), *columns[0], row_num);
    }

+    void addManyDefaults(
+        AggregateDataPtr __restrict place,
+        const IColumn ** columns,
+        size_t /*length*/,
+        Arena * /*arena*/) const override
+    {
+        detail::OneAdder<T, Data>::add(this->data(place), *columns[0], 0);
+    }
+
    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
    {
        this->data(place).set.merge(this->data(rhs).set);
--- a/src/AggregateFunctions/IAggregateFunction.h
+++ b/src/AggregateFunctions/IAggregateFunction.h
@ -123,6 +123,10 @@ public:
     */
    virtual void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena * arena) const = 0;

+    /// Adds several default values of arguments into aggregation data on which place points to.
+    /// Default values must be a the 0-th positions in columns.
+    virtual void addManyDefaults(AggregateDataPtr __restrict place, const IColumn ** columns, size_t length, Arena * arena) const = 0;
+
    /// Merges state (on which place points to) with other state of current aggregation function.
    virtual void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena * arena) const = 0;

@ -377,6 +381,16 @@ public:

    AddFunc getAddressOfAddFunction() const override { return &addFree; }

+    void addManyDefaults(
+        AggregateDataPtr __restrict place,
+        const IColumn ** columns,
+        size_t length,
+        Arena * arena) const override
+    {
+        for (size_t i = 0; i < length; ++i)
+            static_cast<const Derived *>(this)->add(place, columns, 0, arena);
+    }
+
    void addBatch( /// NOLINT
        size_t row_begin,
        size_t row_end,
@ -413,13 +427,9 @@ public:
    {
        const auto & column_sparse = assert_cast<const ColumnSparse &>(*columns[0]);
        const auto * values = &column_sparse.getValuesColumn();
-        auto offset_it = column_sparse.begin();
+        auto offset_it = column_sparse.getIterator(row_begin);

-        /// FIXME: make it more optimal
-        for (size_t i = 0; i < row_begin; ++i, ++offset_it)
-            ;
-
-        for (size_t i = 0; i < row_end; ++i, ++offset_it)
+        for (size_t i = row_begin; i < row_end; ++i, ++offset_it)
            static_cast<const Derived *>(this)->add(places[offset_it.getCurrentRow()] + place_offset,
                                                    &values, offset_it.getValueIndex(), arena);
    }
@ -468,17 +478,16 @@ public:
        const IColumn ** columns,
        Arena * arena) const override
    {
-        /// TODO: add values and defaults separately if order of adding isn't important.
        const auto & column_sparse = assert_cast<const ColumnSparse &>(*columns[0]);
        const auto * values = &column_sparse.getValuesColumn();
-        auto offset_it = column_sparse.begin();
+        const auto & offsets = column_sparse.getOffsetsData();

-        /// FIXME: make it more optimal
-        for (size_t i = 0; i < row_begin; ++i, ++offset_it)
-            ;
+        auto from = std::lower_bound(offsets.begin(), offsets.end(), row_begin) - offsets.begin() + 1;
+        auto to = std::lower_bound(offsets.begin(), offsets.end(), row_end) - offsets.begin() + 1;

-        for (size_t i = 0; i < row_end; ++i, ++offset_it)
-            static_cast<const Derived *>(this)->add(place, &values, offset_it.getValueIndex(), arena);
+        size_t num_defaults = (row_end - row_begin) - (to - from);
+        static_cast<const Derived *>(this)->addBatchSinglePlace(from, to, place, &values, arena, -1);
+        static_cast<const Derived *>(this)->addManyDefaults(place, &values, num_defaults, arena);
    }

    void addBatchSinglePlaceNotNull( /// NOLINT
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -87,6 +87,7 @@ add_headers_and_sources(clickhouse_common_io IO/S3)
 list (REMOVE_ITEM clickhouse_common_io_sources Common/malloc.cpp Common/new_delete.cpp)

 add_headers_and_sources(dbms Disks/IO)
+add_headers_and_sources(dbms Disks/ObjectStorages)
 if (TARGET ch_contrib::sqlite)
    add_headers_and_sources(dbms Databases/SQLite)
 endif()
@ -113,16 +114,16 @@ endif()

 if (TARGET ch_contrib::aws_s3)
    add_headers_and_sources(dbms Common/S3)
-    add_headers_and_sources(dbms Disks/S3)
+    add_headers_and_sources(dbms Disks/ObjectStorages/S3)
 endif()

 if (TARGET ch_contrib::azure_sdk)
-    add_headers_and_sources(dbms Disks/AzureBlobStorage)
+    add_headers_and_sources(dbms Disks/ObjectStorages/AzureBlobStorage)
 endif()

 if (TARGET ch_contrib::hdfs)
    add_headers_and_sources(dbms Storages/HDFS)
-    add_headers_and_sources(dbms Disks/HDFS)
+    add_headers_and_sources(dbms Disks/ObjectStorages/HDFS)
 endif()

 add_headers_and_sources(dbms Storages/Cache)
--- a/src/Client/ClientBase.cpp
+++ b/src/Client/ClientBase.cpp
@ -285,11 +285,11 @@ void ClientBase::setupSignalHandler()
    sigemptyset(&new_act.sa_mask);
 #else
    if (sigemptyset(&new_act.sa_mask))
-        throw Exception(ErrorCodes::CANNOT_SET_SIGNAL_HANDLER, "Cannot set signal handler.");
+        throwFromErrno("Cannot set signal handler.", ErrorCodes::CANNOT_SET_SIGNAL_HANDLER);
 #endif

    if (sigaction(SIGINT, &new_act, nullptr))
-        throw Exception(ErrorCodes::CANNOT_SET_SIGNAL_HANDLER, "Cannot set signal handler.");
+        throwFromErrno("Cannot set signal handler.", ErrorCodes::CANNOT_SET_SIGNAL_HANDLER);
 }


@ -492,7 +492,8 @@ try
        String pager = config().getString("pager", "");
        if (!pager.empty())
        {
-            signal(SIGPIPE, SIG_IGN);
+            if (SIG_ERR == signal(SIGPIPE, SIG_IGN))
+                throwFromErrno("Cannot set signal handler.", ErrorCodes::CANNOT_SET_SIGNAL_HANDLER);

            ShellCommand::Config config(pager);
            config.pipe_stdin_only = true;
--- a/src/Columns/ColumnSparse.cpp
+++ b/src/Columns/ColumnSparse.cpp
@ -772,6 +772,14 @@ size_t ColumnSparse::getValueIndex(size_t n) const
    return it - offsets_data.begin() + 1;
 }

+ColumnSparse::Iterator ColumnSparse::getIterator(size_t n) const
+{
+    const auto & offsets_data = getOffsetsData();
+    const auto * it = std::lower_bound(offsets_data.begin(), offsets_data.end(), n);
+    size_t current_offset = it - offsets_data.begin();
+    return Iterator(offsets_data, _size, current_offset, n);
+}
+
 ColumnPtr recursiveRemoveSparse(const ColumnPtr & column)
 {
    if (!column)
--- a/src/Columns/ColumnSparse.h
+++ b/src/Columns/ColumnSparse.h
@ -215,6 +215,7 @@ public:

    Iterator begin() const { return Iterator(getOffsetsData(), _size, 0, 0); }
    Iterator end() const { return Iterator(getOffsetsData(), _size, getOffsetsData().size(), _size); }
+    Iterator getIterator(size_t n) const;

 private:
    using Inserter = std::function<void(IColumn &)>;
--- a/src/Common/ErrorCodes.cpp
+++ b/src/Common/ErrorCodes.cpp
@ -628,6 +628,7 @@
    M(657, UNSUPPORTED_MEILISEARCH_TYPE) \
    M(658, MEILISEARCH_MISSING_SOME_COLUMNS) \
    M(659, UNKNOWN_STATUS_OF_TRANSACTION) \
+    M(660, HDFS_ERROR) \
    \
    M(999, KEEPER_EXCEPTION) \
    M(1000, POCO_EXCEPTION) \
--- a/src/Common/FileCache.cpp
+++ b/src/Common/FileCache.cpp
@ -72,6 +72,8 @@ void IFileCache::assertInitialized() const

 LRUFileCache::LRUFileCache(const String & cache_base_path_, const FileCacheSettings & cache_settings_)
    : IFileCache(cache_base_path_, cache_settings_)
+    , max_stash_element_size(cache_settings_.max_elements)
+    , enable_cache_hits_threshold(cache_settings_.enable_cache_hits_threshold)
    , log(&Poco::Logger::get("LRUFileCache"))
 {
 }
@ -404,9 +406,42 @@ LRUFileCache::FileSegmentCell * LRUFileCache::addCell(
            "Cache already exists for key: `{}`, offset: {}, size: {}.\nCurrent cache structure: {}",
            keyToStr(key), offset, size, dumpStructureUnlocked(key, cache_lock));

-    auto file_segment = std::make_shared<FileSegment>(offset, size, key, this, state);
-    FileSegmentCell cell(std::move(file_segment), this, cache_lock);
+    auto skip_or_download = [&]() -> FileSegmentPtr
+    {
+        if (state == FileSegment::State::EMPTY && enable_cache_hits_threshold)
+        {
+            auto record = records.find({key, offset});

+            if (record == records.end())
+            {
+                auto queue_iter = stash_queue.add(key, offset, 0, cache_lock);
+                records.insert({{key, offset}, queue_iter});
+
+                if (stash_queue.getElementsNum(cache_lock) > max_stash_element_size)
+                {
+                    auto remove_queue_iter = stash_queue.begin();
+                    records.erase({remove_queue_iter->key, remove_queue_iter->offset});
+                    stash_queue.remove(remove_queue_iter, cache_lock);
+                }
+
+                /// For segments that do not reach the download threshold, we do not download them, but directly read them
+                return std::make_shared<FileSegment>(offset, size, key, this, FileSegment::State::SKIP_CACHE);
+            }
+            else
+            {
+                auto queue_iter = record->second;
+                queue_iter->hits++;
+                stash_queue.moveToEnd(queue_iter, cache_lock);
+
+                state = queue_iter->hits >= enable_cache_hits_threshold ? FileSegment::State::EMPTY : FileSegment::State::SKIP_CACHE;
+                return std::make_shared<FileSegment>(offset, size, key, this, state);
+            }
+        }
+        else
+            return std::make_shared<FileSegment>(offset, size, key, this, state);
+    };
+
+    FileSegmentCell cell(skip_or_download(), this, cache_lock);
    auto & offsets = files[key];

    if (offsets.empty())
@ -471,7 +506,7 @@ bool LRUFileCache::tryReserve(
    std::vector<FileSegmentCell *> to_evict;
    std::vector<FileSegmentCell *> trash;

-    for (const auto & [entry_key, entry_offset, entry_size] : queue)
+    for (const auto & [entry_key, entry_offset, entry_size, _] : queue)
    {
        if (!is_overflow())
            break;
@ -603,10 +638,6 @@ void LRUFileCache::remove(const Key & key)

    if (fs::exists(key_path))
        fs::remove(key_path);
-
-#ifndef NDEBUG
-    assertCacheCorrectness(cache_lock);
-#endif
 }

 void LRUFileCache::remove()
@ -619,7 +650,7 @@ void LRUFileCache::remove()
    std::vector<FileSegment *> to_remove;
    for (auto it = queue.begin(); it != queue.end();)
    {
-        const auto & [key, offset, size] = *it++;
+        const auto & [key, offset, size, _] = *it++;
        auto * cell = getCell(key, offset, cache_lock);
        if (!cell)
            throw Exception(
@ -637,6 +668,10 @@ void LRUFileCache::remove()
            }
        }
    }
+
+    /// Remove all access information.
+    records.clear();
+    stash_queue.removeAll(cache_lock);
 }

 void LRUFileCache::remove(
@ -882,6 +917,7 @@ LRUFileCache::FileSegmentCell::FileSegmentCell(
            queue_iterator = cache->queue.add(file_segment->key(), file_segment->offset(), file_segment->range().size(), cache_lock);
            break;
        }
+        case FileSegment::State::SKIP_CACHE:
        case FileSegment::State::EMPTY:
        case FileSegment::State::DOWNLOADING:
        {
@ -898,7 +934,7 @@ LRUFileCache::LRUQueue::Iterator LRUFileCache::LRUQueue::add(
    const IFileCache::Key & key, size_t offset, size_t size, std::lock_guard<std::mutex> & /* cache_lock */)
 {
 #ifndef NDEBUG
-    for (const auto & [entry_key, entry_offset, _] : queue)
+    for (const auto & [entry_key, entry_offset, entry_size, entry_hits] : queue)
    {
        if (entry_key == key && entry_offset == offset)
            throw Exception(
@ -918,6 +954,12 @@ void LRUFileCache::LRUQueue::remove(Iterator queue_it, std::lock_guard<std::mute
    queue.erase(queue_it);
 }

+void LRUFileCache::LRUQueue::removeAll(std::lock_guard<std::mutex> & /* cache_lock */)
+{
+    queue.clear();
+    cache_size = 0;
+}
+
 void LRUFileCache::LRUQueue::moveToEnd(Iterator queue_it, std::lock_guard<std::mutex> & /* cache_lock */)
 {
    queue.splice(queue.end(), queue, queue_it);
@ -934,7 +976,7 @@ bool LRUFileCache::LRUQueue::contains(
 {
    /// This method is used for assertions in debug mode.
    /// So we do not care about complexity here.
-    for (const auto & [entry_key, entry_offset, size] : queue)
+    for (const auto & [entry_key, entry_offset, size, _] : queue)
    {
        if (key == entry_key && offset == entry_offset)
            return true;
@ -947,7 +989,7 @@ void LRUFileCache::LRUQueue::assertCorrectness(LRUFileCache * cache, std::lock_g
    [[maybe_unused]] size_t total_size = 0;
    for (auto it = queue.begin(); it != queue.end();)
    {
-        auto & [key, offset, size] = *it++;
+        auto & [key, offset, size, _] = *it++;

        auto * cell = cache->getCell(key, offset, cache_lock);
        if (!cell)
@ -969,7 +1011,7 @@ void LRUFileCache::LRUQueue::assertCorrectness(LRUFileCache * cache, std::lock_g
 String LRUFileCache::LRUQueue::toString(std::lock_guard<std::mutex> & /* cache_lock */) const
 {
    String result;
-    for (const auto & [key, offset, size] : queue)
+    for (const auto & [key, offset, size, _] : queue)
    {
        if (!result.empty())
            result += ", ";
--- a/src/Common/FileCache.h
+++ b/src/Common/FileCache.h
@ -7,6 +7,7 @@
 #include <mutex>
 #include <unordered_map>
 #include <unordered_set>
+#include <boost/functional/hash.hpp>
 #include <boost/noncopyable.hpp>
 #include <map>

@ -165,6 +166,7 @@ private:
            Key key;
            size_t offset;
            size_t size;
+            size_t hits = 0;

            FileKeyAndOffset(const Key & key_, size_t offset_, size_t size_) : key(key_), offset(offset_), size(size_) {}
        };
@ -194,6 +196,8 @@ private:

        Iterator end() { return queue.end(); }

+        void removeAll(std::lock_guard<std::mutex> & cache_lock);
+
    private:
        std::list<FileKeyAndOffset> queue;
        size_t cache_size = 0;
@ -223,8 +227,26 @@ private:
    using FileSegmentsByOffset = std::map<size_t, FileSegmentCell>;
    using CachedFiles = std::unordered_map<Key, FileSegmentsByOffset>;

+    using AccessKeyAndOffset = std::pair<Key, size_t>;
+
+    struct KeyAndOffsetHash
+    {
+        std::size_t operator()(const AccessKeyAndOffset & key) const
+        {
+            return std::hash<UInt128>()(key.first) ^ std::hash<UInt64>()(key.second);
+        }
+    };
+
+    using AccessRecord = std::unordered_map<AccessKeyAndOffset, LRUQueue::Iterator, KeyAndOffsetHash>;
+
    CachedFiles files;
    LRUQueue queue;
+
+    LRUQueue stash_queue;
+    AccessRecord records;
+    size_t max_stash_element_size;
+    size_t enable_cache_hits_threshold;
+
    Poco::Logger * log;

    FileSegments getImpl(
--- a/src/Common/FileCacheSettings.cpp
+++ b/src/Common/FileCacheSettings.cpp
@ -11,6 +11,7 @@ void FileCacheSettings::loadFromConfig(const Poco::Util::AbstractConfiguration &
    max_elements = config.getUInt64(config_prefix + ".data_cache_max_elements", REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_ELEMENTS);
    max_file_segment_size = config.getUInt64(config_prefix + ".max_file_segment_size", REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_FILE_SEGMENT_SIZE);
    cache_on_write_operations = config.getUInt64(config_prefix + ".cache_on_write_operations", false);
+    enable_cache_hits_threshold = config.getUInt64(config_prefix + ".enable_cache_hits_threshold", REMOTE_FS_OBJECTS_CACHE_ENABLE_HITS_THRESHOLD);
 }

 }
--- a/src/Common/FileCacheSettings.h
+++ b/src/Common/FileCacheSettings.h
@ -14,6 +14,8 @@ struct FileCacheSettings
    size_t max_file_segment_size = REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_FILE_SEGMENT_SIZE;
    bool cache_on_write_operations = false;

+    size_t enable_cache_hits_threshold = REMOTE_FS_OBJECTS_CACHE_ENABLE_HITS_THRESHOLD;
+
    void loadFromConfig(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix);
 };

--- a/src/Common/FileCache_fwd.h
+++ b/src/Common/FileCache_fwd.h
@ -7,6 +7,7 @@ namespace DB
 static constexpr int REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_CACHE_SIZE = 1024 * 1024 * 1024;
 static constexpr int REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_FILE_SEGMENT_SIZE = 100 * 1024 * 1024;
 static constexpr int REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_ELEMENTS = 1024 * 1024;
+static constexpr int REMOTE_FS_OBJECTS_CACHE_ENABLE_HITS_THRESHOLD = 0;

 class IFileCache;
 using FileCachePtr = std::shared_ptr<IFileCache>;
--- a/src/Common/FileSegment.cpp
+++ b/src/Common/FileSegment.cpp
@ -59,6 +59,10 @@ FileSegment::FileSegment(
            downloader_id = getCallerId();
            break;
        }
+        case (State::SKIP_CACHE):
+        {
+            break;
+        }
        default:
        {
            throw Exception(ErrorCodes::REMOTE_FS_OBJECT_CACHE_ERROR, "Can create cell with either EMPTY, DOWNLOADED, DOWNLOADING state");
@ -525,6 +529,14 @@ void FileSegment::complete(std::lock_guard<std::mutex> & cache_lock)

 void FileSegment::completeUnlocked(std::lock_guard<std::mutex> & cache_lock, std::lock_guard<std::mutex> & segment_lock)
 {
+    bool is_last_holder = cache->isLastFileSegmentHolder(key(), offset(), cache_lock, segment_lock);
+
+    if (is_last_holder && download_state == State::SKIP_CACHE)
+    {
+        cache->remove(key(), offset(), cache_lock, segment_lock);
+        return;
+    }
+
    if (download_state == State::SKIP_CACHE || is_detached)
        return;

@ -542,8 +554,7 @@ void FileSegment::completeUnlocked(std::lock_guard<std::mutex> & cache_lock, std
        /// Segment state can be changed from DOWNLOADING or EMPTY only if the caller is the
        /// downloader or the only owner of the segment.

-        bool can_update_segment_state = isDownloaderImpl(segment_lock)
-            || cache->isLastFileSegmentHolder(key(), offset(), cache_lock, segment_lock);
+        bool can_update_segment_state = isDownloaderImpl(segment_lock) || is_last_holder;

        if (can_update_segment_state)
            download_state = State::PARTIALLY_DOWNLOADED;
--- a/src/Common/OptimizedRegularExpression.cpp
+++ b/src/Common/OptimizedRegularExpression.cpp
@ -17,7 +17,7 @@ namespace DB

 template <bool thread_safe>
 void OptimizedRegularExpressionImpl<thread_safe>::analyze(
-    const std::string & regexp,
+    std::string_view regexp,
    std::string & required_substring,
    bool & is_trivial,
    bool & required_substring_is_prefix)
--- a/src/Common/OptimizedRegularExpression.h
+++ b/src/Common/OptimizedRegularExpression.h
@ -86,8 +86,6 @@ public:
    /// Get the regexp re2 or nullptr if the pattern is trivial (for output to the log).
    const std::unique_ptr<RegexType> & getRE2() const { return re2; }

-    static void analyze(const std::string & regexp_, std::string & required_substring, bool & is_trivial, bool & required_substring_is_prefix);
-
    void getAnalyzeResult(std::string & out_required_substring, bool & out_is_trivial, bool & out_required_substring_is_prefix) const
    {
        out_required_substring = required_substring;
@ -104,6 +102,8 @@ private:
    std::optional<DB::StringSearcher<false, true>> case_insensitive_substring_searcher;
    std::unique_ptr<RegexType> re2;
    unsigned number_of_subpatterns;
+
+    static void analyze(std::string_view regexp_, std::string & required_substring, bool & is_trivial, bool & required_substring_is_prefix);
 };

 using OptimizedRegularExpression = OptimizedRegularExpressionImpl<true>;
--- a/src/Common/ProfileEvents.cpp
+++ b/src/Common/ProfileEvents.cpp
@ -144,6 +144,13 @@
    M(MergeTreeDataWriterBlocks, "Number of blocks INSERTed to MergeTree tables. Each block forms a data part of level zero.") \
    M(MergeTreeDataWriterBlocksAlreadySorted, "Number of blocks INSERTed to MergeTree tables that appeared to be already sorted.") \
    \
+    M(InsertedWideParts, "Number of parts inserted in Wide format.") \
+    M(InsertedCompactParts, "Number of parts inserted in Compact format.") \
+    M(InsertedInMemoryParts, "Number of parts inserted in InMemory format.") \
+    M(MergedIntoWideParts, "Number of parts merged into Wide format.") \
+    M(MergedIntoCompactParts, "Number of parts merged into Compact format.") \
+    M(MergedIntoInMemoryParts, "Number of parts in merged into InMemory format.") \
+    \
    M(MergeTreeDataProjectionWriterRows, "Number of rows INSERTed to MergeTree tables projection.") \
    M(MergeTreeDataProjectionWriterUncompressedBytes, "Uncompressed bytes (for columns as they stored in memory) INSERTed to MergeTree tables projection.") \
    M(MergeTreeDataProjectionWriterCompressedBytes, "Bytes written to filesystem for data INSERTed to MergeTree tables projection.") \
--- a/src/Common/ThreadFuzzer.cpp
+++ b/src/Common/ThreadFuzzer.cpp
@ -1,3 +1,5 @@
+// NOLINTBEGIN(readability-inconsistent-declaration-parameter-name)
+
 #include <csignal>
 #include <sys/time.h>
 #if defined(OS_LINUX)
@ -317,3 +319,5 @@ FOR_EACH_WRAPPED_FUNCTION(MAKE_WRAPPER)
 #    undef MAKE_WRAPPER
 #endif
 }
+
+// NOLINTEND(readability-inconsistent-declaration-parameter-name)
--- a/src/Common/filesystemHelpers.cpp
+++ b/src/Common/filesystemHelpers.cpp
@ -1,10 +1,8 @@
 #include "filesystemHelpers.h"

-#include <sys/stat.h>
 #if defined(__linux__)
 #    include <cstdio>
 #    include <mntent.h>
-#    include <sys/stat.h>
 #    include <sys/sysmacros.h>
 #endif
 #include <cerrno>
@ -13,6 +11,7 @@
 #include <filesystem>
 #include <fcntl.h>
 #include <unistd.h>
+#include <sys/stat.h>
 #include <sys/types.h>
 #include <utime.h>
 #include <IO/ReadBufferFromFile.h>
--- a/src/Coordination/KeeperServer.cpp
+++ b/src/Coordination/KeeperServer.cpp
@ -15,7 +15,6 @@
 #include <IO/WriteHelpers.h>
 #include <boost/algorithm/string.hpp>
 #include <libnuraft/cluster_config.hxx>
-#include <libnuraft/log_val_type.hxx>
 #include <libnuraft/raft_server.hxx>
 #include <Poco/Util/AbstractConfiguration.h>
 #include <Poco/Util/Application.h>
@ -316,22 +315,6 @@ void KeeperServer::startup(const Poco::Util::AbstractConfiguration & config, boo

    state_manager->loadLogStore(state_machine->last_commit_index() + 1, coordination_settings->reserved_log_items);

-    auto log_store = state_manager->load_log_store();
-    auto next_log_idx = log_store->next_slot();
-    if (next_log_idx > 0 && next_log_idx > state_machine->last_commit_index())
-    {
-        auto log_entries = log_store->log_entries(state_machine->last_commit_index() + 1, next_log_idx);
-
-        auto idx = state_machine->last_commit_index() + 1;
-        for (const auto & entry : *log_entries)
-        {
-            if (entry && entry->get_val_type() == nuraft::log_val_type::app_log)
-                state_machine->preprocess(idx, entry->get_buf());
-
-            ++idx;
-        }
-    }
-
    loadLatestConfig();

    last_local_config = state_manager->parseServersConfiguration(config, true).cluster_config;
--- a/src/Coordination/KeeperStateMachine.cpp
+++ b/src/Coordination/KeeperStateMachine.cpp
@ -44,6 +44,7 @@ namespace
        else /// backward compatibility
            request_for_session.time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();

+
        return request_for_session;
    }
 }
@ -113,21 +114,6 @@ void KeeperStateMachine::init()
        storage = std::make_unique<KeeperStorage>(coordination_settings->dead_session_check_period_ms.totalMilliseconds(), superdigest);
 }

-nuraft::ptr<nuraft::buffer> KeeperStateMachine::pre_commit(uint64_t log_idx, nuraft::buffer & data)
-{
-    preprocess(log_idx, data);
-    return nullptr;
-}
-
-void KeeperStateMachine::preprocess(const uint64_t log_idx, nuraft::buffer & data)
-{
-    auto request_for_session = parseRequest(data);
-    if (request_for_session.request->getOpNum() == Coordination::OpNum::SessionID)
-        return;
-    std::lock_guard lock(storage_and_responses_lock);
-    storage->preprocessRequest(request_for_session.request, request_for_session.session_id, request_for_session.time, log_idx);
-}
-
 nuraft::ptr<nuraft::buffer> KeeperStateMachine::commit(const uint64_t log_idx, nuraft::buffer & data)
 {
    auto request_for_session = parseRequest(data);
@ -196,12 +182,6 @@ void KeeperStateMachine::commit_config(const uint64_t /* log_idx */, nuraft::ptr
    cluster_config = ClusterConfig::deserialize(*tmp);
 }

-void KeeperStateMachine::rollback(uint64_t log_idx, nuraft::buffer & /*data*/)
-{
-    std::lock_guard lock(storage_and_responses_lock);
-    storage->rollbackRequest(log_idx);
-}
-
 nuraft::ptr<nuraft::snapshot> KeeperStateMachine::last_snapshot()
 {
    /// Just return the latest snapshot.
@ -363,7 +343,7 @@ void KeeperStateMachine::processReadRequest(const KeeperStorage::RequestForSessi
 {
    /// Pure local request, just process it with storage
    std::lock_guard lock(storage_and_responses_lock);
-    auto responses = storage->processRequest(request_for_session.request, request_for_session.session_id, request_for_session.time, std::nullopt, true /*check_acl*/, true /*is_local*/);
+    auto responses = storage->processRequest(request_for_session.request, request_for_session.session_id, request_for_session.time, std::nullopt);
    for (const auto & response : responses)
        if (!responses_queue.push(response))
            throw Exception(ErrorCodes::SYSTEM_ERROR, "Could not push response with session id {} into responses queue", response.session_id);
--- a/src/Coordination/KeeperStateMachine.h
+++ b/src/Coordination/KeeperStateMachine.h
@ -27,16 +27,16 @@ public:
    /// Read state from the latest snapshot
    void init();

-    void preprocess(uint64_t log_idx, nuraft::buffer & data);
-
-    nuraft::ptr<nuraft::buffer> pre_commit(uint64_t log_idx, nuraft::buffer & data) override;
+    /// Currently not supported
+    nuraft::ptr<nuraft::buffer> pre_commit(const uint64_t /*log_idx*/, nuraft::buffer & /*data*/) override { return nullptr; }

    nuraft::ptr<nuraft::buffer> commit(const uint64_t log_idx, nuraft::buffer & data) override; /// NOLINT

    /// Save new cluster config to our snapshot (copy of the config stored in StateManager)
    void commit_config(const uint64_t log_idx, nuraft::ptr<nuraft::cluster_config> & new_conf) override; /// NOLINT

-    void rollback(uint64_t log_idx, nuraft::buffer & data) override;
+    /// Currently not supported
+    void rollback(const uint64_t /*log_idx*/, nuraft::buffer & /*data*/) override {}

    uint64_t last_commit_index() override { return last_committed_idx; }

--- a/src/Coordination/KeeperStorage.cpp
+++ b/src/Coordination/KeeperStorage.cpp
--- a/src/Coordination/KeeperStorage.h
+++ b/src/Coordination/KeeperStorage.h
@ -1,14 +1,14 @@
 #pragma once

-#include <unordered_map>
-#include <vector>
-#include <Coordination/ACLMap.h>
+#include <Common/ZooKeeper/IKeeper.h>
+#include <Common/ConcurrentBoundedQueue.h>
+#include <Common/ZooKeeper/ZooKeeperCommon.h>
 #include <Coordination/SessionExpiryQueue.h>
+#include <Coordination/ACLMap.h>
 #include <Coordination/SnapshotableHashTable.h>
 #include <IO/WriteBufferFromString.h>
-#include <Common/ConcurrentBoundedQueue.h>
-#include <Common/ZooKeeper/IKeeper.h>
-#include <Common/ZooKeeper/ZooKeeperCommon.h>
+#include <unordered_map>
+#include <vector>

 #include <absl/container/flat_hash_set.h>

@ -29,6 +29,7 @@ struct KeeperStorageSnapshot;
 class KeeperStorage
 {
 public:
+
    struct Node
    {
        uint64_t acl_id = 0; /// 0 -- no ACL by default
@ -40,18 +41,26 @@ public:
        Node() : size_bytes(sizeof(Node)) { }

        /// Object memory size
-        uint64_t sizeInBytes() const { return size_bytes; }
+        uint64_t sizeInBytes() const
+        {
+            return size_bytes;
+        }

        void setData(String new_data);

-        const auto & getData() const noexcept { return data; }
+        const auto & getData() const noexcept
+        {
+            return data;
+        }

        void addChild(StringRef child_path);

        void removeChild(StringRef child_path);

-        const auto & getChildren() const noexcept { return children; }
-
+        const auto & getChildren() const noexcept
+        {
+            return children;
+        }
    private:
        String data;
        ChildrenSet children{};
@ -76,7 +85,10 @@ public:
        std::string scheme;
        std::string id;

-        bool operator==(const AuthID & other) const { return scheme == other.scheme && id == other.id; }
+        bool operator==(const AuthID & other) const
+        {
+            return scheme == other.scheme && id == other.id;
+        }
    };

    using RequestsForSessions = std::vector<RequestForSession>;
@ -84,7 +96,7 @@ public:
    using Container = SnapshotableHashTable<Node>;
    using Ephemerals = std::unordered_map<int64_t, std::unordered_set<std::string>>;
    using SessionAndWatcher = std::unordered_map<int64_t, std::unordered_set<std::string>>;
-    using SessionIDs = std::vector<int64_t>;
+    using SessionIDs = std::unordered_set<int64_t>;

    /// Just vector of SHA1 from user:password
    using AuthIDs = std::vector<AuthID>;
@ -100,146 +112,6 @@ public:
    /// container.
    Container container;

-    // Applying ZooKeeper request to storage consists of two steps:
-    //  - preprocessing which, instead of applying the changes directly to storage,
-    //    generates deltas with those changes, denoted with the request ZXID
-    //  - processing which applies deltas with the correct ZXID to the storage
-    //
-    // Delta objects allow us two things:
-    //  - fetch the latest, uncommitted state of an object by getting the committed
-    //    state of that same object from the storage and applying the deltas
-    //    in the same order as they are defined
-    //  - quickly commit the changes to the storage
-    struct CreateNodeDelta
-    {
-        Coordination::Stat stat;
-        bool is_ephemeral;
-        bool is_sequental;
-        Coordination::ACLs acls;
-        String data;
-    };
-
-    struct RemoveNodeDelta
-    {
-        int32_t version{-1};
-    };
-
-    struct UpdateNodeDelta
-    {
-        std::function<void(Node &)> update_fn;
-        int32_t version{-1};
-    };
-
-    struct SetACLDelta
-    {
-        Coordination::ACLs acls;
-        int32_t version{-1};
-    };
-
-    struct ErrorDelta
-    {
-        Coordination::Error error;
-    };
-
-    struct FailedMultiDelta
-    {
-        std::vector<Coordination::Error> error_codes;
-    };
-
-    // Denotes end of a subrequest in multi request
-    struct SubDeltaEnd
-    {
-    };
-
-    struct AddAuthDelta
-    {
-        int64_t session_id;
-        AuthID auth_id;
-    };
-
-    using Operation
-        = std::variant<CreateNodeDelta, RemoveNodeDelta, UpdateNodeDelta, SetACLDelta, AddAuthDelta, ErrorDelta, SubDeltaEnd, FailedMultiDelta>;
-
-    struct Delta
-    {
-        Delta(String path_, int64_t zxid_, Operation operation_) : path(std::move(path_)), zxid(zxid_), operation(std::move(operation_)) { }
-
-        Delta(int64_t zxid_, Coordination::Error error) : Delta("", zxid_, ErrorDelta{error}) { }
-
-        Delta(int64_t zxid_, Operation subdelta) : Delta("", zxid_, subdelta) { }
-
-        String path;
-        int64_t zxid;
-        Operation operation;
-    };
-
-    struct UncommittedState
-    {
-        explicit UncommittedState(KeeperStorage & storage_) : storage(storage_) { }
-
-        template <typename Visitor>
-        void applyDeltas(StringRef path, const Visitor & visitor) const
-        {
-            for (const auto & delta : deltas)
-            {
-                if (path.empty() || delta.path == path)
-                    std::visit(visitor, delta.operation);
-            }
-        }
-
-        bool hasACL(int64_t session_id, bool is_local, std::function<bool(const AuthID &)> predicate)
-        {
-            for (const auto & session_auth : storage.session_and_auth[session_id])
-            {
-                if (predicate(session_auth))
-                    return true;
-            }
-
-            if (is_local)
-                return false;
-
-
-            for (const auto & delta : deltas)
-            {
-                if (const auto * auth_delta = std::get_if<KeeperStorage::AddAuthDelta>(&delta.operation);
-                    auth_delta && auth_delta->session_id == session_id && predicate(auth_delta->auth_id))
-                    return true;
-            }
-
-            return false;
-        }
-
-        std::shared_ptr<Node> getNode(StringRef path);
-        bool hasNode(StringRef path) const;
-        Coordination::ACLs getACLs(StringRef path) const;
-
-        std::deque<Delta> deltas;
-        KeeperStorage & storage;
-    };
-
-    UncommittedState uncommitted_state{*this};
-
-    Coordination::Error commit(int64_t zxid, int64_t session_id);
-
-    // Create node in the storage
-    // Returns false if it failed to create the node, true otherwise
-    // We don't care about the exact failure because we should've caught it during preprocessing
-    bool createNode(
-        const std::string & path,
-        String data,
-        const Coordination::Stat & stat,
-        bool is_sequental,
-        bool is_ephemeral,
-        Coordination::ACLs node_acls,
-        int64_t session_id);
-
-    // Remove node in the storage
-    // Returns false if it failed to remove the node, true otherwise
-    // We don't care about the exact failure because we should've caught it during preprocessing
-    bool removeNode(const std::string & path, int32_t version);
-
-    bool checkACL(StringRef path, int32_t permissions, int64_t session_id, bool is_local);
-
    /// Mapping session_id -> set of ephemeral nodes paths
    Ephemerals ephemerals;
    /// Mapping session_id -> set of watched nodes paths
@ -258,12 +130,15 @@ public:

    /// Currently active watches (node_path -> subscribed sessions)
    Watches watches;
-    Watches list_watches; /// Watches for 'list' request (watches on children).
+    Watches list_watches;   /// Watches for 'list' request (watches on children).

    void clearDeadWatches(int64_t session_id);

    /// Get current zxid
-    int64_t getZXID() const { return zxid; }
+    int64_t getZXID() const
+    {
+        return zxid;
+    }

    const String superdigest;

@ -287,53 +162,78 @@ public:

    /// Process user request and return response.
    /// check_acl = false only when converting data from ZooKeeper.
-    ResponsesForSessions processRequest(
-        const Coordination::ZooKeeperRequestPtr & request,
-        int64_t session_id,
-        int64_t time,
-        std::optional<int64_t> new_last_zxid,
-        bool check_acl = true,
-        bool is_local = false);
-    void preprocessRequest(
-        const Coordination::ZooKeeperRequestPtr & request, int64_t session_id, int64_t time, int64_t new_last_zxid, bool check_acl = true);
-    void rollbackRequest(int64_t rollback_zxid);
+    ResponsesForSessions processRequest(const Coordination::ZooKeeperRequestPtr & request, int64_t session_id, int64_t time, std::optional<int64_t> new_last_zxid, bool check_acl = true);

    void finalize();

    /// Set of methods for creating snapshots

    /// Turn on snapshot mode, so data inside Container is not deleted, but replaced with new version.
-    void enableSnapshotMode(size_t up_to_version) { container.enableSnapshotMode(up_to_version); }
+    void enableSnapshotMode(size_t up_to_version)
+    {
+        container.enableSnapshotMode(up_to_version);
+
+    }

    /// Turn off snapshot mode.
-    void disableSnapshotMode() { container.disableSnapshotMode(); }
+    void disableSnapshotMode()
+    {
+        container.disableSnapshotMode();
+    }

-    Container::const_iterator getSnapshotIteratorBegin() const { return container.begin(); }
+    Container::const_iterator getSnapshotIteratorBegin() const
+    {
+        return container.begin();
+    }

    /// Clear outdated data from internal container.
-    void clearGarbageAfterSnapshot() { container.clearOutdatedNodes(); }
+    void clearGarbageAfterSnapshot()
+    {
+        container.clearOutdatedNodes();
+    }

    /// Get all active sessions
-    const SessionAndTimeout & getActiveSessions() const { return session_and_timeout; }
+    const SessionAndTimeout & getActiveSessions() const
+    {
+        return session_and_timeout;
+    }

    /// Get all dead sessions
-    std::vector<int64_t> getDeadSessions() const { return session_expiry_queue.getExpiredSessions(); }
+    std::vector<int64_t> getDeadSessions() const
+    {
+        return session_expiry_queue.getExpiredSessions();
+    }

    /// Introspection functions mostly used in 4-letter commands
-    uint64_t getNodesCount() const { return container.size(); }
+    uint64_t getNodesCount() const
+    {
+        return container.size();
+    }

-    uint64_t getApproximateDataSize() const { return container.getApproximateDataSize(); }
+    uint64_t getApproximateDataSize() const
+    {
+        return container.getApproximateDataSize();
+    }

-    uint64_t getArenaDataSize() const { return container.keyArenaSize(); }
+    uint64_t getArenaDataSize() const
+    {
+        return container.keyArenaSize();
+    }


    uint64_t getTotalWatchesCount() const;

-    uint64_t getWatchedPathsCount() const { return watches.size() + list_watches.size(); }
+    uint64_t getWatchedPathsCount() const
+    {
+        return watches.size() + list_watches.size();
+    }

    uint64_t getSessionsWithWatchesCount() const;

-    uint64_t getSessionWithEphemeralNodesCount() const { return ephemerals.size(); }
+    uint64_t getSessionWithEphemeralNodesCount() const
+    {
+        return ephemerals.size();
+    }
    uint64_t getTotalEphemeralNodesCount() const;

    void dumpWatches(WriteBufferFromOwnString & buf) const;
--- a/src/Coordination/WriteBufferFromNuraftBuffer.h
+++ b/src/Coordination/WriteBufferFromNuraftBuffer.h
@ -12,6 +12,7 @@ public:
    WriteBufferFromNuraftBuffer();

    nuraft::ptr<nuraft::buffer> getBuffer();
+    bool isFinished() const { return finalized; }

    ~WriteBufferFromNuraftBuffer() override;

--- a/src/Coordination/ZooKeeperDataReader.cpp
+++ b/src/Coordination/ZooKeeperDataReader.cpp
@ -520,7 +520,6 @@ bool deserializeTxn(KeeperStorage & storage, ReadBuffer & in, Poco::Logger * /*l
            if (request->getOpNum() == Coordination::OpNum::Multi && hasErrorsInMultiRequest(request))
                return true;

-            storage.preprocessRequest(request, session_id, time, zxid, /* check_acl = */ false);
            storage.processRequest(request, session_id, time, zxid, /* check_acl = */ false);
        }
    }
--- a/src/Coordination/tests/gtest_coordination.cpp
+++ b/src/Coordination/tests/gtest_coordination.cpp
@ -1,8 +1,6 @@
 #include <chrono>
 #include <gtest/gtest.h>
-#include "Common/ZooKeeper/IKeeper.h"

-#include "Coordination/KeeperStorage.h"
 #include "config_core.h"

 #if USE_NURAFT
@ -1263,7 +1261,6 @@ void testLogAndStateMachine(Coordination::CoordinationSettingsPtr settings, uint
        changelog.append(entry);
        changelog.end_of_append_batch(0, 0);

-        state_machine->pre_commit(i, changelog.entry_at(i)->get_buf());
        state_machine->commit(i, changelog.entry_at(i)->get_buf());
        bool snapshot_created = false;
        if (i % settings->snapshot_distance == 0)
@ -1308,7 +1305,6 @@ void testLogAndStateMachine(Coordination::CoordinationSettingsPtr settings, uint

    for (size_t i = restore_machine->last_commit_index() + 1; i < restore_changelog.next_slot(); ++i)
    {
-        restore_machine->pre_commit(i, changelog.entry_at(i)->get_buf());
        restore_machine->commit(i, changelog.entry_at(i)->get_buf());
    }

@ -1411,7 +1407,6 @@ TEST_P(CoordinationTest, TestEphemeralNodeRemove)
    request_c->path = "/hello";
    request_c->is_ephemeral = true;
    auto entry_c = getLogEntryFromZKRequest(0, 1, request_c);
-    state_machine->pre_commit(1, entry_c->get_buf());
    state_machine->commit(1, entry_c->get_buf());
    const auto & storage = state_machine->getStorage();

@ -1420,7 +1415,6 @@ TEST_P(CoordinationTest, TestEphemeralNodeRemove)
    request_d->path = "/hello";
    /// Delete from other session
    auto entry_d = getLogEntryFromZKRequest(0, 2, request_d);
-    state_machine->pre_commit(2, entry_d->get_buf());
    state_machine->commit(2, entry_d->get_buf());

    EXPECT_EQ(storage.ephemerals.size(), 0);
@ -1783,130 +1777,6 @@ TEST_P(CoordinationTest, TestLogGap)
    EXPECT_EQ(changelog1.next_slot(), 61);
 }

-template <typename ResponseType>
-ResponseType getSingleResponse(const auto & responses)
-{
-    EXPECT_FALSE(responses.empty());
-    return dynamic_cast<ResponseType &>(*responses[0].response);
-}
-
-TEST_P(CoordinationTest, TestUncommittedStateBasicCrud)
-{
-    using namespace DB;
-    using namespace Coordination;
-
-    DB::KeeperStorage storage{500, ""};
-
-    constexpr std::string_view path = "/test";
-
-    const auto get_committed_data = [&]() -> std::optional<String>
-    {
-        auto request = std::make_shared<ZooKeeperGetRequest>();
-        request->path = path;
-        auto responses = storage.processRequest(request, 0, 0, std::nullopt, true, true);
-        const auto & get_response = getSingleResponse<ZooKeeperGetResponse>(responses);
-
-        if (get_response.error != Error::ZOK)
-            return std::nullopt;
-
-        return get_response.data;
-    };
-
-    const auto preprocess_get = [&](int64_t zxid)
-    {
-        auto get_request = std::make_shared<ZooKeeperGetRequest>();
-        get_request->path = path;
-        storage.preprocessRequest(get_request, 0, 0, zxid);
-        return get_request;
-    };
-
-    const auto create_request = std::make_shared<ZooKeeperCreateRequest>();
-    create_request->path = path;
-    create_request->data = "initial_data";
-    storage.preprocessRequest(create_request, 0, 0, 1);
-    storage.preprocessRequest(create_request, 0, 0, 2);
-
-    ASSERT_FALSE(get_committed_data());
-
-    const auto after_create_get = preprocess_get(3);
-
-    ASSERT_FALSE(get_committed_data());
-
-    const auto set_request = std::make_shared<ZooKeeperSetRequest>();
-    set_request->path = path;
-    set_request->data = "new_data";
-    storage.preprocessRequest(set_request, 0, 0, 4);
-
-    const auto after_set_get = preprocess_get(5);
-
-    ASSERT_FALSE(get_committed_data());
-
-    const auto remove_request = std::make_shared<ZooKeeperRemoveRequest>();
-    remove_request->path = path;
-    storage.preprocessRequest(remove_request, 0, 0, 6);
-    storage.preprocessRequest(remove_request, 0, 0, 7);
-
-    const auto after_remove_get = preprocess_get(8);
-
-    ASSERT_FALSE(get_committed_data());
-
-    {
-        const auto responses = storage.processRequest(create_request, 0, 0, 1);
-        const auto & create_response = getSingleResponse<ZooKeeperCreateResponse>(responses);
-        ASSERT_EQ(create_response.error, Error::ZOK);
-    }
-
-    {
-        const auto responses = storage.processRequest(create_request, 0, 0, 2);
-        const auto & create_response = getSingleResponse<ZooKeeperCreateResponse>(responses);
-        ASSERT_EQ(create_response.error, Error::ZNODEEXISTS);
-    }
-
-    {
-        const auto responses = storage.processRequest(after_create_get, 0, 0, 3);
-        const auto & get_response = getSingleResponse<ZooKeeperGetResponse>(responses);
-        ASSERT_EQ(get_response.error, Error::ZOK);
-        ASSERT_EQ(get_response.data, "initial_data");
-    }
-
-    ASSERT_EQ(get_committed_data(), "initial_data");
-
-    {
-        const auto responses = storage.processRequest(set_request, 0, 0, 4);
-        const auto & create_response = getSingleResponse<ZooKeeperSetResponse>(responses);
-        ASSERT_EQ(create_response.error, Error::ZOK);
-    }
-
-    {
-        const auto responses = storage.processRequest(after_set_get, 0, 0, 5);
-        const auto & get_response = getSingleResponse<ZooKeeperGetResponse>(responses);
-        ASSERT_EQ(get_response.error, Error::ZOK);
-        ASSERT_EQ(get_response.data, "new_data");
-    }
-
-    ASSERT_EQ(get_committed_data(), "new_data");
-
-    {
-        const auto responses = storage.processRequest(remove_request, 0, 0, 6);
-        const auto & create_response = getSingleResponse<ZooKeeperRemoveResponse>(responses);
-        ASSERT_EQ(create_response.error, Error::ZOK);
-    }
-
-    {
-        const auto responses = storage.processRequest(remove_request, 0, 0, 7);
-        const auto & create_response = getSingleResponse<ZooKeeperRemoveResponse>(responses);
-        ASSERT_EQ(create_response.error, Error::ZNONODE);
-    }
-
-    {
-        const auto responses = storage.processRequest(after_remove_get, 0, 0, 8);
-        const auto & get_response = getSingleResponse<ZooKeeperGetResponse>(responses);
-        ASSERT_EQ(get_response.error, Error::ZNONODE);
-    }
-
-    ASSERT_FALSE(get_committed_data());
-}
-

 INSTANTIATE_TEST_SUITE_P(CoordinationTestSuite,
    CoordinationTest,
--- a/src/Core/ColumnNumbers.h
+++ b/src/Core/ColumnNumbers.h
@ -1,5 +1,6 @@
 #pragma once

+#include <unordered_set>
 #include <vector>


@ -7,6 +8,8 @@ namespace DB
 {

 using ColumnNumbers = std::vector<size_t>;
+using ColumnNumbersSet = std::unordered_set<size_t>;
 using ColumnNumbersList = std::vector<ColumnNumbers>;
+using ColumnNumbersSetList = std::vector<ColumnNumbersSet>;

 }
--- a/src/Core/Names.h
+++ b/src/Core/Names.h
@ -16,6 +16,7 @@ using NameOrderedSet = std::set<std::string>;
 using NameToNameMap = std::unordered_map<std::string, std::string>;
 using NameToNameSetMap = std::unordered_map<std::string, NameSet>;
 using NameToNameVector = std::vector<std::pair<std::string, std::string>>;
+using NameToIndexMap = std::unordered_map<std::string, size_t>;

 using NameWithAlias = std::pair<std::string, std::string>;
 using NamesWithAliases = std::vector<NameWithAlias>;
--- a/src/Core/NamesAndTypes.cpp
+++ b/src/Core/NamesAndTypes.cpp
@ -1,3 +1,4 @@
+#include <cstddef>
 #include <Core/NamesAndTypes.h>

 #include <base/sort.h>
@ -214,4 +215,17 @@ std::optional<NameAndTypePair> NamesAndTypesList::tryGetByName(const std::string
    }
    return {};
 }
+
+size_t NamesAndTypesList::getPosByName(const std::string &name) const noexcept
+{
+    size_t pos = 0;
+    for (const NameAndTypePair & column : *this)
+    {
+        if (column.name == name)
+            break;
+        ++pos;
+    }
+    return pos;
+}
+
 }
--- a/src/Core/NamesAndTypes.h
+++ b/src/Core/NamesAndTypes.h
@ -105,8 +105,11 @@ public:
    /// Check that column contains in list
    bool contains(const String & name) const;

-    /// Try to get column by name, return empty optional if column not found
+    /// Try to get column by name, returns empty optional if column not found
    std::optional<NameAndTypePair> tryGetByName(const std::string & name) const;
+
+    /// Try to get column position by name, returns number of columns if column isn't found
+    size_t getPosByName(const std::string & name) const noexcept;
 };

 using NamesAndTypesLists = std::vector<NamesAndTypesList>;
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -86,6 +86,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
    M(UInt64, s3_max_connections, 1024, "The maximum number of connections per server.", 0) \
    M(Bool, s3_truncate_on_insert, false, "Enables or disables truncate before insert in s3 engine tables.", 0) \
    M(Bool, s3_create_new_file_on_insert, false, "Enables or disables creating a new file on each insert in s3 engine tables", 0) \
+    M(Bool, enable_s3_requests_logging, false, "Enable very explicit logging of S3 requests. Makes sense for debug only.", 0) \
    M(UInt64, hdfs_replication, 0, "The actual number of replications can be specified when the hdfs file is created.", 0) \
    M(Bool, hdfs_truncate_on_insert, false, "Enables or disables truncate before insert in s3 engine tables", 0) \
    M(Bool, hdfs_create_new_file_on_insert, false, "Enables or disables creating a new file on each insert in hdfs engine tables", 0) \
@ -566,7 +567,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
    \
    M(UInt64, remote_fs_read_max_backoff_ms, 10000, "Max wait time when trying to read data for remote disk", 0) \
    M(UInt64, remote_fs_read_backoff_max_tries, 5, "Max attempts to read with backoff", 0) \
-    M(Bool, enable_filesystem_cache, true, "Use cache for remote filesystem. This setting does not turn on/off cache for disks (must me done via disk config), but allows to bypass cache for some queries if intended", 0) \
+    M(Bool, enable_filesystem_cache, true, "Use cache for remote filesystem. This setting does not turn on/off cache for disks (must be done via disk config), but allows to bypass cache for some queries if intended", 0) \
    M(UInt64, filesystem_cache_max_wait_sec, 5, "Allow to wait at most this number of seconds for download of current remote_fs_buffer_size bytes, and skip cache if exceeded", 0) \
    M(Bool, enable_filesystem_cache_on_write_operations, false, "Write into cache on write operations. To actually work this setting requires be added to disk config too", 0) \
    M(Bool, enable_filesystem_cache_log, false, "Allows to record the filesystem caching log for each query", 0) \
--- a/src/Core/SettingsFields.cpp
+++ b/src/Core/SettingsFields.cpp
@ -9,6 +9,8 @@
 #include <IO/WriteHelpers.h>
 #include <boost/algorithm/string/predicate.hpp>

+#include <cmath>
+

 namespace DB
 {
@ -16,6 +18,7 @@ namespace ErrorCodes
 {
    extern const int SIZE_OF_FIXED_STRING_DOESNT_MATCH;
    extern const int CANNOT_PARSE_BOOL;
+    extern const int CANNOT_PARSE_NUMBER;
 }


@ -176,27 +179,75 @@ UInt64 SettingFieldMaxThreads::getAuto()
    return getNumberOfPhysicalCPUCores();
 }

+namespace
+{
+    Poco::Timespan::TimeDiff float64AsSecondsToTimespan(Float64 d)
+    {
+        if (d != 0.0 && !std::isnormal(d))
+            throw Exception(
+                ErrorCodes::CANNOT_PARSE_NUMBER, "A setting's value in seconds must be a normal floating point number or zero. Got {}", d);
+        return static_cast<Poco::Timespan::TimeDiff>(d * 1000000);
+    }

-template <SettingFieldTimespanUnit unit_>
-SettingFieldTimespan<unit_>::SettingFieldTimespan(const Field & f) : SettingFieldTimespan(fieldToNumber<UInt64>(f))
+}
+
+template <>
+SettingFieldSeconds::SettingFieldTimespan(const Field & f) : SettingFieldTimespan(float64AsSecondsToTimespan(fieldToNumber<Float64>(f)))
 {
 }

-template <SettingFieldTimespanUnit unit_>
-SettingFieldTimespan<unit_> & SettingFieldTimespan<unit_>::operator=(const Field & f)
+template <>
+SettingFieldMilliseconds::SettingFieldTimespan(const Field & f) : SettingFieldTimespan(fieldToNumber<UInt64>(f))
+{
+}
+
+template <>
+SettingFieldTimespan<SettingFieldTimespanUnit::Second> & SettingFieldSeconds::operator=(const Field & f)
+{
+    *this = Poco::Timespan{float64AsSecondsToTimespan(fieldToNumber<Float64>(f))};
+    return *this;
+}
+
+template <>
+SettingFieldTimespan<SettingFieldTimespanUnit::Millisecond> & SettingFieldMilliseconds::operator=(const Field & f)
 {
    *this = fieldToNumber<UInt64>(f);
    return *this;
 }

-template <SettingFieldTimespanUnit unit_>
-String SettingFieldTimespan<unit_>::toString() const
+template <>
+String SettingFieldSeconds::toString() const
+{
+    return ::DB::toString(static_cast<Float64>(value.totalMicroseconds()) / microseconds_per_unit);
+}
+
+template <>
+String SettingFieldMilliseconds::toString() const
 {
    return ::DB::toString(operator UInt64());
 }

-template <SettingFieldTimespanUnit unit_>
-void SettingFieldTimespan<unit_>::parseFromString(const String & str)
+template <>
+SettingFieldSeconds::operator Field() const
+{
+    return static_cast<Float64>(value.totalMicroseconds()) / microseconds_per_unit;
+}
+
+template <>
+SettingFieldMilliseconds::operator Field() const
+{
+    return operator UInt64();
+}
+
+template <>
+void SettingFieldSeconds::parseFromString(const String & str)
+{
+    Float64 n = parse<Float64>(str.data(), str.size());
+    *this = Poco::Timespan{static_cast<Poco::Timespan::TimeDiff>(n * microseconds_per_unit)};
+}
+
+template <>
+void SettingFieldMilliseconds::parseFromString(const String & str)
 {
    *this = stringToNumber<UInt64>(str);
 }
@ -204,6 +255,13 @@ void SettingFieldTimespan<unit_>::parseFromString(const String & str)
 template <SettingFieldTimespanUnit unit_>
 void SettingFieldTimespan<unit_>::writeBinary(WriteBuffer & out) const
 {
+    /// Note that this returns an UInt64 (for both seconds and milliseconds units) for compatibility reasons as the value
+    /// for seconds used to be a integer (now a Float64)
+    /// This method is only used to communicate with clients or servers older than DBMS_MIN_REVISION_WITH_SETTINGS_SERIALIZED_AS_STRINGS
+    /// in which the value was passed as binary (as a UInt64)
+    /// Later versions pass the setting values as String (using toString() and parseFromString()) and there passing "1.2" will
+    /// lead to `1` on releases with integer seconds or `1.2` on more recent releases
+    /// See https://github.com/ClickHouse/ClickHouse/issues/36940 for more details
    auto num_units = operator UInt64();
    writeVarUInt(num_units, out);
 }
--- a/src/Core/SettingsFields.h
+++ b/src/Core/SettingsFields.h
@ -124,7 +124,7 @@ struct SettingFieldTimespan
    operator std::chrono::duration<Rep, Period>() const { return std::chrono::duration_cast<std::chrono::duration<Rep, Period>>(std::chrono::microseconds(value.totalMicroseconds())); } /// NOLINT

    explicit operator UInt64() const { return value.totalMicroseconds() / microseconds_per_unit; }
-    explicit operator Field() const { return operator UInt64(); }
+    explicit operator Field() const;

    Poco::Timespan::TimeDiff totalMicroseconds() const { return value.totalMicroseconds(); }
    Poco::Timespan::TimeDiff totalMilliseconds() const { return value.totalMilliseconds(); }
--- a/src/Daemon/BaseDaemon.cpp
+++ b/src/Daemon/BaseDaemon.cpp
@ -68,6 +68,14 @@

 namespace fs = std::filesystem;

+namespace DB
+{
+    namespace ErrorCodes
+    {
+        extern const int CANNOT_SET_SIGNAL_HANDLER;
+    }
+}
+
 DB::PipeFDs signal_pipe;


@ -76,7 +84,8 @@ DB::PipeFDs signal_pipe;
  */
 static void call_default_signal_handler(int sig)
 {
-    signal(sig, SIG_DFL);
+    if (SIG_ERR == signal(sig, SIG_DFL))
+        DB::throwFromErrno("Cannot set signal handler.", DB::ErrorCodes::CANNOT_SET_SIGNAL_HANDLER);
    raise(sig);
 }

@ -498,9 +507,8 @@ BaseDaemon::~BaseDaemon()
    signal_listener_thread.join();
    /// Reset signals to SIG_DFL to avoid trying to write to the signal_pipe that will be closed after.
    for (int sig : handled_signals)
-    {
-        signal(sig, SIG_DFL);
-    }
+        if (SIG_ERR == signal(sig, SIG_DFL))
+            DB::throwFromErrno("Cannot set signal handler.", DB::ErrorCodes::CANNOT_SET_SIGNAL_HANDLER);
    signal_pipe.close();
 }

--- a/src/DataTypes/Serializations/SerializationNothing.h
+++ b/src/DataTypes/Serializations/SerializationNothing.h
@ -16,7 +16,7 @@ class SerializationNothing : public SimpleTextSerialization
 private:
    [[noreturn]] static void throwNoSerialization()
    {
-        throw Exception("Serialization is not implemented", ErrorCodes::NOT_IMPLEMENTED);
+        throw Exception("Serialization is not implemented for type Nothing", ErrorCodes::NOT_IMPLEMENTED);
    }
 public:
    void serializeBinary(const Field &, WriteBuffer &) const override                       { throwNoSerialization(); }
--- a/src/Databases/MySQL/DatabaseMySQL.cpp
+++ b/src/Databases/MySQL/DatabaseMySQL.cpp
@ -89,7 +89,7 @@ bool DatabaseMySQL::empty() const
        return true;

    for (const auto & [table_name, storage_info] : local_tables_cache)
-        if (!remove_or_detach_tables.count(table_name))
+        if (!remove_or_detach_tables.contains(table_name))
            return false;

    return true;
@ -103,7 +103,7 @@ DatabaseTablesIteratorPtr DatabaseMySQL::getTablesIterator(ContextPtr local_cont
    fetchTablesIntoLocalCache(local_context);

    for (const auto & [table_name, modify_time_and_storage] : local_tables_cache)
-        if (!remove_or_detach_tables.count(table_name) && (!filter_by_table_name || filter_by_table_name(table_name)))
+        if (!remove_or_detach_tables.contains(table_name) && (!filter_by_table_name || filter_by_table_name(table_name)))
            tables[table_name] = modify_time_and_storage.second;

    return std::make_unique<DatabaseTablesSnapshotIterator>(tables, database_name);
@ -120,7 +120,7 @@ StoragePtr DatabaseMySQL::tryGetTable(const String & mysql_table_name, ContextPt

    fetchTablesIntoLocalCache(local_context);

-    if (!remove_or_detach_tables.count(mysql_table_name) && local_tables_cache.find(mysql_table_name) != local_tables_cache.end())
+    if (!remove_or_detach_tables.contains(mysql_table_name) && local_tables_cache.find(mysql_table_name) != local_tables_cache.end())
        return local_tables_cache[mysql_table_name].second;

    return StoragePtr{};
@ -349,11 +349,11 @@ void DatabaseMySQL::attachTable(ContextPtr /* context_ */, const String & table_
 {
    std::lock_guard<std::mutex> lock{mutex};

-    if (!local_tables_cache.count(table_name))
+    if (!local_tables_cache.contains(table_name))
        throw Exception("Cannot attach table " + backQuoteIfNeed(database_name) + "." + backQuoteIfNeed(table_name) +
            " because it does not exist.", ErrorCodes::UNKNOWN_TABLE);

-    if (!remove_or_detach_tables.count(table_name))
+    if (!remove_or_detach_tables.contains(table_name))
        throw Exception("Cannot attach table " + backQuoteIfNeed(database_name) + "." + backQuoteIfNeed(table_name) +
            " because it already exists.", ErrorCodes::TABLE_ALREADY_EXISTS);

@ -372,11 +372,11 @@ StoragePtr DatabaseMySQL::detachTable(ContextPtr /* context */, const String & t
 {
    std::lock_guard<std::mutex> lock{mutex};

-    if (remove_or_detach_tables.count(table_name))
+    if (remove_or_detach_tables.contains(table_name))
        throw Exception("Table " + backQuoteIfNeed(database_name) + "." + backQuoteIfNeed(table_name) + " is dropped",
            ErrorCodes::TABLE_IS_DROPPED);

-    if (!local_tables_cache.count(table_name))
+    if (!local_tables_cache.contains(table_name))
        throw Exception("Table " + backQuoteIfNeed(database_name) + "." + backQuoteIfNeed(table_name) + " doesn't exist.",
            ErrorCodes::UNKNOWN_TABLE);

@ -412,7 +412,7 @@ void DatabaseMySQL::detachTablePermanently(ContextPtr, const String & table_name

    fs::path remove_flag = fs::path(getMetadataPath()) / (escapeForFileName(table_name) + suffix);

-    if (remove_or_detach_tables.count(table_name))
+    if (remove_or_detach_tables.contains(table_name))
        throw Exception(ErrorCodes::TABLE_IS_DROPPED, "Table {}.{} is dropped", backQuoteIfNeed(database_name), backQuoteIfNeed(table_name));

    if (fs::exists(remove_flag))
--- a/src/Dictionaries/DictionaryStructure.cpp
+++ b/src/Dictionaries/DictionaryStructure.cpp
@ -252,7 +252,7 @@ Strings DictionaryStructure::getKeysNames() const
 static void checkAttributeKeys(const Poco::Util::AbstractConfiguration::Keys & keys)
 {
    static const std::unordered_set<std::string_view> valid_keys
-        = {"name", "type", "expression", "null_value", "hierarchical", "injective", "is_object_id"};
+        = {"name", "type", "expression", "null_value", "hierarchical", "bidirectional", "injective", "is_object_id"};

    for (const auto & key : keys)
    {
@ -350,6 +350,7 @@ std::vector<DictionaryAttribute> DictionaryStructure::getAttributes(
        }

        const auto hierarchical = config.getBool(prefix + "hierarchical", false);
+        const auto bidirectional = config.getBool(prefix + "bidirectional", false);
        const auto injective = config.getBool(prefix + "injective", false);
        const auto is_object_id = config.getBool(prefix + "is_object_id", false);

@ -362,6 +363,9 @@ std::vector<DictionaryAttribute> DictionaryStructure::getAttributes(
        if (has_hierarchy && hierarchical)
            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Only one hierarchical attribute supported");

+        if (bidirectional && !hierarchical)
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bidirectional can only be applied to hierarchical attributes");
+
        has_hierarchy = has_hierarchy || hierarchical;

        res_attributes.emplace_back(DictionaryAttribute{
@ -372,6 +376,7 @@ std::vector<DictionaryAttribute> DictionaryStructure::getAttributes(
            expression,
            null_value,
            hierarchical,
+            bidirectional,
            injective,
            is_object_id,
            is_nullable});
--- a/src/Dictionaries/DictionaryStructure.h
+++ b/src/Dictionaries/DictionaryStructure.h
@ -67,6 +67,7 @@ struct DictionaryAttribute final
    const std::string expression;
    const Field null_value;
    const bool hierarchical;
+    const bool bidirectional;
    const bool injective;
    const bool is_object_id;
    const bool is_nullable;
--- a/src/Dictionaries/FlatDictionary.cpp
+++ b/src/Dictionaries/FlatDictionary.cpp
@ -43,6 +43,7 @@ FlatDictionary::FlatDictionary(
 {
    createAttributes();
    loadData();
+    buildHierarchyParentToChildIndexIfNeeded();
    calculateBytesAllocated();
 }

@ -244,30 +245,43 @@ ColumnUInt8::Ptr FlatDictionary::isInHierarchy(
    return result;
 }

-ColumnPtr FlatDictionary::getDescendants(
-    ColumnPtr key_column,
-    const DataTypePtr &,
-    size_t level) const
+DictionaryHierarchyParentToChildIndexPtr FlatDictionary::getHierarchicalIndex() const
 {
-    PaddedPODArray<UInt64> keys_backup;
-    const auto & keys = getColumnVectorData(this, key_column, keys_backup);
+    if (hierarhical_index)
+        return hierarhical_index;

    size_t hierarchical_attribute_index = *dict_struct.hierarchical_attribute_index;
    const auto & hierarchical_attribute = attributes[hierarchical_attribute_index];
    const ContainerType<UInt64> & parent_keys = std::get<ContainerType<UInt64>>(hierarchical_attribute.container);

    HashMap<UInt64, PaddedPODArray<UInt64>> parent_to_child;
+    parent_to_child.reserve(element_count);

-    for (size_t i = 0; i < parent_keys.size(); ++i)
+    UInt64 child_keys_size = static_cast<UInt64>(parent_keys.size());
+
+    for (UInt64 child_key = 0; child_key < child_keys_size; ++child_key)
    {
-        auto parent_key = parent_keys[i];
+        if (!loaded_keys[child_key])
+            continue;

-        if (loaded_keys[i])
-            parent_to_child[parent_key].emplace_back(static_cast<UInt64>(i));
+        auto parent_key = parent_keys[child_key];
+        parent_to_child[parent_key].emplace_back(child_key);
    }

+    return std::make_shared<DictionaryHierarchicalParentToChildIndex>(parent_to_child);
+}
+
+ColumnPtr FlatDictionary::getDescendants(
+    ColumnPtr key_column,
+    const DataTypePtr &,
+    size_t level,
+    DictionaryHierarchicalParentToChildIndexPtr parent_to_child_index) const
+{
+    PaddedPODArray<UInt64> keys_backup;
+    const auto & keys = getColumnVectorData(this, key_column, keys_backup);
+
    size_t keys_found;
-    auto result = getKeysDescendantsArray(keys, parent_to_child, level, keys_found);
+    auto result = getKeysDescendantsArray(keys, *parent_to_child_index, level, keys_found);

    query_count.fetch_add(keys.size(), std::memory_order_relaxed);
    found_count.fetch_add(keys_found, std::memory_order_relaxed);
@ -400,6 +414,15 @@ void FlatDictionary::loadData()
        throw Exception(ErrorCodes::DICTIONARY_IS_EMPTY, "{}: dictionary source is empty and 'require_nonempty' property is set.", getFullName());
 }

+void FlatDictionary::buildHierarchyParentToChildIndexIfNeeded()
+{
+    if (!dict_struct.hierarchical_attribute_index)
+        return;
+
+    if (dict_struct.attributes[*dict_struct.hierarchical_attribute_index].bidirectional)
+        hierarhical_index = getHierarchicalIndex();
+}
+
 void FlatDictionary::calculateBytesAllocated()
 {
    bytes_allocated += attributes.size() * sizeof(attributes.front());
@ -439,6 +462,12 @@ void FlatDictionary::calculateBytesAllocated()
    if (update_field_loaded_block)
        bytes_allocated += update_field_loaded_block->allocatedBytes();

+    if (hierarhical_index)
+    {
+        hierarchical_index_bytes_allocated = hierarhical_index->getSizeInBytes();
+        bytes_allocated += hierarchical_index_bytes_allocated;
+    }
+
    bytes_allocated += string_arena.size();
 }

@ -614,7 +643,7 @@ void registerDictionaryFlat(DictionaryFactory & factory)

        const auto dict_id = StorageID::fromDictionaryConfig(config, config_prefix);

-        return std::make_unique<FlatDictionary>(dict_id, dict_struct, std::move(source_ptr), std::move(configuration));
+        return std::make_unique<FlatDictionary>(dict_id, dict_struct, std::move(source_ptr), configuration);
    };

    factory.registerLayout("flat", create_layout, false);
--- a/src/Dictionaries/FlatDictionary.h
+++ b/src/Dictionaries/FlatDictionary.h
@ -92,10 +92,15 @@ public:
        ColumnPtr in_key_column,
        const DataTypePtr & key_type) const override;

+    DictionaryHierarchicalParentToChildIndexPtr getHierarchicalIndex() const override;
+
+    size_t getHierarchicalIndexBytesAllocated() const override { return hierarchical_index_bytes_allocated; }
+
    ColumnPtr getDescendants(
        ColumnPtr key_column,
        const DataTypePtr & key_type,
-        size_t level) const override;
+        size_t level,
+        DictionaryHierarchicalParentToChildIndexPtr parent_to_child_index) const override;

    Pipe read(const Names & column_names, size_t max_block_size, size_t num_streams) const override;

@ -137,10 +142,15 @@ private:
    };

    void createAttributes();
+
    void blockToAttributes(const Block & block);
+
    void updateData();
+
    void loadData();

+    void buildHierarchyParentToChildIndexIfNeeded();
+
    void calculateBytesAllocated();

    Attribute createAttribute(const DictionaryAttribute & attribute);
@ -165,6 +175,7 @@ private:
    std::vector<bool> loaded_keys;

    size_t bytes_allocated = 0;
+    size_t hierarchical_index_bytes_allocated = 0;
    size_t element_count = 0;
    size_t bucket_count = 0;
    mutable std::atomic<size_t> query_count{0};
@ -172,6 +183,7 @@ private:

    BlockPtr update_field_loaded_block;
    Arena string_arena;
+    DictionaryHierarchicalParentToChildIndexPtr hierarhical_index;
 };

 }
--- a/src/Dictionaries/HashedArrayDictionary.cpp
+++ b/src/Dictionaries/HashedArrayDictionary.cpp
@ -37,6 +37,7 @@ HashedArrayDictionary<dictionary_key_type>::HashedArrayDictionary(
 {
    createAttributes();
    loadData();
+    buildHierarchyParentToChildIndexIfNeeded();
    calculateBytesAllocated();
 }

@ -282,18 +283,14 @@ ColumnUInt8::Ptr HashedArrayDictionary<dictionary_key_type>::isInHierarchy(
 }

 template <DictionaryKeyType dictionary_key_type>
-ColumnPtr HashedArrayDictionary<dictionary_key_type>::getDescendants(
-    ColumnPtr key_column [[maybe_unused]],
-    const DataTypePtr &,
-    size_t level [[maybe_unused]]) const
+DictionaryHierarchicalParentToChildIndexPtr HashedArrayDictionary<dictionary_key_type>::getHierarchicalIndex() const
 {
    if constexpr (dictionary_key_type == DictionaryKeyType::Simple)
    {
-        PaddedPODArray<UInt64> keys_backup;
-        const auto & keys = getColumnVectorData(this, key_column, keys_backup);
+        if (hierarchical_index)
+            return hierarchical_index;

        size_t hierarchical_attribute_index = *dict_struct.hierarchical_attribute_index;
-
        const auto & hierarchical_attribute = attributes[hierarchical_attribute_index];
        const AttributeContainerType<UInt64> & parent_keys_container = std::get<AttributeContainerType<UInt64>>(hierarchical_attribute.container);

@ -306,6 +303,7 @@ ColumnPtr HashedArrayDictionary<dictionary_key_type>::getDescendants(
            index_to_key[value] = key;

        HashMap<UInt64, PaddedPODArray<UInt64>> parent_to_child;
+        parent_to_child.reserve(index_to_key.size());

        for (size_t i = 0; i < parent_keys_container.size(); ++i)
        {
@ -313,13 +311,33 @@ ColumnPtr HashedArrayDictionary<dictionary_key_type>::getDescendants(
            if (it == index_to_key.end())
                continue;

-            auto parent_key = it->getMapped();
-            auto child_key = parent_keys_container[i];
+            auto child_key = it->getMapped();
+            auto parent_key = parent_keys_container[i];
            parent_to_child[parent_key].emplace_back(child_key);
        }

+        return std::make_shared<DictionaryHierarchicalParentToChildIndex>(parent_to_child);
+    }
+    else
+    {
+        return nullptr;
+    }
+}
+
+template <DictionaryKeyType dictionary_key_type>
+ColumnPtr HashedArrayDictionary<dictionary_key_type>::getDescendants(
+    ColumnPtr key_column [[maybe_unused]],
+    const DataTypePtr &,
+    size_t level [[maybe_unused]],
+    DictionaryHierarchicalParentToChildIndexPtr parent_to_child_index [[maybe_unused]]) const
+{
+    if constexpr (dictionary_key_type == DictionaryKeyType::Simple)
+    {
+        PaddedPODArray<UInt64> keys_backup;
+        const auto & keys = getColumnVectorData(this, key_column, keys_backup);
+
        size_t keys_found = 0;
-        auto result = getKeysDescendantsArray(keys, parent_to_child, level, keys_found);
+        auto result = getKeysDescendantsArray(keys, *parent_to_child_index, level, keys_found);

        query_count.fetch_add(keys.size(), std::memory_order_relaxed);
        found_count.fetch_add(keys_found, std::memory_order_relaxed);
@ -693,6 +711,16 @@ void HashedArrayDictionary<dictionary_key_type>::loadData()
            getFullName());
 }

+template <DictionaryKeyType dictionary_key_type>
+void HashedArrayDictionary<dictionary_key_type>::buildHierarchyParentToChildIndexIfNeeded()
+{
+    if (!dict_struct.hierarchical_attribute_index)
+        return;
+
+    if (dict_struct.attributes[*dict_struct.hierarchical_attribute_index].bidirectional)
+        hierarchical_index = getHierarchicalIndex();
+}
+
 template <DictionaryKeyType dictionary_key_type>
 void HashedArrayDictionary<dictionary_key_type>::calculateBytesAllocated()
 {
@ -730,10 +758,16 @@ void HashedArrayDictionary<dictionary_key_type>::calculateBytesAllocated()
            bytes_allocated += (*attribute.is_index_null).size();
    }

-    bytes_allocated += string_arena.size();
-
    if (update_field_loaded_block)
        bytes_allocated += update_field_loaded_block->allocatedBytes();
+
+    if (hierarchical_index)
+    {
+        hierarchical_index_bytes_allocated = hierarchical_index->getSizeInBytes();
+        bytes_allocated += hierarchical_index_bytes_allocated;
+    }
+
+    bytes_allocated += string_arena.size();
 }

 template <DictionaryKeyType dictionary_key_type>
--- a/src/Dictionaries/HashedArrayDictionary.h
+++ b/src/Dictionaries/HashedArrayDictionary.h
@ -109,10 +109,15 @@ public:
        ColumnPtr in_key_column,
        const DataTypePtr & key_type) const override;

+    DictionaryHierarchicalParentToChildIndexPtr getHierarchicalIndex() const override;
+
+    size_t getHierarchicalIndexBytesAllocated() const override { return hierarchical_index_bytes_allocated; }
+
    ColumnPtr getDescendants(
        ColumnPtr key_column,
        const DataTypePtr & key_type,
-        size_t level) const override;
+        size_t level,
+        DictionaryHierarchicalParentToChildIndexPtr parent_to_child_index) const override;

    Pipe read(const Names & column_names, size_t max_block_size, size_t num_streams) const override;

@ -173,6 +178,8 @@ private:

    void loadData();

+    void buildHierarchyParentToChildIndexIfNeeded();
+
    void calculateBytesAllocated();

    template <typename KeysProvider>
@ -214,6 +221,7 @@ private:
    KeyAttribute key_attribute;

    size_t bytes_allocated = 0;
+    size_t hierarchical_index_bytes_allocated = 0;
    size_t element_count = 0;
    size_t bucket_count = 0;
    mutable std::atomic<size_t> query_count{0};
@ -221,6 +229,7 @@ private:

    BlockPtr update_field_loaded_block;
    Arena string_arena;
+    DictionaryHierarchicalParentToChildIndexPtr hierarchical_index;
 };

 extern template class HashedArrayDictionary<DictionaryKeyType::Simple>;
--- a/src/Dictionaries/HashedDictionary.cpp
+++ b/src/Dictionaries/HashedDictionary.cpp
@ -54,6 +54,7 @@ HashedDictionary<dictionary_key_type, sparse>::HashedDictionary(
 {
    createAttributes();
    loadData();
+    buildHierarchyParentToChildIndexIfNeeded();
    calculateBytesAllocated();
 }

@ -317,29 +318,46 @@ ColumnUInt8::Ptr HashedDictionary<dictionary_key_type, sparse>::isInHierarchy(
        return nullptr;
 }

+template <DictionaryKeyType dictionary_key_type, bool sparse>
+DictionaryHierarchyParentToChildIndexPtr HashedDictionary<dictionary_key_type, sparse>::getHierarchicalIndex() const
+{
+    if constexpr (dictionary_key_type == DictionaryKeyType::Simple)
+    {
+        if (hierarchical_index)
+            return hierarchical_index;
+
+        size_t hierarchical_attribute_index = *dict_struct.hierarchical_attribute_index;
+        const auto & hierarchical_attribute = attributes[hierarchical_attribute_index];
+        const CollectionType<UInt64> & parent_keys = std::get<CollectionType<UInt64>>(hierarchical_attribute.container);
+
+        HashMap<UInt64, PaddedPODArray<UInt64>> parent_to_child;
+        parent_to_child.reserve(parent_keys.size());
+
+        for (const auto & [key, value] : parent_keys)
+            parent_to_child[value].emplace_back(key);
+
+        return std::make_shared<DictionaryHierarchicalParentToChildIndex>(parent_to_child);
+    }
+    else
+    {
+        return nullptr;
+    }
+}
+
 template <DictionaryKeyType dictionary_key_type, bool sparse>
 ColumnPtr HashedDictionary<dictionary_key_type, sparse>::getDescendants(
    ColumnPtr key_column [[maybe_unused]],
    const DataTypePtr &,
-    size_t level [[maybe_unused]]) const
+    size_t level [[maybe_unused]],
+    DictionaryHierarchicalParentToChildIndexPtr parent_to_child_index [[maybe_unused]]) const
 {
    if constexpr (dictionary_key_type == DictionaryKeyType::Simple)
    {
        PaddedPODArray<UInt64> keys_backup;
        const auto & keys = getColumnVectorData(this, key_column, keys_backup);

-        size_t hierarchical_attribute_index = *dict_struct.hierarchical_attribute_index;
-
-        const auto & hierarchical_attribute = attributes[hierarchical_attribute_index];
-        const CollectionType<UInt64> & parent_keys = std::get<CollectionType<UInt64>>(hierarchical_attribute.container);
-
-        HashMap<UInt64, PaddedPODArray<UInt64>> parent_to_child;
-
-        for (const auto & [key, value] : parent_keys)
-            parent_to_child[value].emplace_back(key);
-
        size_t keys_found;
-        auto result = getKeysDescendantsArray(keys, parent_to_child, level, keys_found);
+        auto result = getKeysDescendantsArray(keys, *parent_to_child_index, level, keys_found);

        query_count.fetch_add(keys.size(), std::memory_order_relaxed);
        found_count.fetch_add(keys_found, std::memory_order_relaxed);
@ -347,7 +365,9 @@ ColumnPtr HashedDictionary<dictionary_key_type, sparse>::getDescendants(
        return result;
    }
    else
+    {
        return nullptr;
+    }
 }

 template <DictionaryKeyType dictionary_key_type, bool sparse>
@ -631,6 +651,16 @@ void HashedDictionary<dictionary_key_type, sparse>::loadData()
            getFullName());
 }

+template <DictionaryKeyType dictionary_key_type, bool sparse>
+void HashedDictionary<dictionary_key_type, sparse>::buildHierarchyParentToChildIndexIfNeeded()
+{
+    if (!dict_struct.hierarchical_attribute_index)
+        return;
+
+    if (dict_struct.attributes[*dict_struct.hierarchical_attribute_index].bidirectional)
+        hierarchical_index = getHierarchicalIndex();
+}
+
 template <DictionaryKeyType dictionary_key_type, bool sparse>
 void HashedDictionary<dictionary_key_type, sparse>::calculateBytesAllocated()
 {
@ -684,10 +714,16 @@ void HashedDictionary<dictionary_key_type, sparse>::calculateBytesAllocated()
        }
    }

-    bytes_allocated += string_arena.size();
-
    if (update_field_loaded_block)
        bytes_allocated += update_field_loaded_block->allocatedBytes();
+
+    if (hierarchical_index)
+    {
+        hierarchical_index_bytes_allocated = hierarchical_index->getSizeInBytes();
+        bytes_allocated += hierarchical_index_bytes_allocated;
+    }
+
+    bytes_allocated += string_arena.size();
 }

 template <DictionaryKeyType dictionary_key_type, bool sparse>
--- a/src/Dictionaries/HashedDictionary.h
+++ b/src/Dictionaries/HashedDictionary.h
@ -110,10 +110,15 @@ public:
        ColumnPtr in_key_column,
        const DataTypePtr & key_type) const override;

+    DictionaryHierarchicalParentToChildIndexPtr getHierarchicalIndex() const override;
+
+    size_t getHierarchicalIndexBytesAllocated() const override { return hierarchical_index_bytes_allocated; }
+
    ColumnPtr getDescendants(
        ColumnPtr key_column,
        const DataTypePtr & key_type,
-        size_t level) const override;
+        size_t level,
+        DictionaryHierarchicalParentToChildIndexPtr parent_to_child_index) const override;

    Pipe read(const Names & column_names, size_t max_block_size, size_t num_streams) const override;

@ -194,6 +199,8 @@ private:

    void loadData();

+    void buildHierarchyParentToChildIndexIfNeeded();
+
    void calculateBytesAllocated();

    template <typename AttributeType, bool is_nullable, typename ValueSetter, typename DefaultValueExtractor>
@ -218,6 +225,7 @@ private:
    std::vector<Attribute> attributes;

    size_t bytes_allocated = 0;
+    size_t hierarchical_index_bytes_allocated = 0;
    size_t element_count = 0;
    size_t bucket_count = 0;
    mutable std::atomic<size_t> query_count{0};
@ -226,6 +234,7 @@ private:
    BlockPtr update_field_loaded_block;
    Arena string_arena;
    NoAttributesCollectionType no_attributes_container;
+    DictionaryHierarchicalParentToChildIndexPtr hierarchical_index;
 };

 extern template class HashedDictionary<DictionaryKeyType::Simple, false>;
--- a/src/Dictionaries/HierarchyDictionariesUtils.cpp
+++ b/src/Dictionaries/HierarchyDictionariesUtils.cpp
@ -8,6 +8,22 @@ namespace ErrorCodes
    extern const int UNSUPPORTED_METHOD;
 }

+namespace detail
+{
+    ColumnPtr convertElementsAndOffsetsIntoArray(ElementsAndOffsets && elements_and_offsets)
+    {
+        auto elements_column = ColumnVector<UInt64>::create();
+        elements_column->getData() = std::move(elements_and_offsets.elements);
+
+        auto offsets_column = ColumnVector<IColumn::Offset>::create();
+        offsets_column->getData() = std::move(elements_and_offsets.offsets);
+
+        auto column_array = ColumnArray::create(std::move(elements_column), std::move(offsets_column));
+
+        return column_array;
+    }
+}
+
 namespace
 {
    /** In case of cache or direct dictionary we does not have structure with child to parent representation.
@ -84,6 +100,26 @@ namespace
    }
 }

+ColumnPtr getKeysDescendantsArray(
+    const PaddedPODArray<UInt64> & requested_keys,
+    const DictionaryHierarchicalParentToChildIndex & parent_to_child_index,
+    size_t level,
+    size_t & valid_keys)
+{
+    if (level == 0)
+    {
+        detail::GetAllDescendantsStrategy strategy { .level = level };
+        auto elements_and_offsets = detail::getDescendants(requested_keys, parent_to_child_index, strategy, valid_keys);
+        return detail::convertElementsAndOffsetsIntoArray(std::move(elements_and_offsets));
+    }
+    else
+    {
+        detail::GetDescendantsAtSpecificLevelStrategy strategy { .level = level };
+        auto elements_and_offsets = detail::getDescendants(requested_keys, parent_to_child_index, strategy, valid_keys);
+        return detail::convertElementsAndOffsetsIntoArray(std::move(elements_and_offsets));
+    }
+}
+
 ColumnPtr getKeysHierarchyDefaultImplementation(
    const IDictionary * dictionary,
    ColumnPtr key_column,
--- a/src/Dictionaries/HierarchyDictionariesUtils.h
+++ b/src/Dictionaries/HierarchyDictionariesUtils.h
@ -14,25 +14,65 @@
 namespace DB
 {

+class DictionaryHierarchicalParentToChildIndex;
+using DictionaryHierarchyParentToChildIndexPtr = std::shared_ptr<DictionaryHierarchicalParentToChildIndex>;
+
+class DictionaryHierarchicalParentToChildIndex
+{
+public:
+    struct KeysRange
+    {
+        UInt32 start_index;
+        UInt32 end_index;
+    };
+
+    explicit DictionaryHierarchicalParentToChildIndex(const HashMap<UInt64, PaddedPODArray<UInt64>> & parent_to_children_map_)
+    {
+        size_t parent_to_children_map_size = parent_to_children_map_.size();
+
+        keys.reserve(parent_to_children_map_size);
+        parent_to_children_keys_range.reserve(parent_to_children_map_size);
+
+        for (auto & [parent, children] : parent_to_children_map_)
+        {
+            size_t keys_size = keys.size();
+            UInt32 start_index = static_cast<UInt32>(keys_size);
+            UInt32 end_index = start_index + static_cast<UInt32>(children.size());
+
+            keys.insert(children.begin(), children.end());
+
+            parent_to_children_keys_range[parent] = KeysRange{start_index, end_index};
+        }
+    }
+
+    size_t getSizeInBytes() const
+    {
+        return parent_to_children_keys_range.getBufferSizeInBytes() + (keys.size() * sizeof(UInt64));
+    }
+
+    /// Map parent key to range of children from keys array
+    HashMap<UInt64, KeysRange> parent_to_children_keys_range;
+
+    /// Array of keys in hierarchy
+    PaddedPODArray<UInt64> keys;
+};
+
 namespace detail
 {
-    template <typename KeyType>
    struct ElementsAndOffsets
    {
-        PaddedPODArray<KeyType> elements;
+        PaddedPODArray<UInt64> elements;
        PaddedPODArray<IColumn::Offset> offsets;
    };

-    template <typename T>
    struct IsKeyValidFuncInterface
    {
-        bool operator()(T key [[maybe_unused]]) { return false; }
+        bool operator()(UInt64 key [[maybe_unused]]) { return false; }
    };

-    template <typename T>
    struct GetParentKeyFuncInterface
    {
-        std::optional<T> operator()(T key [[maybe_unused]]) { return {}; }
+        std::optional<UInt64> operator()(UInt64 key [[maybe_unused]]) { return {}; }
    };

    /** Calculate hierarchy for keys iterating the hierarchy from child to parent using get_parent_key_func provided by client.
@ -54,16 +94,16 @@ namespace detail
      * Elements: [1, 2, 1, 3, 1, 4, 2, 1]
      * Offsets: [1, 3, 5, 8, 8]
      */
-    template <typename KeyType, typename IsKeyValidFunc, typename GetParentKeyFunc>
-    ElementsAndOffsets<KeyType> getHierarchy(
-        const PaddedPODArray<KeyType> & keys,
-        const KeyType & hierarchy_null_value,
+    template <typename IsKeyValidFunc, typename GetParentKeyFunc>
+    ElementsAndOffsets getHierarchy(
+        const PaddedPODArray<UInt64> & keys,
+        const UInt64 & hierarchy_null_value,
        IsKeyValidFunc && is_key_valid_func,
        GetParentKeyFunc && get_parent_key_func)
    {
        size_t hierarchy_keys_size = keys.size();

-        PaddedPODArray<KeyType> elements;
+        PaddedPODArray<UInt64> elements;
        elements.reserve(hierarchy_keys_size);

        PaddedPODArray<IColumn::Offset> offsets;
@ -75,7 +115,7 @@ namespace detail
            size_t array_element_offset;
        };

-        HashMap<KeyType, OffsetInArray> already_processes_keys_to_offset;
+        HashMap<UInt64, OffsetInArray> already_processes_keys_to_offset;
        already_processes_keys_to_offset.reserve(hierarchy_keys_size);

        for (size_t i = 0; i < hierarchy_keys_size; ++i)
@ -123,7 +163,7 @@ namespace detail
                elements.emplace_back(hierarchy_key);
                ++current_hierarchy_depth;

-                std::optional<KeyType> parent_key = std::forward<GetParentKeyFunc>(get_parent_key_func)(hierarchy_key);
+                std::optional<UInt64> parent_key = std::forward<GetParentKeyFunc>(get_parent_key_func)(hierarchy_key);

                if (!parent_key.has_value())
                    break;
@ -134,7 +174,7 @@ namespace detail
            offsets.emplace_back(elements.size());
        }

-        ElementsAndOffsets<KeyType> result = {std::move(elements), std::move(offsets)};
+        ElementsAndOffsets result = {std::move(elements), std::move(offsets)};

        return result;
    }
@ -146,11 +186,11 @@ namespace detail
      *
      * Not: keys size must be equal to in_keys_size.
      */
-    template <typename KeyType, typename IsKeyValidFunc, typename GetParentKeyFunc>
+    template <typename IsKeyValidFunc, typename GetParentKeyFunc>
    PaddedPODArray<UInt8> getIsInHierarchy(
-        const PaddedPODArray<KeyType> & keys,
-        const PaddedPODArray<KeyType> & in_keys,
-        const KeyType & hierarchy_null_value,
+        const PaddedPODArray<UInt64> & keys,
+        const PaddedPODArray<UInt64> & in_keys,
+        const UInt64 & hierarchy_null_value,
        IsKeyValidFunc && is_key_valid_func,
        GetParentKeyFunc && get_parent_func)
    {
@ -159,7 +199,7 @@ namespace detail
        PaddedPODArray<UInt8> result;
        result.resize_fill(keys.size());

-        detail::ElementsAndOffsets<KeyType> hierarchy = detail::getHierarchy(
+        detail::ElementsAndOffsets hierarchy = detail::getHierarchy(
            keys,
            hierarchy_null_value,
            std::forward<IsKeyValidFunc>(is_key_valid_func),
@ -216,19 +256,22 @@ namespace detail
      * Result: [1], [2, 3], [4], [], [];
      * Offsets: [1, 3, 4, 4, 4];
      */
-    template <typename KeyType, typename Strategy>
-    ElementsAndOffsets<KeyType> getDescendants(
-        const PaddedPODArray<KeyType> & keys,
-        const HashMap<KeyType, PaddedPODArray<KeyType>> & parent_to_child,
+    template <typename Strategy>
+    ElementsAndOffsets getDescendants(
+        const PaddedPODArray<UInt64> & keys,
+        const DictionaryHierarchicalParentToChildIndex & parent_to_child_index,
        Strategy strategy,
        size_t & valid_keys)
    {
+        auto & parent_to_children_keys_range = parent_to_child_index.parent_to_children_keys_range;
+        auto & children_keys = parent_to_child_index.keys;
+
        /// If strategy is GetAllDescendantsStrategy we try to cache and later reuse previously calculated descendants.
        /// If strategy is GetDescendantsAtSpecificLevelStrategy we does not use cache strategy.
        size_t keys_size = keys.size();
        valid_keys = 0;

-        PaddedPODArray<KeyType> descendants;
+        PaddedPODArray<UInt64> descendants;
        descendants.reserve(keys_size);

        PaddedPODArray<IColumn::Offset> descendants_offsets;
@ -241,18 +284,18 @@ namespace detail
        };

        static constexpr Int64 key_range_requires_update = -1;
-        HashMap<KeyType, Range> already_processed_keys_to_range [[maybe_unused]];
+        HashMap<UInt64, Range> already_processed_keys_to_range [[maybe_unused]];

        if constexpr (std::is_same_v<Strategy, GetAllDescendantsStrategy>)
            already_processed_keys_to_range.reserve(keys_size);

        struct KeyAndDepth
        {
-            KeyType key;
+            UInt64 key;
            Int64 depth;
        };

-        HashSet<KeyType> already_processed_keys_during_loop;
+        HashSet<UInt64> already_processed_keys_during_loop;
        already_processed_keys_during_loop.reserve(keys_size);

        PaddedPODArray<KeyAndDepth> next_keys_to_process_stack;
@ -262,9 +305,9 @@ namespace detail

        for (size_t i = 0; i < keys_size; ++i)
        {
-            const KeyType & requested_key = keys[i];
+            const UInt64 & requested_key = keys[i];

-            if (parent_to_child.find(requested_key) == nullptr)
+            if (parent_to_children_keys_range.find(requested_key) == nullptr)
            {
                descendants_offsets.emplace_back(descendants.size());
                continue;
@ -282,7 +325,7 @@ namespace detail
            {
                KeyAndDepth key_to_process = next_keys_to_process_stack.back();

-                KeyType key = key_to_process.key;
+                UInt64 key = key_to_process.key;
                Int64 depth = key_to_process.depth;
                next_keys_to_process_stack.pop_back();

@ -329,7 +372,7 @@ namespace detail
                    }
                }

-                const auto * it = parent_to_child.find(key);
+                const auto * it = parent_to_children_keys_range.find(key);

                if (!it || depth >= DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH)
                    continue;
@ -352,15 +395,26 @@ namespace detail

                ++depth;

-                const auto & children = it->getMapped();
+                DictionaryHierarchicalParentToChildIndex::KeysRange children_range = it->getMapped();

-                for (auto child_key : children)
+                for (; children_range.start_index < children_range.end_index; ++children_range.start_index)
                {
+                    auto child_key = children_keys[children_range.start_index];
+
                    /// In case of GetAllDescendantsStrategy we add any descendant to result array
                    /// If strategy is GetDescendantsAtSpecificLevelStrategy we require depth == level
-                    if (std::is_same_v<Strategy, GetAllDescendantsStrategy> || depth == level)
+                    if constexpr (std::is_same_v<Strategy, GetAllDescendantsStrategy>)
                        descendants.emplace_back(child_key);

+                    if constexpr (std::is_same_v<Strategy, GetDescendantsAtSpecificLevelStrategy>)
+                    {
+                        if (depth == level)
+                        {
+                            descendants.emplace_back(child_key);
+                            continue;
+                        }
+                    }
+
                    next_keys_to_process_stack.emplace_back(KeyAndDepth{child_key, depth});
                }
            }
@ -370,24 +424,12 @@ namespace detail
            descendants_offsets.emplace_back(descendants.size());
        }

-        ElementsAndOffsets<KeyType> result = {std::move(descendants), std::move(descendants_offsets)};
+        ElementsAndOffsets result = {std::move(descendants), std::move(descendants_offsets)};
        return result;
    }

    /// Converts ElementAndOffsets structure into ArrayColumn
-    template<typename KeyType>
-    ColumnPtr convertElementsAndOffsetsIntoArray(ElementsAndOffsets<KeyType> && elements_and_offsets)
-    {
-        auto elements_column = ColumnVector<KeyType>::create();
-        elements_column->getData() = std::move(elements_and_offsets.elements);
-
-        auto offsets_column = ColumnVector<IColumn::Offset>::create();
-        offsets_column->getData() = std::move(elements_and_offsets.offsets);
-
-        auto column_array = ColumnArray::create(std::move(elements_column), std::move(offsets_column));
-
-        return column_array;
-    }
+    ColumnPtr convertElementsAndOffsetsIntoArray(ElementsAndOffsets && elements_and_offsets);
 }

 /// Returns hierarchy array column for keys
@ -432,26 +474,11 @@ ColumnUInt8::Ptr getKeysIsInHierarchyColumn(
 /// Returns descendants array column for keys
 ///
 /// @param valid_keys - number of keys that are valid in parent_to_child map
-template <typename KeyType>
 ColumnPtr getKeysDescendantsArray(
-    const PaddedPODArray<KeyType> & requested_keys,
-    const HashMap<KeyType, PaddedPODArray<KeyType>> & parent_to_child,
+    const PaddedPODArray<UInt64> & requested_keys,
+    const DictionaryHierarchicalParentToChildIndex & parent_to_child_index,
    size_t level,
-    size_t & valid_keys)
-{
-    if (level == 0)
-    {
-        detail::GetAllDescendantsStrategy strategy { .level = level };
-        auto elements_and_offsets = detail::getDescendants(requested_keys, parent_to_child, strategy, valid_keys);
-        return detail::convertElementsAndOffsetsIntoArray(std::move(elements_and_offsets));
-    }
-    else
-    {
-        detail::GetDescendantsAtSpecificLevelStrategy strategy { .level = level };
-        auto elements_and_offsets = detail::getDescendants(requested_keys, parent_to_child, strategy, valid_keys);
-        return detail::convertElementsAndOffsetsIntoArray(std::move(elements_and_offsets));
-    }
-}
+    size_t & valid_keys);

 /** Default getHierarchy implementation for dictionaries that does not have structure with child to parent representation.
  * Implementation will build such structure with getColumn calls, and then getHierarchy for such structure.
--- a/Show More
+++ b/Show More