Merge branch 'master' into fix-order

2024-11-21 15:12:02 +00:00 · 2022-04-02 12:00:29 +00:00 · 2022-04-02 12:00:29 +00:00 · 1e43e26fa1
commit 1e43e26fa1
parent 3ae36ac483 687e9e7aeb
237 changed files with 5128 additions and 2774 deletions
--- a/.github/workflows/master.yml
+++ b/.github/workflows/master.yml
@ -149,7 +149,6 @@ jobs:
          sudo rm -fr "$TEMP_PATH"
  SplitBuildSmokeTest:
    needs: [BuilderDebSplitted]
-    if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }}
    runs-on: [self-hosted, style-checker]
    steps:
      - name: Set envs
@ -316,7 +315,6 @@ jobs:
          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
  BuilderBinRelease:
    needs: [DockerHubPush]
-    if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }}
    runs-on: [self-hosted, builder]
    steps:
      - name: Set envs
@ -362,7 +360,6 @@ jobs:
          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
  BuilderBinGCC:
    needs: [DockerHubPush]
-    if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }}
    runs-on: [self-hosted, builder]
    steps:
      - name: Set envs
@ -636,7 +633,6 @@ jobs:
 ##########################################################################################
  BuilderDebSplitted:
    needs: [DockerHubPush]
-    if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }}
    runs-on: [self-hosted, builder]
    steps:
      - name: Set envs
@ -682,7 +678,6 @@ jobs:
          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
  BuilderBinTidy:
    needs: [DockerHubPush]
-    if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }}
    runs-on: [self-hosted, builder]
    steps:
      - name: Set envs
@ -728,7 +723,6 @@ jobs:
          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
  BuilderBinDarwin:
    needs: [DockerHubPush]
-    if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }}
    runs-on: [self-hosted, builder]
    steps:
      - name: Set envs
@ -774,7 +768,6 @@ jobs:
          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
  BuilderBinAarch64:
    needs: [DockerHubPush]
-    if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }}
    runs-on: [self-hosted, builder]
    steps:
      - name: Set envs
@ -820,7 +813,6 @@ jobs:
          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
  BuilderBinFreeBSD:
    needs: [DockerHubPush]
-    if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }}
    runs-on: [self-hosted, builder]
    steps:
      - name: Set envs
@ -866,7 +858,6 @@ jobs:
          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
  BuilderBinDarwinAarch64:
    needs: [DockerHubPush]
-    if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }}
    runs-on: [self-hosted, builder]
    steps:
      - name: Set envs
@ -912,7 +903,6 @@ jobs:
          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
  BuilderBinPPC64:
    needs: [DockerHubPush]
-    if: ${{ !contains(github.event.pull_request.labels.*.name, 'pr-documentation') && !contains(github.event.pull_request.labels.*.name, 'pr-doc-fix') }}
    runs-on: [self-hosted, builder]
    steps:
      - name: Set envs
@ -957,6 +947,34 @@ jobs:
          docker rm -f "$(docker ps -a -q)" ||:
          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
 ############################################################################################
+##################################### Docker images  #######################################
+############################################################################################
+  DockerServerImages:
+    needs:
+      - BuilderDebRelease
+      - BuilderDebAarch64
+    runs-on: [self-hosted, style-checker]
+    steps:
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 0  # otherwise we will have no version info
+      - name: Check docker clickhouse/clickhouse-server building
+        run: |
+          cd "$GITHUB_WORKSPACE/tests/ci"
+          python3 docker_server.py --release-type head
+          python3 docker_server.py --release-type head --no-ubuntu \
+            --image-repo clickhouse/clickhouse-keeper --image-path docker/keeper
+      - name: Cleanup
+        if: always()
+        run: |
+          docker kill "$(docker ps -q)" ||:
+          docker rm -f "$(docker ps -a -q)" ||:
+          sudo rm -fr "$TEMP_PATH"
+############################################################################################
 ##################################### BUILD REPORTER #######################################
 ############################################################################################
  BuilderReport:
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@ -4,7 +4,7 @@ env:
  # Force the stdout and stderr streams to be unbuffered
  PYTHONUNBUFFERED: 1

-on: # yamllint disable-line rule:truthy
+on:  # yamllint disable-line rule:truthy
  pull_request:
    types:
      - synchronize
@ -998,6 +998,34 @@ jobs:
          docker rm -f "$(docker ps -a -q)" ||:
          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
 ############################################################################################
+##################################### Docker images  #######################################
+############################################################################################
+  DockerServerImages:
+    needs:
+      - BuilderDebRelease
+      - BuilderDebAarch64
+    runs-on: [self-hosted, style-checker]
+    steps:
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 0  # otherwise we will have no version info
+      - name: Check docker clickhouse/clickhouse-server building
+        run: |
+          cd "$GITHUB_WORKSPACE/tests/ci"
+          python3 docker_server.py --release-type head --no-push
+          python3 docker_server.py --release-type head --no-push --no-ubuntu \
+            --image-repo clickhouse/clickhouse-keeper --image-path docker/keeper
+      - name: Cleanup
+        if: always()
+        run: |
+          docker kill "$(docker ps -q)" ||:
+          docker rm -f "$(docker ps -a -q)" ||:
+          sudo rm -fr "$TEMP_PATH"
+############################################################################################
 ##################################### BUILD REPORTER #######################################
 ############################################################################################
  BuilderReport:
@ -3138,6 +3166,7 @@ jobs:
    needs:
      - StyleCheck
      - DockerHubPush
+      - DockerServerImages
      - CheckLabels
      - BuilderReport
      - FastTest
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@ -36,3 +36,28 @@ jobs:
        overwrite: true
        tag: ${{ github.ref }}
        file_glob: true
+  ############################################################################################
+  ##################################### Docker images  #######################################
+  ############################################################################################
+  DockerServerImages:
+    runs-on: [self-hosted, style-checker]
+    steps:
+    - name: Clear repository
+      run: |
+        sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+    - name: Check out repository code
+      uses: actions/checkout@v2
+      with:
+        fetch-depth: 0  # otherwise we will have no version info
+    - name: Check docker clickhouse/clickhouse-server building
+      run: |
+        cd "$GITHUB_WORKSPACE/tests/ci"
+        python3 docker_server.py --release-type auto
+        python3 docker_server.py --release-type auto --no-ubuntu \
+          --image-repo clickhouse/clickhouse-keeper --image-path docker/keeper
+    - name: Cleanup
+      if: always()
+      run: |
+        docker kill "$(docker ps -q)" ||:
+        docker rm -f "$(docker ps -a -q)" ||:
+        sudo rm -fr "$TEMP_PATH"
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -261,8 +261,8 @@ endif ()
 # Add a section with the hash of the compiled machine code for integrity checks.
 # Only for official builds, because adding a section can be time consuming (rewrite of several GB).
 # And cross compiled binaries are not supported (since you cannot execute clickhouse hash-binary)
-if (OBJCOPY_PATH AND CLICKHOUSE_OFFICIAL_BUILD AND (NOT CMAKE_TOOLCHAIN_FILE))
-    set (USE_BINARY_HASH 1)
+if (OBJCOPY_PATH AND CLICKHOUSE_OFFICIAL_BUILD AND (NOT CMAKE_TOOLCHAIN_FILE OR CMAKE_TOOLCHAIN_FILE MATCHES "linux/toolchain-x86_64.cmake$"))
+    set (USE_BINARY_HASH 1 CACHE STRING "Calculate binary hash and store it in the separate section")
 endif ()

 # Allows to build stripped binary in a separate directory
--- a/base/base/CMakeLists.txt
+++ b/base/base/CMakeLists.txt
@ -2,6 +2,7 @@ set (SRCS
    argsToConfig.cpp
    coverage.cpp
    demangle.cpp
+    getAvailableMemoryAmount.cpp
    getFQDNOrHostName.cpp
    getMemoryAmount.cpp
    getPageSize.cpp
--- a/base/base/getAvailableMemoryAmount.cpp
+++ b/base/base/getAvailableMemoryAmount.cpp
@ -0,0 +1,44 @@
+#include <stdexcept>
+#include <fstream>
+#include <base/getAvailableMemoryAmount.h>
+#include <base/getPageSize.h>
+
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/param.h>
+#if defined(BSD)
+#include <sys/sysctl.h>
+#include <sys/vmmeter.h>
+#endif
+
+
+uint64_t getAvailableMemoryAmountOrZero()
+{
+#if defined(_SC_AVPHYS_PAGES) // linux
+    return getPageSize() * sysconf(_SC_AVPHYS_PAGES);
+#elif defined(__FreeBSD__)
+    struct vmtotal vmt;
+    size_t vmt_size = sizeof(vmt);
+    if (sysctlbyname("vm.vmtotal", &vmt, &vmt_size, NULL, 0) == 0)
+        return getPageSize() * vmt.t_avm;
+    else
+        return 0;
+#else // darwin
+    unsigned int usermem;
+    size_t len = sizeof(usermem);
+    static int mib[2] = { CTL_HW, HW_USERMEM };
+    if (sysctl(mib, 2, &usermem, &len, nullptr, 0) == 0 && len == sizeof(usermem))
+        return usermem;
+    else
+        return 0;
+#endif
+}
+
+
+uint64_t getAvailableMemoryAmount()
+{
+    auto res = getAvailableMemoryAmountOrZero();
+    if (!res)
+        throw std::runtime_error("Cannot determine available memory amount");
+    return res;
+}
--- a/base/base/getAvailableMemoryAmount.h
+++ b/base/base/getAvailableMemoryAmount.h
@ -0,0 +1,12 @@
+#pragma once
+
+#include <cstdint>
+
+/** Returns the size of currently available physical memory (RAM) in bytes.
+  * Returns 0 on unsupported platform or if it cannot determine the size of physical memory.
+  */
+uint64_t getAvailableMemoryAmountOrZero();
+
+/** Throws exception if it cannot determine the size of physical memory.
+  */
+uint64_t getAvailableMemoryAmount();
--- a/docker/keeper/Dockerfile
+++ b/docker/keeper/Dockerfile
@ -0,0 +1,72 @@
+FROM ubuntu:20.04 AS glibc-donor
+
+ARG TARGETARCH
+RUN arch=${TARGETARCH:-amd64} \
+    && case $arch in \
+        amd64) rarch=x86_64 ;; \
+        arm64) rarch=aarch64 ;; \
+    esac \
+    && ln -s "${rarch}-linux-gnu" /lib/linux-gnu
+
+
+FROM alpine
+
+ENV LANG=en_US.UTF-8 \
+    LANGUAGE=en_US:en \
+    LC_ALL=en_US.UTF-8 \
+    TZ=UTC \
+    CLICKHOUSE_CONFIG=/etc/clickhouse-server/config.xml
+
+COPY --from=glibc-donor /lib/linux-gnu/libc.so.6 /lib/linux-gnu/libdl.so.2 /lib/linux-gnu/libm.so.6 /lib/linux-gnu/libpthread.so.0 /lib/linux-gnu/librt.so.1 /lib/linux-gnu/libnss_dns.so.2 /lib/linux-gnu/libnss_files.so.2 /lib/linux-gnu/libresolv.so.2 /lib/linux-gnu/ld-2.31.so /lib/
+COPY --from=glibc-donor /etc/nsswitch.conf /etc/
+COPY entrypoint.sh /entrypoint.sh
+RUN arch=${TARGETARCH:-amd64} \
+    && case $arch in \
+        amd64) mkdir -p /lib64 && ln -sf /lib/ld-2.31.so /lib64/ld-linux-x86-64.so.2 ;; \
+        arm64) ln -sf /lib/ld-2.31.so /lib/ld-linux-aarch64.so.1 ;; \
+    esac
+
+ARG REPOSITORY="https://s3.amazonaws.com/clickhouse-builds/22.4/31c367d3cd3aefd316778601ff6565119fe36682/package_release"
+ARG VERSION="22.4.1.917"
+ARG PACKAGES="clickhouse-keeper"
+
+# user/group precreated explicitly with fixed uid/gid on purpose.
+# It is especially important for rootless containers: in that case entrypoint
+# can't do chown and owners of mounted volumes should be configured externally.
+# We do that in advance at the begining of Dockerfile before any packages will be
+# installed to prevent picking those uid / gid by some unrelated software.
+# The same uid / gid (101) is used both for alpine and ubuntu.
+
+
+ARG TARGETARCH
+RUN arch=${TARGETARCH:-amd64} \
+    && for package in ${PACKAGES}; do \
+        { \
+            { echo "Get ${REPOSITORY}/${package}-${VERSION}-${arch}.tgz" \
+                && wget -c -q "${REPOSITORY}/${package}-${VERSION}-${arch}.tgz" -O "/tmp/${package}-${VERSION}-${arch}.tgz" \
+                && tar xvzf "/tmp/${package}-${VERSION}-${arch}.tgz" --strip-components=1 -C / ; \
+            } || \
+            { echo "Fallback to ${REPOSITORY}/${package}-${VERSION}.tgz" \
+                && wget -c -q "${REPOSITORY}/${package}-${VERSION}.tgz" -O "/tmp/${package}-${VERSION}.tgz" \
+                && tar xvzf "/tmp/${package}-${VERSION}.tgz" --strip-components=2 -C / ; \
+            } ; \
+        } || exit 1 \
+    ; done \
+    && rm /tmp/*.tgz /install -r \
+    && addgroup -S -g 101 clickhouse \
+    && adduser -S -h /var/lib/clickhouse -s /bin/bash -G clickhouse -g "ClickHouse keeper" -u 101 clickhouse \
+    && mkdir -p /var/lib/clickhouse /var/log/clickhouse-keeper /etc/clickhouse-keeper \
+    && chown clickhouse:clickhouse /var/lib/clickhouse \
+    && chown root:clickhouse /var/log/clickhouse-keeper \
+    && chmod +x /entrypoint.sh \
+    && apk add --no-cache su-exec bash tzdata \
+    && cp /usr/share/zoneinfo/UTC /etc/localtime \
+    && echo "UTC" > /etc/timezone \
+    && chmod ugo+Xrw -R /var/lib/clickhouse /var/log/clickhouse-keeper /etc/clickhouse-keeper
+
+
+EXPOSE 2181 10181 44444
+
+VOLUME /var/lib/clickhouse /var/log/clickhouse-keeper /etc/clickhouse-keeper
+
+ENTRYPOINT ["/entrypoint.sh"]
--- a/docker/keeper/Dockerfile.alpine
+++ b/docker/keeper/Dockerfile.alpine
@ -0,0 +1 @@
+Dockerfile
--- a/docker/keeper/entrypoint.sh
+++ b/docker/keeper/entrypoint.sh
@ -0,0 +1,93 @@
+#!/bin/bash
+
+set +x
+set -eo pipefail
+shopt -s nullglob
+
+DO_CHOWN=1
+if [ "${CLICKHOUSE_DO_NOT_CHOWN:-0}" = "1" ]; then
+    DO_CHOWN=0
+fi
+
+CLICKHOUSE_UID="${CLICKHOUSE_UID:-"$(id -u clickhouse)"}"
+CLICKHOUSE_GID="${CLICKHOUSE_GID:-"$(id -g clickhouse)"}"
+
+# support --user
+if [ "$(id -u)" = "0" ]; then
+    USER=$CLICKHOUSE_UID
+    GROUP=$CLICKHOUSE_GID
+    if command -v gosu &> /dev/null; then
+        gosu="gosu $USER:$GROUP"
+    elif command -v su-exec &> /dev/null; then
+        gosu="su-exec $USER:$GROUP"
+    else
+        echo "No gosu/su-exec detected!"
+        exit 1
+    fi
+else
+    USER="$(id -u)"
+    GROUP="$(id -g)"
+    gosu=""
+    DO_CHOWN=0
+fi
+
+KEEPER_CONFIG="${KEEPER_CONFIG:-/etc/clickhouse-keeper/config.yaml}"
+
+if [ -f "$KEEPER_CONFIG" ] && ! $gosu test -f "$KEEPER_CONFIG" -a -r "$KEEPER_CONFIG"; then
+    echo "Configuration file '$KEEPER_CONFIG' isn't readable by user with id '$USER'"
+    exit 1
+fi
+
+DATA_DIR="${CLICKHOUSE_DATA_DIR:-/var/lib/clickhouse}"
+LOG_DIR="${LOG_DIR:-/var/log/clickhouse-keeper}"
+LOG_PATH="${LOG_DIR}/clickhouse-keeper.log"
+ERROR_LOG_PATH="${LOG_DIR}/clickhouse-keeper.err.log"
+COORDINATION_LOG_DIR="${DATA_DIR}/coordination/log"
+COORDINATION_SNAPSHOT_DIR="${DATA_DIR}/coordination/snapshots"
+CLICKHOUSE_WATCHDOG_ENABLE=${CLICKHOUSE_WATCHDOG_ENABLE:-0}
+
+for dir in "$DATA_DIR" \
+  "$LOG_DIR" \
+  "$TMP_DIR" \
+  "$COORDINATION_LOG_DIR" \
+  "$COORDINATION_SNAPSHOT_DIR"
+do
+    # check if variable not empty
+    [ -z "$dir" ] && continue
+    # ensure directories exist
+    if ! mkdir -p "$dir"; then
+        echo "Couldn't create necessary directory: $dir"
+        exit 1
+    fi
+
+    if [ "$DO_CHOWN" = "1" ]; then
+        # ensure proper directories permissions
+        # but skip it for if directory already has proper premissions, cause recursive chown may be slow
+        if [ "$(stat -c %u "$dir")" != "$USER" ] || [ "$(stat -c %g "$dir")" != "$GROUP" ]; then
+            chown -R "$USER:$GROUP" "$dir"
+        fi
+    elif ! $gosu test -d "$dir" -a -w "$dir" -a -r "$dir"; then
+        echo "Necessary directory '$dir' isn't accessible by user with id '$USER'"
+        exit 1
+    fi
+done
+
+# if no args passed to `docker run` or first argument start with `--`, then the user is passing clickhouse-server arguments
+if [[ $# -lt 1 ]] || [[ "$1" == "--"* ]]; then
+    # Watchdog is launched by default, but does not send SIGINT to the main process,
+    # so the container can't be finished by ctrl+c
+    export CLICKHOUSE_WATCHDOG_ENABLE
+
+    cd /var/lib/clickhouse
+
+    # There is a config file. It is already tested with gosu (if it is readably by keeper user)
+    if [ -f "$KEEPER_CONFIG" ]; then
+        exec $gosu /usr/bin/clickhouse-keeper --config-file="$KEEPER_CONFIG" --log-file="$LOG_PATH" --errorlog-file="$ERROR_LOG_PATH" "$@"
+    fi
+
+    # There is no config file. Will use embedded one
+    exec $gosu /usr/bin/clickhouse-keeper --log-file="$LOG_PATH" --errorlog-file="$ERROR_LOG_PATH" "$@"
+fi
+
+# Otherwise, we assume the user want to run his own process, for example a `bash` shell to explore this image
+exec "$@"
--- a/docker/server/.gitignore
+++ b/docker/server/.gitignore
@ -1,2 +0,0 @@
-alpine-root/*
-tgz-packages/*
--- a/docker/server/Dockerfile
+++ b/docker/server/Dockerfile
@ -1,122 +0,0 @@
-FROM ubuntu:20.04
-
-# ARG for quick switch to a given ubuntu mirror
-ARG apt_archive="http://archive.ubuntu.com"
-RUN sed -i "s|http://archive.ubuntu.com|$apt_archive|g" /etc/apt/sources.list
-
-ARG repository="deb https://packages.clickhouse.com/deb stable main"
-ARG version=22.1.1.*
-
-# set non-empty deb_location_url url to create a docker image
-# from debs created by CI build, for example:
-# docker build . --network host --build-arg version="21.4.1.6282" --build-arg deb_location_url="https://clickhouse-builds.s3.yandex.net/21852/069cfbff388b3d478d1a16dc7060b48073f5d522/clickhouse_build_check/clang-11_relwithdebuginfo_none_bundled_unsplitted_disable_False_deb/" -t filimonovq/clickhouse-server:pr21852
-ARG deb_location_url=""
-
-# set non-empty single_binary_location_url to create docker image
-# from a single binary url (useful for non-standard builds - with sanitizers, for arm64).
-# for example (run on aarch64 server):
-# docker build . --network host --build-arg single_binary_location_url="https://builds.clickhouse.com/master/aarch64/clickhouse" -t altinity/clickhouse-server:master-testing-arm
-# note: clickhouse-odbc-bridge is not supported there.
-ARG single_binary_location_url=""
-
-# see https://github.com/moby/moby/issues/4032#issuecomment-192327844
-ARG DEBIAN_FRONTEND=noninteractive
-
-# user/group precreated explicitly with fixed uid/gid on purpose.
-# It is especially important for rootless containers: in that case entrypoint
-# can't do chown and owners of mounted volumes should be configured externally.
-# We do that in advance at the begining of Dockerfile before any packages will be
-# installed to prevent picking those uid / gid by some unrelated software.
-# The same uid / gid (101) is used both for alpine and ubuntu.
-
-# To drop privileges, we need 'su' command, that simply changes uid and gid.
-# In fact, the 'su' command from Linux is not so simple, due to inherent vulnerability in Linux:
-# https://ruderich.org/simon/notes/su-sudo-from-root-tty-hijacking
-# It has to mitigate this drawback of Linux, and to do this, 'su' command is creating it's own pseudo-terminal
-# and forwarding commands. Due to some ridiculous curcumstances, it does not work in Docker (or it does)
-# and for these reasons people are using alternatives to the 'su' command in Docker,
-# that don't mess with the terminal, don't care about closing the opened files, etc...
-# but can only be safe to drop privileges inside Docker.
-# The question - what implementation of 'su' command to use.
-# It should be a simple script doing about just two syscalls.
-# Some people tend to use 'gosu' tool that is written in Go.
-# It is not used for several reasons:
-# 1. Dependency on some foreign code in yet another programming language - does not sound alright.
-# 2. Anselmo D. Adams suggested not to use it due to false positive alarms in some undisclosed security scanners.
-
-COPY su-exec.c /su-exec.c
-
-RUN groupadd -r clickhouse --gid=101 \
-    && useradd -r -g clickhouse --uid=101 --home-dir=/var/lib/clickhouse --shell=/bin/bash clickhouse \
-    && apt-get update \
-    && apt-get install --yes --no-install-recommends \
-        apt-transport-https \
-        ca-certificates \
-        dirmngr \
-        gnupg \
-        locales \
-        wget \
-        tzdata \
-    && mkdir -p /etc/apt/sources.list.d \
-    && apt-key adv --keyserver keyserver.ubuntu.com --recv 8919F6BD2B48D754 \
-    && echo $repository > /etc/apt/sources.list.d/clickhouse.list \
-    && if [ -n "$deb_location_url" ]; then \
-            echo "installing from custom url with deb packages: $deb_location_url" \
-            rm -rf /tmp/clickhouse_debs \
-            && mkdir -p /tmp/clickhouse_debs \
-            && wget --progress=bar:force:noscroll "${deb_location_url}/clickhouse-common-static_${version}_amd64.deb" -P /tmp/clickhouse_debs \
-            && wget --progress=bar:force:noscroll "${deb_location_url}/clickhouse-client_${version}_all.deb" -P /tmp/clickhouse_debs \
-            && wget --progress=bar:force:noscroll "${deb_location_url}/clickhouse-server_${version}_all.deb" -P /tmp/clickhouse_debs \
-            && dpkg -i /tmp/clickhouse_debs/*.deb ; \
-       elif [ -n "$single_binary_location_url" ]; then \
-            echo "installing from single binary url: $single_binary_location_url" \
-            && rm -rf /tmp/clickhouse_binary \
-            && mkdir -p /tmp/clickhouse_binary \
-            && wget --progress=bar:force:noscroll "$single_binary_location_url" -O /tmp/clickhouse_binary/clickhouse \
-            && chmod +x /tmp/clickhouse_binary/clickhouse \
-            && /tmp/clickhouse_binary/clickhouse install --user "clickhouse" --group "clickhouse" ; \
-       else \
-           echo "installing from repository: $repository" \
-           && apt-get update \
-           && apt-get --yes -o "Dpkg::Options::=--force-confdef" -o "Dpkg::Options::=--force-confold" upgrade \
-           && apt-get install --allow-unauthenticated --yes --no-install-recommends \
-                clickhouse-common-static=$version \
-                clickhouse-client=$version \
-                clickhouse-server=$version ; \
-       fi \
-    && apt-get install -y --no-install-recommends tcc libc-dev && \
-        tcc /su-exec.c -o /bin/su-exec && \
-        chown root:root /bin/su-exec && \
-        chmod 0755 /bin/su-exec && \
-        rm /su-exec.c && \
-        apt-get purge -y --auto-remove tcc libc-dev libc-dev-bin libc6-dev linux-libc-dev \
-    && clickhouse-local -q 'SELECT * FROM system.build_options' \
-    && rm -rf \
-        /var/lib/apt/lists/* \
-        /var/cache/debconf \
-        /tmp/* \
-    && apt-get clean \
-    && mkdir -p /var/lib/clickhouse /var/log/clickhouse-server /etc/clickhouse-server /etc/clickhouse-client \
-    && chmod ugo+Xrw -R /var/lib/clickhouse /var/log/clickhouse-server /etc/clickhouse-server /etc/clickhouse-client
-
-# we need to allow "others" access to clickhouse folder, because docker container
-# can be started with arbitrary uid (openshift usecase)
-
-RUN locale-gen en_US.UTF-8
-ENV LANG en_US.UTF-8
-ENV LANGUAGE en_US:en
-ENV LC_ALL en_US.UTF-8
-ENV TZ UTC
-
-RUN mkdir /docker-entrypoint-initdb.d
-
-COPY docker_related_config.xml /etc/clickhouse-server/config.d/
-COPY entrypoint.sh /entrypoint.sh
-RUN chmod +x /entrypoint.sh
-
-EXPOSE 9000 8123 9009
-VOLUME /var/lib/clickhouse
-
-ENV CLICKHOUSE_CONFIG /etc/clickhouse-server/config.xml
-
-ENTRYPOINT ["/entrypoint.sh"]
--- a/docker/server/Dockerfile
+++ b/docker/server/Dockerfile
@ -0,0 +1 @@
+Dockerfile.ubuntu
--- a/docker/server/Dockerfile.alpine
+++ b/docker/server/Dockerfile.alpine
@ -1,3 +1,14 @@
+FROM ubuntu:20.04 AS glibc-donor
+ARG TARGETARCH
+
+RUN arch=${TARGETARCH:-amd64} \
+    && case $arch in \
+        amd64) rarch=x86_64 ;; \
+        arm64) rarch=aarch64 ;; \
+    esac \
+    && ln -s "${rarch}-linux-gnu" /lib/linux-gnu
+
+
 FROM alpine

 ENV LANG=en_US.UTF-8 \
@ -6,7 +17,24 @@ ENV LANG=en_US.UTF-8 \
    TZ=UTC \
    CLICKHOUSE_CONFIG=/etc/clickhouse-server/config.xml

-COPY alpine-root/ /
+COPY --from=glibc-donor /lib/linux-gnu/libc.so.6 /lib/linux-gnu/libdl.so.2 /lib/linux-gnu/libm.so.6 /lib/linux-gnu/libpthread.so.0 /lib/linux-gnu/librt.so.1 /lib/linux-gnu/libnss_dns.so.2 /lib/linux-gnu/libnss_files.so.2 /lib/linux-gnu/libresolv.so.2 /lib/linux-gnu/ld-2.31.so /lib/
+COPY --from=glibc-donor /etc/nsswitch.conf /etc/
+COPY docker_related_config.xml /etc/clickhouse-server/config.d/
+COPY entrypoint.sh /entrypoint.sh
+
+ARG TARGETARCH
+
+RUN arch=${TARGETARCH:-amd64} \
+    && case $arch in \
+        amd64) mkdir -p /lib64 && ln -sf /lib/ld-2.31.so /lib64/ld-linux-x86-64.so.2 ;; \
+        arm64) ln -sf /lib/ld-2.31.so /lib/ld-linux-aarch64.so.1 ;; \
+    esac
+
+# lts / testing / prestable / etc
+ARG REPO_CHANNEL="stable"
+ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}"
+ARG VERSION="20.9.3.45"
+ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static"

 # user/group precreated explicitly with fixed uid/gid on purpose.
 # It is especially important for rootless containers: in that case entrypoint
@ -15,9 +43,23 @@ COPY alpine-root/ /
 # installed to prevent picking those uid / gid by some unrelated software.
 # The same uid / gid (101) is used both for alpine and ubuntu.

-RUN addgroup -S -g 101 clickhouse \
+RUN arch=${TARGETARCH:-amd64} \
+    && for package in ${PACKAGES}; do \
+        { \
+            { echo "Get ${REPOSITORY}/${package}-${VERSION}-${arch}.tgz" \
+                && wget -c -q "${REPOSITORY}/${package}-${VERSION}-${arch}.tgz" -O "/tmp/${package}-${VERSION}-${arch}.tgz" \
+                && tar xvzf "/tmp/${package}-${VERSION}-${arch}.tgz" --strip-components=1 -C / ; \
+            } || \
+            { echo "Fallback to ${REPOSITORY}/${package}-${VERSION}.tgz" \
+                && wget -c -q "${REPOSITORY}/${package}-${VERSION}.tgz" -O "/tmp/${package}-${VERSION}.tgz" \
+                && tar xvzf "/tmp/${package}-${VERSION}.tgz" --strip-components=2 -C / ; \
+            } ; \
+        } || exit 1 \
+    ; done \
+    && rm /tmp/*.tgz /install -r \
+    && addgroup -S -g 101 clickhouse \
    && adduser -S -h /var/lib/clickhouse -s /bin/bash -G clickhouse -g "ClickHouse server" -u 101 clickhouse \
-    && mkdir -p /var/lib/clickhouse /var/log/clickhouse-server /etc/clickhouse-server /etc/clickhouse-client \
+    && mkdir -p /var/lib/clickhouse /var/log/clickhouse-server /etc/clickhouse-server/config.d /etc/clickhouse-server/users.d /etc/clickhouse-client /docker-entrypoint-initdb.d \
    && chown clickhouse:clickhouse /var/lib/clickhouse \
    && chown root:clickhouse /var/log/clickhouse-server \
    && chmod +x /entrypoint.sh \
--- a/docker/server/Dockerfile.ubuntu
+++ b/docker/server/Dockerfile.ubuntu
@ -0,0 +1,128 @@
+FROM ubuntu:20.04
+
+# see https://github.com/moby/moby/issues/4032#issuecomment-192327844
+ARG DEBIAN_FRONTEND=noninteractive
+
+COPY su-exec.c /su-exec.c
+
+# ARG for quick switch to a given ubuntu mirror
+ARG apt_archive="http://archive.ubuntu.com"
+RUN sed -i "s|http://archive.ubuntu.com|${apt_archive}|g" /etc/apt/sources.list \
+    && groupadd -r clickhouse --gid=101 \
+    && useradd -r -g clickhouse --uid=101 --home-dir=/var/lib/clickhouse --shell=/bin/bash clickhouse \
+    && apt-get update \
+    && apt-get install --yes --no-install-recommends \
+        apt-transport-https \
+        ca-certificates \
+        dirmngr \
+        gnupg \
+        locales \
+        wget \
+        tzdata \
+    && apt-get install -y --no-install-recommends tcc libc-dev && \
+        tcc /su-exec.c -o /bin/su-exec && \
+        chown root:root /bin/su-exec && \
+        chmod 0755 /bin/su-exec && \
+        rm /su-exec.c && \
+        apt-get purge -y --auto-remove tcc libc-dev libc-dev-bin libc6-dev linux-libc-dev \
+    && apt-get clean
+
+ARG REPO_CHANNEL="stable"
+ARG REPOSITORY="deb https://packages.clickhouse.com/deb ${REPO_CHANNEL} main"
+ARG VERSION=22.1.1.*
+ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static"
+
+# set non-empty deb_location_url url to create a docker image
+# from debs created by CI build, for example:
+# docker build . --network host --build-arg version="21.4.1.6282" --build-arg deb_location_url="https://clickhouse-builds.s3.yandex.net/21852/069cfbff388b3d478d1a16dc7060b48073f5d522/clickhouse_build_check/clang-11_relwithdebuginfo_none_bundled_unsplitted_disable_False_deb/" -t filimonovq/clickhouse-server:pr21852
+ARG deb_location_url=""
+
+# set non-empty single_binary_location_url to create docker image
+# from a single binary url (useful for non-standard builds - with sanitizers, for arm64).
+# for example (run on aarch64 server):
+# docker build . --network host --build-arg single_binary_location_url="https://builds.clickhouse.com/master/aarch64/clickhouse" -t altinity/clickhouse-server:master-testing-arm
+# note: clickhouse-odbc-bridge is not supported there.
+ARG single_binary_location_url=""
+
+# user/group precreated explicitly with fixed uid/gid on purpose.
+# It is especially important for rootless containers: in that case entrypoint
+# can't do chown and owners of mounted volumes should be configured externally.
+# We do that in advance at the begining of Dockerfile before any packages will be
+# installed to prevent picking those uid / gid by some unrelated software.
+# The same uid / gid (101) is used both for alpine and ubuntu.
+
+# To drop privileges, we need 'su' command, that simply changes uid and gid.
+# In fact, the 'su' command from Linux is not so simple, due to inherent vulnerability in Linux:
+# https://ruderich.org/simon/notes/su-sudo-from-root-tty-hijacking
+# It has to mitigate this drawback of Linux, and to do this, 'su' command is creating it's own pseudo-terminal
+# and forwarding commands. Due to some ridiculous curcumstances, it does not work in Docker (or it does)
+# and for these reasons people are using alternatives to the 'su' command in Docker,
+# that don't mess with the terminal, don't care about closing the opened files, etc...
+# but can only be safe to drop privileges inside Docker.
+# The question - what implementation of 'su' command to use.
+# It should be a simple script doing about just two syscalls.
+# Some people tend to use 'gosu' tool that is written in Go.
+# It is not used for several reasons:
+# 1. Dependency on some foreign code in yet another programming language - does not sound alright.
+# 2. Anselmo D. Adams suggested not to use it due to false positive alarms in some undisclosed security scanners.
+
+ARG TARGETARCH
+
+RUN arch=${TARGETARCH:-amd64} \
+    && if [ -n "${deb_location_url}" ]; then \
+        echo "installing from custom url with deb packages: ${deb_location_url}" \
+        rm -rf /tmp/clickhouse_debs \
+        && mkdir -p /tmp/clickhouse_debs \
+        && for package in ${PACKAGES}; do \
+            { wget --progress=bar:force:noscroll "${deb_location_url}/${package}_${VERSION}_${arch}.deb" -P /tmp/clickhouse_debs || \
+                wget --progress=bar:force:noscroll "${deb_location_url}/${package}_${VERSION}_all.deb" -P /tmp/clickhouse_debs ; } \
+            || exit 1 \
+        ; done \
+        && dpkg -i /tmp/clickhouse_debs/*.deb ; \
+    elif [ -n "${single_binary_location_url}" ]; then \
+        echo "installing from single binary url: ${single_binary_location_url}" \
+        && rm -rf /tmp/clickhouse_binary \
+        && mkdir -p /tmp/clickhouse_binary \
+        && wget --progress=bar:force:noscroll "${single_binary_location_url}" -O /tmp/clickhouse_binary/clickhouse \
+        && chmod +x /tmp/clickhouse_binary/clickhouse \
+        && /tmp/clickhouse_binary/clickhouse install --user "clickhouse" --group "clickhouse" ; \
+    else \
+        mkdir -p /etc/apt/sources.list.d \
+        && apt-key adv --keyserver keyserver.ubuntu.com --recv 8919F6BD2B48D754 \
+        && echo ${REPOSITORY} > /etc/apt/sources.list.d/clickhouse.list \
+        && echo "installing from repository: ${REPOSITORY}" \
+        && apt-get update \
+        && apt-get --yes -o "Dpkg::Options::=--force-confdef" -o "Dpkg::Options::=--force-confold" upgrade \
+        && for package in ${PACKAGES}; do \
+            apt-get install --allow-unauthenticated --yes --no-install-recommends "${package}=${VERSION}" || exit 1 \
+        ; done \
+    ; fi \
+    && clickhouse-local -q 'SELECT * FROM system.build_options' \
+    && rm -rf \
+        /var/lib/apt/lists/* \
+        /var/cache/debconf \
+        /tmp/* \
+    && mkdir -p /var/lib/clickhouse /var/log/clickhouse-server /etc/clickhouse-server /etc/clickhouse-client \
+    && chmod ugo+Xrw -R /var/lib/clickhouse /var/log/clickhouse-server /etc/clickhouse-server /etc/clickhouse-client
+
+# we need to allow "others" access to clickhouse folder, because docker container
+# can be started with arbitrary uid (openshift usecase)
+
+RUN locale-gen en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US:en
+ENV LC_ALL en_US.UTF-8
+ENV TZ UTC
+
+RUN mkdir /docker-entrypoint-initdb.d
+
+COPY docker_related_config.xml /etc/clickhouse-server/config.d/
+COPY entrypoint.sh /entrypoint.sh
+RUN chmod +x /entrypoint.sh
+
+EXPOSE 9000 8123 9009
+VOLUME /var/lib/clickhouse
+
+ENV CLICKHOUSE_CONFIG /etc/clickhouse-server/config.xml
+
+ENTRYPOINT ["/entrypoint.sh"]
--- a/docker/server/alpine-build.sh
+++ b/docker/server/alpine-build.sh
@ -1,63 +0,0 @@
-#!/bin/bash
-set -x
-
-REPO_CHANNEL="${REPO_CHANNEL:-stable}" # lts / testing / prestable / etc
-REPO_URL="${REPO_URL:-"https://repo.yandex.ru/clickhouse/tgz/${REPO_CHANNEL}"}"
-VERSION="${VERSION:-20.9.3.45}"
-DOCKER_IMAGE="${DOCKER_IMAGE:-clickhouse/clickhouse-server}"
-
-# where original files live
-DOCKER_BUILD_FOLDER="${BASH_SOURCE%/*}"
-
-# we will create root for our image here
-CONTAINER_ROOT_FOLDER="${DOCKER_BUILD_FOLDER}/alpine-root"
-
-# clean up the root from old runs, it's reconstructed each time
-rm -rf "$CONTAINER_ROOT_FOLDER"
-mkdir -p "$CONTAINER_ROOT_FOLDER"
-
-# where to put downloaded tgz
-TGZ_PACKAGES_FOLDER="${DOCKER_BUILD_FOLDER}/tgz-packages"
-mkdir -p "$TGZ_PACKAGES_FOLDER"
-
-PACKAGES=( "clickhouse-client" "clickhouse-server" "clickhouse-common-static" )
-
-# download tars from the repo
-for package in "${PACKAGES[@]}"
-do
-    wget -c -q --show-progress "${REPO_URL}/${package}-${VERSION}.tgz" -O "${TGZ_PACKAGES_FOLDER}/${package}-${VERSION}.tgz"
-done
-
-# unpack tars
-for package in "${PACKAGES[@]}"
-do
-    tar xvzf "${TGZ_PACKAGES_FOLDER}/${package}-${VERSION}.tgz" --strip-components=2 -C "$CONTAINER_ROOT_FOLDER"
-done
-
-# prepare few more folders
-mkdir -p "${CONTAINER_ROOT_FOLDER}/etc/clickhouse-server/users.d" \
-         "${CONTAINER_ROOT_FOLDER}/etc/clickhouse-server/config.d" \
-         "${CONTAINER_ROOT_FOLDER}/var/log/clickhouse-server" \
-         "${CONTAINER_ROOT_FOLDER}/var/lib/clickhouse" \
-         "${CONTAINER_ROOT_FOLDER}/docker-entrypoint-initdb.d" \
-         "${CONTAINER_ROOT_FOLDER}/lib64"
-
-cp "${DOCKER_BUILD_FOLDER}/docker_related_config.xml" "${CONTAINER_ROOT_FOLDER}/etc/clickhouse-server/config.d/"
-cp "${DOCKER_BUILD_FOLDER}/entrypoint.sh"             "${CONTAINER_ROOT_FOLDER}/entrypoint.sh"
-
-## get glibc components from ubuntu 20.04 and put them to expected place
-docker pull ubuntu:20.04
-ubuntu20image=$(docker create --rm ubuntu:20.04)
-docker cp -L "${ubuntu20image}":/lib/x86_64-linux-gnu/libc.so.6       "${CONTAINER_ROOT_FOLDER}/lib"
-docker cp -L "${ubuntu20image}":/lib/x86_64-linux-gnu/libdl.so.2      "${CONTAINER_ROOT_FOLDER}/lib"
-docker cp -L "${ubuntu20image}":/lib/x86_64-linux-gnu/libm.so.6       "${CONTAINER_ROOT_FOLDER}/lib"
-docker cp -L "${ubuntu20image}":/lib/x86_64-linux-gnu/libpthread.so.0 "${CONTAINER_ROOT_FOLDER}/lib"
-docker cp -L "${ubuntu20image}":/lib/x86_64-linux-gnu/librt.so.1      "${CONTAINER_ROOT_FOLDER}/lib"
-docker cp -L "${ubuntu20image}":/lib/x86_64-linux-gnu/libnss_dns.so.2 "${CONTAINER_ROOT_FOLDER}/lib"
-docker cp -L "${ubuntu20image}":/lib/x86_64-linux-gnu/libnss_files.so.2 "${CONTAINER_ROOT_FOLDER}/lib"
-docker cp -L "${ubuntu20image}":/lib/x86_64-linux-gnu/libresolv.so.2  "${CONTAINER_ROOT_FOLDER}/lib"
-docker cp -L "${ubuntu20image}":/lib64/ld-linux-x86-64.so.2           "${CONTAINER_ROOT_FOLDER}/lib64"
-docker cp -L "${ubuntu20image}":/etc/nsswitch.conf                    "${CONTAINER_ROOT_FOLDER}/etc"
-
-docker build "$DOCKER_BUILD_FOLDER" -f Dockerfile.alpine -t "${DOCKER_IMAGE}:${VERSION}-alpine" --pull
-rm -rf "$CONTAINER_ROOT_FOLDER"
--- a/docker/server/local.Dockerfile
+++ b/docker/server/local.Dockerfile
@ -1,47 +0,0 @@
-# Since right now we can't set volumes to the docker during build, we split building container in stages:
-# 1. build base container
-# 2. run base conatiner with mounted volumes
-# 3. commit container as image
-# 4. build final container atop that image
-# Middle steps are performed by the bash script.
-
-FROM ubuntu:18.04 as clickhouse-server-base
-ARG gosu_ver=1.14
-
-VOLUME /packages/
-
-# update to allow installing dependencies of clickhouse automatically
-RUN apt update; \
-    DEBIAN_FRONTEND=noninteractive \
-    apt install -y locales;
-
-ADD https://github.com/tianon/gosu/releases/download/${gosu_ver}/gosu-amd64 /bin/gosu
-
-RUN locale-gen en_US.UTF-8
-ENV LANG en_US.UTF-8
-ENV LANGUAGE en_US:en
-ENV LC_ALL en_US.UTF-8
-
-# installing via apt to simulate real-world scenario, where user installs deb package and all it's dependecies automatically.
-CMD DEBIAN_FRONTEND=noninteractive \
-    apt install -y \
-        /packages/clickhouse-common-static_*.deb \
-        /packages/clickhouse-server_*.deb ;
-
-FROM clickhouse-server-base:postinstall as clickhouse-server
-
-RUN mkdir /docker-entrypoint-initdb.d
-
-COPY docker_related_config.xml /etc/clickhouse-server/config.d/
-COPY entrypoint.sh /entrypoint.sh
-
-RUN chmod +x \
-    /entrypoint.sh \
-    /bin/gosu
-
-EXPOSE 9000 8123 9009
-VOLUME /var/lib/clickhouse
-
-ENV CLICKHOUSE_CONFIG /etc/clickhouse-server/config.xml
-
-ENTRYPOINT ["/entrypoint.sh"]
--- a/docker/test/performance-comparison/compare.sh
+++ b/docker/test/performance-comparison/compare.sh
@ -1378,7 +1378,7 @@ $REF_SHA	$SHA_TO_TEST	$(numactl --hardware | sed -n 's/^available:[[:space:]]\+/
 EOF

    # Also insert some data about the check into the CI checks table.
-    "${client[@]}" --query "INSERT INTO "'"'"gh-data"'"'".checks FORMAT TSVWithNamesAndTypes" \
+    "${client[@]}" --query "INSERT INTO "'"'"default"'"'".checks FORMAT TSVWithNamesAndTypes" \
        < ci-checks.tsv

    set -x
--- a/docker/test/stateful/run.sh
+++ b/docker/test/stateful/run.sh
@ -115,7 +115,7 @@ function run_tests()
    fi

    set +e
-    clickhouse-test --testname --shard --zookeeper --check-zookeeper-session --no-stateless --hung-check --print-time \
+    clickhouse-test -j 2 --testname --shard --zookeeper --check-zookeeper-session --no-stateless --hung-check --print-time \
        --skip 00168_parallel_processing_on_replicas "${ADDITIONAL_OPTIONS[@]}" \
        "$SKIP_TESTS_OPTION" 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee test_output/test_result.txt

--- a/docker/test/stateless/run.sh
+++ b/docker/test/stateless/run.sh
@ -131,8 +131,23 @@ clickhouse-client -q "system flush logs" ||:

 grep -Fa "Fatal" /var/log/clickhouse-server/clickhouse-server.log ||:
 pigz < /var/log/clickhouse-server/clickhouse-server.log > /test_output/clickhouse-server.log.gz &
-clickhouse-client -q "select * from system.query_log format TSVWithNamesAndTypes" | pigz > /test_output/query-log.tsv.gz &
-clickhouse-client -q "select * from system.query_thread_log format TSVWithNamesAndTypes" | pigz > /test_output/query-thread-log.tsv.gz &
+
+# Compress tables.
+#
+# NOTE:
+# - that due to tests with s3 storage we cannot use /var/lib/clickhouse/data
+#   directly
+# - even though ci auto-compress some files (but not *.tsv) it does this only
+#   for files >64MB, we want this files to be compressed explicitly
+for table in query_log zookeeper_log trace_log
+do
+    clickhouse-client -q "select * from system.$table format TSVWithNamesAndTypes" | pigz > /test_output/$table.tsv.gz &
+    if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then
+        clickhouse-client --port 19000 -q "select * from system.$table format TSVWithNamesAndTypes" | pigz > /test_output/$table.1.tsv.gz &
+        clickhouse-client --port 29000 -q "select * from system.$table format TSVWithNamesAndTypes" | pigz > /test_output/$table.2.tsv.gz &
+    fi
+done
+wait ||:

 # Also export trace log in flamegraph-friendly format.
 for trace_type in CPU Memory Real
@ -161,14 +176,6 @@ fi

 tar -chf /test_output/coordination.tar /var/lib/clickhouse/coordination ||:

-# Replace the engine with Ordinary to avoid extra symlinks stuff in artifacts.
-# (so that clickhouse-local --path can read it w/o extra care).
-sed -i -e "s/ATTACH DATABASE _ UUID '[^']*'/ATTACH DATABASE system/" -e "s/Atomic/Ordinary/" /var/lib/clickhouse/metadata/system.sql
-for table in text_log query_log zookeeper_log trace_log; do
-    sed -i "s/ATTACH TABLE _ UUID '[^']*'/ATTACH TABLE $table/" /var/lib/clickhouse/metadata/system/${table}.sql
-    tar -chf /test_output/${table}_dump.tar /var/lib/clickhouse/metadata/system.sql /var/lib/clickhouse/metadata/system/${table}.sql /var/lib/clickhouse/data/system/${table} ||:
-done
-
 if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then
    grep -Fa "Fatal" /var/log/clickhouse-server/clickhouse-server1.log ||:
    grep -Fa "Fatal" /var/log/clickhouse-server/clickhouse-server2.log ||:
@ -179,8 +186,6 @@ if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]
    rm /var/log/clickhouse-server/clickhouse-server2.log
    mv /var/log/clickhouse-server/stderr1.log /test_output/ ||:
    mv /var/log/clickhouse-server/stderr2.log /test_output/ ||:
-    tar -chf /test_output/zookeeper_log_dump1.tar /var/lib/clickhouse1/data/system/zookeeper_log ||:
-    tar -chf /test_output/zookeeper_log_dump2.tar /var/lib/clickhouse2/data/system/zookeeper_log ||:
    tar -chf /test_output/coordination1.tar /var/lib/clickhouse1/coordination ||:
    tar -chf /test_output/coordination2.tar /var/lib/clickhouse2/coordination ||:
 fi
--- a/docker/test/test_runner.sh
+++ b/docker/test/test_runner.sh
@ -1,86 +0,0 @@
-#!/bin/sh
-
-set -e -x
-
-# Not sure why shellcheck complains that rc is not assigned before it is referenced.
-# shellcheck disable=SC2154
-trap 'rc=$?; echo EXITED WITH: $rc; exit $rc' EXIT
-
-# CLI option to prevent rebuilding images, just re-run tests with images leftover from previuos time
-readonly NO_REBUILD_FLAG="--no-rebuild"
-
-readonly CLICKHOUSE_DOCKER_DIR="$(realpath "${1}")"
-readonly CLICKHOUSE_PACKAGES_ARG="${2}"
-CLICKHOUSE_SERVER_IMAGE="${3}"
-
-if [ "${CLICKHOUSE_PACKAGES_ARG}" != "${NO_REBUILD_FLAG}" ]; then
-    readonly CLICKHOUSE_PACKAGES_DIR="$(realpath "${2}")" # or --no-rebuild
-fi
-
-
-# In order to allow packages directory to be anywhere, and to reduce amount of context sent to the docker daemon,
-# all images are built in multiple stages:
-# 1. build base image, install dependencies
-# 2. run image with volume mounted, install what needed from those volumes
-# 3. tag container as image
-# 4. [optional] build another image atop of tagged.
-
-# TODO: optionally mount most recent clickhouse-test and queries directory from local machine
-
-if [ "${CLICKHOUSE_PACKAGES_ARG}" != "${NO_REBUILD_FLAG}" ]; then
-    docker build --network=host \
-        -f "${CLICKHOUSE_DOCKER_DIR}/test/stateless/clickhouse-statelest-test-runner.Dockerfile" \
-        --target clickhouse-test-runner-base \
-        -t clickhouse-test-runner-base:preinstall \
-        "${CLICKHOUSE_DOCKER_DIR}/test/stateless"
-
-    docker rm -f clickhouse-test-runner-installing-packages || true
-    docker run  --network=host \
-        -v "${CLICKHOUSE_PACKAGES_DIR}:/packages" \
-        --name clickhouse-test-runner-installing-packages \
-        clickhouse-test-runner-base:preinstall
-    docker commit clickhouse-test-runner-installing-packages clickhouse-statelest-test-runner:local
-    docker rm -f clickhouse-test-runner-installing-packages || true
-fi
-
-# # Create a bind-volume to the clickhouse-test script file
-# docker volume create --driver local --opt type=none --opt device=/home/enmk/proj/ClickHouse_master/tests/clickhouse-test --opt o=bind clickhouse-test-script-volume
-# docker volume create --driver local --opt type=none --opt device=/home/enmk/proj/ClickHouse_master/tests/queries --opt o=bind clickhouse-test-queries-dir-volume
-
-# Build server image (optional) from local packages
-if [ -z "${CLICKHOUSE_SERVER_IMAGE}" ]; then
-    CLICKHOUSE_SERVER_IMAGE="clickhouse/server:local"
-
-    if [ "${CLICKHOUSE_PACKAGES_ARG}" != "${NO_REBUILD_FLAG}" ]; then
-        docker build --network=host \
-            -f "${CLICKHOUSE_DOCKER_DIR}/server/local.Dockerfile" \
-            --target clickhouse-server-base \
-            -t clickhouse-server-base:preinstall \
-            "${CLICKHOUSE_DOCKER_DIR}/server"
-
-        docker rm -f clickhouse_server_base_installing_server || true
-        docker run  --network=host -v "${CLICKHOUSE_PACKAGES_DIR}:/packages" \
-            --name clickhouse_server_base_installing_server \
-            clickhouse-server-base:preinstall
-        docker commit clickhouse_server_base_installing_server clickhouse-server-base:postinstall
-
-        docker build --network=host \
-            -f "${CLICKHOUSE_DOCKER_DIR}/server/local.Dockerfile" \
-            --target clickhouse-server \
-            -t "${CLICKHOUSE_SERVER_IMAGE}" \
-            "${CLICKHOUSE_DOCKER_DIR}/server"
-    fi
-fi
-
-docker rm -f test-runner || true
-docker-compose down
-CLICKHOUSE_SERVER_IMAGE="${CLICKHOUSE_SERVER_IMAGE}" \
-    docker-compose -f "${CLICKHOUSE_DOCKER_DIR}/test/test_runner_docker_compose.yaml" \
-    create \
-    --build --force-recreate
-
-CLICKHOUSE_SERVER_IMAGE="${CLICKHOUSE_SERVER_IMAGE}" \
-    docker-compose -f "${CLICKHOUSE_DOCKER_DIR}/test/test_runner_docker_compose.yaml" \
-    run \
-    --name test-runner \
-    test-runner
--- a/docker/test/test_runner_docker_compose.yaml
+++ b/docker/test/test_runner_docker_compose.yaml
@ -1,34 +0,0 @@
-version: "2"
-
-services:
-  clickhouse-server:
-    image: ${CLICKHOUSE_SERVER_IMAGE}
-    expose:
-      - "8123" # HTTP
-      - "9000" # TCP
-      - "9009" # HTTP-interserver
-    restart: "no"
-
-  test-runner:
-    image: clickhouse-statelest-test-runner:local
-
-    restart: "no"
-    depends_on:
-      - clickhouse-server
-    environment:
-      # these are used by clickhouse-test to point clickhouse-client to the right server
-      - CLICKHOUSE_HOST=clickhouse-server
-      - CLICKHOUSE_PORT=9009
-      - CLICKHOUSE_TEST_HOST_EXPOSED_PORT=51234
-    expose:
-      # port for any test to serve data to clickhouse-server on rare occasion (like URL-engine tables in 00646),
-      # should match value of CLICKHOUSE_TEST_HOST_EXPOSED_PORT above
-      - "51234"
-
-    # NOTE: Dev-mode: mount newest versions of the queries and clickhouse-test script into container.
-    # volumes:
-    #   - /home/enmk/proj/ClickHouse_master/tests/queries:/usr/share/clickhouse-test/queries:ro
-    #   - /home/enmk/proj/ClickHouse_master/tests/clickhouse-test:/usr/bin/clickhouse-test:ro
-
-    # String-form instead of list-form to allow multiple arguments in "${CLICKHOUSE_TEST_ARGS}"
-    entrypoint: "clickhouse-test ${CLICKHOUSE_TEST_ARGS}"
--- a/docs/en/engines/table-engines/mergetree-family/mergetree.md
+++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md
@ -688,7 +688,7 @@ Tags:
 -   `volume_name_N` — Volume name. Volume names must be unique.
 -   `disk` — a disk within a volume.
 -   `max_data_part_size_bytes` — the maximum size of a part that can be stored on any of the volume’s disks. If the a size of a merged part estimated to be bigger than `max_data_part_size_bytes` then this part will be written to a next volume. Basically this feature allows to keep new/small parts on a hot (SSD) volume and move them to a cold (HDD) volume when they reach large size. Do not use this setting if your policy has only one volume.
-   `move_factor` — when the amount of available space gets lower than this factor, data automatically start to move on the next volume if any (by default, 0.1).
+-   `move_factor` — when the amount of available space gets lower than this factor, data automatically starts to move on the next volume if any (by default, 0.1). ClickHouse sorts existing parts by size from largest to smallest (in descending order) and selects parts with the total size that is sufficient to meet the `move_factor` condition. If the total size of all parts is insufficient, all parts will be moved. 
 -   `prefer_not_to_merge` — Disables merging of data parts on this volume. When this setting is enabled, merging data on this volume is not allowed. This allows controlling how ClickHouse works with slow disks.

 Cofiguration examples:
--- a/docs/en/operations/named-collections.md
+++ b/docs/en/operations/named-collections.md
@ -36,6 +36,7 @@ Example of configuration:
            <access_key_id>AKIAIOSFODNN7EXAMPLE</access_key_id>
            <secret_access_key> wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY</secret_access_key>
            <format>CSV</format>
+            <url>https://s3.us-east-1.amazonaws.com/yourbucket/mydata/</url>
        </s3_mydata>
    </named_collections>
 </clickhouse>
@ -44,12 +45,12 @@ Example of configuration:
 ### Example of using named connections with the s3 function

 ```sql
-INSERT INTO FUNCTION s3(s3_mydata, url = 'https://s3.us-east-1.amazonaws.com/yourbucket/mydata/test_file.tsv.gz',
+INSERT INTO FUNCTION s3(s3_mydata, filename = 'test_file.tsv.gz',
   format = 'TSV', structure = 'number UInt64', compression_method = 'gzip')
 SELECT * FROM numbers(10000);

 SELECT count()
-FROM s3(s3_mydata, url = 'https://s3.us-east-1.amazonaws.com/yourbucket/mydata/test_file.tsv.gz')
+FROM s3(s3_mydata, filename = 'test_file.tsv.gz')

 ┌─count()─┐
 │   10000 │
--- a/docs/en/sql-reference/aggregate-functions/reference/grouparraysorted.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/grouparraysorted.md
@ -0,0 +1,48 @@
+---
+toc_priority: 108
+---
+
+# groupArraySorted {#groupArraySorted}
+
+Returns an array with the first N items in ascending order.
+
+``` sql
+groupArraySorted(N)(column)
+```
+
+**Arguments**
+
+-   `N` – The number of elements to return.
+
+If the parameter is omitted, default value 10 is used.
+
+**Arguments**
+
+-   `column` – The value.
+-   `expr` — Optional. The field or expresion to sort by. If not set values are sorted by themselves.
+
+**Example**
+
+Gets the first 10 numbers:
+
+``` sql
+SELECT groupArraySorted(10)(number) FROM numbers(100)
+```
+
+``` text
+┌─groupArraySorted(10)(number)─┐
+│ [0,1,2,3,4,5,6,7,8,9]        │
+└──────────────────────────────┘
+```
+
+Or the last 10:
+
+``` sql
+SELECT groupArraySorted(10)(number, -number) FROM numbers(100)
+```
+
+``` text
+┌─groupArraySorted(10)(number, negate(number))─┐
+│ [99,98,97,96,95,94,93,92,91,90]              │
+└──────────────────────────────────────────────┘
+```
--- a/docs/en/sql-reference/aggregate-functions/reference/index.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/index.md
@ -35,6 +35,7 @@ ClickHouse-specific aggregate functions:
 -   [groupArrayInsertAt](../../../sql-reference/aggregate-functions/reference/grouparrayinsertat.md)
 -   [groupArrayMovingAvg](../../../sql-reference/aggregate-functions/reference/grouparraymovingavg.md)
 -   [groupArrayMovingSum](../../../sql-reference/aggregate-functions/reference/grouparraymovingsum.md)
+-   [groupArraySorted](../../../sql-reference/aggregate-functions/reference/grouparraysorted.md)
 -   [groupBitAnd](../../../sql-reference/aggregate-functions/reference/groupbitand.md)
 -   [groupBitOr](../../../sql-reference/aggregate-functions/reference/groupbitor.md)
 -   [groupBitXor](../../../sql-reference/aggregate-functions/reference/groupbitxor.md)
--- a/docs/en/sql-reference/statements/create/table.md
+++ b/docs/en/sql-reference/statements/create/table.md
@ -114,9 +114,9 @@ In addition, this column is not substituted when using an asterisk in a SELECT q

 ### EPHEMERAL {#ephemeral}

-`EPHEMERAL expr`
+`EPHEMERAL [expr]`

-Ephemeral column. Such a column isn't stored in the table and cannot be SELECTed, but can be referenced in the defaults of CREATE statement.
+Ephemeral column. Such a column isn't stored in the table and cannot be SELECTed, but can be referenced in the defaults of CREATE statement. If `expr` is omitted type for column is required.
 INSERT without list of columns will skip such column, so SELECT/INSERT invariant is preserved -  the dump obtained using `SELECT *` can be inserted back into the table using INSERT without specifying the list of columns.

 ### ALIAS {#alias}
--- a/docs/ja/development/developer-instruction.md
+++ b/docs/ja/development/developer-instruction.md
@ -273,7 +273,7 @@ GitHubのUIでforkリポジトリに移動します。 ブランチで開発し

 プル要求は、作業がまだ完了していない場合でも作成できます。 この場合、単語を入れてください “WIP” （進行中の作業）タイトルの先頭に、それは後で変更することができます。 これは、変更の協調的なレビューと議論、および利用可能なすべてのテストの実行に役立ちます。 変更の簡単な説明を提供することが重要です。

-Yandexの従業員がタグであなたのPRにラベルを付けるとすぐにテストが開始されます “can be tested”. The results of some first checks (e.g. code style) will come in within several minutes. Build check results will arrive within half an hour. And the main set of tests will report itself within an hour.
+ClickHouseの従業員がタグであなたのPRにラベルを付けるとすぐにテストが開始されます “can be tested”. The results of some first checks (e.g. code style) will come in within several minutes. Build check results will arrive within half an hour. And the main set of tests will report itself within an hour.

 システムは、プル要求用にClickHouseバイナリビルドを個別に準備します。 これらのビルドを取得するには “Details” 次のリンク “ClickHouse build check” 小切手のリストのエントリ。 そこには、ビルドへの直接リンクがあります。ClickHouseのdebパッケージは、本番サーバーにも展開できます（恐れがない場合）。

--- a/docs/ja/sql-reference/aggregate-functions/reference/grouparraysorted.md
+++ b/docs/ja/sql-reference/aggregate-functions/reference/grouparraysorted.md
--- a/docs/ru/development/developer-instruction.md
+++ b/docs/ru/development/developer-instruction.md
@ -72,11 +72,11 @@ ClickHouse не работает и не собирается на 32-битны

 Этот вариант не подходит для отправки изменений на сервер. Вы можете временно его использовать, а затем добавить ssh ключи и заменить адрес репозитория с помощью команды `git remote`.

-Вы можете также добавить для своего локального репозитория адрес оригинального репозитория Яндекса, чтобы притягивать оттуда обновления:
+Вы можете также добавить для своего локального репозитория адрес оригинального репозитория, чтобы притягивать оттуда обновления:

    git remote add upstream git@github.com:ClickHouse/ClickHouse.git

-После этого, вы сможете добавлять в свой репозиторий обновления из репозитория Яндекса с помощью команды `git pull upstream master`.
+После этого, вы сможете добавлять в свой репозиторий обновления из репозитория ClickHouse с помощью команды `git pull upstream master`.

 ### Работа с сабмодулями Git {#rabota-s-sabmoduliami-git}

@ -288,7 +288,7 @@ sudo ./llvm.sh 12

 Pull request можно создать, даже если работа над задачей ещё не завершена. В этом случае, добавьте в его название слово «WIP» (work in progress). Название можно будет изменить позже. Это полезно для совместного просмотра и обсуждения изменений, а также для запуска всех имеющихся тестов. Введите краткое описание изменений - впоследствии, оно будет использовано для релизных changelog.

-Тесты будут запущены, как только сотрудники Яндекса поставят для pull request тег «Can be tested». Результаты первых проверок (стиль кода) появятся уже через несколько минут. Результаты сборки появятся примерно через пол часа. Результаты основного набора тестов будут доступны в пределах часа.
+Тесты будут запущены, как только сотрудники ClickHouse поставят для pull request тег «Can be tested». Результаты первых проверок (стиль кода) появятся уже через несколько минут. Результаты сборки появятся примерно через пол часа. Результаты основного набора тестов будут доступны в пределах часа.

 Система подготовит сборки ClickHouse специально для вашего pull request. Для их получения, нажмите на ссылку «Details» у проверки «Clickhouse build check». Там вы сможете найти прямые ссылки на собранные .deb пакеты ClickHouse, которые, при желании, вы даже сможете установить на свои продакшен серверы (если не страшно).

--- a/docs/ru/engines/table-engines/mergetree-family/mergetree.md
+++ b/docs/ru/engines/table-engines/mergetree-family/mergetree.md
@ -678,7 +678,7 @@ TTL d + INTERVAL 1 MONTH GROUP BY k1, k2 SET x = max(x), y = min(y);
 -   `volume_name_N` — название тома. Названия томов должны быть уникальны.
 -   `disk` — диск, находящийся внутри тома.
 -   `max_data_part_size_bytes` — максимальный размер куска данных, который может находится на любом из дисков этого тома. Если в результате слияния размер куска ожидается больше, чем max_data_part_size_bytes, то этот кусок будет записан в следующий том. В основном эта функция позволяет хранить новые / мелкие куски на горячем (SSD) томе и перемещать их на холодный (HDD) том, когда они достигают большого размера. Не используйте этот параметр, если политика имеет только один том. 
-   `move_factor` — доля доступного свободного места на томе, если места становится меньше, то данные начнут перемещение на следующий том, если он есть (по умолчанию 0.1).
+-   `move_factor` — доля доступного свободного места на томе, если места становится меньше, то данные начнут перемещение на следующий том, если он есть (по умолчанию 0.1). Для перемещения куски сортируются по размеру от большего к меньшему (по убыванию) и выбираются куски, совокупный размер которых достаточен для соблюдения условия `move_factor`, если совокупный размер всех партов недостаточен, будут перемещены все парты.
 -   `prefer_not_to_merge` — Отключает слияние кусков данных, хранящихся на данном томе. Если данная настройка включена, то слияние данных, хранящихся на данном томе, не допускается. Это позволяет контролировать работу ClickHouse с медленными дисками.

 Примеры конфигураций:
--- a/docs/ru/sql-reference/statements/create/table.md
+++ b/docs/ru/sql-reference/statements/create/table.md
@ -110,9 +110,9 @@ SELECT x, toTypeName(x) FROM t1;

 ### EPHEMERAL {#ephemeral}

-`EPHEMERAL expr`
+`EPHEMERAL [expr]`

-Эфемерное выражение. Такой столбец не хранится в таблице и не может быть получен в запросе SELECT, но на него можно ссылаться в выражениях по умолчанию запроса CREATE.
+Эфемерное выражение. Такой столбец не хранится в таблице и не может быть получен в запросе SELECT, но на него можно ссылаться в выражениях по умолчанию запроса CREATE. Если значение по умолчанию `expr` не указано, то тип колонки должен быть специфицирован.
 INSERT без списка столбцов игнорирует этот столбец, таким образом сохраняется инвариант - т.е. дамп, полученный путём `SELECT *`, можно вставить обратно в таблицу INSERT-ом без указания списка столбцов.

 ### ALIAS {#alias}
--- a/docs/zh/development/developer-instruction.md
+++ b/docs/zh/development/developer-instruction.md
@ -259,7 +259,7 @@ ClickHouse的架构描述可以在此处查看：https://clickhouse.com/docs/en/

 即使工作尚未完成，也可以创建拉取请求。在这种情况下，请在标题的开头加上«WIP»（正在进行中），以便后续更改。这对于协同审查和讨论更改以及运行所有可用测试用例很有用。提供有关变更的简短描述很重要，这将在后续用于生成重新发布变更日志。

-Yandex成员一旦在您的拉取请求上贴上«可以测试»标签，就会开始测试。一些初始检查项（例如，代码类型）的结果会在几分钟内反馈。构建的检查结果将在半小时内完成。而主要的测试用例集结果将在一小时内报告给您。
+ClickHouse成员一旦在您的拉取请求上贴上«可以测试»标签，就会开始测试。一些初始检查项（例如，代码类型）的结果会在几分钟内反馈。构建的检查结果将在半小时内完成。而主要的测试用例集结果将在一小时内报告给您。

 系统将分别为您的拉取请求准备ClickHouse二进制版本。若要检索这些构建信息，请在检查列表中单击« ClickHouse构建检查»旁边的«详细信息»链接。在这里，您会找到指向ClickHouse的.deb软件包的直接链接，此外，甚至可以将其部署在生产服务器上（如果您不担心）。

--- a/docs/zh/engines/table-engines/mergetree-family/versionedcollapsingmergetree.md
+++ b/docs/zh/engines/table-engines/mergetree-family/versionedcollapsingmergetree.md
@ -8,7 +8,7 @@ toc_title: "版本折叠MergeTree"
 这个引擎:

 -   允许快速写入不断变化的对象状态。
-   删除后台中的旧对象状态。 这显着降低了存储体积。
+-   删除后台中的旧对象状态。 这显著降低了存储体积。

 请参阅部分 [崩溃](#table_engines_versionedcollapsingmergetree) 有关详细信息。

--- a/programs/local/LocalServer.cpp
+++ b/programs/local/LocalServer.cpp
@ -184,6 +184,11 @@ void LocalServer::tryInitPath()
    if (path.back() != '/')
        path += '/';

+    fs::create_directories(fs::path(path) / "user_defined/");
+    fs::create_directories(fs::path(path) / "data/");
+    fs::create_directories(fs::path(path) / "metadata/");
+    fs::create_directories(fs::path(path) / "metadata_dropped/");
+
    global_context->setPath(path);

    global_context->setTemporaryStorage(path + "tmp");
@ -565,7 +570,6 @@ void LocalServer::processConfig()
        /// Lock path directory before read
        status.emplace(fs::path(path) / "status", StatusFile::write_full_info);

-        fs::create_directories(fs::path(path) / "user_defined/");
        LOG_DEBUG(log, "Loading user defined objects from {}", path);
        Poco::File(path + "user_defined/").createDirectories();
        UserDefinedSQLObjectsLoader::instance().loadObjects(global_context);
@ -573,9 +577,6 @@ void LocalServer::processConfig()
        LOG_DEBUG(log, "Loaded user defined objects.");

        LOG_DEBUG(log, "Loading metadata from {}", path);
-        fs::create_directories(fs::path(path) / "data/");
-        fs::create_directories(fs::path(path) / "metadata/");
-
        loadMetadataSystem(global_context);
        attachSystemTablesLocal(global_context, *createMemoryDatabaseIfNotExists(global_context, DatabaseCatalog::SYSTEM_DATABASE));
        attachInformationSchema(global_context, *createMemoryDatabaseIfNotExists(global_context, DatabaseCatalog::INFORMATION_SCHEMA));
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@ -20,6 +20,7 @@
 #include <base/phdr_cache.h>
 #include <base/ErrorHandlers.h>
 #include <base/getMemoryAmount.h>
+#include <base/getAvailableMemoryAmount.h>
 #include <base/errnoToString.h>
 #include <base/coverage.h>
 #include <base/getFQDNOrHostName.h>
@ -45,6 +46,7 @@
 #include <Core/ServerUUID.h>
 #include <IO/HTTPCommon.h>
 #include <IO/ReadHelpers.h>
+#include <IO/ReadBufferFromFile.h>
 #include <IO/IOThreadPool.h>
 #include <IO/UseSSL.h>
 #include <Interpreters/AsynchronousMetrics.h>
@ -80,6 +82,7 @@
 #include <Common/SensitiveDataMasker.h>
 #include <Common/ThreadFuzzer.h>
 #include <Common/getHashOfLoadedBinary.h>
+#include <Common/filesystemHelpers.h>
 #include <Common/Elf.h>
 #include <Server/MySQLHandlerFactory.h>
 #include <Server/PostgreSQLHandlerFactory.h>
@ -505,6 +508,101 @@ void checkForUsersNotInMainConfig(
    }
 }

+/// Unused in other builds
+#if defined(OS_LINUX)
+static String readString(const String & path)
+{
+    ReadBufferFromFile in(path);
+    String contents;
+    readStringUntilEOF(contents, in);
+    return contents;
+}
+
+static int readNumber(const String & path)
+{
+    ReadBufferFromFile in(path);
+    int result;
+    readText(result, in);
+    return result;
+}
+
+#endif
+
+static void sanityChecks(Server * server)
+{
+    std::string data_path = getCanonicalPath(server->config().getString("path", DBMS_DEFAULT_PATH));
+    std::string logs_path = server->config().getString("logger.log", "");
+
+#if defined(OS_LINUX)
+    try
+    {
+        if (readString("/sys/devices/system/clocksource/clocksource0/current_clocksource").find("tsc") == std::string::npos)
+            server->context()->addWarningMessage("Linux is not using fast TSC clock source. Performance can be degraded.");
+    }
+    catch (...)
+    {
+    }
+
+    try
+    {
+        if (readNumber("/proc/sys/vm/overcommit_memory") == 2)
+            server->context()->addWarningMessage("Linux memory overcommit is disabled.");
+    }
+    catch (...)
+    {
+    }
+
+    try
+    {
+        if (readString("/sys/kernel/mm/transparent_hugepage/enabled").find("[always]") != std::string::npos)
+            server->context()->addWarningMessage("Linux transparent hugepage are set to \"always\".");
+    }
+    catch (...)
+    {
+    }
+
+    try
+    {
+        if (readNumber("/proc/sys/kernel/pid_max") < 30000)
+            server->context()->addWarningMessage("Linux max PID is too low.");
+    }
+    catch (...)
+    {
+    }
+
+    try
+    {
+        if (readNumber("/proc/sys/kernel/threads-max") < 30000)
+            server->context()->addWarningMessage("Linux threads max count is too low.");
+    }
+    catch (...)
+    {
+    }
+
+    std::string dev_id = getBlockDeviceId(data_path);
+    if (getBlockDeviceType(dev_id) == BlockDeviceType::ROT && getBlockDeviceReadAheadBytes(dev_id) == 0)
+        server->context()->addWarningMessage("Rotational disk with disabled readahead is in use. Performance can be degraded.");
+#endif
+
+    try
+    {
+        if (getAvailableMemoryAmount() < (2l << 30))
+            server->context()->addWarningMessage("Available memory at server startup is too low (2GiB).");
+
+        if (!enoughSpaceInDirectory(data_path, 1ull << 30))
+            server->context()->addWarningMessage("Available disk space at server startup is too low (1GiB).");
+
+        if (!logs_path.empty())
+        {
+            if (!enoughSpaceInDirectory(fs::path(logs_path).parent_path(), 1ull << 30))
+                server->context()->addWarningMessage("Available disk space at server startup is too low (1GiB).");
+        }
+    }
+    catch (...)
+    {
+    }
+}
+
 int Server::main(const std::vector<std::string> & /*args*/)
 {
    Poco::Logger * log = &logger();
@ -538,13 +636,14 @@ int Server::main(const std::vector<std::string> & /*args*/)
    global_context->addWarningMessage("Server was built in debug mode. It will work slowly.");
 #endif

-if (ThreadFuzzer::instance().isEffective())
-    global_context->addWarningMessage("ThreadFuzzer is enabled. Application will run slowly and unstable.");
+    if (ThreadFuzzer::instance().isEffective())
+        global_context->addWarningMessage("ThreadFuzzer is enabled. Application will run slowly and unstable.");

 #if defined(SANITIZER)
    global_context->addWarningMessage("Server was built with sanitizer. It will work slowly.");
 #endif

+    sanityChecks(this);

    // Initialize global thread pool. Do it before we fetch configs from zookeeper
    // nodes (`from_zk`), because ZooKeeper interface uses the pool. We will
@ -766,6 +865,38 @@ if (ThreadFuzzer::instance().isEffective())
        }
    }

+    /// Try to increase limit on number of threads.
+    {
+        rlimit rlim;
+        if (getrlimit(RLIMIT_NPROC, &rlim))
+            throw Poco::Exception("Cannot getrlimit");
+
+        if (rlim.rlim_cur == rlim.rlim_max)
+        {
+            LOG_DEBUG(log, "rlimit on number of threads is {}", rlim.rlim_cur);
+        }
+        else
+        {
+            rlim_t old = rlim.rlim_cur;
+            rlim.rlim_cur = rlim.rlim_max;
+            int rc = setrlimit(RLIMIT_NPROC, &rlim);
+            if (rc != 0)
+            {
+                LOG_WARNING(log, "Cannot set max number of threads to {}. error: {}", rlim.rlim_cur, strerror(errno));
+                rlim.rlim_cur = old;
+            }
+            else
+            {
+                LOG_DEBUG(log, "Set max number of threads to {} (was {}).", rlim.rlim_cur, old);
+            }
+        }
+
+        if (rlim.rlim_cur < 30000)
+        {
+            global_context->addWarningMessage("Maximum number of threads is lower than 30000. There could be problems with handling a lot of simultaneous queries.");
+        }
+    }
+
    static ServerErrorHandler error_handler;
    Poco::ErrorHandler::set(&error_handler);

--- a/programs/server/play.html
+++ b/programs/server/play.html
@ -266,12 +266,25 @@
            color: var(--null-color);
        }

+        @keyframes hourglass-animation {
+            0% {
+                transform: rotate(-180deg);
+            }
+            50% {
+                transform: rotate(-180deg);
+            }
+            100% {
+                transform: none;
+            }
+        }
+
        #hourglass
        {
            display: none;
-            padding-left: 1rem;
+            margin-left: 1rem;
            font-size: 110%;
            color: #888;
+            animation: hourglass-animation 1s linear infinite;
        }

        #check-mark
@ -457,7 +470,7 @@
        }

        document.getElementById('check-mark').style.display = 'none';
-        document.getElementById('hourglass').style.display = 'inline';
+        document.getElementById('hourglass').style.display = 'inline-block';

        xhr.send(query);
    }
--- a/src/Access/Common/AccessType.h
+++ b/src/Access/Common/AccessType.h
@ -182,6 +182,7 @@ enum class AccessType
    M(JDBC, "", GLOBAL, SOURCES) \
    M(HDFS, "", GLOBAL, SOURCES) \
    M(S3, "", GLOBAL, SOURCES) \
+    M(HIVE, "", GLOBAL, SOURCES) \
    M(SOURCES, "", GROUP, ALL) \
    \
    M(ALL, "ALL PRIVILEGES", GROUP, NONE) /* full access */ \
--- a/src/AggregateFunctions/AggregateFunctionGroupArraySorted.cpp
+++ b/src/AggregateFunctions/AggregateFunctionGroupArraySorted.cpp
@ -0,0 +1,147 @@
+#include <AggregateFunctions/AggregateFunctionFactory.h>
+#include <AggregateFunctions/AggregateFunctionGroupArraySorted.h>
+#include <AggregateFunctions/FactoryHelpers.h>
+#include <AggregateFunctions/Helpers.h>
+#include <DataTypes/DataTypeDate.h>
+#include <DataTypes/DataTypeDateTime.h>
+#include <DataTypes/DataTypeString.h>
+#include <Common/FieldVisitorConvertToNumber.h>
+
+
+static inline constexpr UInt64 GROUP_SORTED_ARRAY_MAX_SIZE = 0xFFFFFF;
+static inline constexpr UInt64 GROUP_SORTED_ARRAY_DEFAULT_THRESHOLD = 10;
+
+
+namespace DB
+{
+struct Settings;
+
+namespace ErrorCodes
+{
+    extern const int ARGUMENT_OUT_OF_BOUND;
+    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
+}
+
+
+namespace
+{
+    template <typename T, bool expr_sorted, typename TColumnB, bool is_plain_b>
+    class AggregateFunctionGroupArraySortedNumeric : public AggregateFunctionGroupArraySorted<T, false, expr_sorted, TColumnB, is_plain_b>
+    {
+        using AggregateFunctionGroupArraySorted<T, false, expr_sorted, TColumnB, is_plain_b>::AggregateFunctionGroupArraySorted;
+    };
+
+    template <typename T, bool expr_sorted, typename TColumnB, bool is_plain_b>
+    class AggregateFunctionGroupArraySortedFieldType
+        : public AggregateFunctionGroupArraySorted<typename T::FieldType, false, expr_sorted, TColumnB, is_plain_b>
+    {
+        using AggregateFunctionGroupArraySorted<typename T::FieldType, false, expr_sorted, TColumnB, is_plain_b>::
+            AggregateFunctionGroupArraySorted;
+        DataTypePtr getReturnType() const override { return std::make_shared<DataTypeArray>(std::make_shared<T>()); }
+    };
+
+    template <template <typename, bool, typename, bool> class AggregateFunctionTemplate, typename TColumnA, bool expr_sorted, typename TColumnB, bool is_plain_b, typename... TArgs>
+    AggregateFunctionPtr
+    createAggregateFunctionGroupArraySortedTypedFinal(TArgs && ... args)
+    {
+        return AggregateFunctionPtr(new AggregateFunctionTemplate<TColumnA, expr_sorted, TColumnB, is_plain_b>(std::forward<TArgs>(args)...));
+    }
+
+    template <bool expr_sorted = false, typename TColumnB = UInt64, bool is_plain_b = false>
+    AggregateFunctionPtr
+    createAggregateFunctionGroupArraySortedTyped(const DataTypes & argument_types, const Array & params, UInt64 threshold)
+    {
+#define DISPATCH(A, C, B) \
+    if (which.idx == TypeIndex::A) \
+        return createAggregateFunctionGroupArraySortedTypedFinal<C, B, expr_sorted, TColumnB, is_plain_b>(threshold, argument_types, params);
+#define DISPATCH_NUMERIC(A) DISPATCH(A, AggregateFunctionGroupArraySortedNumeric, A)
+        WhichDataType which(argument_types[0]);
+        FOR_NUMERIC_TYPES(DISPATCH_NUMERIC)
+        DISPATCH(Enum8, AggregateFunctionGroupArraySortedNumeric, Int8)
+        DISPATCH(Enum16, AggregateFunctionGroupArraySortedNumeric, Int16)
+        DISPATCH(Date, AggregateFunctionGroupArraySortedFieldType, DataTypeDate)
+        DISPATCH(DateTime, AggregateFunctionGroupArraySortedFieldType, DataTypeDateTime)
+#undef DISPATCH
+#undef DISPATCH_NUMERIC
+
+        if (argument_types[0]->isValueUnambiguouslyRepresentedInContiguousMemoryRegion())
+        {
+            return AggregateFunctionPtr(new AggregateFunctionGroupArraySorted<StringRef, true, expr_sorted, TColumnB, is_plain_b>(
+                threshold, argument_types, params));
+        }
+        else
+        {
+            return AggregateFunctionPtr(new AggregateFunctionGroupArraySorted<StringRef, false, expr_sorted, TColumnB, is_plain_b>(
+                threshold, argument_types, params));
+        }
+    }
+
+
+    AggregateFunctionPtr createAggregateFunctionGroupArraySorted(
+        const std::string & name, const DataTypes & argument_types, const Array & params, const Settings *)
+    {
+        UInt64 threshold = GROUP_SORTED_ARRAY_DEFAULT_THRESHOLD;
+
+        if (params.size() == 1)
+        {
+            UInt64 k = applyVisitor(FieldVisitorConvertToNumber<UInt64>(), params[0]);
+
+            if (k > GROUP_SORTED_ARRAY_MAX_SIZE)
+                throw Exception(
+                    "Too large parameter(s) for aggregate function " + name + ". Maximum: " + toString(GROUP_SORTED_ARRAY_MAX_SIZE),
+                    ErrorCodes::ARGUMENT_OUT_OF_BOUND);
+
+            if (k == 0)
+                throw Exception("Parameter 0 is illegal for aggregate function " + name, ErrorCodes::ARGUMENT_OUT_OF_BOUND);
+
+            threshold = k;
+        }
+        else if (!params.empty())
+        {
+            throw Exception("Aggregate function " + name + " only supports 1 parameter.", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
+        }
+
+        if (argument_types.size() == 2)
+        {
+            if (isNumber(argument_types[1]))
+            {
+#define DISPATCH2(A, B) \
+    if (which.idx == TypeIndex::A) \
+        return createAggregateFunctionGroupArraySortedTyped<true, B>(argument_types, params, threshold);
+#define DISPATCH(A) DISPATCH2(A, A)
+                WhichDataType which(argument_types[1]);
+                FOR_NUMERIC_TYPES(DISPATCH)
+                DISPATCH2(Enum8, Int8)
+                DISPATCH2(Enum16, Int16)
+#undef DISPATCH
+#undef DISPATCH2
+                throw Exception("Invalid parameter type.", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+            }
+            else if (argument_types[1]->isValueUnambiguouslyRepresentedInContiguousMemoryRegion())
+            {
+                return createAggregateFunctionGroupArraySortedTyped<true, StringRef, true>(argument_types, params, threshold);
+            }
+            else
+            {
+                return createAggregateFunctionGroupArraySortedTyped<true, StringRef, false>(argument_types, params, threshold);
+            }
+        }
+        else if (argument_types.size() == 1)
+        {
+            return createAggregateFunctionGroupArraySortedTyped<>(argument_types, params, threshold);
+        }
+        else
+        {
+            throw Exception(
+                "Aggregate function " + name + " requires one or two parameters.", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
+        }
+    }
+}
+
+void registerAggregateFunctionGroupArraySorted(AggregateFunctionFactory & factory)
+{
+    AggregateFunctionProperties properties = {.returns_default_when_only_null = false, .is_order_dependent = true};
+    factory.registerFunction("groupArraySorted", {createAggregateFunctionGroupArraySorted, properties});
+}
+}
--- a/src/AggregateFunctions/AggregateFunctionGroupArraySorted.h
+++ b/src/AggregateFunctions/AggregateFunctionGroupArraySorted.h
@ -0,0 +1,310 @@
+#pragma once
+
+#include <Columns/ColumnArray.h>
+#include <DataTypes/DataTypeArray.h>
+
+#include <AggregateFunctions/AggregateFunctionGroupArraySortedData.h>
+#include <AggregateFunctions/IAggregateFunction.h>
+
+namespace DB
+{
+template <typename TColumn, bool is_plain>
+inline TColumn readItem(const IColumn * column, Arena * arena, size_t row)
+{
+    if constexpr (std::is_same_v<TColumn, StringRef>)
+    {
+        if constexpr (is_plain)
+        {
+            StringRef str = column->getDataAt(row);
+            auto ptr = arena->alloc(str.size);
+            std::copy(str.data, str.data + str.size, ptr);
+            return StringRef(ptr, str.size);
+        }
+        else
+        {
+            const char * begin = nullptr;
+            return column->serializeValueIntoArena(row, *arena, begin);
+        }
+    }
+    else
+    {
+        if constexpr (std::is_same_v<TColumn, UInt64>)
+            return column->getUInt(row);
+        else
+            return column->getInt(row);
+    }
+}
+
+template <typename TColumn, typename TFilter = void>
+size_t
+getFirstNElements_low_threshold(const TColumn * data, int num_elements, int threshold, size_t * results, const TFilter * filter = nullptr)
+{
+    for (int i = 0; i < threshold; i++)
+    {
+        results[i] = 0;
+    }
+
+    threshold = std::min(num_elements, threshold);
+    int current_max = 0;
+    int cur;
+    int z;
+    for (int i = 0; i < num_elements; i++)
+    {
+        if constexpr (!std::is_same_v<TFilter, void>)
+        {
+            if (filter[i] == 0)
+                continue;
+        }
+
+        //Starting from the highest values and we look for the immediately lower than the given one
+        for (cur = current_max; cur > 0; cur--)
+        {
+            if (data[i] > data[results[cur - 1]])
+                break;
+        }
+
+        if (cur < threshold)
+        {
+            //Move all the higher values 1 position to the right
+            for (z = std::min(threshold - 1, current_max); z > cur; z--)
+                results[z] = results[z - 1];
+
+            if (current_max < threshold)
+                ++current_max;
+
+            //insert element into the given position
+            results[cur] = i;
+        }
+    }
+
+    return current_max;
+}
+
+template <typename T>
+struct SortableItem
+{
+    T a;
+    size_t b;
+    bool operator<(const SortableItem & other) const { return (this->a < other.a); }
+};
+
+template <typename TColumn, typename TFilter = void>
+size_t getFirstNElements_high_threshold(
+    const TColumn * data, size_t num_elements, size_t threshold, size_t * results, const TFilter * filter = nullptr)
+{
+    std::vector<SortableItem<TColumn>> dataIndexed(num_elements);
+    size_t num_elements_filtered = 0;
+
+    for (size_t i = 0; i < num_elements; i++)
+    {
+        if constexpr (!std::is_same_v<TFilter, void>)
+        {
+            if (filter[i] == 0)
+                continue;
+        }
+
+        dataIndexed.data()[num_elements_filtered].a = data[i];
+        dataIndexed.data()[num_elements_filtered].b = i;
+        num_elements_filtered++;
+    }
+
+    threshold = std::min(num_elements_filtered, threshold);
+
+    std::nth_element(dataIndexed.data(), dataIndexed.data() + threshold, dataIndexed.data() + num_elements_filtered);
+    std::sort(dataIndexed.data(), dataIndexed.data() + threshold);
+
+    for (size_t i = 0; i < threshold; i++)
+    {
+        results[i] = dataIndexed[i].b;
+    }
+
+    return threshold;
+}
+
+static const size_t THRESHOLD_MAX_CUSTOM_FUNCTION = 1000;
+
+template <typename TColumn>
+size_t getFirstNElements(const TColumn * data, size_t num_elements, size_t threshold, size_t * results, const UInt8 * filter = nullptr)
+{
+    if (threshold < THRESHOLD_MAX_CUSTOM_FUNCTION)
+    {
+        if (filter != nullptr)
+            return getFirstNElements_low_threshold(data, num_elements, threshold, results, filter);
+        else
+            return getFirstNElements_low_threshold(data, num_elements, threshold, results);
+    }
+    else
+    {
+        if (filter != nullptr)
+            return getFirstNElements_high_threshold(data, num_elements, threshold, results, filter);
+        else
+            return getFirstNElements_high_threshold(data, num_elements, threshold, results);
+    }
+}
+
+template <typename TColumnA, bool is_plain_a, bool use_column_b, typename TColumnB, bool is_plain_b>
+class AggregateFunctionGroupArraySorted : public IAggregateFunctionDataHelper<
+                                              AggregateFunctionGroupArraySortedData<TColumnA, use_column_b, TColumnB>,
+                                              AggregateFunctionGroupArraySorted<TColumnA, is_plain_a, use_column_b, TColumnB, is_plain_b>>
+{
+protected:
+    using State = AggregateFunctionGroupArraySortedData<TColumnA, use_column_b, TColumnB>;
+    using Base = IAggregateFunctionDataHelper<
+        AggregateFunctionGroupArraySortedData<TColumnA, use_column_b, TColumnB>,
+        AggregateFunctionGroupArraySorted>;
+
+    UInt64 threshold;
+    DataTypePtr & input_data_type;
+    mutable std::mutex mutex;
+
+    static void deserializeAndInsert(StringRef str, IColumn & data_to);
+
+public:
+    AggregateFunctionGroupArraySorted(UInt64 threshold_, const DataTypes & argument_types_, const Array & params)
+        : IAggregateFunctionDataHelper<
+            AggregateFunctionGroupArraySortedData<TColumnA, use_column_b, TColumnB>,
+            AggregateFunctionGroupArraySorted>(argument_types_, params)
+        , threshold(threshold_)
+        , input_data_type(this->argument_types[0])
+    {
+    }
+
+    void create(AggregateDataPtr place) const override
+    {
+        Base::create(place);
+        this->data(place).threshold = threshold;
+    }
+
+    String getName() const override { return "groupArraySorted"; }
+
+    DataTypePtr getReturnType() const override { return std::make_shared<DataTypeArray>(input_data_type); }
+
+    bool allocatesMemoryInArena() const override
+    {
+        if constexpr (std::is_same_v<TColumnA, StringRef>)
+            return true;
+        else
+            return false;
+    }
+
+    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena * arena) const override
+    {
+        State & data = this->data(place);
+        if constexpr (use_column_b)
+        {
+            data.add(
+                readItem<TColumnA, is_plain_a>(columns[0], arena, row_num), readItem<TColumnB, is_plain_b>(columns[1], arena, row_num));
+        }
+        else
+        {
+            data.add(readItem<TColumnA, is_plain_a>(columns[0], arena, row_num));
+        }
+    }
+
+    template <typename TColumn, bool is_plain, typename TFunc>
+    void
+    forFirstRows(size_t batch_size, const IColumn ** columns, size_t data_column, Arena * arena, ssize_t if_argument_pos, TFunc func) const
+    {
+        const TColumn * values = nullptr;
+        std::unique_ptr<std::vector<TColumn>> values_vector;
+        std::vector<size_t> best_rows(threshold);
+
+        if constexpr (std::is_same_v<TColumn, StringRef>)
+        {
+            values_vector.reset(new std::vector<TColumn>(batch_size));
+            for (size_t i = 0; i < batch_size; i++)
+                (*values_vector)[i] = readItem<TColumn, is_plain>(columns[data_column], arena, i);
+            values = (*values_vector).data();
+        }
+        else
+        {
+            const auto & column = assert_cast<const ColumnVector<TColumn> &>(*columns[data_column]);
+            values = column.getData().data();
+        }
+
+        const UInt8 * filter = nullptr;
+        StringRef refFilter;
+
+        if (if_argument_pos >= 0)
+        {
+            refFilter = columns[if_argument_pos]->getRawData();
+            filter = reinterpret_cast<const UInt8 *>(refFilter.data);
+        }
+
+        size_t num_elements = getFirstNElements(values, batch_size, threshold, best_rows.data(), filter);
+        for (size_t i = 0; i < num_elements; i++)
+        {
+            func(best_rows[i], values);
+        }
+    }
+
+    void addBatchSinglePlace(
+        size_t batch_size, AggregateDataPtr place, const IColumn ** columns, Arena * arena, ssize_t if_argument_pos) const override
+    {
+        State & data = this->data(place);
+
+        if constexpr (use_column_b)
+        {
+            forFirstRows<TColumnB, is_plain_b>(
+                batch_size, columns, 1, arena, if_argument_pos, [columns, &arena, &data](size_t row, const TColumnB * values)
+                {
+                    data.add(readItem<TColumnA, is_plain_a>(columns[0], arena, row), values[row]);
+                });
+        }
+        else
+        {
+            forFirstRows<TColumnA, is_plain_a>(
+                batch_size, columns, 0, arena, if_argument_pos, [&data](size_t row, const TColumnA * values)
+                {
+                    data.add(values[row]);
+                });
+        }
+    }
+
+    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
+    {
+        this->data(place).merge(this->data(rhs));
+    }
+
+    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
+    {
+        this->data(place).serialize(buf);
+    }
+
+    void
+    deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version  */, Arena * arena) const override
+    {
+        this->data(place).deserialize(buf, arena);
+    }
+
+    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena * /*arena*/) const override
+    {
+        ColumnArray & arr_to = assert_cast<ColumnArray &>(to);
+        ColumnArray::Offsets & offsets_to = arr_to.getOffsets();
+
+        auto & values = this->data(place).values;
+        offsets_to.push_back(offsets_to.back() + values.size());
+
+        IColumn & data_to = arr_to.getData();
+        for (auto value : values)
+        {
+            if constexpr (std::is_same_v<TColumnA, StringRef>)
+            {
+                auto str = State::itemValue(value);
+                if constexpr (is_plain_a)
+                {
+                    data_to.insertData(str.data, str.size);
+                }
+                else
+                {
+                    data_to.deserializeAndInsertFromArena(str.data);
+                }
+            }
+            else
+            {
+                data_to.insert(State::itemValue(value));
+            }
+        }
+    }
+};
+}
--- a/src/AggregateFunctions/AggregateFunctionGroupArraySortedData.h
+++ b/src/AggregateFunctions/AggregateFunctionGroupArraySortedData.h
@ -0,0 +1,162 @@
+#pragma once
+
+#include <IO/ReadBuffer.h>
+#include <IO/ReadHelpers.h>
+#include <IO/VarInt.h>
+#include <IO/WriteBuffer.h>
+#include <IO/WriteHelpers.h>
+
+
+static inline constexpr UInt64 GROUP_SORTED_DEFAULT_THRESHOLD = 0xFFFFFF;
+
+namespace DB
+{
+template <typename T>
+static void writeOneItem(WriteBuffer & buf, T item)
+{
+    if constexpr (std::numeric_limits<T>::is_signed)
+    {
+        writeVarInt(item, buf);
+    }
+    else
+    {
+        writeVarUInt(item, buf);
+    }
+}
+
+static void writeOneItem(WriteBuffer & buf, const StringRef & item)
+{
+    writeBinary(item, buf);
+}
+
+template <typename T>
+static void readOneItem(ReadBuffer & buf, Arena * /*arena*/, T & item)
+{
+    if constexpr (std::numeric_limits<T>::is_signed)
+    {
+        DB::Int64 val;
+        readVarT(val, buf);
+        item = val;
+    }
+    else
+    {
+        DB::UInt64 val;
+        readVarT(val, buf);
+        item = val;
+    }
+}
+
+static void readOneItem(ReadBuffer & buf, Arena * arena, StringRef & item)
+{
+    item = readStringBinaryInto(*arena, buf);
+}
+
+template <typename Storage>
+struct AggregateFunctionGroupArraySortedDataBase
+{
+    typedef typename Storage::value_type ValueType;
+    AggregateFunctionGroupArraySortedDataBase(UInt64 threshold_ = GROUP_SORTED_DEFAULT_THRESHOLD) : threshold(threshold_) { }
+
+    virtual ~AggregateFunctionGroupArraySortedDataBase() { }
+    inline void narrowDown()
+    {
+        while (values.size() > threshold)
+            values.erase(--values.end());
+    }
+
+    void merge(const AggregateFunctionGroupArraySortedDataBase & other)
+    {
+        values.merge(Storage(other.values));
+        narrowDown();
+    }
+
+    void serialize(WriteBuffer & buf) const
+    {
+        writeOneItem(buf, UInt64(values.size()));
+        for (auto value : values)
+        {
+            serializeItem(buf, value);
+        }
+    }
+
+    virtual void serializeItem(WriteBuffer & buf, ValueType & val) const = 0;
+    virtual ValueType deserializeItem(ReadBuffer & buf, Arena * arena) const = 0;
+
+    void deserialize(ReadBuffer & buf, Arena * arena)
+    {
+        values.clear();
+        UInt64 length;
+        readOneItem(buf, nullptr, length);
+
+        while (length--)
+        {
+            values.insert(deserializeItem(buf, arena));
+        }
+
+        narrowDown();
+    }
+
+    UInt64 threshold;
+    Storage values;
+};
+
+template <typename T, bool expr_sorted, typename TIndex>
+struct AggregateFunctionGroupArraySortedData
+{
+};
+
+template <typename T, typename TIndex>
+struct AggregateFunctionGroupArraySortedData<T, true, TIndex> : public AggregateFunctionGroupArraySortedDataBase<std::multimap<TIndex, T>>
+{
+    using Base = AggregateFunctionGroupArraySortedDataBase<std::multimap<TIndex, T>>;
+    using Base::Base;
+
+    void add(T item, TIndex weight)
+    {
+        Base::values.insert({weight, item});
+        Base::narrowDown();
+    }
+
+    void serializeItem(WriteBuffer & buf, typename Base::ValueType & value) const override
+    {
+        writeOneItem(buf, value.first);
+        writeOneItem(buf, value.second);
+    }
+
+    virtual typename Base::ValueType deserializeItem(ReadBuffer & buf, Arena * arena) const override
+    {
+        TIndex first;
+        T second;
+        readOneItem(buf, arena, first);
+        readOneItem(buf, arena, second);
+
+        return {first, second};
+    }
+
+    static T itemValue(typename Base::ValueType & value) { return value.second; }
+};
+
+template <typename T, typename TIndex>
+struct AggregateFunctionGroupArraySortedData<T, false, TIndex> : public AggregateFunctionGroupArraySortedDataBase<std::multiset<T>>
+{
+    using Base = AggregateFunctionGroupArraySortedDataBase<std::multiset<T>>;
+    using Base::Base;
+
+    void add(T item)
+    {
+        Base::values.insert(item);
+        Base::narrowDown();
+    }
+
+    void serializeItem(WriteBuffer & buf, typename Base::ValueType & value) const override { writeOneItem(buf, value); }
+
+    typename Base::ValueType deserializeItem(ReadBuffer & buf, Arena * arena) const override
+    {
+        T value;
+        readOneItem(buf, arena, value);
+        return value;
+    }
+
+    static T itemValue(typename Base::ValueType & value) { return value; }
+};
+}
--- a/src/AggregateFunctions/registerAggregateFunctions.cpp
+++ b/src/AggregateFunctions/registerAggregateFunctions.cpp
@ -59,6 +59,7 @@ void registerAggregateFunctionNothing(AggregateFunctionFactory &);
 void registerAggregateFunctionExponentialMovingAverage(AggregateFunctionFactory &);
 void registerAggregateFunctionSparkbar(AggregateFunctionFactory &);
 void registerAggregateFunctionIntervalLengthSum(AggregateFunctionFactory &);
+void registerAggregateFunctionGroupArraySorted(AggregateFunctionFactory & factory);

 class AggregateFunctionCombinatorFactory;
 void registerAggregateFunctionCombinatorIf(AggregateFunctionCombinatorFactory &);
@ -130,6 +131,7 @@ void registerAggregateFunctions()
        registerAggregateFunctionIntervalLengthSum(factory);
        registerAggregateFunctionExponentialMovingAverage(factory);
        registerAggregateFunctionSparkbar(factory);
+        registerAggregateFunctionGroupArraySorted(factory);

        registerWindowFunctions(factory);
    }
--- a/src/Client/ClientBase.cpp
+++ b/src/Client/ClientBase.cpp
@ -220,7 +220,7 @@ static void incrementProfileEventsBlock(Block & dst, const Block & src)
 }


-std::atomic_flag exit_on_signal = ATOMIC_FLAG_INIT;
+std::atomic_flag exit_on_signal;

 class QueryInterruptHandler : private boost::noncopyable
 {
--- a/src/Common/Exception.h
+++ b/src/Common/Exception.h
@ -35,10 +35,10 @@ public:
    {}

    // Format message with fmt::format, like the logging functions.
-    template <typename ...Args>
-    Exception(int code, const std::string & fmt, Args&&... args)
-        : Exception(fmt::format(fmt::runtime(fmt), std::forward<Args>(args)...), code)
-    {}
+    template <typename... Args>
+    Exception(int code, fmt::format_string<Args...> fmt, Args &&... args) : Exception(fmt::format(fmt, std::forward<Args>(args)...), code)
+    {
+    }

    struct CreateFromPocoTag {};
    struct CreateFromSTDTag {};
@ -52,10 +52,10 @@ public:
    const char * what() const throw() override { return message().data(); }

    /// Add something to the existing message.
-    template <typename ...Args>
-    void addMessage(const std::string& format, Args&&... args)
+    template <typename... Args>
+    void addMessage(fmt::format_string<Args...> format, Args &&... args)
    {
-        extendedMessage(fmt::format(fmt::runtime(format), std::forward<Args>(args)...));
+        extendedMessage(fmt::format(format, std::forward<Args>(args)...));
    }

    void addMessage(const std::string& message)
@ -117,10 +117,10 @@ public:
    ParsingException(int code, const std::string & message);

    // Format message with fmt::format, like the logging functions.
-    template <typename ...Args>
-    ParsingException(int code, const std::string & fmt, Args&&... args)
-        : Exception(fmt::format(fmt::runtime(fmt), std::forward<Args>(args)...), code)
-    {}
+    template <typename... Args>
+    ParsingException(int code, fmt::format_string<Args...> fmt, Args &&... args) : Exception(code, fmt, std::forward<Args>(args)...)
+    {
+    }


    std::string displayText() const
--- a/src/Common/FileCache.cpp
+++ b/src/Common/FileCache.cpp
@ -241,6 +241,10 @@ FileSegmentsHolder LRUFileCache::getOrSet(const Key & key, size_t offset, size_t

    std::lock_guard cache_lock(mutex);

+#ifndef NDEBUG
+    assertCacheCorrectness(key, cache_lock);
+#endif
+
    /// Get all segments which intersect with the given range.
    auto file_segments = getImpl(key, range, cache_lock);

@ -315,7 +319,7 @@ FileSegmentsHolder LRUFileCache::getOrSet(const Key & key, size_t offset, size_t

 LRUFileCache::FileSegmentCell * LRUFileCache::addCell(
    const Key & key, size_t offset, size_t size, FileSegment::State state,
-    std::lock_guard<std::mutex> & /* cache_lock */)
+    std::lock_guard<std::mutex> & cache_lock)
 {
    /// Create a file segment cell and put it in `files` map by [key][offset].

@ -323,8 +327,10 @@ LRUFileCache::FileSegmentCell * LRUFileCache::addCell(
        return nullptr; /// Empty files are not cached.

    if (files[key].contains(offset))
-        throw Exception(ErrorCodes::REMOTE_FS_OBJECT_CACHE_ERROR,
-            "Cache already exists for key: `{}`, offset: {}, size: {}", keyToStr(key), offset, size);
+        throw Exception(
+            ErrorCodes::REMOTE_FS_OBJECT_CACHE_ERROR,
+            "Cache already exists for key: `{}`, offset: {}, size: {}.\nCurrent cache structure: {}",
+            keyToStr(key), offset, size, dumpStructureImpl(key, cache_lock));

    auto file_segment = std::make_shared<FileSegment>(offset, size, key, this, state);
    FileSegmentCell cell(std::move(file_segment), queue);
@ -340,8 +346,10 @@ LRUFileCache::FileSegmentCell * LRUFileCache::addCell(

    auto [it, inserted] = offsets.insert({offset, std::move(cell)});
    if (!inserted)
-        throw Exception(ErrorCodes::LOGICAL_ERROR,
-            "Failed to insert into cache key: `{}`, offset: {}, size: {}", keyToStr(key), offset, size);
+        throw Exception(
+            ErrorCodes::LOGICAL_ERROR,
+            "Failed to insert into cache key: `{}`, offset: {}, size: {}",
+            keyToStr(key), offset, size);

    return &(it->second);
 }
@ -523,8 +531,8 @@ void LRUFileCache::loadCacheInfoIntoMemory()
    std::lock_guard cache_lock(mutex);

    Key key;
-    UInt64 offset;
-    size_t size;
+    UInt64 offset = 0;
+    size_t size = 0;
    std::vector<FileSegmentCell *> cells;

    /// cache_base_path / key_prefix / key / offset
@ -687,22 +695,32 @@ LRUFileCache::FileSegmentCell::FileSegmentCell(FileSegmentPtr file_segment_, LRU
    }
 }

-String LRUFileCache::dumpStructure(const Key & key_)
+String LRUFileCache::dumpStructure(const Key & key)
 {
    std::lock_guard cache_lock(mutex);
+    return dumpStructureImpl(key, cache_lock);
+}

+String LRUFileCache::dumpStructureImpl(const Key & key, std::lock_guard<std::mutex> & /* cache_lock */)
+{
    WriteBufferFromOwnString result;
-    for (auto it = queue.begin(); it != queue.end(); ++it)
-    {
-        auto [key, offset] = *it;
-        if (key == key_)
-        {
-            auto * cell = getCell(key, offset, cache_lock);
-            result << (it != queue.begin() ? ", " : "") << cell->file_segment->range().toString();
-            result << "(state: " << cell->file_segment->download_state << ")";
-        }
-    }
+    const auto & cells_by_offset = files[key];
+
+    for (const auto & [offset, cell] : cells_by_offset)
+        result << cell.file_segment->getInfoForLog() << "\n";
+
    return result.str();
 }

+void LRUFileCache::assertCacheCorrectness(const Key & key, std::lock_guard<std::mutex> & /* cache_lock */)
+{
+    const auto & cells_by_offset = files[key];
+
+    for (const auto & [_, cell] : cells_by_offset)
+    {
+        const auto & file_segment = cell.file_segment;
+        file_segment->assertCorrectness();
+    }
+}
+
 }
--- a/src/Common/FileCache.h
+++ b/src/Common/FileCache.h
@ -25,6 +25,7 @@ namespace DB
 class IFileCache : private boost::noncopyable
 {
 friend class FileSegment;
+friend struct FileSegmentsHolder;

 public:
    using Key = UInt128;
@ -196,6 +197,8 @@ private:
    FileSegments splitRangeIntoEmptyCells(
        const Key & key, size_t offset, size_t size, std::lock_guard<std::mutex> & cache_lock);

+    String dumpStructureImpl(const Key & key_, std::lock_guard<std::mutex> & cache_lock);
+
 public:
    struct Stat
    {
@ -208,6 +211,7 @@ public:
    Stat getStat();

    String dumpStructure(const Key & key_) override;
+    void assertCacheCorrectness(const Key & key, std::lock_guard<std::mutex> & cache_lock);
 };

 }
--- a/src/Common/FileSegment.cpp
+++ b/src/Common/FileSegment.cpp
@ -159,7 +159,18 @@ void FileSegment::setRemoteFileReader(RemoteFileReaderPtr remote_file_reader_)
    remote_file_reader = remote_file_reader_;
 }

-void FileSegment::write(const char * from, size_t size)
+void FileSegment::resetRemoteFileReader()
+{
+    if (!isDownloader())
+        throw Exception(ErrorCodes::REMOTE_FS_OBJECT_CACHE_ERROR, "Only downloader can use remote filesystem file reader");
+
+    if (!remote_file_reader)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Remote file reader does not exist");
+
+    remote_file_reader.reset();
+}
+
+void FileSegment::write(const char * from, size_t size, size_t offset_)
 {
    if (!size)
        throw Exception(ErrorCodes::REMOTE_FS_OBJECT_CACHE_ERROR, "Writing zero size is not allowed");
@ -174,8 +185,24 @@ void FileSegment::write(const char * from, size_t size)
                        "Only downloader can do the downloading. (CallerId: {}, DownloaderId: {})",
                        getCallerId(), downloader_id);

+    if (downloaded_size == range().size())
+        throw Exception(ErrorCodes::REMOTE_FS_OBJECT_CACHE_ERROR,
+                        "Attempt to write {} bytes to offset: {}, but current file segment is already fully downloaded",
+                        size, offset_);
+
+    auto download_offset = range().left + downloaded_size;
+    if (offset_ != download_offset)
+        throw Exception(ErrorCodes::REMOTE_FS_OBJECT_CACHE_ERROR,
+                        "Attempt to write {} bytes to offset: {}, but current download offset is {}",
+                        size, offset_, download_offset);
+
    if (!cache_writer)
    {
+        if (downloaded_size > 0)
+            throw Exception(ErrorCodes::LOGICAL_ERROR,
+                            "Cache writer was finalized (downloaded size: {}, state: {})",
+                            downloaded_size, stateToString(download_state));
+
        auto download_path = cache->getPathInLocalCache(key(), offset());
        cache_writer = std::make_unique<WriteBufferFromFile>(download_path);
    }
@ -190,19 +217,26 @@ void FileSegment::write(const char * from, size_t size)

        downloaded_size += size;
    }
-    catch (...)
+    catch (Exception & e)
    {
        std::lock_guard segment_lock(mutex);

-        LOG_ERROR(log, "Failed to write to cache. File segment info: {}", getInfoForLogImpl(segment_lock));
+        auto info = getInfoForLogImpl(segment_lock);
+        e.addMessage("while writing into cache, info: " + info);
+
+        LOG_ERROR(log, "Failed to write to cache. File segment info: {}", info);

        download_state = State::PARTIALLY_DOWNLOADED_NO_CONTINUATION;

        cache_writer->finalize();
        cache_writer.reset();

+        cv.notify_all();
+
        throw;
    }
+
+    assert(getDownloadOffset() == offset_ + size);
 }

 FileSegment::State FileSegment::wait()
@ -270,7 +304,6 @@ void FileSegment::setDownloaded(std::lock_guard<std::mutex> & /* segment_lock */
    download_state = State::DOWNLOADED;
    is_downloaded = true;

-    assert(cache_writer);
    if (cache_writer)
    {
        cache_writer->finalize();
@ -299,107 +332,125 @@ void FileSegment::completeBatchAndResetDownloader()

 void FileSegment::complete(State state)
 {
-    {
-        std::lock_guard segment_lock(mutex);
-
-        bool is_downloader = downloader_id == getCallerId();
-        if (!is_downloader)
-        {
-            cv.notify_all();
-            throw Exception(ErrorCodes::REMOTE_FS_OBJECT_CACHE_ERROR,
-                            "File segment can be completed only by downloader or downloader's FileSegmentsHodler");
-        }
-
-        if (state != State::DOWNLOADED
-            && state != State::PARTIALLY_DOWNLOADED
-            && state != State::PARTIALLY_DOWNLOADED_NO_CONTINUATION)
-        {
-            cv.notify_all();
-            throw Exception(ErrorCodes::REMOTE_FS_OBJECT_CACHE_ERROR,
-                            "Cannot complete file segment with state: {}", stateToString(state));
-        }
-
-        download_state = state;
-    }
-
-    completeImpl();
-    cv.notify_all();
-}
-
-void FileSegment::complete()
-{
-    {
-        std::lock_guard segment_lock(mutex);
-
-        if (download_state == State::SKIP_CACHE || detached)
-            return;
-
-        if (download_state != State::DOWNLOADED && getDownloadedSize(segment_lock) == range().size())
-            setDownloaded(segment_lock);
-
-        if (download_state == State::DOWNLOADING || download_state == State::EMPTY)
-            download_state = State::PARTIALLY_DOWNLOADED;
-    }
-
-    completeImpl(true);
-    cv.notify_all();
-}
-
-void FileSegment::completeImpl(bool allow_non_strict_checking)
-{
-    /// cache lock is always taken before segment lock.
    std::lock_guard cache_lock(cache->mutex);
    std::lock_guard segment_lock(mutex);

-    bool download_can_continue = false;
-
-    if (download_state == State::PARTIALLY_DOWNLOADED
-                || download_state == State::PARTIALLY_DOWNLOADED_NO_CONTINUATION)
+    bool is_downloader = downloader_id == getCallerId();
+    if (!is_downloader)
    {
-        bool is_last_holder = cache->isLastFileSegmentHolder(key(), offset(), cache_lock, segment_lock);
-        download_can_continue = !is_last_holder && download_state == State::PARTIALLY_DOWNLOADED;
+        cv.notify_all();
+        throw Exception(ErrorCodes::REMOTE_FS_OBJECT_CACHE_ERROR,
+                        "File segment can be completed only by downloader or downloader's FileSegmentsHodler");
+    }

-        if (!download_can_continue)
+    if (state != State::DOWNLOADED
+        && state != State::PARTIALLY_DOWNLOADED
+        && state != State::PARTIALLY_DOWNLOADED_NO_CONTINUATION)
+    {
+        cv.notify_all();
+        throw Exception(ErrorCodes::REMOTE_FS_OBJECT_CACHE_ERROR,
+                        "Cannot complete file segment with state: {}", stateToString(state));
+    }
+
+    download_state = state;
+
+    try
+    {
+        completeImpl(cache_lock, segment_lock);
+    }
+    catch (...)
+    {
+        if (!downloader_id.empty() && downloader_id == getCallerIdImpl(true))
+            downloader_id.clear();
+
+        cv.notify_all();
+        throw;
+    }
+
+    cv.notify_all();
+}
+
+void FileSegment::complete(std::lock_guard<std::mutex> & cache_lock)
+{
+    std::lock_guard segment_lock(mutex);
+
+    if (download_state == State::SKIP_CACHE || detached)
+        return;
+
+    if (download_state != State::DOWNLOADED && getDownloadedSize(segment_lock) == range().size())
+        setDownloaded(segment_lock);
+
+    if (download_state == State::DOWNLOADING || download_state == State::EMPTY)
+    {
+        /// Segment state can be changed from DOWNLOADING or EMPTY only if the caller is the
+        /// downloader or the only owner of the segment.
+
+        bool can_update_segment_state = downloader_id == getCallerIdImpl(true)
+            || cache->isLastFileSegmentHolder(key(), offset(), cache_lock, segment_lock);
+
+        if (can_update_segment_state)
+            download_state = State::PARTIALLY_DOWNLOADED;
+    }
+
+    try
+    {
+        completeImpl(cache_lock, segment_lock, /* allow_non_strict_checking */true);
+    }
+    catch (...)
+    {
+        if (!downloader_id.empty() && downloader_id == getCallerIdImpl(true))
+            downloader_id.clear();
+
+        cv.notify_all();
+        throw;
+    }
+
+    cv.notify_all();
+}
+
+void FileSegment::completeImpl(std::lock_guard<std::mutex> & cache_lock, std::lock_guard<std::mutex> & segment_lock, bool allow_non_strict_checking)
+{
+    bool is_last_holder = cache->isLastFileSegmentHolder(key(), offset(), cache_lock, segment_lock);
+
+    if (is_last_holder
+        && (download_state == State::PARTIALLY_DOWNLOADED || download_state == State::PARTIALLY_DOWNLOADED_NO_CONTINUATION))
+    {
+        size_t current_downloaded_size = getDownloadedSize(segment_lock);
+        if (current_downloaded_size == 0)
        {
-            size_t current_downloaded_size = getDownloadedSize(segment_lock);
-            if (current_downloaded_size == 0)
-            {
-                download_state = State::SKIP_CACHE;
-                LOG_TEST(log, "Remove cell {} (nothing downloaded)", range().toString());
-                cache->remove(key(), offset(), cache_lock, segment_lock);
+            download_state = State::SKIP_CACHE;
+            LOG_TEST(log, "Remove cell {} (nothing downloaded)", range().toString());
+            cache->remove(key(), offset(), cache_lock, segment_lock);
+        }
+        else
+        {
+            /**
+            * Only last holder of current file segment can resize the cell,
+            * because there is an invariant that file segments returned to users
+            * in FileSegmentsHolder represent a contiguous range, so we can resize
+            * it only when nobody needs it.
+            */
+            LOG_TEST(log, "Resize cell {} to downloaded: {}", range().toString(), current_downloaded_size);
+            cache->reduceSizeToDownloaded(key(), offset(), cache_lock, segment_lock);
+        }

-                detached = true;
-            }
-            else if (is_last_holder)
-            {
-                /**
-                * Only last holder of current file segment can resize the cell,
-                * because there is an invariant that file segments returned to users
-                * in FileSegmentsHolder represent a contiguous range, so we can resize
-                * it only when nobody needs it.
-                */
-                LOG_TEST(log, "Resize cell {} to downloaded: {}", range().toString(), current_downloaded_size);
-                cache->reduceSizeToDownloaded(key(), offset(), cache_lock, segment_lock);
+        detached = true;

-                detached = true;
-            }
+        if (cache_writer)
+        {
+            cache_writer->finalize();
+            cache_writer.reset();
+            remote_file_reader.reset();
        }
    }

-    if (!downloader_id.empty() && downloader_id == getCallerIdImpl(allow_non_strict_checking))
+    if (!downloader_id.empty() && (downloader_id == getCallerIdImpl(allow_non_strict_checking) || is_last_holder))
    {
        LOG_TEST(log, "Clearing downloader id: {}, current state: {}", downloader_id, stateToString(download_state));
        downloader_id.clear();
    }

-    if (!download_can_continue && cache_writer)
-    {
-        cache_writer->finalize();
-        cache_writer.reset();
-        remote_file_reader.reset();
-    }
-
-    assert(download_state != FileSegment::State::DOWNLOADED || std::filesystem::file_size(cache->getPathInLocalCache(key(), offset())) > 0);
+    assertCorrectnessImpl(segment_lock);
 }

 String FileSegment::getInfoForLog() const
@ -440,6 +491,53 @@ String FileSegment::stateToString(FileSegment::State state)
    __builtin_unreachable();
 }

+void FileSegment::assertCorrectness() const
+{
+    std::lock_guard segment_lock(mutex);
+    assertCorrectnessImpl(segment_lock);
+}
+
+void FileSegment::assertCorrectnessImpl(std::lock_guard<std::mutex> & /* segment_lock */) const
+{
+    assert(downloader_id.empty() == (download_state != FileSegment::State::DOWNLOADING));
+    assert(!downloader_id.empty() == (download_state == FileSegment::State::DOWNLOADING));
+    assert(download_state != FileSegment::State::DOWNLOADED || std::filesystem::file_size(cache->getPathInLocalCache(key(), offset())) > 0);
+}
+
+FileSegmentsHolder::~FileSegmentsHolder()
+{
+    /// In CacheableReadBufferFromRemoteFS file segment's downloader removes file segments from
+    /// FileSegmentsHolder right after calling file_segment->complete(), so on destruction here
+    /// remain only uncompleted file segments.
+
+    IFileCache * cache = nullptr;
+
+    for (auto file_segment_it = file_segments.begin(); file_segment_it != file_segments.end();)
+    {
+        auto current_file_segment_it = file_segment_it;
+        auto & file_segment = *current_file_segment_it;
+
+        if (!cache)
+            cache = file_segment->cache;
+
+        try
+        {
+            /// File segment pointer must be reset right after calling complete() and
+            /// under the same mutex, because complete() checks for segment pointers.
+            std::lock_guard cache_lock(cache->mutex);
+
+            file_segment->complete(cache_lock);
+
+            file_segment_it = file_segments.erase(current_file_segment_it);
+        }
+        catch (...)
+        {
+            tryLogCurrentException(__PRETTY_FUNCTION__);
+            assert(false);
+        }
+    }
+}
+
 String FileSegmentsHolder::toString()
 {
    String ranges;
--- a/src/Common/FileSegment.h
+++ b/src/Common/FileSegment.h
@ -95,12 +95,14 @@ public:

    bool reserve(size_t size);

-    void write(const char * from, size_t size);
+    void write(const char * from, size_t size, size_t offset_);

    RemoteFileReaderPtr getRemoteFileReader();

    void setRemoteFileReader(RemoteFileReaderPtr remote_file_reader_);

+    void resetRemoteFileReader();
+
    String getOrSetDownloader();

    String getDownloader() const;
@ -121,16 +123,32 @@ public:

    String getInfoForLog() const;

+    void assertCorrectness() const;
+
 private:
    size_t availableSize() const { return reserved_size - downloaded_size; }
-    bool lastFileSegmentHolder() const;
-    void complete();
-    void completeImpl(bool allow_non_strict_checking = false);
-    void setDownloaded(std::lock_guard<std::mutex> & segment_lock);
-    static String getCallerIdImpl(bool allow_non_strict_checking = false);
-    void resetDownloaderImpl(std::lock_guard<std::mutex> & segment_lock);
+
    size_t getDownloadedSize(std::lock_guard<std::mutex> & segment_lock) const;
    String getInfoForLogImpl(std::lock_guard<std::mutex> & segment_lock) const;
+    void assertCorrectnessImpl(std::lock_guard<std::mutex> & segment_lock) const;
+
+    void setDownloaded(std::lock_guard<std::mutex> & segment_lock);
+
+    bool lastFileSegmentHolder() const;
+
+    /// complete() without any completion state is called from destructor of
+    /// FileSegmentsHolder. complete() might check if the caller of the method
+    /// is the last alive holder of the segment. Therefore, complete() and destruction
+    /// of the file segment pointer must be done under the same cache mutex.
+    void complete(std::lock_guard<std::mutex> & cache_lock);
+
+    void completeImpl(
+        std::lock_guard<std::mutex> & cache_lock,
+        std::lock_guard<std::mutex> & segment_lock, bool allow_non_strict_checking = false);
+
+    static String getCallerIdImpl(bool allow_non_strict_checking = false);
+
+    void resetDownloaderImpl(std::lock_guard<std::mutex> & segment_lock);

    const Range segment_range;

@ -169,28 +187,7 @@ struct FileSegmentsHolder : private boost::noncopyable
    explicit FileSegmentsHolder(FileSegments && file_segments_) : file_segments(std::move(file_segments_)) {}
    FileSegmentsHolder(FileSegmentsHolder && other) : file_segments(std::move(other.file_segments)) {}

-    ~FileSegmentsHolder()
-    {
-        /// In CacheableReadBufferFromRemoteFS file segment's downloader removes file segments from
-        /// FileSegmentsHolder right after calling file_segment->complete(), so on destruction here
-        /// remain only uncompleted file segments.
-
-        for (auto & segment : file_segments)
-        {
-            try
-            {
-                segment->complete();
-            }
-            catch (...)
-            {
-#ifndef NDEBUG
-                throw;
-#else
-                tryLogCurrentException(__PRETTY_FUNCTION__);
-#endif
-            }
-        }
-    }
+    ~FileSegmentsHolder();

    FileSegments file_segments{};

--- a/src/Common/HashTable/FixedHashTable.h
+++ b/src/Common/HashTable/FixedHashTable.h
@ -67,6 +67,9 @@ struct FixedHashTableCalculatedSize
 {
    size_t getSize(const Cell * buf, const typename Cell::State & state, size_t num_cells) const
    {
+        if (!buf)
+            return 0;
+
        size_t res = 0;
        for (const Cell * end = buf + num_cells; buf != end; ++buf)
            if (!buf->isZero(state))
@ -76,6 +79,9 @@ struct FixedHashTableCalculatedSize

    bool isEmpty(const Cell * buf, const typename Cell::State & state, size_t num_cells) const
    {
+        if (!buf)
+            return true;
+
        for (const Cell * end = buf + num_cells; buf != end; ++buf)
            if (!buf->isZero(state))
                return false;
--- a/src/Common/HashTable/TwoLevelHashTable.h
+++ b/src/Common/HashTable/TwoLevelHashTable.h
@ -94,6 +94,12 @@ public:

    TwoLevelHashTable() = default;

+    explicit TwoLevelHashTable(size_t size_hint)
+    {
+        for (auto & impl : impls)
+            impl.reserve(size_hint / NUM_BUCKETS);
+    }
+
    /// Copy the data from another (normal) hash table. It should have the same hash function.
    template <typename Source>
    explicit TwoLevelHashTable(const Source & src)
--- a/src/Common/ProfileEvents.cpp
+++ b/src/Common/ProfileEvents.cpp
@ -285,6 +285,9 @@
    \
    M(MainConfigLoads, "Number of times the main configuration was reloaded.") \
    \
+    M(AggregationPreallocatedElementsInHashTables, "How many elements were preallocated in hash tables for aggregation.") \
+    M(AggregationHashTablesInitializedAsTwoLevel, "How many hash tables were inited as two-level for aggregation.") \
+    \
    M(MergeTreeMetadataCacheGet, "Number of rocksdb reads(used for merge tree metadata cache)") \
    M(MergeTreeMetadataCachePut, "Number of rocksdb puts(used for merge tree metadata cache)") \
    M(MergeTreeMetadataCacheDelete, "Number of rocksdb deletes(used for merge tree metadata cache)") \
--- a/src/Common/RangeGenerator.h
+++ b/src/Common/RangeGenerator.h
@ -0,0 +1,46 @@
+#pragma once
+
+#include <optional>
+#include <cmath>
+
+namespace DB
+{
+
+class RangeGenerator
+{
+public:
+    explicit RangeGenerator(size_t total_size_, size_t range_step_, size_t range_start = 0)
+        : from(range_start), range_step(range_step_), total_size(total_size_)
+    {
+    }
+
+    size_t totalRanges() const { return static_cast<size_t>(round(static_cast<float>(total_size - from) / range_step)); }
+
+    using Range = std::pair<size_t, size_t>;
+
+    // return upper exclusive range of values, i.e. [from_range, to_range>
+    std::optional<Range> nextRange()
+    {
+        if (from >= total_size)
+        {
+            return std::nullopt;
+        }
+
+        auto to = from + range_step;
+        if (to >= total_size)
+        {
+            to = total_size;
+        }
+
+        Range range{from, to};
+        from = to;
+        return range;
+    }
+
+private:
+    size_t from;
+    size_t range_step;
+    size_t total_size;
+};
+
+}
--- a/src/Common/filesystemHelpers.cpp
+++ b/src/Common/filesystemHelpers.cpp
@ -4,6 +4,8 @@
 #if defined(__linux__)
 #    include <cstdio>
 #    include <mntent.h>
+#    include <sys/stat.h>
+#    include <sys/sysmacros.h>
 #endif
 #include <cerrno>
 #include <Poco/Version.h>
@ -13,6 +15,9 @@
 #include <unistd.h>
 #include <sys/types.h>
 #include <utime.h>
+#include <IO/ReadBufferFromFile.h>
+#include <IO/Operators.h>
+#include <IO/WriteBufferFromString.h>

 namespace fs = std::filesystem;

@ -24,6 +29,7 @@ namespace ErrorCodes
    extern const int LOGICAL_ERROR;
    extern const int SYSTEM_ERROR;
    extern const int NOT_IMPLEMENTED;
+    extern const int CANNOT_STAT;
    extern const int CANNOT_STATVFS;
    extern const int PATH_ACCESS_DENIED;
    extern const int CANNOT_CREATE_FILE;
@ -57,6 +63,68 @@ std::unique_ptr<TemporaryFile> createTemporaryFile(const std::string & path)
    return std::make_unique<TemporaryFile>(path);
 }

+#if !defined(__linux__)
+[[noreturn]]
+#endif
+String getBlockDeviceId([[maybe_unused]] const String & path)
+{
+#if defined(__linux__)
+    struct stat sb;
+    if (lstat(path.c_str(), &sb))
+        throwFromErrnoWithPath("Cannot lstat " + path, path, ErrorCodes::CANNOT_STAT);
+    WriteBufferFromOwnString ss;
+    ss << major(sb.st_dev) << ":" << minor(sb.st_dev);
+    return ss.str();
+#else
+    throw DB::Exception("The function getDeviceId is supported on Linux only", ErrorCodes::NOT_IMPLEMENTED);
+#endif
+}
+
+#if !defined(__linux__)
+[[noreturn]]
+#endif
+BlockDeviceType getBlockDeviceType([[maybe_unused]] const String & device_id)
+{
+#if defined(__linux__)
+    try
+    {
+        ReadBufferFromFile in("/sys/dev/block/" + device_id + "/queue/rotational");
+        int rotational;
+        readText(rotational, in);
+        return rotational ? BlockDeviceType::ROT : BlockDeviceType::NONROT;
+    }
+    catch (...)
+    {
+        return BlockDeviceType::UNKNOWN;
+    }
+#else
+    throw DB::Exception("The function getDeviceType is supported on Linux only", ErrorCodes::NOT_IMPLEMENTED);
+#endif
+}
+
+#if !defined(__linux__)
+[[noreturn]]
+#endif
+UInt64 getBlockDeviceReadAheadBytes([[maybe_unused]] const String & device_id)
+{
+#if defined(__linux__)
+    try
+    {
+        ReadBufferFromFile in("/sys/dev/block/" + device_id + "/queue/read_ahead_kb");
+        int read_ahead_kb;
+        readText(read_ahead_kb, in);
+        return read_ahead_kb * 1024;
+    }
+    catch (...)
+    {
+        return static_cast<UInt64>(-1);
+    }
+#else
+    throw DB::Exception("The function getDeviceType is supported on Linux only", ErrorCodes::NOT_IMPLEMENTED);
+#endif
+}
+
+/// Returns name of filesystem mounted to mount_point
 std::filesystem::path getMountPoint(std::filesystem::path absolute_path)
 {
    if (absolute_path.is_relative())
--- a/src/Common/filesystemHelpers.h
+++ b/src/Common/filesystemHelpers.h
@ -18,6 +18,31 @@ using TemporaryFile = Poco::TemporaryFile;
 bool enoughSpaceInDirectory(const std::string & path, size_t data_size);
 std::unique_ptr<TemporaryFile> createTemporaryFile(const std::string & path);

+// Determine what block device is responsible for specified path
+#if !defined(__linux__)
+[[noreturn]]
+#endif
+String getBlockDeviceId([[maybe_unused]] const String & path);
+
+enum class BlockDeviceType
+{
+    UNKNOWN = 0, // we were unable to determine device type
+    NONROT = 1, // not a rotational device (SSD, NVME, etc)
+    ROT = 2 // rotational device (HDD)
+};
+
+// Try to determine block device type
+#if !defined(__linux__)
+[[noreturn]]
+#endif
+BlockDeviceType getBlockDeviceType([[maybe_unused]] const String & device_id);
+
+// Get size of read-ahead in bytes for specified block device
+#if !defined(__linux__)
+[[noreturn]]
+#endif
+UInt64 getBlockDeviceReadAheadBytes([[maybe_unused]] const String & device_id);
+
 /// Returns mount point of filesystem where absolute_path (must exist) is located
 std::filesystem::path getMountPoint(std::filesystem::path absolute_path);

--- a/src/Common/format.h
+++ b/src/Common/format.h
@ -0,0 +1,178 @@
+#pragma once
+
+#include <base/types.h>
+#include <Common/Exception.h>
+#include <Common/PODArray.h>
+#include <Common/StringUtils/StringUtils.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int BAD_ARGUMENTS;
+}
+
+namespace Format
+{
+    using IndexPositions = PODArrayWithStackMemory<UInt64, 64>;
+
+    static inline void parseNumber(const String & description, UInt64 l, UInt64 r, UInt64 & res, UInt64 argument_number)
+    {
+        res = 0;
+        for (UInt64 pos = l; pos < r; ++pos)
+        {
+            if (!isNumericASCII(description[pos]))
+                throw Exception("Not a number in curly braces at position " + std::to_string(pos), ErrorCodes::BAD_ARGUMENTS);
+            res = res * 10 + description[pos] - '0';
+            if (res >= argument_number)
+                throw Exception(
+                    "Too big number for arguments, must be at most " + std::to_string(argument_number - 1), ErrorCodes::BAD_ARGUMENTS);
+        }
+    }
+
+    static inline void init(
+        const String & pattern,
+        size_t argument_number,
+        const std::vector<std::optional<String>> & constant_strings,
+        IndexPositions & index_positions,
+        std::vector<String> & substrings)
+    {
+        /// Is current position after open curly brace.
+        bool is_open_curly = false;
+        /// The position of last open token.
+        size_t last_open = -1;
+
+        /// Is formatting in a plain {} token.
+        std::optional<bool> is_plain_numbering;
+        UInt64 index_if_plain = 0;
+
+        /// Left position of adding substrings, just to the closed brace position or the start of the string.
+        /// Invariant --- the start of substring is in this position.
+        size_t start_pos = 0;
+
+        /// A flag to decide whether we should glue the constant strings.
+        bool glue_to_next = false;
+
+        /// Handling double braces (escaping).
+        auto double_brace_removal = [](String & str)
+        {
+            size_t i = 0;
+            bool should_delete = true;
+            str.erase(
+                std::remove_if(
+                    str.begin(),
+                    str.end(),
+                    [&i, &should_delete, &str](char)
+                    {
+                        bool is_double_brace = (str[i] == '{' && str[i + 1] == '{') || (str[i] == '}' && str[i + 1] == '}');
+                        ++i;
+                        if (is_double_brace && should_delete)
+                        {
+                            should_delete = false;
+                            return true;
+                        }
+                        should_delete = true;
+                        return false;
+                    }),
+                str.end());
+        };
+
+        index_positions.emplace_back();
+
+        for (size_t i = 0; i < pattern.size(); ++i)
+        {
+            if (pattern[i] == '{')
+            {
+                /// Escaping handling
+                /// It is safe to access because of null termination
+                if (pattern[i + 1] == '{')
+                {
+                    ++i;
+                    continue;
+                }
+
+                if (is_open_curly)
+                    throw Exception("Two open curly braces without close one at position " + std::to_string(i), ErrorCodes::BAD_ARGUMENTS);
+
+                String to_add = String(pattern.data() + start_pos, i - start_pos);
+                double_brace_removal(to_add);
+                if (!glue_to_next)
+                    substrings.emplace_back(to_add);
+                else
+                    substrings.back() += to_add;
+
+                glue_to_next = false;
+
+                is_open_curly = true;
+                last_open = i + 1;
+            }
+            else if (pattern[i] == '}')
+            {
+                if (pattern[i + 1] == '}')
+                {
+                    ++i;
+                    continue;
+                }
+
+                if (!is_open_curly)
+                    throw Exception("Closed curly brace without open one at position " + std::to_string(i), ErrorCodes::BAD_ARGUMENTS);
+
+                is_open_curly = false;
+
+                if (last_open == i)
+                {
+                    if (is_plain_numbering && !*is_plain_numbering)
+                        throw Exception(
+                            "Cannot switch from automatic field numbering to manual field specification", ErrorCodes::BAD_ARGUMENTS);
+                    is_plain_numbering = true;
+                    if (index_if_plain >= argument_number)
+                        throw Exception("Argument is too big for formatting", ErrorCodes::BAD_ARGUMENTS);
+                    index_positions.back() = index_if_plain++;
+                }
+                else
+                {
+                    if (is_plain_numbering && *is_plain_numbering)
+                        throw Exception(
+                            "Cannot switch from automatic field numbering to manual field specification", ErrorCodes::BAD_ARGUMENTS);
+                    is_plain_numbering = false;
+
+                    UInt64 arg;
+                    parseNumber(pattern, last_open, i, arg, argument_number);
+
+                    if (arg >= argument_number)
+                        throw Exception(
+                            "Argument is too big for formatting. Note that indexing starts from zero", ErrorCodes::BAD_ARGUMENTS);
+
+                    index_positions.back() = arg;
+                }
+
+                if (!constant_strings.empty() && constant_strings[index_positions.back()])
+                {
+                    /// The next string should be glued to last `A {} C`.format('B') -> `A B C`.
+                    glue_to_next = true;
+                    substrings.back() += *constant_strings[index_positions.back()];
+                }
+                else
+                    index_positions.emplace_back(); /// Otherwise we commit arg number and proceed.
+
+                start_pos = i + 1;
+            }
+        }
+
+        if (is_open_curly)
+            throw Exception("Last open curly brace is not closed", ErrorCodes::BAD_ARGUMENTS);
+
+        String to_add = String(pattern.data() + start_pos, pattern.size() - start_pos);
+        double_brace_removal(to_add);
+
+        if (!glue_to_next)
+            substrings.emplace_back(to_add);
+        else
+            substrings.back() += to_add;
+
+        index_positions.pop_back();
+    }
+}
+
+}
--- a/src/Common/getNumberOfPhysicalCPUCores.cpp
+++ b/src/Common/getNumberOfPhysicalCPUCores.cpp
@ -38,21 +38,7 @@ unsigned getCGroupLimitedCPUCores(unsigned default_cpu_count)
        quota_count = ceil(static_cast<float>(cgroup_quota) / static_cast<float>(cgroup_period));
    }

-    // Share number (typically a number relative to 1024) (2048 typically expresses 2 CPUs worth of processing)
-    // -1 for no share setup
-    int cgroup_share = read_from("/sys/fs/cgroup/cpu/cpu.shares", -1);
-    // Convert 1024 to no shares setup
-    if (cgroup_share == 1024)
-        cgroup_share = -1;
-
-#    define PER_CPU_SHARES 1024
-    unsigned share_count = default_cpu_count;
-    if (cgroup_share > -1)
-    {
-        share_count = ceil(static_cast<float>(cgroup_share) / static_cast<float>(PER_CPU_SHARES));
-    }
-
-    return std::min(default_cpu_count, std::min(share_count, quota_count));
+    return std::min(default_cpu_count, quota_count);
 }
 #endif // OS_LINUX

@ -91,6 +77,7 @@ unsigned getNumberOfPhysicalCPUCores()
            cpu_count = std::thread::hardware_concurrency();

 #if defined(OS_LINUX)
+        /// TODO: add a setting for disabling that, similar to UseContainerSupport in java
        cpu_count = getCGroupLimitedCPUCores(cpu_count);
 #endif // OS_LINUX
        return cpu_count;
--- a/src/Common/tests/gtest_lru_file_cache.cpp
+++ b/src/Common/tests/gtest_lru_file_cache.cpp
@ -67,7 +67,7 @@ void download(DB::FileSegmentPtr file_segment)
        fs::create_directories(subdir);

    std::string data(size, '0');
-    file_segment->write(data.data(), size);
+    file_segment->write(data.data(), size, file_segment->getDownloadOffset());
 }

 void prepareAndDownload(DB::FileSegmentPtr file_segment)
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -500,6 +500,10 @@ class IColumn;
    M(Bool, optimize_rewrite_sum_if_to_count_if, true, "Rewrite sumIf() and sum(if()) function countIf() function when logically equivalent", 0) \
    M(UInt64, insert_shard_id, 0, "If non zero, when insert into a distributed table, the data will be inserted into the shard `insert_shard_id` synchronously. Possible values range from 1 to `shards_number` of corresponding distributed table", 0) \
    \
+    M(Bool, collect_hash_table_stats_during_aggregation, true, "Enable collecting hash table statistics to optimize memory allocation", 0) \
+    M(UInt64, max_entries_for_hash_table_stats, 10'000, "How many entries hash table statistics collected during aggregation is allowed to have", 0) \
+    M(UInt64, max_size_to_preallocate_for_aggregation, 10'000'000, "For how many elements it is allowed to preallocate space in all hash tables in total before aggregation", 0) \
+    \
    /** Experimental feature for moving data between shards. */ \
    \
    M(Bool, allow_experimental_query_deduplication, false, "Experimental data deduplication for SELECT queries based on part UUIDs", 0) \
--- a/src/Databases/DatabaseOnDisk.cpp
+++ b/src/Databases/DatabaseOnDisk.cpp
@ -6,6 +6,7 @@
 #include <IO/WriteHelpers.h>
 #include <Interpreters/Context.h>
 #include <Interpreters/InterpreterCreateQuery.h>
+#include <Interpreters/ApplyWithSubqueryVisitor.h>
 #include <Parsers/ASTCreateQuery.h>
 #include <Parsers/ASTFunction.h>
 #include <Parsers/ParserCreateQuery.h>
@ -55,6 +56,9 @@ std::pair<String, StoragePtr> createTableFromAST(
    ast_create_query.attach = true;
    ast_create_query.setDatabase(database_name);

+    if (ast_create_query.select && ast_create_query.isView())
+        ApplyWithSubqueryVisitor().visit(*ast_create_query.select);
+
    if (ast_create_query.as_table_function)
    {
        const auto & factory = TableFunctionFactory::instance();
--- a/src/Databases/DatabaseReplicatedWorker.cpp
+++ b/src/Databases/DatabaseReplicatedWorker.cpp
@ -179,8 +179,12 @@ String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entr

    if (!task->was_executed)
    {
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Entry {} was executed, but was not committed: code {}: {}",
-                        task->execution_status.code, task->execution_status.message);
+        throw Exception(
+            ErrorCodes::LOGICAL_ERROR,
+            "Entry {} was executed, but was not committed: code {}: {}",
+            task->entry_name,
+            task->execution_status.code,
+            task->execution_status.message);
    }

    try_node->setAlreadyRemoved();
--- a/src/Dictionaries/XDBCDictionarySource.cpp
+++ b/src/Dictionaries/XDBCDictionarySource.cpp
@ -50,7 +50,7 @@ namespace
        {
            if (!qualified_name.database.empty())
                throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
-                    "Dictionary source of type {} specifies a schema but schema is not supported by {}-driver",
+                    "Dictionary source specifies a schema but schema is not supported by {}-driver",
                    bridge_.getName());
        }

--- a/src/Disks/IO/CachedReadBufferFromRemoteFS.cpp
+++ b/src/Disks/IO/CachedReadBufferFromRemoteFS.cpp
@ -63,7 +63,11 @@ void CachedReadBufferFromRemoteFS::initialize(size_t offset, size_t size)

 SeekableReadBufferPtr CachedReadBufferFromRemoteFS::getCacheReadBuffer(size_t offset) const
 {
-    return std::make_shared<ReadBufferFromFile>(cache->getPathInLocalCache(cache_key, offset), settings.local_fs_buffer_size);
+    auto path = cache->getPathInLocalCache(cache_key, offset);
+    auto buf = std::make_shared<ReadBufferFromFile>(path, settings.local_fs_buffer_size);
+    if (buf->size() == 0)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to read from an empty cache file: {}", path);
+    return buf;
 }

 SeekableReadBufferPtr CachedReadBufferFromRemoteFS::getRemoteFSReadBuffer(FileSegmentPtr & file_segment, ReadType read_type_)
@ -96,7 +100,6 @@ SeekableReadBufferPtr CachedReadBufferFromRemoteFS::getRemoteFSReadBuffer(FileSe
            remote_fs_segment_reader = remote_file_reader_creator();
            file_segment->setRemoteFileReader(remote_fs_segment_reader);

-            ///TODO: add check for pending data
            return remote_fs_segment_reader;
        }
        case ReadType::REMOTE_FS_READ_BYPASS_CACHE:
@ -119,7 +122,6 @@ SeekableReadBufferPtr CachedReadBufferFromRemoteFS::getReadBufferForFileSegment(
 {
    auto range = file_segment->range();

-    /// Each wait() call has a timeout of 1 second.
    size_t wait_download_max_tries = settings.remote_fs_cache_max_wait_sec;
    size_t wait_download_tries = 0;

@ -296,17 +298,21 @@ SeekableReadBufferPtr CachedReadBufferFromRemoteFS::getImplementationBuffer(File
    {
        case ReadType::CACHED:
        {
+#ifndef NDEBUG
+            auto * file_reader = assert_cast<ReadBufferFromFile *>(read_buffer_for_file_segment.get());
+            size_t file_size = file_reader->size();
+
+            if (file_size == 0 || range.left + file_size <= file_offset_of_buffer_end)
+                throw Exception(
+                    ErrorCodes::LOGICAL_ERROR,
+                    "Unexpected state of cache file. Cache file size: {}, cache file offset: {}, "
+                    "expected file size to be non-zero and file downloaded size to exceed current file read offset (expected: {} > {})",
+                    file_size, range.left, range.left + file_size, file_offset_of_buffer_end);
+#endif
+
            size_t seek_offset = file_offset_of_buffer_end - range.left;
            read_buffer_for_file_segment->seek(seek_offset, SEEK_SET);

-            auto * file_reader = assert_cast<ReadBufferFromFile *>(read_buffer_for_file_segment.get());
-            size_t file_size = file_reader->size();
-            auto state = file_segment->state();
-
-            LOG_TEST(log, "Cache file: {}. Cached seek to: {}, file size: {}, file segment state: {}, download offset: {}",
-                    file_reader->getFileName(), seek_offset, file_size, state, file_segment->getDownloadOffset());
-
-            assert(file_size > 0);
            break;
        }
        case ReadType::REMOTE_FS_READ_BYPASS_CACHE:
@ -384,6 +390,7 @@ void CachedReadBufferFromRemoteFS::predownload(FileSegmentPtr & file_segment)
        LOG_TEST(log, "Bytes to predownload: {}, caller_id: {}", bytes_to_predownload, FileSegment::getCallerId());

        assert(implementation_buffer->getFileOffsetOfBufferEnd() == file_segment->getDownloadOffset());
+        size_t current_offset = file_segment->getDownloadOffset();

        while (true)
        {
@ -392,8 +399,13 @@ void CachedReadBufferFromRemoteFS::predownload(FileSegmentPtr & file_segment)
                if (bytes_to_predownload)
                    throw Exception(
                        ErrorCodes::LOGICAL_ERROR,
-                        "Failed to predownload remaining {} bytes. Current file segment: {}, current download offset: {}, expected: {}, eof: {}",
-                        file_segment->range().toString(), file_segment->getDownloadOffset(), file_offset_of_buffer_end, implementation_buffer->eof());
+                        "Failed to predownload remaining {} bytes. Current file segment: {}, current download offset: {}, expected: {}, "
+                        "eof: {}",
+                        bytes_to_predownload,
+                        file_segment->range().toString(),
+                        file_segment->getDownloadOffset(),
+                        file_offset_of_buffer_end,
+                        implementation_buffer->eof());

                auto result = implementation_buffer->hasPendingData();

@ -418,7 +430,11 @@ void CachedReadBufferFromRemoteFS::predownload(FileSegmentPtr & file_segment)
            {
                LOG_TEST(log, "Left to predownload: {}, buffer size: {}", bytes_to_predownload, implementation_buffer->buffer().size());

-                file_segment->write(implementation_buffer->buffer().begin(), current_predownload_size);
+                assert(file_segment->getDownloadOffset() == static_cast<size_t>(implementation_buffer->getPosition()));
+
+                file_segment->write(implementation_buffer->buffer().begin(), current_predownload_size, current_offset);
+
+                current_offset += current_predownload_size;

                bytes_to_predownload -= current_predownload_size;
                implementation_buffer->position() += current_predownload_size;
@ -532,13 +548,15 @@ bool CachedReadBufferFromRemoteFS::nextImpl()
    }
    catch (Exception & e)
    {
-        e.addMessage("Cache info: {}", getInfoForLog());
+        e.addMessage("Cache info: {}", nextimpl_step_log_info);
        throw;
    }
 }

 bool CachedReadBufferFromRemoteFS::nextImplStep()
 {
+    last_caller_id = FileSegment::getCallerId();
+
    if (IFileCache::shouldBypassCache())
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Using cache when not allowed");

@ -549,6 +567,9 @@ bool CachedReadBufferFromRemoteFS::nextImplStep()
        return false;

    SCOPE_EXIT({
+        /// Save state of current file segment before it is completed.
+        nextimpl_step_log_info = getInfoForLog();
+
        if (current_file_segment_it == file_segments_holder->file_segments.end())
            return;

@ -618,6 +639,18 @@ bool CachedReadBufferFromRemoteFS::nextImplStep()

    if (!result)
    {
+#ifndef NDEBUG
+        if (auto * cache_file_reader = typeid_cast<ReadBufferFromFile *>(implementation_buffer.get()))
+        {
+            auto cache_file_size = cache_file_reader->size();
+            if (cache_file_size == 0)
+                throw Exception(
+                    ErrorCodes::LOGICAL_ERROR,
+                    "Attempt to read from an empty cache file: {} (just before actual read)",
+                    cache_file_size);
+        }
+#endif
+
        result = implementation_buffer->next();
        size = implementation_buffer->buffer().size();
    }
@ -630,7 +663,12 @@ bool CachedReadBufferFromRemoteFS::nextImplStep()

            if (file_segment->reserve(size))
            {
-                file_segment->write(needed_to_predownload ? implementation_buffer->position() : implementation_buffer->buffer().begin(), size);
+                assert(file_segment->getDownloadOffset() == static_cast<size_t>(implementation_buffer->getPosition()));
+
+                file_segment->write(needed_to_predownload ? implementation_buffer->position() : implementation_buffer->buffer().begin(), size, file_offset_of_buffer_end);
+
+                assert(file_segment->getDownloadOffset() <= file_segment->range().right + 1);
+                assert(std::next(current_file_segment_it) == file_segments_holder->file_segments.end() || file_segment->getDownloadOffset() == implementation_buffer->getFileOffsetOfBufferEnd());
            }
            else
            {
@ -660,10 +698,15 @@ bool CachedReadBufferFromRemoteFS::nextImplStep()
            }
        }

+        /// - If last file segment was read from remote fs, then we read up to segment->range().right, but
+        /// the requested right boundary could be segment->range().left < requested_right_boundary <  segment->range().right.
+        /// Therefore need to resize to a smaller size. And resize must be done after write into cache.
+        /// - If last file segment was read from local fs, then we could read more than file_segemnt->range().right, so resize is also needed.
        if (std::next(current_file_segment_it) == file_segments_holder->file_segments.end())
        {
            size_t remaining_size_to_read = std::min(current_read_range.right, read_until_position - 1) - file_offset_of_buffer_end + 1;
            size = std::min(size, remaining_size_to_read);
+            assert(implementation_buffer->buffer().size() >= nextimpl_working_buffer_offset + size);
            implementation_buffer->buffer().resize(nextimpl_working_buffer_offset + size);
        }

@ -687,9 +730,16 @@ bool CachedReadBufferFromRemoteFS::nextImplStep()
             read_until_position, first_offset, file_segments_holder->toString());

    if (size == 0 && file_offset_of_buffer_end < read_until_position)
-        throw Exception(ErrorCodes::LOGICAL_ERROR,
-                        "Having zero bytes, but range is not finished: file offset: {}, reading until: {}",
-                        file_offset_of_buffer_end, read_until_position);
+    {
+        std::optional<size_t> cache_file_size;
+        if (auto * cache_file_reader = assert_cast<ReadBufferFromFile *>(implementation_buffer.get()))
+            cache_file_size = cache_file_reader->size();
+
+         throw Exception(ErrorCodes::LOGICAL_ERROR,
+                         "Having zero bytes, but range is not finished: file offset: {}, reading until: {}, read type: {}, cache file size: {}",
+                         file_offset_of_buffer_end, read_until_position, toString(read_type), cache_file_size ? std::to_string(*cache_file_size) : "None");
+    }
+
    return result;
 }

@ -752,12 +802,24 @@ std::optional<size_t> CachedReadBufferFromRemoteFS::getLastNonDownloadedOffset()

 String CachedReadBufferFromRemoteFS::getInfoForLog()
 {
-    return fmt::format("Buffer path: {}, hash key: {}, file_offset_of_buffer_end: {}, internal buffer remaining read range: {}, file segment info: {}",
-                        remote_fs_object_path, getHexUIntLowercase(cache_key), file_offset_of_buffer_end,
-                       (implementation_buffer ?
-                        std::to_string(implementation_buffer->getRemainingReadRange().left) + '-' + (implementation_buffer->getRemainingReadRange().right ? std::to_string(*implementation_buffer->getRemainingReadRange().right) : "None")
-                        : "None"),
-                        (current_file_segment_it == file_segments_holder->file_segments.end() ? "None" : (*current_file_segment_it)->getInfoForLog()));
+    auto implementation_buffer_read_range_str =
+        implementation_buffer ?
+        std::to_string(implementation_buffer->getRemainingReadRange().left)
+        + '-'
+        + (implementation_buffer->getRemainingReadRange().right ? std::to_string(*implementation_buffer->getRemainingReadRange().right) : "None")
+        : "None";
+
+    auto current_file_segment_info = current_file_segment_it == file_segments_holder->file_segments.end() ? "None" : (*current_file_segment_it)->getInfoForLog();
+
+    return fmt::format("Buffer path: {}, hash key: {}, file_offset_of_buffer_end: {}, internal buffer remaining read range: {}, "
+                       "read_type: {}, last caller: {}, file segment info: {}",
+                       remote_fs_object_path,
+                       getHexUIntLowercase(cache_key),
+                       file_offset_of_buffer_end,
+                       implementation_buffer_read_range_str,
+                       toString(read_type),
+                       last_caller_id,
+                       current_file_segment_info);
 }

 }
--- a/src/Disks/IO/CachedReadBufferFromRemoteFS.h
+++ b/src/Disks/IO/CachedReadBufferFromRemoteFS.h
@ -98,7 +98,10 @@ private:
        }
        __builtin_unreachable();
    }
+
    size_t first_offset = 0;
+    String nextimpl_step_log_info;
+    String last_caller_id;
 };

 }
--- a/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp
+++ b/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp
@ -44,7 +44,7 @@ SeekableReadBufferPtr ReadBufferFromS3Gather::createImplementationBuffer(const S
    {
        return std::make_unique<ReadBufferFromS3>(
            client_ptr, bucket, fs::path(metadata.remote_fs_root_path) / path, max_single_read_retries,
-            settings, /* use_external_buffer */true, read_until_position, /* restricted_seek */true);
+            settings, /* use_external_buffer */true, /* offset */ 0, read_until_position, /* restricted_seek */true);
    };

    if (with_cache)
--- a/src/Disks/IO/ThreadPoolRemoteFSReader.cpp
+++ b/src/Disks/IO/ThreadPoolRemoteFSReader.cpp
@ -68,16 +68,28 @@ std::future<IAsynchronousReader::Result> ThreadPoolRemoteFSReader::submit(Reques
        auto * remote_fs_fd = assert_cast<RemoteFSFileDescriptor *>(request.descriptor.get());

        Stopwatch watch(CLOCK_MONOTONIC);
-        auto [bytes_read, offset] = remote_fs_fd->readInto(request.buf, request.size, request.offset, request.ignore);
+
+        ReadBufferFromRemoteFSGather::ReadResult result;
+        try
+        {
+            result = remote_fs_fd->readInto(request.buf, request.size, request.offset, request.ignore);
+        }
+        catch (...)
+        {
+            if (running_group)
+                CurrentThread::detachQuery();
+            throw;
+        }
+
        watch.stop();

-        ProfileEvents::increment(ProfileEvents::RemoteFSReadMicroseconds, watch.elapsedMicroseconds());
-        ProfileEvents::increment(ProfileEvents::RemoteFSReadBytes, bytes_read);
-
        if (running_group)
-            thread_status.detachQuery();
+            CurrentThread::detachQuery();

-        return Result{ .size = bytes_read, .offset = offset };
+        ProfileEvents::increment(ProfileEvents::RemoteFSReadMicroseconds, watch.elapsedMicroseconds());
+        ProfileEvents::increment(ProfileEvents::RemoteFSReadBytes, result.offset ? result.size - result.offset : result.size);
+
+        return Result{ .size = result.size, .offset = result.offset };
    });

    auto future = task->get_future();
--- a/src/Formats/FormatSchemaInfo.cpp
+++ b/src/Formats/FormatSchemaInfo.cpp
@ -85,9 +85,12 @@ FormatSchemaInfo::FormatSchemaInfo(const String & format_schema, const String &
    else if (path.has_parent_path() && !fs::weakly_canonical(default_schema_directory_path / path).string().starts_with(fs::weakly_canonical(default_schema_directory_path).string()))
    {
        if (is_server)
-            throw Exception(ErrorCodes::BAD_ARGUMENTS,
-                            "Path in the 'format_schema' setting shouldn't go outside the 'format_schema_path' directory: {} ({} not in {})",
-                            path.string());
+            throw Exception(
+                ErrorCodes::BAD_ARGUMENTS,
+                "Path in the 'format_schema' setting shouldn't go outside the 'format_schema_path' directory: {} ({} not in {})",
+                default_schema_directory(),
+                path.string(),
+                default_schema_directory());
        path = default_schema_directory_path / path;
        schema_path = path.filename();
        schema_directory = path.parent_path() / "";
--- a/src/Functions/FunctionsConversion.h
+++ b/src/Functions/FunctionsConversion.h
@ -887,7 +887,7 @@ struct ConvertImplGenericToString
        const IColumn & col_from = *col_with_type_and_name.column;

        size_t size = col_from.size();
-        auto col_to = result_type->createColumn();
+        auto col_to = removeNullable(result_type)->createColumn();

        {
            ColumnStringHelpers::WriteHelper write_helper(
--- a/src/Functions/FunctionsStringArray.h
+++ b/src/Functions/FunctionsStringArray.h
@ -259,7 +259,7 @@ public:
            throw Exception(
                ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
                "Function '{}' needs at least 2 arguments, at most 3 arguments; passed {}.",
-                arguments.size());
+                name, arguments.size());

        if (!isString(arguments[0]))
            throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() + ". Must be String.",
--- a/src/Functions/IFunction.cpp
+++ b/src/Functions/IFunction.cpp
@ -181,9 +181,12 @@ ColumnPtr IExecutableFunction::defaultImplementationForNulls(
        // Default implementation for nulls returns null result for null arguments,
        // so the result type must be nullable.
        if (!result_type->isNullable())
-            throw Exception(ErrorCodes::LOGICAL_ERROR,
-                            "Function {} with Null argument and default implementation for Nulls "
-                            "is expected to return Nullable result, got {}", result_type->getName());
+            throw Exception(
+                ErrorCodes::LOGICAL_ERROR,
+                "Function {} with Null argument and default implementation for Nulls "
+                "is expected to return Nullable result, got {}",
+                getName(),
+                result_type->getName());

        return result_type->createColumnConstWithDefaultValue(input_rows_count);
    }
--- a/src/Functions/castOrDefault.cpp
+++ b/src/Functions/castOrDefault.cpp
@ -231,7 +231,7 @@ private:
            {
                throw Exception(ErrorCodes::BAD_ARGUMENTS,
                    "Function {} decimal scale should have native UInt type. Actual {}",
-                    scale_argument.type->getName());
+                    getName(), scale_argument.type->getName());
            }

            scale = arguments[additional_argument_index].column->getUInt(0);
--- a/src/Functions/concat.cpp
+++ b/src/Functions/concat.cpp
@ -52,23 +52,21 @@ public:
    {
        if (arguments.size() < 2)
            throw Exception(
-                "Number of arguments for function " + getName() + " doesn't match: passed " + toString(arguments.size())
-                    + ", should be at least 2.",
-                ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
-
-        if (arguments.size() > FormatImpl::argument_threshold)
-            throw Exception(
-                "Number of arguments for function " + getName() + " doesn't match: passed " + toString(arguments.size())
-                    + ", should be at most " + std::to_string(FormatImpl::argument_threshold),
-                ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
+                ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
+                "Number of arguments for function {} doesn't match: passed {}, should be at least 2",
+                getName(),
+                arguments.size());

        for (const auto arg_idx : collections::range(0, arguments.size()))
        {
            const auto * arg = arguments[arg_idx].get();
            if (!isStringOrFixedString(arg))
-                throw Exception{"Illegal type " + arg->getName() + " of argument " + std::to_string(arg_idx + 1) + " of function "
-                                    + getName(),
-                                ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT};
+                throw Exception(
+                    ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+                    "Illegal type {} of argument {} of function {}",
+                    arg->getName(),
+                    arg_idx + 1,
+                    getName());
        }

        return std::make_shared<DataTypeString>();
@ -125,7 +123,7 @@ private:
        std::vector<const ColumnString::Chars *> data(num_arguments);
        std::vector<const ColumnString::Offsets *> offsets(num_arguments);
        std::vector<size_t> fixed_string_sizes(num_arguments);
-        std::vector<String> constant_strings(num_arguments);
+        std::vector<std::optional<String>> constant_strings(num_arguments);
        bool has_column_string = false;
        bool has_column_fixed_string = false;
        for (size_t i = 0; i < num_arguments; ++i)
--- a/src/Functions/dateName.cpp
+++ b/src/Functions/dateName.cpp
@ -112,7 +112,7 @@ public:
            || (res = executeType<DataTypeDateTime64>(arguments, result_type))))
            throw Exception(
                ErrorCodes::ILLEGAL_COLUMN,
-                "Illegal column {} of function {], must be Date or DateTime.",
+                "Illegal column {} of function {}, must be Date or DateTime.",
                arguments[1].column->getName(),
                getName());

--- a/src/Functions/flattenTuple.cpp
+++ b/src/Functions/flattenTuple.cpp
@ -0,0 +1,68 @@
+#include <Functions/IFunction.h>
+#include <Functions/FunctionFactory.h>
+#include <Functions/FunctionHelpers.h>
+#include <DataTypes/DataTypeTuple.h>
+#include <DataTypes/ObjectUtils.h>
+#include <Columns/ColumnTuple.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int ILLEGAL_COLUMN;
+    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+}
+
+namespace
+{
+
+class FunctionFlattenTuple : public IFunction
+{
+public:
+    static constexpr auto name = "flattenTuple";
+    static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionFlattenTuple>(); }
+
+    String getName() const override { return name; }
+    size_t getNumberOfArguments() const override { return 1; }
+    bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo &) const override { return true; }
+    bool useDefaultImplementationForConstants() const override { return true; }
+
+    DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
+    {
+        const auto & type = arguments[0];
+        const auto * type_tuple = checkAndGetDataType<DataTypeTuple>(type.get());
+        if (!type_tuple || !type_tuple->haveExplicitNames())
+            throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+                "Argument for function '{}' must be Named Tuple. Got '{}'",
+                getName(), type->getName());
+
+        auto [paths, types] = flattenTuple(type);
+        Names names;
+        names.reserve(paths.size());
+        for (const auto & path : paths)
+            names.push_back(path.getPath());
+
+        return std::make_shared<DataTypeTuple>(types, names);
+    }
+
+    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override
+    {
+        auto column = arguments.at(0).column;
+        if (!checkAndGetColumn<ColumnTuple>(column.get()))
+            throw Exception(ErrorCodes::ILLEGAL_COLUMN,
+                "Illegal column {} of first argument of function {}. Expected ColumnTuple",
+                column->getName(), getName());
+
+        return flattenTuple(column);
+    }
+};
+
+}
+
+void registerFunctionFlattenTuple(FunctionFactory & factory)
+{
+    factory.registerFunction<FunctionFlattenTuple>();
+}
+
+}
--- a/src/Functions/formatString.cpp
+++ b/src/Functions/formatString.cpp
@ -45,25 +45,23 @@ public:

    DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
    {
-        if (arguments.empty())
+        if (arguments.size() < 2)
            throw Exception(
-                "Number of arguments for function " + getName() + " doesn't match: passed " + toString(arguments.size())
-                    + ", should be at least 1",
-                ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
-
-        if (arguments.size() > FormatImpl::argument_threshold)
-            throw Exception(
-                "Number of arguments for function " + getName() + " doesn't match: passed " + toString(arguments.size())
-                    + ", should be at most " + std::to_string(FormatImpl::argument_threshold),
-                ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
+                ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
+                "Number of arguments for function {} doesn't match: passed {}, should be at least 2",
+                getName(),
+                arguments.size());

        for (const auto arg_idx : collections::range(0, arguments.size()))
        {
            const auto * arg = arguments[arg_idx].get();
            if (!isStringOrFixedString(arg))
                throw Exception(
-                    "Illegal type " + arg->getName() + " of argument " + std::to_string(arg_idx + 1) + " of function " + getName(),
-                    ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+                    ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+                    "Illegal type {} of argument {} of function {}",
+                    arg->getName(),
+                    arg_idx + 1,
+                    getName());
        }

        return std::make_shared<DataTypeString>();
@ -84,7 +82,7 @@ public:
        std::vector<const ColumnString::Chars *> data(arguments.size() - 1);
        std::vector<const ColumnString::Offsets *> offsets(arguments.size() - 1);
        std::vector<size_t> fixed_string_sizes(arguments.size() - 1);
-        std::vector<String> constant_strings(arguments.size() - 1);
+        std::vector<std::optional<String>> constant_strings(arguments.size() - 1);

        bool has_column_string = false;
        bool has_column_fixed_string = false;
--- a/src/Functions/formatString.h
+++ b/src/Functions/formatString.h
@ -4,8 +4,10 @@
 #include <base/types.h>
 #include <Common/Exception.h>
 #include <Common/StringUtils/StringUtils.h>
+#include <Common/format.h>
 #include <Common/memcpySmall.h>

+
 #include <algorithm>
 #include <optional>
 #include <string>
@ -15,15 +17,9 @@

 namespace DB
 {
-namespace ErrorCodes
-{
-    extern const int BAD_ARGUMENTS;
-}

 struct FormatImpl
 {
-    static constexpr size_t small_argument_threshold = 1024;
-    static constexpr size_t argument_threshold = std::numeric_limits<UInt32>::max();
    static constexpr size_t right_padding = 15;

    template <typename... Args>
@ -39,165 +35,10 @@ struct FormatImpl
            format<false, false>(std::forward<Args>(args)...);
    }

-    static void parseNumber(const String & description, UInt64 l, UInt64 r, UInt64 & res)
-    {
-        res = 0;
-        for (UInt64 pos = l; pos < r; ++pos)
-        {
-            if (!isNumericASCII(description[pos]))
-                throw Exception("Not a number in curly braces at position " + std::to_string(pos), ErrorCodes::BAD_ARGUMENTS);
-            res = res * 10 + description[pos] - '0';
-            if (res >= argument_threshold)
-                throw Exception(
-                    "Too big number for arguments, must be at most " + std::to_string(argument_threshold), ErrorCodes::BAD_ARGUMENTS);
-        }
-    }
-
-    static inline void init(
-        const String & pattern,
-        const std::vector<const ColumnString::Chars *> & data,
-        size_t argument_number,
-        const std::vector<String> & constant_strings,
-        UInt64 * index_positions_ptr,
-        std::vector<String> & substrings)
-    {
-        /// Is current position after open curly brace.
-        bool is_open_curly = false;
-        /// The position of last open token.
-        size_t last_open = -1;
-
-        /// Is formatting in a plain {} token.
-        std::optional<bool> is_plain_numbering;
-        UInt64 index_if_plain = 0;
-
-        /// Left position of adding substrings, just to the closed brace position or the start of the string.
-        /// Invariant --- the start of substring is in this position.
-        size_t start_pos = 0;
-
-        /// A flag to decide whether we should glue the constant strings.
-        bool glue_to_next = false;
-
-        /// Handling double braces (escaping).
-        auto double_brace_removal = [](String & str)
-        {
-            size_t i = 0;
-            bool should_delete = true;
-            str.erase(
-                std::remove_if(
-                    str.begin(),
-                    str.end(),
-                    [&i, &should_delete, &str](char)
-                    {
-                        bool is_double_brace = (str[i] == '{' && str[i + 1] == '{') || (str[i] == '}' && str[i + 1] == '}');
-                        ++i;
-                        if (is_double_brace && should_delete)
-                        {
-                            should_delete = false;
-                            return true;
-                        }
-                        should_delete = true;
-                        return false;
-                    }),
-                str.end());
-        };
-
-        for (size_t i = 0; i < pattern.size(); ++i)
-        {
-            if (pattern[i] == '{')
-            {
-                /// Escaping handling
-                /// It is safe to access because of null termination
-                if (pattern[i + 1] == '{')
-                {
-                    ++i;
-                    continue;
-                }
-
-                if (is_open_curly)
-                    throw Exception("Two open curly braces without close one at position " + std::to_string(i), ErrorCodes::BAD_ARGUMENTS);
-
-                String to_add = String(pattern.data() + start_pos, i - start_pos);
-                double_brace_removal(to_add);
-                if (!glue_to_next)
-                    substrings.emplace_back(to_add);
-                else
-                    substrings.back() += to_add;
-
-                glue_to_next = false;
-
-                is_open_curly = true;
-                last_open = i + 1;
-            }
-            else if (pattern[i] == '}')
-            {
-                if (pattern[i + 1] == '}')
-                {
-                    ++i;
-                    continue;
-                }
-
-                if (!is_open_curly)
-                    throw Exception("Closed curly brace without open one at position " + std::to_string(i), ErrorCodes::BAD_ARGUMENTS);
-
-                is_open_curly = false;
-
-                if (last_open == i)
-                {
-                    if (is_plain_numbering && !*is_plain_numbering)
-                        throw Exception(
-                            "Cannot switch from automatic field numbering to manual field specification", ErrorCodes::BAD_ARGUMENTS);
-                    is_plain_numbering = true;
-                    if (index_if_plain >= argument_number)
-                        throw Exception("Argument is too big for formatting", ErrorCodes::BAD_ARGUMENTS);
-                    *index_positions_ptr = index_if_plain++;
-                }
-                else
-                {
-                    if (is_plain_numbering && *is_plain_numbering)
-                        throw Exception(
-                            "Cannot switch from automatic field numbering to manual field specification", ErrorCodes::BAD_ARGUMENTS);
-                    is_plain_numbering = false;
-
-                    UInt64 arg;
-                    parseNumber(pattern, last_open, i, arg);
-
-                    if (arg >= argument_number)
-                        throw Exception(
-                            "Argument is too big for formatting. Note that indexing starts from zero", ErrorCodes::BAD_ARGUMENTS);
-
-                    *index_positions_ptr = arg;
-                }
-
-                /// Constant string.
-                if (!data[*index_positions_ptr])
-                {
-                    /// The next string should be glued to last `A {} C`.format('B') -> `A B C`.
-                    glue_to_next = true;
-                    substrings.back() += constant_strings[*index_positions_ptr];
-                }
-                else
-                    ++index_positions_ptr; /// Otherwise we commit arg number and proceed.
-
-                start_pos = i + 1;
-            }
-        }
-
-        if (is_open_curly)
-            throw Exception("Last open curly brace is not closed", ErrorCodes::BAD_ARGUMENTS);
-
-        String to_add = String(pattern.data() + start_pos, pattern.size() - start_pos);
-        double_brace_removal(to_add);
-
-        if (!glue_to_next)
-            substrings.emplace_back(to_add);
-        else
-            substrings.back() += to_add;
-    }
-
    /// data for ColumnString and ColumnFixed. Nullptr means no data, it is const string.
    /// offsets for ColumnString, nullptr is an indicator that there is a fixed string rather than ColumnString.
    /// fixed_string_N for savings N to fixed strings.
-    /// constant_strings for constant strings. If data[i] is nullptr, than it is constant string.
+    /// constant_strings for constant strings. If data[i] is nullptr, it is constant string.
    /// res_data is result_data, res_offsets is offset result.
    /// input_rows_count is the number of rows processed.
    /// Precondition: data.size() == offsets.size() == fixed_string_N.size() == constant_strings.size().
@ -207,29 +48,22 @@ struct FormatImpl
        const std::vector<const ColumnString::Chars *> & data,
        const std::vector<const ColumnString::Offsets *> & offsets,
        [[maybe_unused]] /* Because sometimes !has_column_fixed_string */ const std::vector<size_t> & fixed_string_N,
-        const std::vector<String> & constant_strings,
+        const std::vector<std::optional<String>> & constant_strings,
        ColumnString::Chars & res_data,
        ColumnString::Offsets & res_offsets,
        size_t input_rows_count)
    {
        const size_t argument_number = offsets.size();

-        UInt64 small_index_positions_buffer[small_argument_threshold];
-        /// The subsequent indexes of strings we should use. e.g `Hello world {1} {3} {1} {0}` this array will be filled with [1, 3, 1, 0, ... (garbage)] but without constant string indices.
-        UInt64 * index_positions = small_index_positions_buffer;
-
-        std::unique_ptr<UInt64[]> big_index_positions_buffer;
-        if (argument_number > small_argument_threshold)
-        {
-            big_index_positions_buffer.reset(new UInt64[argument_number]);
-            index_positions = big_index_positions_buffer.get();
-        }
+        /// The subsequent indexes of strings we should use. e.g `Hello world {1} {3} {1} {0}` this
+        /// array will be filled with [1, 3, 1, 0] but without constant string indices.
+        Format::IndexPositions index_positions;

        /// Vector of substrings of pattern that will be copied to the answer, not string view because of escaping and iterators invalidation.
        /// These are exactly what is between {} tokens, for `Hello {} world {}` we will have [`Hello `, ` world `, ``].
        std::vector<String> substrings;

-        init(pattern, data, argument_number, constant_strings, index_positions, substrings);
+        Format::init(pattern, argument_number, constant_strings, index_positions, substrings);

        UInt64 final_size = 0;

@ -271,7 +105,7 @@ struct FormatImpl
                for (size_t j = 1; j < substrings.size(); ++j)
                {
                    UInt64 arg = index_positions[j - 1];
-                    auto offset_ptr = offsets[arg];
+                    const auto * offset_ptr = offsets[arg];
                    UInt64 arg_offset = 0;
                    UInt64 size = 0;

--- a/src/Functions/registerFunctionsMiscellaneous.cpp
+++ b/src/Functions/registerFunctionsMiscellaneous.cpp
@ -80,6 +80,7 @@ void registerFunctionInitialQueryID(FunctionFactory & factory);
 void registerFunctionServerUUID(FunctionFactory &);
 void registerFunctionZooKeeperSessionUptime(FunctionFactory &);
 void registerFunctionGetOSKernelVersion(FunctionFactory &);
+void registerFunctionFlattenTuple(FunctionFactory &);

 #if USE_ICU
 void registerFunctionConvertCharset(FunctionFactory &);
@ -166,6 +167,7 @@ void registerFunctionsMiscellaneous(FunctionFactory & factory)
    registerFunctionServerUUID(factory);
    registerFunctionZooKeeperSessionUptime(factory);
    registerFunctionGetOSKernelVersion(factory);
+    registerFunctionFlattenTuple(factory);

 #if USE_ICU
    registerFunctionConvertCharset(factory);
--- a/src/IO/ParallelReadBuffer.cpp
+++ b/src/IO/ParallelReadBuffer.cpp
@ -13,18 +13,11 @@ namespace ErrorCodes

 }

-ParallelReadBuffer::ParallelReadBuffer(
-    std::unique_ptr<ReadBufferFactory> reader_factory_,
-    ThreadPool * pool_,
-    size_t max_working_readers_,
-    WorkerSetup worker_setup_,
-    WorkerCleanup worker_cleanup_)
+ParallelReadBuffer::ParallelReadBuffer(std::unique_ptr<ReadBufferFactory> reader_factory_, CallbackRunner schedule_, size_t max_working_readers_)
    : SeekableReadBufferWithSize(nullptr, 0)
-    , pool(pool_)
    , max_working_readers(max_working_readers_)
+    , schedule(std::move(schedule_))
    , reader_factory(std::move(reader_factory_))
-    , worker_setup(std::move(worker_setup_))
-    , worker_cleanup(std::move(worker_cleanup_))
 {
    std::unique_lock<std::mutex> lock{mutex};
    addReaders(lock);
@ -40,30 +33,8 @@ bool ParallelReadBuffer::addReaderToPool(std::unique_lock<std::mutex> & /*buffer

    auto worker = read_workers.emplace_back(std::make_shared<ReadWorker>(std::move(reader)));

-    pool->scheduleOrThrow(
-        [&, this, worker = std::move(worker)]() mutable
-        {
-            ThreadStatus thread_status;
+    schedule([this, worker = std::move(worker)]() mutable { readerThreadFunction(std::move(worker)); });

-            {
-                std::lock_guard lock{mutex};
-                ++active_working_reader;
-            }
-
-            SCOPE_EXIT({
-                worker_cleanup(thread_status);
-
-                std::lock_guard lock{mutex};
-                --active_working_reader;
-                if (active_working_reader == 0)
-                {
-                    readers_done.notify_all();
-                }
-            });
-            worker_setup(thread_status);
-
-            readerThreadFunction(std::move(worker));
-        });
    return true;
 }

@ -232,12 +203,27 @@ bool ParallelReadBuffer::nextImpl()

 void ParallelReadBuffer::readerThreadFunction(ReadWorkerPtr read_worker)
 {
+    {
+        std::lock_guard lock{mutex};
+        ++active_working_reader;
+    }
+
+    SCOPE_EXIT({
+        std::lock_guard lock{mutex};
+        --active_working_reader;
+        if (active_working_reader == 0)
+        {
+            readers_done.notify_all();
+        }
+    });
+
    try
    {
        while (!emergency_stop && !read_worker->cancel)
        {
            if (!read_worker->reader->next())
-                throw Exception("Failed to read all the data from the reader", ErrorCodes::LOGICAL_ERROR);
+                throw Exception(
+                    ErrorCodes::LOGICAL_ERROR, "Failed to read all the data from the reader, missing {} bytes", read_worker->bytes_left);

            if (emergency_stop || read_worker->cancel)
                break;
--- a/src/IO/ParallelReadBuffer.h
+++ b/src/IO/ParallelReadBuffer.h
@ -3,6 +3,7 @@
 #include <IO/BufferWithOwnMemory.h>
 #include <IO/ReadBuffer.h>
 #include <IO/SeekableReadBuffer.h>
+#include <Interpreters/threadPoolCallbackRunner.h>
 #include <Common/ArenaWithFreeLists.h>
 #include <Common/ThreadPool.h>

@ -76,14 +77,7 @@ public:
        virtual std::optional<size_t> getTotalSize() = 0;
    };

-    using WorkerSetup = std::function<void(ThreadStatus &)>;
-    using WorkerCleanup = std::function<void(ThreadStatus &)>;
-    explicit ParallelReadBuffer(
-        std::unique_ptr<ReadBufferFactory> reader_factory_,
-        ThreadPool * pool,
-        size_t max_working_readers,
-        WorkerSetup worker_setup = {},
-        WorkerCleanup worker_cleanup = {});
+    explicit ParallelReadBuffer(std::unique_ptr<ReadBufferFactory> reader_factory_, CallbackRunner schedule_, size_t max_working_readers);

    ~ParallelReadBuffer() override { finishAndWait(); }

@ -140,16 +134,14 @@ private:

    Segment current_segment;

-    ThreadPool * pool;
    size_t max_working_readers;
    size_t active_working_reader{0};
    // Triggered when all reader workers are done
    std::condition_variable readers_done;

-    std::unique_ptr<ReadBufferFactory> reader_factory;
+    CallbackRunner schedule;

-    WorkerSetup worker_setup;
-    WorkerCleanup worker_cleanup;
+    std::unique_ptr<ReadBufferFactory> reader_factory;

    /**
     * FIFO queue of readers.
--- a/src/IO/ReadBufferFromS3.cpp
+++ b/src/IO/ReadBufferFromS3.cpp
@ -1,4 +1,5 @@
 #include <Common/config.h>
+#include "IO/S3Common.h"

 #if USE_AWS_S3

@ -42,6 +43,7 @@ ReadBufferFromS3::ReadBufferFromS3(
    UInt64 max_single_read_retries_,
    const ReadSettings & settings_,
    bool use_external_buffer_,
+    size_t offset_,
    size_t read_until_position_,
    bool restricted_seek_)
    : SeekableReadBufferWithSize(nullptr, 0)
@ -49,9 +51,10 @@ ReadBufferFromS3::ReadBufferFromS3(
    , bucket(bucket_)
    , key(key_)
    , max_single_read_retries(max_single_read_retries_)
+    , offset(offset_)
+    , read_until_position(read_until_position_)
    , read_settings(settings_)
    , use_external_buffer(use_external_buffer_)
-    , read_until_position(read_until_position_)
    , restricted_seek(restricted_seek_)
 {
 }
@ -210,13 +213,14 @@ std::optional<size_t> ReadBufferFromS3::getTotalSize()
    if (file_size)
        return file_size;

-    Aws::S3::Model::HeadObjectRequest request;
-    request.SetBucket(bucket);
-    request.SetKey(key);
+    auto object_size = S3::getObjectSize(client_ptr, bucket, key, false);

-    auto outcome = client_ptr->HeadObject(request);
-    auto head_result = outcome.GetResultWithOwnership();
-    file_size = head_result.GetContentLength();
+    if (!object_size)
+    {
+        return std::nullopt;
+    }
+
+    file_size = object_size;
    return file_size;
 }

@ -234,6 +238,11 @@ void ReadBufferFromS3::setReadUntilPosition(size_t position)
    }
 }

+SeekableReadBuffer::Range ReadBufferFromS3::getRemainingReadRange() const
+{
+    return Range{.left = static_cast<size_t>(offset), .right = read_until_position ? std::optional{read_until_position - 1} : std::nullopt};
+}
+
 std::unique_ptr<ReadBuffer> ReadBufferFromS3::initialize()
 {
    Aws::S3::Model::GetObjectRequest req;
@ -272,6 +281,36 @@ std::unique_ptr<ReadBuffer> ReadBufferFromS3::initialize()
        throw Exception(outcome.GetError().GetMessage(), ErrorCodes::S3_ERROR);
 }

+SeekableReadBufferPtr ReadBufferS3Factory::getReader()
+{
+    const auto next_range = range_generator.nextRange();
+    if (!next_range)
+    {
+        return nullptr;
+    }
+
+    auto reader = std::make_shared<ReadBufferFromS3>(
+        client_ptr,
+        bucket,
+        key,
+        s3_max_single_read_retries,
+        read_settings,
+        false /*use_external_buffer*/,
+        next_range->first,
+        next_range->second);
+    return reader;
+}
+
+off_t ReadBufferS3Factory::seek(off_t off, [[maybe_unused]] int whence)
+{
+    range_generator = RangeGenerator{object_size, range_step, static_cast<size_t>(off)};
+    return off;
+}
+
+std::optional<size_t> ReadBufferS3Factory::getTotalSize()
+{
+    return object_size;
+}
 }

 #endif
--- a/src/IO/ReadBufferFromS3.h
+++ b/src/IO/ReadBufferFromS3.h
@ -1,5 +1,6 @@
 #pragma once

+#include <Common/RangeGenerator.h>
 #include <Common/config.h>

 #if USE_AWS_S3
@ -7,6 +8,7 @@
 #include <memory>

 #include <IO/HTTPCommon.h>
+#include <IO/ParallelReadBuffer.h>
 #include <IO/ReadBuffer.h>
 #include <IO/ReadSettings.h>
 #include <IO/SeekableReadBuffer.h>
@ -30,7 +32,9 @@ private:
    String bucket;
    String key;
    UInt64 max_single_read_retries;
+
    off_t offset = 0;
+    off_t read_until_position = 0;

    Aws::S3::Model::GetObjectResult read_result;
    std::unique_ptr<ReadBuffer> impl;
@ -45,6 +49,7 @@ public:
        UInt64 max_single_read_retries_,
        const ReadSettings & settings_,
        bool use_external_buffer = false,
+        size_t offset_ = 0,
        size_t read_until_position_ = 0,
        bool restricted_seek_ = false);

@ -58,7 +63,7 @@ public:

    void setReadUntilPosition(size_t position) override;

-    Range getRemainingReadRange() const override { return Range{ .left = static_cast<size_t>(offset), .right = read_until_position }; }
+    Range getRemainingReadRange() const override;

    size_t getFileOffsetOfBufferEnd() const override { return offset; }

@ -69,13 +74,55 @@ private:

    bool use_external_buffer;

-    off_t read_until_position = 0;
-
    /// There is different seek policy for disk seek and for non-disk seek
    /// (non-disk seek is applied for seekable input formats: orc, arrow, parquet).
    bool restricted_seek;
 };

+/// Creates separate ReadBufferFromS3 for sequence of ranges of particular object
+class ReadBufferS3Factory : public ParallelReadBuffer::ReadBufferFactory
+{
+public:
+    explicit ReadBufferS3Factory(
+        std::shared_ptr<Aws::S3::S3Client> client_ptr_,
+        const String & bucket_,
+        const String & key_,
+        size_t range_step_,
+        size_t object_size_,
+        UInt64 s3_max_single_read_retries_,
+        const ReadSettings & read_settings_)
+        : client_ptr(client_ptr_)
+        , bucket(bucket_)
+        , key(key_)
+        , read_settings(read_settings_)
+        , range_generator(object_size_, range_step_)
+        , range_step(range_step_)
+        , object_size(object_size_)
+        , s3_max_single_read_retries(s3_max_single_read_retries_)
+    {
+        assert(range_step > 0);
+        assert(range_step < object_size);
+    }
+
+    SeekableReadBufferPtr getReader() override;
+
+    off_t seek(off_t off, [[maybe_unused]] int whence) override;
+
+    std::optional<size_t> getTotalSize() override;
+
+private:
+    std::shared_ptr<Aws::S3::S3Client> client_ptr;
+    const String bucket;
+    const String key;
+    ReadSettings read_settings;
+
+    RangeGenerator range_generator;
+    size_t range_step;
+    size_t object_size;
+
+    UInt64 s3_max_single_read_retries;
+};
+
 }

 #endif
--- a/src/IO/ReadWriteBufferFromHTTP.h
+++ b/src/IO/ReadWriteBufferFromHTTP.h
@ -1,6 +1,7 @@
 #pragma once

 #include <functional>
+#include <Common/RangeGenerator.h>
 #include <IO/ConnectionTimeouts.h>
 #include <IO/HTTPCommon.h>
 #include <IO/ParallelReadBuffer.h>
@ -635,43 +636,6 @@ public:
    void buildNewSession(const Poco::URI & uri) override { session = makeHTTPSession(uri, timeouts); }
 };

-class RangeGenerator
-{
-public:
-    explicit RangeGenerator(size_t total_size_, size_t range_step_, size_t range_start = 0)
-        : from(range_start), range_step(range_step_), total_size(total_size_)
-    {
-    }
-
-    size_t totalRanges() const { return static_cast<size_t>(round(static_cast<float>(total_size - from) / range_step)); }
-
-    using Range = std::pair<size_t, size_t>;
-
-    // return upper exclusive range of values, i.e. [from_range, to_range>
-    std::optional<Range> nextRange()
-    {
-        if (from >= total_size)
-        {
-            return std::nullopt;
-        }
-
-        auto to = from + range_step;
-        if (to >= total_size)
-        {
-            to = total_size;
-        }
-
-        Range range{from, to};
-        from = to;
-        return range;
-    }
-
-private:
-    size_t from;
-    size_t range_step;
-    size_t total_size;
-};
-
 class ReadWriteBufferFromHTTP : public detail::ReadWriteBufferFromHTTPBase<std::shared_ptr<UpdatableSession>>
 {
    using Parent = detail::ReadWriteBufferFromHTTPBase<std::shared_ptr<UpdatableSession>>;
--- a/src/IO/S3Common.cpp
+++ b/src/IO/S3Common.cpp
@ -24,6 +24,7 @@
 #    include <aws/core/utils/UUID.h>
 #    include <aws/core/http/HttpClientFactory.h>
 #    include <aws/s3/S3Client.h>
+#    include <aws/s3/model/HeadObjectRequest.h>  // Y_IGNORE

 #    include <IO/S3/PocoHTTPClientFactory.h>
 #    include <IO/S3/PocoHTTPClient.h>
@ -682,6 +683,7 @@ namespace DB
 namespace ErrorCodes
 {
    extern const int BAD_ARGUMENTS;
+    extern const int S3_ERROR;
 }

 namespace S3
@ -839,6 +841,26 @@ namespace S3
            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bucket name length is out of bounds in virtual hosted style S3 URI:     {}{}",
                            quoteString(bucket), !uri.empty() ? " (" + uri.toString() + ")" : "");
    }
+
+    size_t getObjectSize(std::shared_ptr<Aws::S3::S3Client> client_ptr, const String & bucket, const String & key, bool throw_on_error)
+    {
+        Aws::S3::Model::HeadObjectRequest req;
+        req.SetBucket(bucket);
+        req.SetKey(key);
+
+        Aws::S3::Model::HeadObjectOutcome outcome = client_ptr->HeadObject(req);
+
+        if (outcome.IsSuccess())
+        {
+            auto read_result = outcome.GetResultWithOwnership();
+            return static_cast<size_t>(read_result.GetContentLength());
+        }
+        else if (throw_on_error)
+        {
+            throw DB::Exception(outcome.GetError().GetMessage(), ErrorCodes::S3_ERROR);
+        }
+        return 0;
+    }
 }

 }
--- a/src/IO/S3Common.h
+++ b/src/IO/S3Common.h
@ -75,6 +75,8 @@ struct URI
    static void validateBucket(const String & bucket, const Poco::URI & uri);
 };

+size_t getObjectSize(std::shared_ptr<Aws::S3::S3Client> client_ptr, const String & bucket, const String & key, bool throw_on_error = true);
+
 }

 #endif
--- a/src/Interpreters/Aggregator.cpp
+++ b/src/Interpreters/Aggregator.cpp
@ -1,4 +1,6 @@
+#include <algorithm>
 #include <future>
+#include <numeric>
 #include <Poco/Util/Application.h>

 #include <base/sort.h>
@ -15,6 +17,7 @@
 #include <IO/WriteBufferFromFile.h>
 #include <Compression/CompressedWriteBuffer.h>
 #include <Interpreters/Aggregator.h>
+#include <Common/LRUCache.h>
 #include <Common/MemoryTracker.h>
 #include <Common/CurrentThread.h>
 #include <Common/typeid_cast.h>
@ -27,12 +30,236 @@
 #include <Interpreters/JIT/CompiledExpressionCache.h>
 #include <Core/ProtocolDefines.h>

+#include <Parsers/ASTSelectQuery.h>

 namespace ProfileEvents
 {
-    extern const Event ExternalAggregationWritePart;
-    extern const Event ExternalAggregationCompressedBytes;
-    extern const Event ExternalAggregationUncompressedBytes;
+extern const Event ExternalAggregationWritePart;
+extern const Event ExternalAggregationCompressedBytes;
+extern const Event ExternalAggregationUncompressedBytes;
+extern const Event AggregationPreallocatedElementsInHashTables;
+extern const Event AggregationHashTablesInitializedAsTwoLevel;
+}
+
+namespace
+{
+/** Collects observed HashMap-s sizes to avoid redundant intermediate resizes.
+  */
+class HashTablesStatistics
+{
+public:
+    struct Entry
+    {
+        size_t sum_of_sizes; // used to determine if it's better to convert aggregation to two-level from the beginning
+        size_t median_size; // roughly the size we're going to preallocate on each thread
+    };
+
+    using Cache = DB::LRUCache<UInt64, Entry>;
+    using CachePtr = std::shared_ptr<Cache>;
+    using Params = DB::Aggregator::Params::StatsCollectingParams;
+
+    /// Collection and use of the statistics should be enabled.
+    std::optional<Entry> getSizeHint(const Params & params)
+    {
+        if (!params.isCollectionAndUseEnabled())
+            throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Collection and use of the statistics should be enabled.");
+
+        std::lock_guard lock(mutex);
+        const auto cache = getHashTableStatsCache(params, lock);
+        if (const auto hint = cache->get(params.key))
+        {
+            LOG_DEBUG(
+                &Poco::Logger::get("Aggregator"),
+                "An entry for key={} found in cache: sum_of_sizes={}, median_size={}",
+                params.key,
+                hint->sum_of_sizes,
+                hint->median_size);
+            return *hint;
+        }
+        return std::nullopt;
+    }
+
+    /// Collection and use of the statistics should be enabled.
+    void update(size_t sum_of_sizes, size_t median_size, const Params & params)
+    {
+        if (!params.isCollectionAndUseEnabled())
+            throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Collection and use of the statistics should be enabled.");
+
+        std::lock_guard lock(mutex);
+        const auto cache = getHashTableStatsCache(params, lock);
+        const auto hint = cache->get(params.key);
+        // We'll maintain the maximum among all the observed values until the next prediction turns out to be too wrong.
+        if (!hint || sum_of_sizes < hint->sum_of_sizes / 2 || hint->sum_of_sizes < sum_of_sizes || median_size < hint->median_size / 2
+            || hint->median_size < median_size)
+        {
+            LOG_DEBUG(
+                &Poco::Logger::get("Aggregator"),
+                "Statistics updated for key={}: new sum_of_sizes={}, median_size={}",
+                params.key,
+                sum_of_sizes,
+                median_size);
+            cache->set(params.key, std::make_shared<Entry>(Entry{.sum_of_sizes = sum_of_sizes, .median_size = median_size}));
+        }
+    }
+
+    std::optional<DB::HashTablesCacheStatistics> getCacheStats() const
+    {
+        std::lock_guard lock(mutex);
+        if (hash_table_stats)
+        {
+            size_t hits = 0, misses = 0;
+            hash_table_stats->getStats(hits, misses);
+            return DB::HashTablesCacheStatistics{.entries = hash_table_stats->count(), .hits = hits, .misses = misses};
+        }
+        return std::nullopt;
+    }
+
+    static size_t calculateCacheKey(const DB::ASTPtr & select_query)
+    {
+        if (!select_query)
+            throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Query ptr cannot be null");
+
+        const auto & select = select_query->as<DB::ASTSelectQuery &>();
+
+        // It may happen in some corner cases like `select 1 as num group by num`.
+        if (!select.tables())
+            return 0;
+
+        SipHash hash;
+        hash.update(select.tables()->getTreeHash());
+        if (const auto where = select.where())
+            hash.update(where->getTreeHash());
+        if (const auto group_by = select.groupBy())
+            hash.update(group_by->getTreeHash());
+        return hash.get64();
+    }
+
+private:
+    CachePtr getHashTableStatsCache(const Params & params, const std::lock_guard<std::mutex> &)
+    {
+        if (!hash_table_stats || hash_table_stats->maxSize() != params.max_entries_for_hash_table_stats)
+            hash_table_stats = std::make_shared<Cache>(params.max_entries_for_hash_table_stats);
+        return hash_table_stats;
+    }
+
+    mutable std::mutex mutex;
+    CachePtr hash_table_stats;
+};
+
+HashTablesStatistics & getHashTablesStatistics()
+{
+    static HashTablesStatistics hash_tables_stats;
+    return hash_tables_stats;
+}
+
+bool worthConvertToTwoLevel(
+    size_t group_by_two_level_threshold, size_t result_size, size_t group_by_two_level_threshold_bytes, auto result_size_bytes)
+{
+    // params.group_by_two_level_threshold will be equal to 0 if we have only one thread to execute aggregation (refer to AggregatingStep::transformPipeline).
+    return (group_by_two_level_threshold && result_size >= group_by_two_level_threshold)
+        || (group_by_two_level_threshold_bytes && result_size_bytes >= static_cast<Int64>(group_by_two_level_threshold_bytes));
+}
+
+DB::AggregatedDataVariants::Type convertToTwoLevelTypeIfPossible(DB::AggregatedDataVariants::Type type)
+{
+    using Type = DB::AggregatedDataVariants::Type;
+    switch (type)
+    {
+#define M(NAME) \
+    case Type::NAME: \
+        return Type::NAME##_two_level;
+        APPLY_FOR_VARIANTS_CONVERTIBLE_TO_TWO_LEVEL(M)
+#undef M
+        default:
+            return type;
+    }
+    __builtin_unreachable();
+}
+
+void initDataVariantsWithSizeHint(
+    DB::AggregatedDataVariants & result, DB::AggregatedDataVariants::Type method_chosen, const DB::Aggregator::Params & params)
+{
+    const auto & stats_collecting_params = params.stats_collecting_params;
+    if (stats_collecting_params.isCollectionAndUseEnabled())
+    {
+        if (auto hint = getHashTablesStatistics().getSizeHint(stats_collecting_params))
+        {
+            const auto max_threads = params.group_by_two_level_threshold != 0 ? std::max(params.max_threads, 1ul) : 1;
+            const auto lower_limit = hint->sum_of_sizes / max_threads;
+            const auto upper_limit = stats_collecting_params.max_size_to_preallocate_for_aggregation / max_threads;
+            const auto adjusted = std::min(std::max(lower_limit, hint->median_size), upper_limit);
+            if (worthConvertToTwoLevel(
+                    params.group_by_two_level_threshold,
+                    hint->sum_of_sizes,
+                    /*group_by_two_level_threshold_bytes*/ 0,
+                    /*result_size_bytes*/ 0))
+                method_chosen = convertToTwoLevelTypeIfPossible(method_chosen);
+            result.init(method_chosen, adjusted);
+            ProfileEvents::increment(ProfileEvents::AggregationHashTablesInitializedAsTwoLevel, result.isTwoLevel());
+            return;
+        }
+    }
+    result.init(method_chosen);
+}
+
+/// Collection and use of the statistics should be enabled.
+void updateStatistics(const DB::ManyAggregatedDataVariants & data_variants, const DB::Aggregator::Params::StatsCollectingParams & params)
+{
+    if (!params.isCollectionAndUseEnabled())
+        throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Collection and use of the statistics should be enabled.");
+
+    std::vector<size_t> sizes(data_variants.size());
+    for (size_t i = 0; i < data_variants.size(); ++i)
+        sizes[i] = data_variants[i]->size();
+    const auto median_size = sizes.begin() + sizes.size() / 2; // not precisely though...
+    std::nth_element(sizes.begin(), median_size, sizes.end());
+    const auto sum_of_sizes = std::accumulate(sizes.begin(), sizes.end(), 0ull);
+    getHashTablesStatistics().update(sum_of_sizes, *median_size, params);
+}
+
+// The std::is_constructible trait isn't suitable here because some classes have template constructors with semantics different from providing size hints.
+// Also string hash table variants are not supported due to the fact that both local perf tests and tests in CI showed slowdowns for them.
+template <typename...>
+struct HasConstructorOfNumberOfElements : std::false_type
+{
+};
+
+template <typename... Ts>
+struct HasConstructorOfNumberOfElements<HashMapTable<Ts...>> : std::true_type
+{
+};
+
+template <typename Key, typename Cell, typename Hash, typename Grower, typename Allocator, template <typename...> typename ImplTable>
+struct HasConstructorOfNumberOfElements<TwoLevelHashMapTable<Key, Cell, Hash, Grower, Allocator, ImplTable>> : std::true_type
+{
+};
+
+template <typename... Ts>
+struct HasConstructorOfNumberOfElements<HashTable<Ts...>> : std::true_type
+{
+};
+
+template <typename... Ts>
+struct HasConstructorOfNumberOfElements<TwoLevelHashTable<Ts...>> : std::true_type
+{
+};
+
+template <template <typename> typename Method, typename Base>
+struct HasConstructorOfNumberOfElements<Method<Base>> : HasConstructorOfNumberOfElements<Base>
+{
+};
+
+template <typename Method>
+auto constructWithReserveIfPossible(size_t size_hint)
+{
+    if constexpr (HasConstructorOfNumberOfElements<typename Method::Data>::value)
+    {
+        ProfileEvents::increment(ProfileEvents::AggregationPreallocatedElementsInHashTables, size_hint);
+        return std::make_unique<Method>(size_hint);
+    }
+    else
+        return std::make_unique<Method>();
+}
 }

 namespace DB
@ -64,6 +291,10 @@ AggregatedDataVariants::~AggregatedDataVariants()
    }
 }

+std::optional<HashTablesCacheStatistics> getHashTablesCacheStatistics()
+{
+    return getHashTablesStatistics().getCacheStats();
+}

 void AggregatedDataVariants::convertToTwoLevel()
 {
@ -88,6 +319,47 @@ void AggregatedDataVariants::convertToTwoLevel()
    }
 }

+void AggregatedDataVariants::init(Type type_, std::optional<size_t> size_hint)
+{
+    switch (type_)
+    {
+        case Type::EMPTY:
+            break;
+        case Type::without_key:
+            break;
+
+#define M(NAME, IS_TWO_LEVEL) \
+    case Type::NAME: \
+        if (size_hint) \
+            (NAME) = constructWithReserveIfPossible<decltype(NAME)::element_type>(*size_hint); \
+        else \
+            (NAME) = std::make_unique<decltype(NAME)::element_type>(); \
+        break;
+            APPLY_FOR_AGGREGATED_VARIANTS(M)
+#undef M
+    }
+
+    type = type_;
+}
+
+Aggregator::Params::StatsCollectingParams::StatsCollectingParams() = default;
+
+Aggregator::Params::StatsCollectingParams::StatsCollectingParams(
+    const ASTPtr & select_query_,
+    bool collect_hash_table_stats_during_aggregation_,
+    size_t max_entries_for_hash_table_stats_,
+    size_t max_size_to_preallocate_for_aggregation_)
+    : key(collect_hash_table_stats_during_aggregation_ ? HashTablesStatistics::calculateCacheKey(select_query_) : 0)
+    , max_entries_for_hash_table_stats(max_entries_for_hash_table_stats_)
+    , max_size_to_preallocate_for_aggregation(max_size_to_preallocate_for_aggregation_)
+{
+}
+
+bool Aggregator::Params::StatsCollectingParams::isCollectionAndUseEnabled() const
+{
+    return key != 0;
+}
+
 Block Aggregator::getHeader(bool final) const
 {
    return params.getHeader(final);
@ -237,8 +509,7 @@ public:

 #endif

-Aggregator::Aggregator(const Params & params_)
-    : params(params_)
+Aggregator::Aggregator(const Params & params_) : params(params_)
 {
    /// Use query-level memory tracker
    if (auto * memory_tracker_child = CurrentThread::getMemoryTracker())
@ -292,7 +563,6 @@ Aggregator::Aggregator(const Params & params_)
 #if USE_EMBEDDED_COMPILER
    compileAggregateFunctionsIfNeeded();
 #endif
-
 }

 #if USE_EMBEDDED_COMPILER
@ -958,7 +1228,7 @@ bool Aggregator::executeOnBlock(Columns columns, UInt64 num_rows, AggregatedData
    /// How to perform the aggregation?
    if (result.empty())
    {
-        result.init(method_chosen);
+        initDataVariantsWithSizeHint(result, method_chosen, params);
        result.keys_size = params.keys_size;
        result.key_sizes = key_sizes;
        LOG_TRACE(log, "Aggregation method: {}", result.getMethodName());
@ -1038,9 +1308,8 @@ bool Aggregator::executeOnBlock(Columns columns, UInt64 num_rows, AggregatedData
    /// Here all the results in the sum are taken into account, from different threads.
    auto result_size_bytes = current_memory_usage - memory_usage_before_aggregation;

-    bool worth_convert_to_two_level
-        = (params.group_by_two_level_threshold && result_size >= params.group_by_two_level_threshold)
-        || (params.group_by_two_level_threshold_bytes && result_size_bytes >= static_cast<Int64>(params.group_by_two_level_threshold_bytes));
+    bool worth_convert_to_two_level = worthConvertToTwoLevel(
+        params.group_by_two_level_threshold, result_size, params.group_by_two_level_threshold_bytes, result_size_bytes);

    /** Converting to a two-level data structure.
      * It allows you to make, in the subsequent, an effective merge - either economical from memory or parallel.
@ -1327,10 +1596,7 @@ void Aggregator::convertToBlockImpl(


 template <typename Mapped>
-inline void Aggregator::insertAggregatesIntoColumns(
-    Mapped & mapped,
-    MutableColumns & final_aggregate_columns,
-    Arena * arena) const
+inline void Aggregator::insertAggregatesIntoColumns(Mapped & mapped, MutableColumns & final_aggregate_columns, Arena * arena) const
 {
    /** Final values of aggregate functions are inserted to columns.
      * Then states of aggregate functions, that are not longer needed, are destroyed.
@ -2179,6 +2445,9 @@ ManyAggregatedDataVariants Aggregator::prepareVariantsToMerge(ManyAggregatedData

    LOG_TRACE(log, "Merging aggregated data");

+    if (params.stats_collecting_params.isCollectionAndUseEnabled())
+        updateStatistics(data_variants, params.stats_collecting_params);
+
    ManyAggregatedDataVariants non_empty_data;
    non_empty_data.reserve(data_variants.size());
    for (auto & data : data_variants)
@ -2388,9 +2657,8 @@ bool Aggregator::mergeOnBlock(Block block, AggregatedDataVariants & result, bool
    /// Here all the results in the sum are taken into account, from different threads.
    auto result_size_bytes = current_memory_usage - memory_usage_before_aggregation;

-    bool worth_convert_to_two_level
-        = (params.group_by_two_level_threshold && result_size >= params.group_by_two_level_threshold)
-        || (params.group_by_two_level_threshold_bytes && result_size_bytes >= static_cast<Int64>(params.group_by_two_level_threshold_bytes));
+    bool worth_convert_to_two_level = worthConvertToTwoLevel(
+        params.group_by_two_level_threshold, result_size, params.group_by_two_level_threshold_bytes, result_size_bytes);

    /** Converting to a two-level data structure.
      * It allows you to make, in the subsequent, an effective merge - either economical from memory or parallel.
--- a/src/Interpreters/Aggregator.h
+++ b/src/Interpreters/Aggregator.h
@ -34,6 +34,7 @@
 #include <Columns/ColumnNullable.h>
 #include <Columns/ColumnLowCardinality.h>

+#include <Parsers/IAST_fwd.h>

 namespace DB
 {
@ -129,6 +130,7 @@ private:
 template <typename Base>
 struct AggregationDataWithNullKeyTwoLevel : public Base
 {
+    using Base::Base;
    using Base::impls;

    AggregationDataWithNullKeyTwoLevel() = default;
@ -183,6 +185,8 @@ struct AggregationMethodOneNumber

    AggregationMethodOneNumber() = default;

+    explicit AggregationMethodOneNumber(size_t size_hint) : data(size_hint) { }
+
    template <typename Other>
    explicit AggregationMethodOneNumber(const Other & other) : data(other.data)
    {
@ -225,6 +229,8 @@ struct AggregationMethodString
    {
    }

+    explicit AggregationMethodString(size_t size_hint) : data(size_hint) { }
+
    using State = ColumnsHashing::HashMethodString<typename Data::value_type, Mapped>;

    static const bool low_cardinality_optimization = false;
@ -250,6 +256,8 @@ struct AggregationMethodStringNoCache

    AggregationMethodStringNoCache() = default;

+    explicit AggregationMethodStringNoCache(size_t size_hint) : data(size_hint) { }
+
    template <typename Other>
    explicit AggregationMethodStringNoCache(const Other & other) : data(other.data)
    {
@ -280,6 +288,8 @@ struct AggregationMethodFixedString

    AggregationMethodFixedString() = default;

+    explicit AggregationMethodFixedString(size_t size_hint) : data(size_hint) { }
+
    template <typename Other>
    explicit AggregationMethodFixedString(const Other & other) : data(other.data)
    {
@ -309,6 +319,8 @@ struct AggregationMethodFixedStringNoCache

    AggregationMethodFixedStringNoCache() = default;

+    explicit AggregationMethodFixedStringNoCache(size_t size_hint) : data(size_hint) { }
+
    template <typename Other>
    explicit AggregationMethodFixedStringNoCache(const Other & other) : data(other.data)
    {
@ -382,6 +394,8 @@ struct AggregationMethodKeysFixed

    AggregationMethodKeysFixed() = default;

+    explicit AggregationMethodKeysFixed(size_t size_hint) : data(size_hint) { }
+
    template <typename Other>
    explicit AggregationMethodKeysFixed(const Other & other) : data(other.data)
    {
@ -473,6 +487,8 @@ struct AggregationMethodSerialized

    AggregationMethodSerialized() = default;

+    explicit AggregationMethodSerialized(size_t size_hint) : data(size_hint) { }
+
    template <typename Other>
    explicit AggregationMethodSerialized(const Other & other) : data(other.data)
    {
@ -652,21 +668,7 @@ struct AggregatedDataVariants : private boost::noncopyable

    ~AggregatedDataVariants();

-    void init(Type type_)
-    {
-        switch (type_)
-        {
-            case Type::EMPTY:       break;
-            case Type::without_key: break;
-
-        #define M(NAME, IS_TWO_LEVEL) \
-            case Type::NAME: (NAME) = std::make_unique<decltype(NAME)::element_type>(); break;
-            APPLY_FOR_AGGREGATED_VARIANTS(M)
-        #undef M
-        }
-
-        type = type_;
-    }
+    void init(Type type_, std::optional<size_t> size_hint = std::nullopt);

    /// Number of rows (different keys).
    size_t size() const
@ -929,29 +931,61 @@ public:
        bool compile_aggregate_expressions;
        size_t min_count_to_compile_aggregate_expression;

+        struct StatsCollectingParams
+        {
+            StatsCollectingParams();
+
+            StatsCollectingParams(
+                const ASTPtr & select_query_,
+                bool collect_hash_table_stats_during_aggregation_,
+                size_t max_entries_for_hash_table_stats_,
+                size_t max_size_to_preallocate_for_aggregation_);
+
+            bool isCollectionAndUseEnabled() const;
+
+            const UInt64 key = 0;
+            const size_t max_entries_for_hash_table_stats = 0;
+            const size_t max_size_to_preallocate_for_aggregation = 0;
+        };
+        StatsCollectingParams stats_collecting_params;
+
        Params(
            const Block & src_header_,
-            const ColumnNumbers & keys_, const AggregateDescriptions & aggregates_,
-            bool overflow_row_, size_t max_rows_to_group_by_, OverflowMode group_by_overflow_mode_,
-            size_t group_by_two_level_threshold_, size_t group_by_two_level_threshold_bytes_,
+            const ColumnNumbers & keys_,
+            const AggregateDescriptions & aggregates_,
+            bool overflow_row_,
+            size_t max_rows_to_group_by_,
+            OverflowMode group_by_overflow_mode_,
+            size_t group_by_two_level_threshold_,
+            size_t group_by_two_level_threshold_bytes_,
            size_t max_bytes_before_external_group_by_,
            bool empty_result_for_aggregation_by_empty_set_,
-            VolumePtr tmp_volume_, size_t max_threads_,
+            VolumePtr tmp_volume_,
+            size_t max_threads_,
            size_t min_free_disk_space_,
            bool compile_aggregate_expressions_,
            size_t min_count_to_compile_aggregate_expression_,
-            const Block & intermediate_header_ = {})
-            : src_header(src_header_),
-            intermediate_header(intermediate_header_),
-            keys(keys_), aggregates(aggregates_), keys_size(keys.size()), aggregates_size(aggregates.size()),
-            overflow_row(overflow_row_), max_rows_to_group_by(max_rows_to_group_by_), group_by_overflow_mode(group_by_overflow_mode_),
-            group_by_two_level_threshold(group_by_two_level_threshold_), group_by_two_level_threshold_bytes(group_by_two_level_threshold_bytes_),
-            max_bytes_before_external_group_by(max_bytes_before_external_group_by_),
-            empty_result_for_aggregation_by_empty_set(empty_result_for_aggregation_by_empty_set_),
-            tmp_volume(tmp_volume_), max_threads(max_threads_),
-            min_free_disk_space(min_free_disk_space_),
-            compile_aggregate_expressions(compile_aggregate_expressions_),
-            min_count_to_compile_aggregate_expression(min_count_to_compile_aggregate_expression_)
+            const Block & intermediate_header_ = {},
+            const StatsCollectingParams & stats_collecting_params_ = {})
+            : src_header(src_header_)
+            , intermediate_header(intermediate_header_)
+            , keys(keys_)
+            , aggregates(aggregates_)
+            , keys_size(keys.size())
+            , aggregates_size(aggregates.size())
+            , overflow_row(overflow_row_)
+            , max_rows_to_group_by(max_rows_to_group_by_)
+            , group_by_overflow_mode(group_by_overflow_mode_)
+            , group_by_two_level_threshold(group_by_two_level_threshold_)
+            , group_by_two_level_threshold_bytes(group_by_two_level_threshold_bytes_)
+            , max_bytes_before_external_group_by(max_bytes_before_external_group_by_)
+            , empty_result_for_aggregation_by_empty_set(empty_result_for_aggregation_by_empty_set_)
+            , tmp_volume(tmp_volume_)
+            , max_threads(max_threads_)
+            , min_free_disk_space(min_free_disk_space_)
+            , compile_aggregate_expressions(compile_aggregate_expressions_)
+            , min_count_to_compile_aggregate_expression(min_count_to_compile_aggregate_expression_)
+            , stats_collecting_params(stats_collecting_params_)
        {
        }

@ -1350,4 +1384,13 @@ APPLY_FOR_AGGREGATED_VARIANTS(M)

 #undef M

+
+struct HashTablesCacheStatistics
+{
+    size_t entries = 0;
+    size_t hits = 0;
+    size_t misses = 0;
+};
+
+std::optional<HashTablesCacheStatistics> getHashTablesCacheStatistics();
 }
--- a/src/Interpreters/AsynchronousMetrics.cpp
+++ b/src/Interpreters/AsynchronousMetrics.cpp
@ -1,3 +1,4 @@
+#include <Interpreters/Aggregator.h>
 #include <Interpreters/AsynchronousMetrics.h>
 #include <Interpreters/AsynchronousMetricLog.h>
 #include <Interpreters/JIT/CompiledExpressionCache.h>
@ -630,6 +631,15 @@ void AsynchronousMetrics::update(std::chrono::system_clock::time_point update_ti

    new_values["Uptime"] = getContext()->getUptimeSeconds();

+    {
+        if (const auto stats = getHashTablesCacheStatistics())
+        {
+            new_values["HashTableStatsCacheEntries"] = stats->entries;
+            new_values["HashTableStatsCacheHits"] = stats->hits;
+            new_values["HashTableStatsCacheMisses"] = stats->misses;
+        }
+    }
+
    /// Process process memory usage according to OS
 #if defined(OS_LINUX) || defined(OS_FREEBSD)
    {
--- a/src/Interpreters/CatBoostModel.cpp
+++ b/src/Interpreters/CatBoostModel.cpp
@ -169,6 +169,7 @@ public:
        if (columns.size() != float_features_count + cat_features_count)
            throw Exception(ErrorCodes::BAD_ARGUMENTS,
                "Number of columns is different with number of features: columns size {} float features size {} + cat features size {}",
+                columns.size(),
                float_features_count,
                cat_features_count);

--- a/src/Interpreters/ClientInfo.h
+++ b/src/Interpreters/ClientInfo.h
@ -69,6 +69,7 @@ public:
    /// All below are parameters related to initial query.

    Interface interface = Interface::TCP;
+    bool is_secure = false;

    /// For tcp
    String os_user;
--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@ -1092,6 +1092,17 @@ StoragePtr Context::executeTableFunction(const ASTPtr & table_expression)
    if (!res)
    {
        TableFunctionPtr table_function_ptr = TableFunctionFactory::instance().get(table_expression, shared_from_this());
+        if (table_function_ptr->needStructureHint())
+        {
+            const auto & insertion_table = getInsertionTable();
+            if (!insertion_table.empty())
+            {
+                const auto & structure_hint
+                    = DatabaseCatalog::instance().getTable(insertion_table, shared_from_this())->getInMemoryMetadataPtr()->columns;
+                table_function_ptr->setStructureHint(structure_hint);
+            }
+        }
+
        res = table_function_ptr->execute(table_expression, shared_from_this(), table_function_ptr->getName());

        /// Since ITableFunction::parseArguments() may change table_expression, i.e.:
--- a/src/Interpreters/Context.h
+++ b/src/Interpreters/Context.h
@ -15,6 +15,7 @@
 #include <Common/isLocalAddress.h>
 #include <base/types.h>
 #include <Storages/MergeTree/ParallelReplicasReadingCoordinator.h>
+#include <Storages/ColumnsDescription.h>


 #include "config_core.h"
--- a/src/Interpreters/DatabaseCatalog.cpp
+++ b/src/Interpreters/DatabaseCatalog.cpp
@ -233,7 +233,7 @@ DatabaseAndTable DatabaseCatalog::getTableImpl(
        {
            assert(!db_and_table.first && !db_and_table.second);
            if (exception)
-                exception->emplace(ErrorCodes::UNKNOWN_TABLE, "Table {} doesn't exist", table_id.getNameForLogs());
+                exception->emplace(fmt::format("Table {} doesn't exist", table_id.getNameForLogs()), ErrorCodes::UNKNOWN_TABLE);
            return {};
        }

@ -263,7 +263,7 @@ DatabaseAndTable DatabaseCatalog::getTableImpl(
        /// If table_id has no UUID, then the name of database was specified by user and table_id was not resolved through context.
        /// Do not allow access to TEMPORARY_DATABASE because it contains all temporary tables of all contexts and users.
        if (exception)
-            exception->emplace(ErrorCodes::DATABASE_ACCESS_DENIED, "Direct access to `{}` database is not allowed", String(TEMPORARY_DATABASE));
+            exception->emplace(fmt::format("Direct access to `{}` database is not allowed", TEMPORARY_DATABASE), ErrorCodes::DATABASE_ACCESS_DENIED);
        return {};
    }

@ -274,7 +274,7 @@ DatabaseAndTable DatabaseCatalog::getTableImpl(
        if (databases.end() == it)
        {
            if (exception)
-                exception->emplace(ErrorCodes::UNKNOWN_DATABASE, "Database {} doesn't exist", backQuoteIfNeed(table_id.getDatabaseName()));
+                exception->emplace(fmt::format("Database {} doesn't exist", backQuoteIfNeed(table_id.getDatabaseName())), ErrorCodes::UNKNOWN_DATABASE);
            return {};
        }
        database = it->second;
@ -282,7 +282,7 @@ DatabaseAndTable DatabaseCatalog::getTableImpl(

    auto table = database->tryGetTable(table_id.table_name, context_);
    if (!table && exception)
-            exception->emplace(ErrorCodes::UNKNOWN_TABLE, "Table {} doesn't exist", table_id.getNameForLogs());
+            exception->emplace(fmt::format("Table {} doesn't exist", table_id.getNameForLogs()), ErrorCodes::UNKNOWN_TABLE);
    if (!table)
        database = nullptr;

--- a/src/Interpreters/InterpreterCreateQuery.cpp
+++ b/src/Interpreters/InterpreterCreateQuery.cpp
@ -508,7 +508,9 @@ ColumnsDescription InterpreterCreateQuery::getColumnsDescription(

                default_expr_list->children.emplace_back(
                    setAlias(
-                        col_decl.default_expression->clone(),
+                        col_decl.default_specifier == "EPHEMERAL" ? /// can be ASTLiteral::value NULL
+                            std::make_shared<ASTLiteral>(data_type_ptr->getDefault()) :
+                            col_decl.default_expression->clone(),
                        tmp_column_name));
            }
            else
@ -536,7 +538,11 @@ ColumnsDescription InterpreterCreateQuery::getColumnsDescription(

        if (col_decl.default_expression)
        {
-            ASTPtr default_expr = col_decl.default_expression->clone();
+            ASTPtr default_expr =
+                col_decl.default_specifier == "EPHEMERAL" && col_decl.default_expression->as<ASTLiteral>()->value.isNull() ?
+                    std::make_shared<ASTLiteral>(DataTypeFactory::instance().get(col_decl.type)->getDefault()) :
+                    col_decl.default_expression->clone();
+
            if (col_decl.type)
                column.type = name_type_it->type;
            else
--- a/src/Interpreters/InterpreterSelectQuery.cpp
+++ b/src/Interpreters/InterpreterSelectQuery.cpp
@ -2082,6 +2082,12 @@ void InterpreterSelectQuery::executeAggregation(QueryPlan & query_plan, const Ac

    const Settings & settings = context->getSettingsRef();

+    const auto stats_collecting_params = Aggregator::Params::StatsCollectingParams(
+        query_ptr,
+        settings.collect_hash_table_stats_during_aggregation,
+        settings.max_entries_for_hash_table_stats,
+        settings.max_size_to_preallocate_for_aggregation);
+
    Aggregator::Params params(
        header_before_aggregation,
        keys,
@ -2099,7 +2105,9 @@ void InterpreterSelectQuery::executeAggregation(QueryPlan & query_plan, const Ac
        settings.max_threads,
        settings.min_free_disk_space_for_temporary_data,
        settings.compile_aggregate_expressions,
-        settings.min_count_to_compile_aggregate_expression);
+        settings.min_count_to_compile_aggregate_expression,
+        Block{},
+        stats_collecting_params);

    SortDescription group_by_sort_description;

--- a/src/Interpreters/QueryLog.cpp
+++ b/src/Interpreters/QueryLog.cpp
@ -86,6 +86,7 @@ NamesAndTypesList QueryLogElement::getNamesAndTypes()
        {"initial_query_start_time", std::make_shared<DataTypeDateTime>()},
        {"initial_query_start_time_microseconds", std::make_shared<DataTypeDateTime64>(6)},
        {"interface", std::make_shared<DataTypeUInt8>()},
+        {"is_secure", std::make_shared<DataTypeUInt8>()},
        {"os_user", std::make_shared<DataTypeString>()},
        {"client_hostname", std::make_shared<DataTypeString>()},
        {"client_name", std::make_shared<DataTypeString>()},
@ -275,6 +276,7 @@ void QueryLogElement::appendClientInfo(const ClientInfo & client_info, MutableCo
    columns[i++]->insert(client_info.initial_query_start_time_microseconds);

    columns[i++]->insert(UInt64(client_info.interface));
+    columns[i++]->insert(static_cast<UInt64>(client_info.is_secure));

    columns[i++]->insert(client_info.os_user);
    columns[i++]->insert(client_info.client_hostname);
--- a/src/Interpreters/QueryThreadLog.cpp
+++ b/src/Interpreters/QueryThreadLog.cpp
@ -56,6 +56,7 @@ NamesAndTypesList QueryThreadLogElement::getNamesAndTypes()
        {"initial_query_start_time", std::make_shared<DataTypeDateTime>()},
        {"initial_query_start_time_microseconds", std::make_shared<DataTypeDateTime64>(6)},
        {"interface", std::make_shared<DataTypeUInt8>()},
+        {"is_secure", std::make_shared<DataTypeUInt8>()},
        {"os_user", std::make_shared<DataTypeString>()},
        {"client_hostname", std::make_shared<DataTypeString>()},
        {"client_name", std::make_shared<DataTypeString>()},
--- a/src/Interpreters/Session.cpp
+++ b/src/Interpreters/Session.cpp
@ -243,7 +243,7 @@ void Session::shutdownNamedSessions()
    NamedSessionsStorage::instance().shutdown();
 }

-Session::Session(const ContextPtr & global_context_, ClientInfo::Interface interface_)
+Session::Session(const ContextPtr & global_context_, ClientInfo::Interface interface_, bool is_secure)
    : auth_id(UUIDHelpers::generateV4()),
      global_context(global_context_),
      interface(interface_),
@ -251,6 +251,7 @@ Session::Session(const ContextPtr & global_context_, ClientInfo::Interface inter
 {
    prepared_client_info.emplace();
    prepared_client_info->interface = interface_;
+    prepared_client_info->is_secure = is_secure;
 }

 Session::~Session()
--- a/src/Interpreters/Session.h
+++ b/src/Interpreters/Session.h
@ -32,7 +32,7 @@ public:
    /// Stops using named sessions. The method must be called at the server shutdown.
    static void shutdownNamedSessions();

-    Session(const ContextPtr & global_context_, ClientInfo::Interface interface_);
+    Session(const ContextPtr & global_context_, ClientInfo::Interface interface_, bool is_secure = false);
    ~Session();

    Session(const Session &&) = delete;
--- a/src/Interpreters/executeDDLQueryOnCluster.cpp
+++ b/src/Interpreters/executeDDLQueryOnCluster.cpp
@ -320,12 +320,13 @@ Chunk DDLQueryStatusSource::generate()
            if (throw_on_timeout)
            {
                if (!first_exception)
-                    first_exception = std::make_unique<Exception>(ErrorCodes::TIMEOUT_EXCEEDED, msg_format,
-                        node_path, timeout_seconds, num_unfinished_hosts, num_active_hosts);
+                    first_exception = std::make_unique<Exception>(
+                        fmt::format(msg_format, node_path, timeout_seconds, num_unfinished_hosts, num_active_hosts),
+                        ErrorCodes::TIMEOUT_EXCEEDED);
                return {};
            }

-            LOG_INFO(log, fmt::runtime(msg_format), node_path, timeout_seconds, num_unfinished_hosts, num_active_hosts);
+            LOG_INFO(log, msg_format, node_path, timeout_seconds, num_unfinished_hosts, num_active_hosts);

            NameSet unfinished_hosts = waiting_hosts;
            for (const auto & host_id : finished_hosts)
@ -358,9 +359,12 @@ Chunk DDLQueryStatusSource::generate()
            /// Paradoxically, this exception will be throw even in case of "never_throw" mode.

            if (!first_exception)
-                first_exception = std::make_unique<Exception>(ErrorCodes::UNFINISHED,
-                    "Cannot provide query execution status. The query's node {} has been deleted by the cleaner"
-                    " since it was finished (or its lifetime is expired)", node_path);
+                first_exception = std::make_unique<Exception>(
+                    fmt::format(
+                        "Cannot provide query execution status. The query's node {} has been deleted by the cleaner"
+                        " since it was finished (or its lifetime is expired)",
+                        node_path),
+                    ErrorCodes::UNFINISHED);
            return {};
        }

@ -386,7 +390,8 @@ Chunk DDLQueryStatusSource::generate()
            if (status.code != 0 && !first_exception
                && context->getSettingsRef().distributed_ddl_output_mode != DistributedDDLOutputMode::NEVER_THROW)
            {
-                first_exception = std::make_unique<Exception>(status.code, "There was an error on [{}:{}]: {}", host, port, status.message);
+                first_exception = std::make_unique<Exception>(
+                    fmt::format("There was an error on [{}:{}]: {}", host, port, status.message), status.code);
            }

            ++num_hosts_finished;
--- a/src/Interpreters/threadPoolCallbackRunner.cpp
+++ b/src/Interpreters/threadPoolCallbackRunner.cpp
@ -9,30 +9,31 @@ namespace DB

 CallbackRunner threadPoolCallbackRunner(ThreadPool & pool)
 {
-    return [pool = &pool, thread_group = CurrentThread::getGroup()](auto callback)
+    return [pool = &pool, thread_group = CurrentThread::getGroup()](auto callback) mutable
    {
-        pool->scheduleOrThrow([callback = std::move(callback), thread_group]()
-        {
-            if (thread_group)
-                CurrentThread::attachTo(thread_group);
-
-            SCOPE_EXIT_SAFE({
+        pool->scheduleOrThrow(
+            [&, callback = std::move(callback), thread_group]()
+            {
                if (thread_group)
-                    CurrentThread::detachQueryIfNotDetached();
+                    CurrentThread::attachTo(thread_group);

-                /// After we detached from the thread_group, parent for memory_tracker inside ThreadStatus will be reset to it's parent.
-                /// Typically, it may be changes from Process to User.
-                /// Usually it could be ok, because thread pool task is executed before user-level memory tracker is destroyed.
-                /// However, thread could stay alive inside the thread pool, and it's ThreadStatus as well.
-                /// When, finally, we destroy the thread (and the ThreadStatus),
-                /// it can use memory tracker in the ~ThreadStatus in order to alloc/free untracked_memory,
-                /// and by this time user-level memory tracker may be already destroyed.
-                ///
-                /// As a work-around, reset memory tracker to total, which is always alive.
-                CurrentThread::get().memory_tracker.setParent(&total_memory_tracker);
+                SCOPE_EXIT_SAFE({
+                    if (thread_group)
+                        CurrentThread::detachQueryIfNotDetached();
+
+                    /// After we detached from the thread_group, parent for memory_tracker inside ThreadStatus will be reset to it's parent.
+                    /// Typically, it may be changes from Process to User.
+                    /// Usually it could be ok, because thread pool task is executed before user-level memory tracker is destroyed.
+                    /// However, thread could stay alive inside the thread pool, and it's ThreadStatus as well.
+                    /// When, finally, we destroy the thread (and the ThreadStatus),
+                    /// it can use memory tracker in the ~ThreadStatus in order to alloc/free untracked_memory,
+                    /// and by this time user-level memory tracker may be already destroyed.
+                    ///
+                    /// As a work-around, reset memory tracker to total, which is always alive.
+                    CurrentThread::get().memory_tracker.setParent(&total_memory_tracker);
+                });
+                callback();
            });
-            callback();
-        });
    };
 }

--- a/src/Parsers/ASTColumnDeclaration.cpp
+++ b/src/Parsers/ASTColumnDeclaration.cpp
@ -1,6 +1,7 @@
 #include <Parsers/ASTColumnDeclaration.h>
 #include <Common/quoteString.h>
 #include <IO/Operators.h>
+#include <Parsers/ASTLiteral.h>


 namespace DB
@ -71,8 +72,12 @@ void ASTColumnDeclaration::formatImpl(const FormatSettings & settings, FormatSta

    if (default_expression)
    {
-        settings.ostr << ' ' << (settings.hilite ? hilite_keyword : "") << default_specifier << (settings.hilite ? hilite_none : "") << ' ';
-        default_expression->formatImpl(settings, state, frame);
+        settings.ostr << ' ' << (settings.hilite ? hilite_keyword : "") << default_specifier << (settings.hilite ? hilite_none : "");
+        if (default_specifier != "EPHEMERAL" || !default_expression->as<ASTLiteral>()->value.isNull())
+        {
+            settings.ostr << ' ';
+            default_expression->formatImpl(settings, state, frame);
+        }
    }

    if (comment)
--- a/Show More
+++ b/Show More