Merge master

2024-11-22 15:42:02 +00:00 · 2022-06-13 17:35:48 +02:00 · 2022-06-13 17:35:48 +02:00 · 4813b90281
commit 4813b90281
parent 82629cc2be 88dfcaa892
165 changed files with 3507 additions and 875 deletions
--- a/base/base/errnoToString.cpp
+++ b/base/base/errnoToString.cpp
@ -9,7 +9,7 @@ std::string errnoToString(int code, int the_errno)
    char buf[buf_size];
 #ifndef _GNU_SOURCE
    int rc = strerror_r(the_errno, buf, buf_size);
-#ifdef __APPLE__
+#ifdef OS_DARWIN
    if (rc != 0 && rc != EINVAL)
 #else
    if (rc != 0)
--- a/base/base/getAvailableMemoryAmount.cpp
+++ b/base/base/getAvailableMemoryAmount.cpp
@ -16,7 +16,7 @@ uint64_t getAvailableMemoryAmountOrZero()
 {
 #if defined(_SC_PHYS_PAGES) // linux
    return getPageSize() * sysconf(_SC_PHYS_PAGES);
-#elif defined(__FreeBSD__)
+#elif defined(OS_FREEBSD)
    struct vmtotal vmt;
    size_t vmt_size = sizeof(vmt);
    if (sysctlbyname("vm.vmtotal", &vmt, &vmt_size, NULL, 0) == 0)
--- a/base/base/phdr_cache.cpp
+++ b/base/base/phdr_cache.cpp
@ -6,7 +6,7 @@

 #include <base/defines.h>

-#if defined(__linux__) && !defined(THREAD_SANITIZER) && !defined(USE_MUSL)
+#if defined(OS_LINUX) && !defined(THREAD_SANITIZER) && !defined(USE_MUSL)
    #define USE_PHDR_CACHE 1
 #endif

--- a/cmake/tools.cmake
+++ b/cmake/tools.cmake
@ -77,6 +77,7 @@ if (OS_LINUX AND NOT LINKER_NAME)

    if (NOT LINKER_NAME)
        if (GOLD_PATH)
+            message (WARNING "Linking with gold is not recommended. Please use lld.")
            if (COMPILER_GCC)
                set (LINKER_NAME "gold")
            else ()
--- a/contrib/llvm-cmake/CMakeLists.txt
+++ b/contrib/llvm-cmake/CMakeLists.txt
@ -76,9 +76,7 @@ message (STATUS "LLVM library Directory: ${LLVM_LIBRARY_DIRS}")
 message (STATUS "LLVM C++ compiler flags: ${LLVM_CXXFLAGS}")

 # ld: unknown option: --color-diagnostics
-if (APPLE)
-    set (LINKER_SUPPORTS_COLOR_DIAGNOSTICS 0 CACHE INTERNAL "")
-endif ()
+set (LINKER_SUPPORTS_COLOR_DIAGNOSTICS 0 CACHE INTERNAL "")

 # Do not adjust RPATH in llvm, since then it will not be able to find libcxx/libcxxabi/libunwind
 set (CMAKE_INSTALL_RPATH "ON")
--- a/docker/server/README.md
+++ b/docker/server/README.md
@ -21,7 +21,9 @@ By default, starting above server instance will be run as default user without p

 ### connect to it from a native client
 ```bash
-$ docker run -it --rm --link some-clickhouse-server:clickhouse-server clickhouse/clickhouse-client --host clickhouse-server
+$ docker run -it --rm --link some-clickhouse-server:clickhouse-server --entrypoint clickhouse-client clickhouse/clickhouse-server --host clickhouse-server
+# OR
+$ docker exec -it some-clickhouse-server clickhouse-client
 ```

 More information about [ClickHouse client](https://clickhouse.com/docs/en/interfaces/cli/).
--- a/docker/test/stateful/Dockerfile
+++ b/docker/test/stateful/Dockerfile
@ -7,22 +7,12 @@ RUN apt-get update -y \
    && env DEBIAN_FRONTEND=noninteractive \
        apt-get install --yes --no-install-recommends \
        python3-requests \
-        llvm-9
+    && apt-get clean

 COPY s3downloader /s3downloader

 ENV S3_URL="https://clickhouse-datasets.s3.amazonaws.com"
 ENV DATASETS="hits visits"
-ENV EXPORT_S3_STORAGE_POLICIES=1
-
-# Download Minio-related binaries
-RUN arch=${TARGETARCH:-amd64} \
-    && if [ "$arch" = "amd64" ] ; then wget "https://dl.min.io/server/minio/release/linux-${arch}/archive/minio-20220103182258.0.0.x86_64.rpm"; else wget "https://dl.min.io/server/minio/release/linux-${arch}/archive/minio-20220103182258.0.0.aarch64.rpm" ; fi \
-    && wget "https://dl.min.io/client/mc/release/linux-${arch}/mc" \
-    && chmod +x ./mc
-ENV MINIO_ROOT_USER="clickhouse"
-ENV MINIO_ROOT_PASSWORD="clickhouse"
-COPY setup_minio.sh /

 COPY run.sh /
 CMD ["/bin/bash", "/run.sh"]
--- a/docker/test/stateful/run.sh
+++ b/docker/test/stateful/run.sh
@ -17,7 +17,7 @@ ln -s /usr/share/clickhouse-test/clickhouse-test /usr/bin/clickhouse-test
 # install test configs
 /usr/share/clickhouse-test/config/install.sh

-./setup_minio.sh
+./setup_minio.sh stateful

 function start()
 {
--- a/docker/test/stateful/setup_minio.sh
+++ b/docker/test/stateful/setup_minio.sh
@ -1,77 +0,0 @@
-#!/bin/bash
-
-# TODO: Make this file shared with stateless tests
-#
-# Usage for local run:
-#
-# ./docker/test/stateful/setup_minio.sh ./tests/
-#
-
-set -e -x -a -u
-
-rpm2cpio ./minio-20220103182258.0.0.*.rpm | cpio -i --make-directories
-find / -name minio
-cp ./usr/local/bin/minio ./
-
-ls -lha
-
-mkdir -p ./minio_data
-
-if [ ! -f ./minio ]; then
-  echo 'MinIO binary not found, downloading...'
-
-  BINARY_TYPE=$(uname -s | tr '[:upper:]' '[:lower:]')
-
-  wget "https://dl.min.io/server/minio/release/${BINARY_TYPE}-amd64/minio" \
-    && chmod +x ./minio \
-    && wget "https://dl.min.io/client/mc/release/${BINARY_TYPE}-amd64/mc" \
-    && chmod +x ./mc
-fi
-
-MINIO_ROOT_USER=${MINIO_ROOT_USER:-clickhouse}
-MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-clickhouse}
-
-./minio --version
-./minio server --address ":11111" ./minio_data &
-
-i=0
-while ! curl -v --silent http://localhost:11111 2>&1 | grep AccessDenied
-do
-  if [[ $i == 60 ]]; then
-    echo "Failed to setup minio"
-    exit 0
-  fi
-  echo "Trying to connect to minio"
-  sleep 1
-  i=$((i + 1))
-done
-
-lsof -i :11111
-
-sleep 5
-
-./mc alias set clickminio http://localhost:11111 clickhouse clickhouse
-./mc admin user add clickminio test testtest
-./mc admin policy set clickminio readwrite user=test
-./mc mb clickminio/test
-
-
-# Upload data to Minio. By default after unpacking all tests will in
-# /usr/share/clickhouse-test/queries
-
-TEST_PATH=${1:-/usr/share/clickhouse-test}
-MINIO_DATA_PATH=${TEST_PATH}/queries/1_stateful/data_minio
-
-# Iterating over globs will cause redudant FILE variale to be a path to a file, not a filename
-# shellcheck disable=SC2045
-for FILE in $(ls "${MINIO_DATA_PATH}"); do
-    echo "$FILE";
-    ./mc cp "${MINIO_DATA_PATH}"/"$FILE" clickminio/test/"$FILE";
-done
-
-mkdir -p ~/.aws
-cat <<EOT >> ~/.aws/credentials
-[default]
-aws_access_key_id=clickhouse
-aws_secret_access_key=clickhouse
-EOT
--- a/docker/test/stateful/setup_minio.sh
+++ b/docker/test/stateful/setup_minio.sh
@ -0,0 +1 @@
+../stateless/setup_minio.sh
--- a/docker/test/stateless/Dockerfile
+++ b/docker/test/stateless/Dockerfile
@ -5,37 +5,36 @@ FROM clickhouse/test-base:$FROM_TAG

 ARG odbc_driver_url="https://github.com/ClickHouse/clickhouse-odbc/releases/download/v1.1.4.20200302/clickhouse-odbc-1.1.4-Linux.tar.gz"

+# golang version 1.13 on Ubuntu 20 is enough for tests
 RUN apt-get update -y \
    && env DEBIAN_FRONTEND=noninteractive \
        apt-get install --yes --no-install-recommends \
+            awscli \
            brotli \
            expect \
-            zstd \
+            golang \
            lsof \
+            mysql-client=8.0* \
            ncdu \
            netcat-openbsd \
+            openjdk-11-jre-headless \
            openssl \
+            postgresql-client \
            protobuf-compiler \
            python3 \
            python3-lxml \
+            python3-pip \
            python3-requests \
            python3-termcolor \
-            python3-pip \
            qemu-user-static \
+            sqlite3 \
            sudo \
-            # golang version 1.13 on Ubuntu 20 is enough for tests
-            golang \
            telnet \
            tree \
            unixodbc \
            wget \
-            mysql-client=8.0* \
-            postgresql-client \
-            sqlite3 \
-            awscli \
-            openjdk-11-jre-headless \
-            rpm2cpio \
-            cpio
+            zstd \
+    && apt-get clean


 RUN pip3 install numpy scipy pandas Jinja2
@ -53,13 +52,17 @@ RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
 ENV NUM_TRIES=1
 ENV MAX_RUN_TIME=0

+# Unrelated to vars in setup_minio.sh, but should be the same there
+# to have the same binaries for local running scenario
+ARG MINIO_SERVER_VERSION=2022-01-03T18-22-58Z
+ARG MINIO_CLIENT_VERSION=2022-01-05T23-52-51Z
 ARG TARGETARCH

 # Download Minio-related binaries
 RUN arch=${TARGETARCH:-amd64} \
-    && if [ "$arch" = "amd64" ] ; then wget "https://dl.min.io/server/minio/release/linux-${arch}/archive/minio-20220103182258.0.0.x86_64.rpm"; else wget "https://dl.min.io/server/minio/release/linux-${arch}/archive/minio-20220103182258.0.0.aarch64.rpm" ; fi \
-    && wget "https://dl.min.io/client/mc/release/linux-${arch}/mc" \
-    && chmod +x ./mc
+    && wget "https://dl.min.io/server/minio/release/linux-${arch}/archive/minio.RELEASE.${MINIO_SERVER_VERSION}" -O ./minio \
+    && wget "https://dl.min.io/client/mc/release/linux-${arch}/archive/mc.RELEASE.${MINIO_CLIENT_VERSION}" -O ./mc \
+    && chmod +x ./mc ./minio


 RUN wget 'https://dlcdn.apache.org/hadoop/common/hadoop-3.3.1/hadoop-3.3.1.tar.gz' \
--- a/docker/test/stateless/run.sh
+++ b/docker/test/stateless/run.sh
@ -18,7 +18,7 @@ ln -s /usr/share/clickhouse-test/clickhouse-test /usr/bin/clickhouse-test
 # install test configs
 /usr/share/clickhouse-test/config/install.sh

-./setup_minio.sh
+./setup_minio.sh stateless
 ./setup_hdfs_minicluster.sh

 # For flaky check we also enable thread fuzzer
--- a/docker/test/stateless/setup_minio.sh
+++ b/docker/test/stateless/setup_minio.sh
@ -1,29 +1,41 @@
 #!/bin/bash

-# Usage for local run:
-#
-# ./docker/test/stateless/setup_minio.sh ./tests/
-#
+USAGE='Usage for local run:
+
+./docker/test/stateless/setup_minio.sh { stateful | stateless } ./tests/
+
+'

 set -e -x -a -u

-rpm2cpio ./minio-20220103182258.0.0.*.rpm | cpio -i --make-directories
-find / -name minio
-cp ./usr/local/bin/minio ./
+TEST_TYPE="$1"
+shift
+
+case $TEST_TYPE in
+  stateless) QUERY_DIR=0_stateless ;;
+  stateful) QUERY_DIR=1_stateful ;;
+  *) echo "unknown test type $TEST_TYPE"; echo "${USAGE}"; exit 1 ;;
+esac

 ls -lha

 mkdir -p ./minio_data

 if [ ! -f ./minio ]; then
+  MINIO_SERVER_VERSION=${MINIO_SERVER_VERSION:-2022-01-03T18-22-58Z}
+  MINIO_CLIENT_VERSION=${MINIO_CLIENT_VERSION:-2022-01-05T23-52-51Z}
+  case $(uname -m) in
+    x86_64) BIN_ARCH=amd64 ;;
+    aarch64) BIN_ARCH=arm64 ;;
+    *) echo "unknown architecture $(uname -m)"; exit 1 ;;
+  esac
  echo 'MinIO binary not found, downloading...'

  BINARY_TYPE=$(uname -s | tr '[:upper:]' '[:lower:]')

-  wget "https://dl.min.io/server/minio/release/${BINARY_TYPE}-amd64/minio" \
-    && chmod +x ./minio \
-    && wget "https://dl.min.io/client/mc/release/${BINARY_TYPE}-amd64/mc" \
-    && chmod +x ./mc
+  wget "https://dl.min.io/server/minio/release/${BINARY_TYPE}-${BIN_ARCH}/archive/minio.RELEASE.${MINIO_SERVER_VERSION}" -O ./minio \
+    && wget "https://dl.min.io/client/mc/release/${BINARY_TYPE}-${BIN_ARCH}/archive/mc.RELEASE.${MINIO_CLIENT_VERSION}" -O ./mc \
+    && chmod +x ./mc ./minio
 fi

 MINIO_ROOT_USER=${MINIO_ROOT_USER:-clickhouse}
@ -52,14 +64,16 @@ sleep 5
 ./mc admin user add clickminio test testtest
 ./mc admin policy set clickminio readwrite user=test
 ./mc mb clickminio/test
-./mc policy set public clickminio/test
+if [ "$TEST_TYPE" = "stateless" ]; then
+  ./mc policy set public clickminio/test
+fi


 # Upload data to Minio. By default after unpacking all tests will in
 # /usr/share/clickhouse-test/queries

 TEST_PATH=${1:-/usr/share/clickhouse-test}
-MINIO_DATA_PATH=${TEST_PATH}/queries/0_stateless/data_minio
+MINIO_DATA_PATH=${TEST_PATH}/queries/${QUERY_DIR}/data_minio

 # Iterating over globs will cause redudant FILE variale to be a path to a file, not a filename
 # shellcheck disable=SC2045
@ -71,6 +85,6 @@ done
 mkdir -p ~/.aws
 cat <<EOT >> ~/.aws/credentials
 [default]
-aws_access_key_id=clickhouse
-aws_secret_access_key=clickhouse
+aws_access_key_id=${MINIO_ROOT_USER}
+aws_secret_access_key=${MINIO_ROOT_PASSWORD}
 EOT
--- a/docker/test/stress/run.sh
+++ b/docker/test/stress/run.sh
@ -174,7 +174,7 @@ install_packages package_folder

 configure

-./setup_minio.sh
+./setup_minio.sh stateful  # to have a proper environment

 start

--- a/docs/en/development/build.md
+++ b/docs/en/development/build.md
@ -19,7 +19,7 @@ The following tutorial is based on the Ubuntu Linux system. With appropriate cha
 ### Install Git, CMake, Python and Ninja {#install-git-cmake-python-and-ninja}

 ``` bash
-sudo apt-get install git cmake python ninja-build
+sudo apt-get install git cmake ccache python3 ninja-build
 ```

 Or cmake3 instead of cmake on older systems.
--- a/docs/en/getting-started/example-datasets/metrica.md
+++ b/docs/en/getting-started/example-datasets/metrica.md
@ -1,78 +1,139 @@
 ---
 sidebar_label: Web Analytics Data
-description: Dataset consists of two tables containing anonymized web analytics data with hits and visits
+description: Dataset consisting of two tables containing anonymized web analytics data with hits and visits
 ---

 # Anonymized Web Analytics Data

-Dataset consists of two tables containing anonymized web analytics data with hits (`hits_v1`) and visits (`visits_v1`).
+This dataset consists of two tables containing anonymized web analytics data with hits (`hits_v1`) and visits (`visits_v1`).

-The dataset consists of two tables, either of them can be downloaded as a compressed `tsv.xz` file or as prepared partitions. In addition to that, an extended version of the `hits` table containing 100 million rows is available as TSV at https://datasets.clickhouse.com/hits/tsv/hits_100m_obfuscated_v1.tsv.xz and as prepared partitions at https://datasets.clickhouse.com/hits/partitions/hits_100m_obfuscated_v1.tar.xz.
+The tables can be downloaded as compressed `tsv.xz` files. In addition to the sample worked with in this document, an extended (7.5GB) version of the `hits` table containing 100 million rows is available as TSV at [https://datasets.clickhouse.com/hits/tsv/hits_100m_obfuscated_v1.tsv.xz](https://datasets.clickhouse.com/hits/tsv/hits_100m_obfuscated_v1.tsv.xz).

-## Obtaining Tables from Prepared Partitions {#obtaining-tables-from-prepared-partitions}
+## Download and ingest the data

-Download and import hits table:
-
-``` bash
-curl -O https://datasets.clickhouse.com/hits/partitions/hits_v1.tar
-tar xvf hits_v1.tar -C /var/lib/clickhouse # path to ClickHouse data directory
-# check permissions on unpacked data, fix if required
-sudo service clickhouse-server restart
-clickhouse-client --query "SELECT COUNT(*) FROM datasets.hits_v1"
-```
-
-Download and import visits:
-
-``` bash
-curl -O https://datasets.clickhouse.com/visits/partitions/visits_v1.tar
-tar xvf visits_v1.tar -C /var/lib/clickhouse # path to ClickHouse data directory
-# check permissions on unpacked data, fix if required
-sudo service clickhouse-server restart
-clickhouse-client --query "SELECT COUNT(*) FROM datasets.visits_v1"
-```
-
-## Obtaining Tables from Compressed TSV File {#obtaining-tables-from-compressed-tsv-file}
-
-Download and import hits from compressed TSV file:
+### Download the hits compressed TSV file:

 ``` bash
 curl https://datasets.clickhouse.com/hits/tsv/hits_v1.tsv.xz | unxz --threads=`nproc` > hits_v1.tsv
 # Validate the checksum
 md5sum hits_v1.tsv
 # Checksum should be equal to: f3631b6295bf06989c1437491f7592cb
-# now create table
-clickhouse-client --query "CREATE DATABASE IF NOT EXISTS datasets"
-# for hits_v1
-clickhouse-client --query "CREATE TABLE datasets.hits_v1 ( WatchID UInt64,  JavaEnable UInt8,  Title String,  GoodEvent Int16,  EventTime DateTime,  EventDate Date,  CounterID UInt32,  ClientIP UInt32,  ClientIP6 FixedString(16),  RegionID UInt32,  UserID UInt64,  CounterClass Int8,  OS UInt8,  UserAgent UInt8,  URL String,  Referer String,  URLDomain String,  RefererDomain String,  Refresh UInt8,  IsRobot UInt8,  RefererCategories Array(UInt16),  URLCategories Array(UInt16), URLRegions Array(UInt32),  RefererRegions Array(UInt32),  ResolutionWidth UInt16,  ResolutionHeight UInt16,  ResolutionDepth UInt8,  FlashMajor UInt8, FlashMinor UInt8,  FlashMinor2 String,  NetMajor UInt8,  NetMinor UInt8, UserAgentMajor UInt16,  UserAgentMinor FixedString(2),  CookieEnable UInt8, JavascriptEnable UInt8,  IsMobile UInt8,  MobilePhone UInt8,  MobilePhoneModel String,  Params String,  IPNetworkID UInt32,  TraficSourceID Int8, SearchEngineID UInt16,  SearchPhrase String,  AdvEngineID UInt8,  IsArtifical UInt8,  WindowClientWidth UInt16,  WindowClientHeight UInt16,  ClientTimeZone Int16,  ClientEventTime DateTime,  SilverlightVersion1 UInt8, SilverlightVersion2 UInt8,  SilverlightVersion3 UInt32,  SilverlightVersion4 UInt16,  PageCharset String,  CodeVersion UInt32,  IsLink UInt8,  IsDownload UInt8,  IsNotBounce UInt8,  FUniqID UInt64,  HID UInt32,  IsOldCounter UInt8, IsEvent UInt8,  IsParameter UInt8,  DontCountHits UInt8,  WithHash UInt8, HitColor FixedString(1),  UTCEventTime DateTime,  Age UInt8,  Sex UInt8,  Income UInt8,  Interests UInt16,  Robotness UInt8,  GeneralInterests Array(UInt16), RemoteIP UInt32,  RemoteIP6 FixedString(16),  WindowName Int32,  OpenerName Int32,  HistoryLength Int16,  BrowserLanguage FixedString(2),  BrowserCountry FixedString(2),  SocialNetwork String,  SocialAction String,  HTTPError UInt16, SendTiming Int32,  DNSTiming Int32,  ConnectTiming Int32,  ResponseStartTiming Int32,  ResponseEndTiming Int32,  FetchTiming Int32,  RedirectTiming Int32, DOMInteractiveTiming Int32,  DOMContentLoadedTiming Int32,  DOMCompleteTiming Int32,  LoadEventStartTiming Int32,  LoadEventEndTiming Int32, NSToDOMContentLoadedTiming Int32,  FirstPaintTiming Int32,  RedirectCount Int8, SocialSourceNetworkID UInt8,  SocialSourcePage String,  ParamPrice Int64, ParamOrderID String,  ParamCurrency FixedString(3),  ParamCurrencyID UInt16, GoalsReached Array(UInt32),  OpenstatServiceName String,  OpenstatCampaignID String,  OpenstatAdID String,  OpenstatSourceID String,  UTMSource String, UTMMedium String,  UTMCampaign String,  UTMContent String,  UTMTerm String, FromTag String,  HasGCLID UInt8,  RefererHash UInt64,  URLHash UInt64,  CLID UInt32,  YCLID UInt64,  ShareService String,  ShareURL String,  ShareTitle String,  ParsedParams Nested(Key1 String,  Key2 String, Key3 String, Key4 String, Key5 String,  ValueDouble Float64),  IslandID FixedString(16),  RequestNum UInt32,  RequestTry UInt8) ENGINE = MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS index_granularity = 8192"
-# for hits_100m_obfuscated
-clickhouse-client --query="CREATE TABLE default.hits_100m_obfuscated (WatchID UInt64, JavaEnable UInt8, Title String, GoodEvent Int16, EventTime DateTime, EventDate Date, CounterID UInt32, ClientIP UInt32, RegionID UInt32, UserID UInt64, CounterClass Int8, OS UInt8, UserAgent UInt8, URL String, Referer String, Refresh UInt8, RefererCategoryID UInt16, RefererRegionID UInt32, URLCategoryID UInt16, URLRegionID UInt32, ResolutionWidth UInt16, ResolutionHeight UInt16, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, FlashMinor2 String, NetMajor UInt8, NetMinor UInt8, UserAgentMajor UInt16, UserAgentMinor FixedString(2), CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, MobilePhone UInt8, MobilePhoneModel String, Params String, IPNetworkID UInt32, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, IsArtifical UInt8, WindowClientWidth UInt16, WindowClientHeight UInt16, ClientTimeZone Int16, ClientEventTime DateTime, SilverlightVersion1 UInt8, SilverlightVersion2 UInt8, SilverlightVersion3 UInt32, SilverlightVersion4 UInt16, PageCharset String, CodeVersion UInt32, IsLink UInt8, IsDownload UInt8, IsNotBounce UInt8, FUniqID UInt64, OriginalURL String, HID UInt32, IsOldCounter UInt8, IsEvent UInt8, IsParameter UInt8, DontCountHits UInt8, WithHash UInt8, HitColor FixedString(1), LocalEventTime DateTime, Age UInt8, Sex UInt8, Income UInt8, Interests UInt16, Robotness UInt8, RemoteIP UInt32, WindowName Int32, OpenerName Int32, HistoryLength Int16, BrowserLanguage FixedString(2), BrowserCountry FixedString(2), SocialNetwork String, SocialAction String, HTTPError UInt16, SendTiming UInt32, DNSTiming UInt32, ConnectTiming UInt32, ResponseStartTiming UInt32, ResponseEndTiming UInt32, FetchTiming UInt32, SocialSourceNetworkID UInt8, SocialSourcePage String, ParamPrice Int64, ParamOrderID String, ParamCurrency FixedString(3), ParamCurrencyID UInt16, OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, RefererHash UInt64, URLHash UInt64, CLID UInt32) ENGINE = MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER  BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS index_granularity = 8192"
+```

-# import data
+### Create the database and table
+
+```bash
+clickhouse-client --query "CREATE DATABASE IF NOT EXISTS datasets"
+```
+
+For hits_v1
+
+```bash
+clickhouse-client --query "CREATE TABLE datasets.hits_v1 ( WatchID UInt64,  JavaEnable UInt8,  Title String,  GoodEvent Int16,  EventTime DateTime,  EventDate Date,  CounterID UInt32,  ClientIP UInt32,  ClientIP6 FixedString(16),  RegionID UInt32,  UserID UInt64,  CounterClass Int8,  OS UInt8,  UserAgent UInt8,  URL String,  Referer String,  URLDomain String,  RefererDomain String,  Refresh UInt8,  IsRobot UInt8,  RefererCategories Array(UInt16),  URLCategories Array(UInt16), URLRegions Array(UInt32),  RefererRegions Array(UInt32),  ResolutionWidth UInt16,  ResolutionHeight UInt16,  ResolutionDepth UInt8,  FlashMajor UInt8, FlashMinor UInt8,  FlashMinor2 String,  NetMajor UInt8,  NetMinor UInt8, UserAgentMajor UInt16,  UserAgentMinor FixedString(2),  CookieEnable UInt8, JavascriptEnable UInt8,  IsMobile UInt8,  MobilePhone UInt8,  MobilePhoneModel String,  Params String,  IPNetworkID UInt32,  TraficSourceID Int8, SearchEngineID UInt16,  SearchPhrase String,  AdvEngineID UInt8,  IsArtifical UInt8,  WindowClientWidth UInt16,  WindowClientHeight UInt16,  ClientTimeZone Int16,  ClientEventTime DateTime,  SilverlightVersion1 UInt8, SilverlightVersion2 UInt8,  SilverlightVersion3 UInt32,  SilverlightVersion4 UInt16,  PageCharset String,  CodeVersion UInt32,  IsLink UInt8,  IsDownload UInt8,  IsNotBounce UInt8,  FUniqID UInt64,  HID UInt32,  IsOldCounter UInt8, IsEvent UInt8,  IsParameter UInt8,  DontCountHits UInt8,  WithHash UInt8, HitColor FixedString(1),  UTCEventTime DateTime,  Age UInt8,  Sex UInt8,  Income UInt8,  Interests UInt16,  Robotness UInt8,  GeneralInterests Array(UInt16), RemoteIP UInt32,  RemoteIP6 FixedString(16),  WindowName Int32,  OpenerName Int32,  HistoryLength Int16,  BrowserLanguage FixedString(2),  BrowserCountry FixedString(2),  SocialNetwork String,  SocialAction String,  HTTPError UInt16, SendTiming Int32,  DNSTiming Int32,  ConnectTiming Int32,  ResponseStartTiming Int32,  ResponseEndTiming Int32,  FetchTiming Int32,  RedirectTiming Int32, DOMInteractiveTiming Int32,  DOMContentLoadedTiming Int32,  DOMCompleteTiming Int32,  LoadEventStartTiming Int32,  LoadEventEndTiming Int32, NSToDOMContentLoadedTiming Int32,  FirstPaintTiming Int32,  RedirectCount Int8, SocialSourceNetworkID UInt8,  SocialSourcePage String,  ParamPrice Int64, ParamOrderID String,  ParamCurrency FixedString(3),  ParamCurrencyID UInt16, GoalsReached Array(UInt32),  OpenstatServiceName String,  OpenstatCampaignID String,  OpenstatAdID String,  OpenstatSourceID String,  UTMSource String, UTMMedium String,  UTMCampaign String,  UTMContent String,  UTMTerm String, FromTag String,  HasGCLID UInt8,  RefererHash UInt64,  URLHash UInt64,  CLID UInt32,  YCLID UInt64,  ShareService String,  ShareURL String,  ShareTitle String,  ParsedParams Nested(Key1 String,  Key2 String, Key3 String, Key4 String, Key5 String,  ValueDouble Float64),  IslandID FixedString(16),  RequestNum UInt32,  RequestTry UInt8) ENGINE = MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS index_granularity = 8192"
+```
+
+Or for hits_100m_obfuscated
+
+```bash
+clickhouse-client --query="CREATE TABLE default.hits_100m_obfuscated (WatchID UInt64, JavaEnable UInt8, Title String, GoodEvent Int16, EventTime DateTime, EventDate Date, CounterID UInt32, ClientIP UInt32, RegionID UInt32, UserID UInt64, CounterClass Int8, OS UInt8, UserAgent UInt8, URL String, Referer String, Refresh UInt8, RefererCategoryID UInt16, RefererRegionID UInt32, URLCategoryID UInt16, URLRegionID UInt32, ResolutionWidth UInt16, ResolutionHeight UInt16, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, FlashMinor2 String, NetMajor UInt8, NetMinor UInt8, UserAgentMajor UInt16, UserAgentMinor FixedString(2), CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, MobilePhone UInt8, MobilePhoneModel String, Params String, IPNetworkID UInt32, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, IsArtifical UInt8, WindowClientWidth UInt16, WindowClientHeight UInt16, ClientTimeZone Int16, ClientEventTime DateTime, SilverlightVersion1 UInt8, SilverlightVersion2 UInt8, SilverlightVersion3 UInt32, SilverlightVersion4 UInt16, PageCharset String, CodeVersion UInt32, IsLink UInt8, IsDownload UInt8, IsNotBounce UInt8, FUniqID UInt64, OriginalURL String, HID UInt32, IsOldCounter UInt8, IsEvent UInt8, IsParameter UInt8, DontCountHits UInt8, WithHash UInt8, HitColor FixedString(1), LocalEventTime DateTime, Age UInt8, Sex UInt8, Income UInt8, Interests UInt16, Robotness UInt8, RemoteIP UInt32, WindowName Int32, OpenerName Int32, HistoryLength Int16, BrowserLanguage FixedString(2), BrowserCountry FixedString(2), SocialNetwork String, SocialAction String, HTTPError UInt16, SendTiming UInt32, DNSTiming UInt32, ConnectTiming UInt32, ResponseStartTiming UInt32, ResponseEndTiming UInt32, FetchTiming UInt32, SocialSourceNetworkID UInt8, SocialSourcePage String, ParamPrice Int64, ParamOrderID String, ParamCurrency FixedString(3), ParamCurrencyID UInt16, OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, RefererHash UInt64, URLHash UInt64, CLID UInt32) ENGINE = MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER  BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS index_granularity = 8192"
+```
+
+### Import the hits data:
+
+```bash
 cat hits_v1.tsv | clickhouse-client --query "INSERT INTO datasets.hits_v1 FORMAT TSV" --max_insert_block_size=100000
-# optionally you can optimize table
-clickhouse-client --query "OPTIMIZE TABLE datasets.hits_v1 FINAL"
+```
+
+Verify the count of rows
+
+```bash
 clickhouse-client --query "SELECT COUNT(*) FROM datasets.hits_v1"
 ```

-Download and import visits from compressed tsv-file:
+```response
+8873898
+```
+
+### Download the visits compressed TSV file:

 ``` bash
 curl https://datasets.clickhouse.com/visits/tsv/visits_v1.tsv.xz | unxz --threads=`nproc` > visits_v1.tsv
 # Validate the checksum
 md5sum visits_v1.tsv
 # Checksum should be equal to: 6dafe1a0f24e59e3fc2d0fed85601de6
-# now create table
-clickhouse-client --query "CREATE DATABASE IF NOT EXISTS datasets"
+```
+
+### Create the visits table
+
+```bash
 clickhouse-client --query "CREATE TABLE datasets.visits_v1 ( CounterID UInt32,  StartDate Date,  Sign Int8,  IsNew UInt8,  VisitID UInt64,  UserID UInt64,  StartTime DateTime,  Duration UInt32,  UTCStartTime DateTime,  PageViews Int32,  Hits Int32,  IsBounce UInt8,  Referer String,  StartURL String,  RefererDomain String,  StartURLDomain String,  EndURL String,  LinkURL String,  IsDownload UInt8,  TraficSourceID Int8,  SearchEngineID UInt16,  SearchPhrase String,  AdvEngineID UInt8,  PlaceID Int32,  RefererCategories Array(UInt16),  URLCategories Array(UInt16),  URLRegions Array(UInt32),  RefererRegions Array(UInt32),  IsYandex UInt8,  GoalReachesDepth Int32,  GoalReachesURL Int32,  GoalReachesAny Int32,  SocialSourceNetworkID UInt8,  SocialSourcePage String,  MobilePhoneModel String,  ClientEventTime DateTime,  RegionID UInt32,  ClientIP UInt32,  ClientIP6 FixedString(16),  RemoteIP UInt32,  RemoteIP6 FixedString(16),  IPNetworkID UInt32,  SilverlightVersion3 UInt32,  CodeVersion UInt32,  ResolutionWidth UInt16,  ResolutionHeight UInt16,  UserAgentMajor UInt16,  UserAgentMinor UInt16,  WindowClientWidth UInt16,  WindowClientHeight UInt16,  SilverlightVersion2 UInt8,  SilverlightVersion4 UInt16,  FlashVersion3 UInt16,  FlashVersion4 UInt16,  ClientTimeZone Int16,  OS UInt8,  UserAgent UInt8,  ResolutionDepth UInt8,  FlashMajor UInt8,  FlashMinor UInt8,  NetMajor UInt8,  NetMinor UInt8,  MobilePhone UInt8,  SilverlightVersion1 UInt8,  Age UInt8,  Sex UInt8,  Income UInt8,  JavaEnable UInt8,  CookieEnable UInt8,  JavascriptEnable UInt8,  IsMobile UInt8,  BrowserLanguage UInt16,  BrowserCountry UInt16,  Interests UInt16,  Robotness UInt8,  GeneralInterests Array(UInt16),  Params Array(String),  Goals Nested(ID UInt32, Serial UInt32, EventTime DateTime,  Price Int64,  OrderID String, CurrencyID UInt32),  WatchIDs Array(UInt64),  ParamSumPrice Int64,  ParamCurrency FixedString(3),  ParamCurrencyID UInt16,  ClickLogID UInt64,  ClickEventID Int32,  ClickGoodEvent Int32,  ClickEventTime DateTime,  ClickPriorityID Int32,  ClickPhraseID Int32,  ClickPageID Int32,  ClickPlaceID Int32,  ClickTypeID Int32,  ClickResourceID Int32,  ClickCost UInt32,  ClickClientIP UInt32,  ClickDomainID UInt32,  ClickURL String,  ClickAttempt UInt8,  ClickOrderID UInt32,  ClickBannerID UInt32,  ClickMarketCategoryID UInt32,  ClickMarketPP UInt32,  ClickMarketCategoryName String,  ClickMarketPPName String,  ClickAWAPSCampaignName String,  ClickPageName String,  ClickTargetType UInt16,  ClickTargetPhraseID UInt64,  ClickContextType UInt8,  ClickSelectType Int8,  ClickOptions String,  ClickGroupBannerID Int32,  OpenstatServiceName String,  OpenstatCampaignID String,  OpenstatAdID String,  OpenstatSourceID String,  UTMSource String,  UTMMedium String,  UTMCampaign String,  UTMContent String,  UTMTerm String,  FromTag String,  HasGCLID UInt8,  FirstVisit DateTime,  PredLastVisit Date,  LastVisit Date,  TotalVisits UInt32,  TraficSource    Nested(ID Int8,  SearchEngineID UInt16, AdvEngineID UInt8, PlaceID UInt16, SocialSourceNetworkID UInt8, Domain String, SearchPhrase String, SocialSourcePage String),  Attendance FixedString(16),  CLID UInt32,  YCLID UInt64,  NormalizedRefererHash UInt64,  SearchPhraseHash UInt64,  RefererDomainHash UInt64,  NormalizedStartURLHash UInt64,  StartURLDomainHash UInt64,  NormalizedEndURLHash UInt64,  TopLevelDomain UInt64,  URLScheme UInt64,  OpenstatServiceNameHash UInt64,  OpenstatCampaignIDHash UInt64,  OpenstatAdIDHash UInt64,  OpenstatSourceIDHash UInt64,  UTMSourceHash UInt64,  UTMMediumHash UInt64,  UTMCampaignHash UInt64,  UTMContentHash UInt64,  UTMTermHash UInt64,  FromHash UInt64,  WebVisorEnabled UInt8,  WebVisorActivity UInt32,  ParsedParams    Nested(Key1 String,  Key2 String,  Key3 String,  Key4 String, Key5 String, ValueDouble    Float64),  Market Nested(Type UInt8, GoalID UInt32, OrderID String,  OrderPrice Int64,  PP UInt32,  DirectPlaceID UInt32,  DirectOrderID  UInt32,  DirectBannerID UInt32,  GoodID String, GoodName String, GoodQuantity Int32,  GoodPrice Int64),  IslandID FixedString(16)) ENGINE = CollapsingMergeTree(Sign) PARTITION BY toYYYYMM(StartDate) ORDER BY (CounterID, StartDate, intHash32(UserID), VisitID) SAMPLE BY intHash32(UserID) SETTINGS index_granularity = 8192"
-# import data
+```
+
+### Import the visits data
+```bash
 cat visits_v1.tsv | clickhouse-client --query "INSERT INTO datasets.visits_v1 FORMAT TSV" --max_insert_block_size=100000
-# optionally you can optimize table
-clickhouse-client --query "OPTIMIZE TABLE datasets.visits_v1 FINAL"
+```
+
+Verify the count
+```bash
 clickhouse-client --query "SELECT COUNT(*) FROM datasets.visits_v1"
 ```

-## Example Queries {#example-queries}
+```response
+1680609
+```

-[The ClickHouse tutorial](../../tutorial.md) is based on this web analytics dataset, and the recommended way to get started with this dataset is to go through the tutorial.
+## An example JOIN 

-Additional examples of queries to these tables can be found among [stateful tests](https://github.com/ClickHouse/ClickHouse/tree/master/tests/queries/1_stateful) of ClickHouse (they are named `test.hits` and `test.visits` there).
+The hits and visits dataset is used in the ClickHouse test
+routines, this is one of the queries from the test suite. The rest
+of the tests are refernced in the *What's Next* section at the
+end of this page.
+
+```sql
+clickhouse-client --query "SELECT
+    EventDate,
+    hits,
+    visits
+FROM
+(
+    SELECT
+        EventDate,
+        count() AS hits
+    FROM datasets.hits_v1
+    GROUP BY EventDate
+) ANY LEFT JOIN
+(
+    SELECT
+        StartDate AS EventDate,
+        sum(Sign) AS visits
+    FROM datasets.visits_v1
+    GROUP BY EventDate
+) USING EventDate
+ORDER BY hits DESC
+LIMIT 10
+SETTINGS joined_subquery_requires_alias = 0
+FORMAT PrettyCompact"
+```
+
+```response
+┌──EventDate─┬────hits─┬─visits─┐
+│ 2014-03-17 │ 1406958 │ 265108 │
+│ 2014-03-19 │ 1405797 │ 261624 │
+│ 2014-03-18 │ 1383658 │ 258723 │
+│ 2014-03-20 │ 1353623 │ 255328 │
+│ 2014-03-21 │ 1245779 │ 236232 │
+│ 2014-03-23 │ 1046491 │ 202212 │
+│ 2014-03-22 │ 1031592 │ 197354 │
+└────────────┴─────────┴────────┘
+```
+
+## Next Steps
+
+[A Practical Introduction to Sparse Primary Indexes in ClickHouse](../../guides/improving-query-performance/sparse-primary-indexes/sparse-primary-indexes-intro.md) uses the hits dataset to discuss the differences in ClickHouse indexing compared to traditional relational databases, how ClickHouse builds and uses a sparse primary index, and indexing best practices.
+
+Additional examples of queries to these tables can be found among the ClickHouse [stateful tests](https://github.com/ClickHouse/ClickHouse/blob/d7129855757f38ceec3e4ecc6dafacdabe9b178f/tests/queries/1_stateful/00172_parallel_join.sql).
+
+:::note
+The test suite uses a database name `test`, and the tables are named `hits` and `visits`.  You can rename your database and tables, or edit the SQL from the test file.  
+:::
--- a/docs/en/interfaces/third-party/client-libraries.md
+++ b/docs/en/interfaces/third-party/client-libraries.md
@ -47,6 +47,8 @@ ClickHouse Inc does **not** maintain the libraries listed below and hasn’t don
    -   [ClickHouse (Ruby)](https://github.com/shlima/click_house)
    -   [clickhouse-activerecord](https://github.com/PNixx/clickhouse-activerecord)
 -   Rust
+    -   [clickhouse.rs](https://github.com/loyd/clickhouse.rs)
+    -   [clickhouse-rs](https://github.com/suharev7/clickhouse-rs)
    -   [Klickhouse](https://github.com/Protryon/klickhouse)
 -   R
    -   [clickhouse-r](https://github.com/hannesmuehleisen/clickhouse-r)
--- a/docs/ru/interfaces/third-party/client-libraries.md
+++ b/docs/ru/interfaces/third-party/client-libraries.md
@ -41,6 +41,8 @@ sidebar_label: "Клиентские библиотеки от сторонни
    -   [ClickHouse (Ruby)](https://github.com/shlima/click_house)
    -   [clickhouse-activerecord](https://github.com/PNixx/clickhouse-activerecord)
 -   Rust
+    -   [clickhouse.rs](https://github.com/loyd/clickhouse.rs)
+    -   [clickhouse-rs](https://github.com/suharev7/clickhouse-rs)
    -   [Klickhouse](https://github.com/Protryon/klickhouse)
 -   R
    -   [clickhouse-r](https://github.com/hannesmuehleisen/clickhouse-r)
--- a/docs/ru/operations/clickhouse-keeper.md
+++ b/docs/ru/operations/clickhouse-keeper.md
@ -3,23 +3,21 @@ sidebar_position: 66
 sidebar_label: ClickHouse Keeper
 ---

-# [пре-продакшн] ClickHouse Keeper {#clickHouse-keeper}
+# ClickHouse Keeper {#clickHouse-keeper}

 Сервер ClickHouse использует сервис координации [ZooKeeper](https://zookeeper.apache.org/) для [репликации](../engines/table-engines/mergetree-family/replication.md) данных и выполнения [распределенных DDL запросов](../sql-reference/distributed-ddl.md). ClickHouse Keeper — это альтернативный сервис координации, совместимый с ZooKeeper.

-:::danger "Предупреждение"
-    ClickHouse Keeper находится в стадии пре-продакшн и тестируется в CI ClickHouse и на нескольких внутренних инсталляциях.
-
 ## Детали реализации {#implementation-details}

 ZooKeeper — один из первых широко известных сервисов координации с открытым исходным кодом. Он реализован на языке программирования Java, имеет достаточно простую и мощную модель данных. Алгоритм координации Zookeeper называется ZAB (ZooKeeper Atomic Broadcast). Он не гарантирует линеаризуемость операций чтения, поскольку каждый узел ZooKeeper обслуживает чтения локально. В отличие от ZooKeeper, ClickHouse Keeper реализован на C++ и использует алгоритм [RAFT](https://raft.github.io/), [реализация](https://github.com/eBay/NuRaft). Этот алгоритм позволяет достичь линеаризуемости чтения и записи, имеет несколько реализаций с открытым исходным кодом на разных языках.

-По умолчанию ClickHouse Keeper предоставляет те же гарантии, что и ZooKeeper (линеаризуемость записей, последовательная согласованность чтений). У него есть совместимый клиент-серверный протокол, поэтому любой стандартный клиент ZooKeeper может использоваться для взаимодействия с ClickHouse Keeper. Снэпшоты и журналы имеют несовместимый с ZooKeeper формат, однако можно конвертировать данные Zookeeper в снэпшот ClickHouse Keeper с помощью `clickhouse-keeper-converter`. Межсерверный протокол ClickHouse Keeper также несовместим с ZooKeeper, поэтому создание смешанного кластера ZooKeeper / ClickHouse Keeper невозможно.
+По умолчанию ClickHouse Keeper предоставляет те же гарантии, что и ZooKeeper (линеаризуемость записей, нелинеаризуемость чтений). ClickHouse Keeper предоставляет совместимый клиент-серверный протокол, поэтому любой стандартный клиент ZooKeeper может использоваться для взаимодействия с ClickHouse Keeper. Снэпшоты и журналы имеют несовместимый с ZooKeeper формат, однако можно конвертировать данные Zookeeper в снэпшот ClickHouse Keeper с помощью `clickhouse-keeper-converter`. Межсерверный протокол ClickHouse Keeper также несовместим с ZooKeeper, поэтому создание смешанного кластера ZooKeeper / ClickHouse Keeper невозможно.

-Система управления доступом (ACL) ClickHouse Keeper реализована так же, как в [ZooKeeper](https://zookeeper.apache.org/doc/r3.1.2/zookeeperProgrammers.html#sc_ZooKeeperAccessControl). ClickHouse Keeper поддерживает тот же набор разрешений и идентичные схемы: `world`, `auth`, `digest`, `host` и `ip`. Digest для аутентификации использует пару значений `username:password`. Пароль кодируется в Base64.
+Система управления доступом (ACL) ClickHouse Keeper реализована так же, как в [ZooKeeper](https://zookeeper.apache.org/doc/r3.1.2/zookeeperProgrammers.html#sc_ZooKeeperAccessControl). ClickHouse Keeper поддерживает тот же набор разрешений и идентичные схемы: `world`, `auth`, `digest`. Digest для аутентификации использует пару значений `username:password`. Пароль кодируется в Base64.

-:::info "Примечание"
+:::note
    Внешние интеграции не поддерживаются.
+:::

 ## Конфигурация {#configuration}

@ -27,34 +25,36 @@ ClickHouse Keeper может использоваться как равноце

 -    `tcp_port` — порт для подключения клиента (по умолчанию для ZooKeeper: `2181`).
 -    `tcp_port_secure` — зашифрованный порт для SSL-соединения между клиентом и сервером сервиса.
-    `server_id` — уникальный идентификатор сервера, каждый участник кластера должен иметь уникальный номер&nbsp;(1,&nbsp;2,&nbsp;3&nbsp;и&nbsp;т.&nbsp;д.).
-    `log_storage_path` — путь к журналам координации, лучше хранить их на незанятом устройстве (актуально и для ZooKeeper).
+-    `server_id` — уникальный идентификатор сервера, каждый участник кластера должен иметь уникальный номер (1, 2, 3 и т.д.).
+-    `log_storage_path` — путь к журналам координации, лучше хранить их на не нагруженном устройстве (актуально и для ZooKeeper).
 -    `snapshot_storage_path` — путь к снэпшотам координации.

 Другие общие параметры наследуются из конфигурации сервера ClickHouse (`listen_host`, `logger`, и т. д.).

 Настройки внутренней координации находятся в `<keeper_server>.<coordination_settings>`:

-    `operation_timeout_ms` — максимальное время ожидания для одной клиентской операции в миллисекундах (по умолчанию: 10000).
-    `session_timeout_ms` — максимальное время ожидания для клиентской сессии в миллисекундах (по умолчанию: 30000).
-    `dead_session_check_period_ms` — частота, с которой ClickHouse Keeper проверяет мертвые сессии и удаляет их, в миллисекундах (по умолчанию: 500).
-    `heart_beat_interval_ms` — частота, с которой узел-лидер ClickHouse Keeper отправляет хартбиты узлам-последователям, в миллисекундах (по умолчанию: 500).
-    `election_timeout_lower_bound_ms` — время, после которого последователь может инициировать выборы лидера, если не получил от него сердцебиения (по умолчанию: 1000).
-    `election_timeout_upper_bound_ms` — время, после которого последователь должен инициировать выборы лидера, если не получил от него сердцебиения (по умолчанию: 2000).
-    `rotate_log_storage_interval` — количество записей в журнале  координации для хранения в одном файле (по умолчанию: 100000).
-    `reserved_log_items` — минимальное количество записей в журнале координации которые нужно сохранять после снятия снепшота (по умолчанию: 100000).
-    `snapshot_distance` — частота, с которой ClickHouse Keeper делает новые снэпшоты (по количеству записей в журналах), в миллисекундах (по умолчанию: 100000).
-    `snapshots_to_keep` — количество снэпшотов для сохранения (по умолчанию: 3).
-    `stale_log_gap` — время, после которого лидер считает последователя устаревшим и отправляет ему снэпшот вместо журналов (по умолчанию: 10000).
-    `fresh_log_gap` — максимальное отставание от лидера в количестве записей журнала после которого последователь считает себя не отстающим (по умолчанию: 200).
-    `max_requests_batch_size` — количество запросов на запись, которые будут сгруппированы в один перед отправкой через RAFT (по умолчанию: 100).
-    `force_sync` — вызывать `fsync` при каждой записи в журнал координации (по умолчанию: true).
-    `quorum_reads` — выполнять запросы чтения аналогично запросам записи через весь консенсус RAFT с негативным эффектом на производительность и размер журналов (по умолчанию: false).
-    `raft_logs_level` — уровень логгирования сообщений в текстовый лог  (trace, debug и т. д.) (по умолчанию: information).
 -    `auto_forwarding` — разрешить пересылку запросов на запись от последователей лидеру (по умолчанию: true).
-    `shutdown_timeout` — время ожидания завершения внутренних подключений и выключения, в миллисекундах (по умолчанию: 5000).
+-    `dead_session_check_period_ms` — частота, с которой ClickHouse Keeper проверяет мертвые сессии и удаляет их, в миллисекундах (по умолчанию: 500).
+-    `election_timeout_lower_bound_ms` — время, после которого последователь может инициировать перевыбор лидера, если не получил от него контрольный сигнал (по умолчанию: 1000).
+-    `election_timeout_upper_bound_ms` — время, после которого последователь должен инициировать перевыбор лидера, если не получил от него контрольный сигнал (по умолчанию: 2000).
+-    `force_sync` — вызывать `fsync` при каждой записи в журнал координации (по умолчанию: true).
+-    `four_letter_word_white_list` — список разрешенных 4-х буквенных команд (по умолчанию: "conf,cons,crst,envi,ruok,srst,srvr,stat,wchc,wchs,dirs,mntr,isro").
+-    `fresh_log_gap` — минимальное отставание от лидера в количестве записей журнала после которого последователь считает себя актуальным (по умолчанию: 200).
+-    `heart_beat_interval_ms` — частота, с которой узел-лидер ClickHouse Keeper отправляет контрольные сигналы узлам-последователям, в миллисекундах (по умолчанию: 500).
+-    `max_requests_batch_size` — количество запросов на запись, которые будут сгруппированы в один перед отправкой через RAFT (по умолчанию: 100).
+-    `min_session_timeout_ms` — Min timeout for client session (ms) (default: 10000).
+-    `operation_timeout_ms` — максимальное время ожидания для одной клиентской операции в миллисекундах (по умолчанию: 10000).
+-    `quorum_reads` — выполнять запросы чтения аналогично запросам записи через консенсус RAFT (по умолчанию: false).
+-    `raft_logs_level` — уровень логгирования сообщений в текстовый лог  (trace, debug и т. д.) (по умолчанию: default).
+-    `reserved_log_items` — минимальное количество записей в журнале координации которые нужно сохранять после снятия снепшота (по умолчанию: 100000).
+-    `rotate_log_storage_interval` — количество записей в журнале  координации для хранения в одном файле (по умолчанию: 100000).
+-    `session_timeout_ms` — максимальное время ожидания для клиентской сессии в миллисекундах (по умолчанию: 30000).
+-    `shutdown_timeout` — время ожидания завершения внутренних подключений при выключении, в миллисекундах (по умолчанию: 5000).
+-    `snapshot_distance` — частота, с которой ClickHouse Keeper делает новые снэпшоты (по количеству записей в журналах) (по умолчанию: 100000).
+-    `snapshots_to_keep` — количество снэпшотов для хранения (по умолчанию: 3).
+-    `stale_log_gap` — время, после которого лидер считает последователя отставшим и отправляет ему снэпшот вместо журналов (по умолчанию: 10000).
 -    `startup_timeout` — время отключения сервера, если он не подключается к другим участникам кворума, в миллисекундах (по умолчанию: 30000).
-    `four_letter_word_allow_list` — список разрешенных 4-х буквенных команд (по умолчанию: "conf,cons,crst,envi,ruok,srst,srvr,stat,wchs,dirs,mntr,isro").
+

 Конфигурация кворума находится в `<keeper_server>.<raft_configuration>` и содержит описание серверов. 

@ -67,6 +67,10 @@ ClickHouse Keeper может использоваться как равноце
 -    `port` — порт, на котором серверу доступны соединения для внутренней коммуникации.


+:::note
+В случае изменения топологии кластера ClickHouse Keeper(например, замены сервера), удостоверьтесь, что вы сохраняеете отношение `server_id` - `hostname`, не переиспользуете существующие `server_id` для для новых серверов и не перемешиваете идентификаторы. Подобные ошибки могут случаться, если вы используете автоматизацию при разворачивании кластера без логики сохранения идентификаторов.
+:::
+
 Примеры конфигурации кворума с тремя узлами можно найти в [интеграционных тестах](https://github.com/ClickHouse/ClickHouse/tree/master/tests/integration) с префиксом `test_keeper_`. Пример конфигурации для сервера №1:

 ```xml
@ -314,4 +318,31 @@ clickhouse-keeper-converter --zookeeper-logs-dir /var/lib/zookeeper/version-2 --

 4. Скопируйте снэпшот на узлы сервера ClickHouse с настроенным `keeper` или запустите ClickHouse Keeper вместо ZooKeeper. Снэпшот должен сохраняться на всех узлах: в противном случае пустые узлы могут захватить лидерство и сконвертированные данные могут быть отброшены на старте.

+## Восстановление после потери кворума
+
+Так как ClickHouse Keeper основан на протоколе Raft, он может оставаться работоспособным при отказе определенного количества нод в зависимости от размера кластера.
+Например, для кластера из 3 нод, алгоритм кворума продолжает работать при отказе не более чем одной ноды.
+
+Конфигурация кластера может быть изменена динамически с некоторыми ограничениями.
+Переконфигурация также использует Raft, поэтому для добавление новой ноды кластера или исключения старой ноды из него требуется достижения кворума в рамках текущей конфигурации кластера.
+Если в вашем кластере произошел отказ большего числа нод, чем допускает Raft для вашей текущей конфигурации и у вас нет возможности восстановить их работоспособность, Raft перестанет работать и не позволит изменить конфигурацию стандартным механизмом.
+
+Тем не менее ClickHousr Keeper имеет возможность запуститься в режиме восстановления, который позволяет переконфигурировать класте используя только одну ноду кластера.
+Этот механизм может использоваться только как крайняя мера, когда вы не можете восстановить существующие ноды кластера или запустить новый сервер с тем же идентификатором.
+
+Важно:
+- Удостоверьтесь, что отказавшие ноды не смогут в дальнейшем подключиться к кластеру в будущем.
+- Не запускайте новые ноды, пока не завешите процедуру ниже.
+
+После того, как выполнили действия выше выполните следующие шаги.
+1. Выберете одну ноду Keeper, которая станет новым лидером. Учтите, что данные которые с этой ноды будут испольщзованы всем кластером, поэтому рекомендуется выбрать ноду с наиболее актуальным состоянием.
+2. Перед дальнейшими действиям сделайте резервную копию данных из директорий `log_storage_path` и `snapshot_storage_path`.
+3. Измените настройки на всех нодах кластера, которые вы собираетесь использовать.
+4. Отправьте команду `rcvr` на ноду, которую вы выбрали или остановите ее и запустите заново с аргументом `--force-recovery`. Это переведет ноду в режим восстановления.
+5. Запускайте остальные ноды кластера по одной и проверяйте, что команда `mntr` возвращает `follower` в выводе состояния `zk_server_state` перед тем, как запустить следующую ноду.
+6. Пока нода работает в режиме восстановления, лидер будет возвращать ошибку на запрос `mntr` пока кворум не будет достигнут с помощью новых нод. Любые запросы от клиентов и постедователей будут возвращать ошибку.
+7. После достижения кворума лидер перейдет в нормальный режим работы и станет обрабатывать все запросы через Raft. Удостоверьтесь, что запрос `mntr` возвращает `leader` в выводе состояния `zk_server_state`.
+
 [Original article](https://clickhouse.com/docs/en/operations/clickhouse-keeper/) <!--hide-->
+
+
--- a/docs/ru/sql-reference/statements/check-table.md
+++ b/docs/ru/sql-reference/statements/check-table.md
@ -45,7 +45,7 @@ CHECK TABLE test_table;
 └───────────┴───────────┴─────────┘
 ```

-Если `check_query_single_value_result` = 0, запрос `CHECK TABLE` возвращает статус таблицы в целом.
+Если `check_query_single_value_result` = 1, запрос `CHECK TABLE` возвращает статус таблицы в целом.

 ```sql
 SET check_query_single_value_result = 1;
--- a/docs/zh/interfaces/third-party/client-libraries.md
+++ b/docs/zh/interfaces/third-party/client-libraries.md
@ -41,6 +41,10 @@ Yandex**没有**维护下面列出的库，也没有做过任何广泛的测试
 -   Ruby
    -   [ClickHouse (Ruby)](https://github.com/shlima/click_house)
    -   [clickhouse-activerecord](https://github.com/PNixx/clickhouse-activerecord)
+-   Rust
+    -   [clickhouse.rs](https://github.com/loyd/clickhouse.rs)
+    -   [clickhouse-rs](https://github.com/suharev7/clickhouse-rs)
+    -   [Klickhouse](https://github.com/Protryon/klickhouse)
 -   R
    -   [clickhouse-r](https://github.com/hannesmuehleisen/clickhouse-r)
    -   [RClickHouse](https://github.com/IMSMWU/RClickHouse)
--- a/programs/install/Install.cpp
+++ b/programs/install/Install.cpp
@ -5,7 +5,7 @@
 #include <sys/stat.h>
 #include <pwd.h>

-#if defined(__linux__)
+#if defined(OS_LINUX)
    #include <syscall.h>
    #include <linux/capability.h>
 #endif
@ -789,7 +789,7 @@ int mainEntryClickHouseInstall(int argc, char ** argv)
          *  then attempt to run this file will end up with a cryptic "Operation not permitted" message.
          */

-#if defined(__linux__)
+#if defined(OS_LINUX)
        fmt::print("Setting capabilities for clickhouse binary. This is optional.\n");
        std::string command = fmt::format("command -v setcap >/dev/null"
            " && command -v capsh >/dev/null"
--- a/programs/main.cpp
+++ b/programs/main.cpp
@ -2,7 +2,7 @@
 #include <csetjmp>
 #include <unistd.h>

-#ifdef __linux__
+#ifdef OS_LINUX
 #include <sys/mman.h>
 #endif

@ -339,7 +339,7 @@ struct Checker
        checkRequiredInstructions();
    }
 } checker
-#ifndef __APPLE__
+#ifndef OS_DARWIN
    __attribute__((init_priority(101)))    /// Run before other static initializers.
 #endif
 ;
--- a/src/Columns/ColumnObject.cpp
+++ b/src/Columns/ColumnObject.cpp
@ -22,8 +22,8 @@ namespace ErrorCodes
    extern const int ILLEGAL_COLUMN;
    extern const int DUPLICATE_COLUMN;
    extern const int NUMBER_OF_DIMENSIONS_MISMATHED;
-    extern const int NOT_IMPLEMENTED;
    extern const int SIZES_OF_COLUMNS_DOESNT_MATCH;
+    extern const int ARGUMENT_OUT_OF_BOUND;
 }

 namespace
@ -179,7 +179,7 @@ ColumnObject::Subcolumn::Subcolumn(
 {
 }

-size_t ColumnObject::Subcolumn::Subcolumn::size() const
+size_t ColumnObject::Subcolumn::size() const
 {
    size_t res = num_of_defaults_in_prefix;
    for (const auto & part : data)
@ -187,7 +187,7 @@ size_t ColumnObject::Subcolumn::Subcolumn::size() const
    return res;
 }

-size_t ColumnObject::Subcolumn::Subcolumn::byteSize() const
+size_t ColumnObject::Subcolumn::byteSize() const
 {
    size_t res = 0;
    for (const auto & part : data)
@ -195,7 +195,7 @@ size_t ColumnObject::Subcolumn::Subcolumn::byteSize() const
    return res;
 }

-size_t ColumnObject::Subcolumn::Subcolumn::allocatedBytes() const
+size_t ColumnObject::Subcolumn::allocatedBytes() const
 {
    size_t res = 0;
    for (const auto & part : data)
@ -203,6 +203,37 @@ size_t ColumnObject::Subcolumn::Subcolumn::allocatedBytes() const
    return res;
 }

+void ColumnObject::Subcolumn::get(size_t n, Field & res) const
+{
+    if (isFinalized())
+    {
+        getFinalizedColumn().get(n, res);
+        return;
+    }
+
+    size_t ind = n;
+    if (ind < num_of_defaults_in_prefix)
+    {
+        res = least_common_type.get()->getDefault();
+        return;
+    }
+
+    ind -= num_of_defaults_in_prefix;
+    for (const auto & part : data)
+    {
+        if (ind < part->size())
+        {
+            part->get(ind, res);
+            res = convertFieldToTypeOrThrow(res, *least_common_type.get());
+            return;
+        }
+
+        ind -= part->size();
+    }
+
+    throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Index ({}) for getting field is out of range", n);
+}
+
 void ColumnObject::Subcolumn::checkTypes() const
 {
    DataTypes prefix_types;
@ -221,7 +252,7 @@ void ColumnObject::Subcolumn::checkTypes() const

 void ColumnObject::Subcolumn::insert(Field field)
 {
-    auto info = getFieldInfo(field);
+    auto info = DB::getFieldInfo(field);
    insert(std::move(field), std::move(info));
 }

@ -244,8 +275,8 @@ static bool isConversionRequiredBetweenIntegers(const IDataType & lhs, const IDa
    bool is_native_int = which_lhs.isNativeInt() && which_rhs.isNativeInt();
    bool is_native_uint = which_lhs.isNativeUInt() && which_rhs.isNativeUInt();

-    return (is_native_int || is_native_uint)
-        && lhs.getSizeOfValueInMemory() <= rhs.getSizeOfValueInMemory();
+    return (!is_native_int && !is_native_uint)
+        || lhs.getSizeOfValueInMemory() > rhs.getSizeOfValueInMemory();
 }

 void ColumnObject::Subcolumn::insert(Field field, FieldInfo info)
@ -288,7 +319,7 @@ void ColumnObject::Subcolumn::insert(Field field, FieldInfo info)
    }
    else if (!least_common_base_type->equals(*base_type) && !isNothing(base_type))
    {
-        if (!isConversionRequiredBetweenIntegers(*base_type, *least_common_base_type))
+        if (isConversionRequiredBetweenIntegers(*base_type, *least_common_base_type))
        {
            base_type = getLeastSupertype(DataTypes{std::move(base_type), least_common_base_type}, true);
            type_changed = true;
@ -305,35 +336,96 @@ void ColumnObject::Subcolumn::insert(Field field, FieldInfo info)

 void ColumnObject::Subcolumn::insertRangeFrom(const Subcolumn & src, size_t start, size_t length)
 {
-    assert(src.isFinalized());
-    const auto & src_column = src.data.back();
-    const auto & src_type = src.least_common_type.get();
+    assert(start + length <= src.size());
+    size_t end = start + length;

    if (data.empty())
    {
-        addNewColumnPart(src.least_common_type.get());
-        data.back()->insertRangeFrom(*src_column, start, length);
+        addNewColumnPart(src.getLeastCommonType());
    }
-    else if (least_common_type.get()->equals(*src_type))
+    else if (!least_common_type.get()->equals(*src.getLeastCommonType()))
    {
-        data.back()->insertRangeFrom(*src_column, start, length);
-    }
-    else
-    {
-        auto new_least_common_type = getLeastSupertype(DataTypes{least_common_type.get(), src_type}, true);
-        auto casted_column = castColumn({src_column, src_type, ""}, new_least_common_type);
-
-        if (!least_common_type.get()->equals(*new_least_common_type))
+        auto new_least_common_type = getLeastSupertype(DataTypes{least_common_type.get(), src.getLeastCommonType()}, true);
+        if (!new_least_common_type->equals(*least_common_type.get()))
            addNewColumnPart(std::move(new_least_common_type));
+    }

-        data.back()->insertRangeFrom(*casted_column, start, length);
+    if (end <= src.num_of_defaults_in_prefix)
+    {
+        data.back()->insertManyDefaults(length);
+        return;
+    }
+
+    if (start < src.num_of_defaults_in_prefix)
+        data.back()->insertManyDefaults(src.num_of_defaults_in_prefix - start);
+
+    auto insert_from_part = [&](const auto & column, size_t from, size_t n)
+    {
+        assert(from + n <= column->size());
+        auto column_type = getDataTypeByColumn(*column);
+
+        if (column_type->equals(*least_common_type.get()))
+        {
+            data.back()->insertRangeFrom(*column, from, n);
+            return;
+        }
+
+        /// If we need to insert large range, there is no sense to cut part of column and cast it.
+        /// Casting of all column and inserting from it can be faster.
+        /// Threshold is just a guess.
+
+        if (n * 3 >= column->size())
+        {
+            auto casted_column = castColumn({column, column_type, ""}, least_common_type.get());
+            data.back()->insertRangeFrom(*casted_column, from, n);
+            return;
+        }
+
+        auto casted_column = column->cut(from, n);
+        casted_column = castColumn({casted_column, column_type, ""}, least_common_type.get());
+        data.back()->insertRangeFrom(*casted_column, 0, n);
+    };
+
+    size_t pos = 0;
+    size_t processed_rows = src.num_of_defaults_in_prefix;
+
+    /// Find the first part of the column that intersects the range.
+    while (pos < src.data.size() && processed_rows + src.data[pos]->size() < start)
+    {
+        processed_rows += src.data[pos]->size();
+        ++pos;
+    }
+
+    /// Insert from the first part of column.
+    if (pos < src.data.size() && processed_rows < start)
+    {
+        size_t part_start = start - processed_rows;
+        size_t part_length = std::min(src.data[pos]->size() - part_start, end - start);
+        insert_from_part(src.data[pos], part_start, part_length);
+        processed_rows += src.data[pos]->size();
+        ++pos;
+    }
+
+    /// Insert from the parts of column in the middle of range.
+    while (pos < src.data.size() && processed_rows + src.data[pos]->size() < end)
+    {
+        insert_from_part(src.data[pos], 0, src.data[pos]->size());
+        processed_rows += src.data[pos]->size();
+        ++pos;
+    }
+
+    /// Insert from the last part of column if needed.
+    if (pos < src.data.size() && processed_rows < end)
+    {
+        size_t part_end = end - processed_rows;
+        insert_from_part(src.data[pos], 0, part_end);
    }
 }

 bool ColumnObject::Subcolumn::isFinalized() const
 {
-    return data.empty() ||
-        (data.size() == 1 && !data[0]->isSparse() && num_of_defaults_in_prefix == 0);
+    return num_of_defaults_in_prefix == 0 &&
+        (data.empty() || (data.size() == 1 && !data[0]->isSparse()));
 }

 void ColumnObject::Subcolumn::finalize()
@ -432,6 +524,13 @@ void ColumnObject::Subcolumn::popBack(size_t n)
    num_of_defaults_in_prefix -= n;
 }

+ColumnObject::Subcolumn ColumnObject::Subcolumn::cut(size_t start, size_t length) const
+{
+    Subcolumn new_subcolumn(0, is_nullable);
+    new_subcolumn.insertRangeFrom(*this, start, length);
+    return new_subcolumn;
+}
+
 Field ColumnObject::Subcolumn::getLastField() const
 {
    if (data.empty())
@ -442,6 +541,18 @@ Field ColumnObject::Subcolumn::getLastField() const
    return (*last_part)[last_part->size() - 1];
 }

+FieldInfo ColumnObject::Subcolumn::getFieldInfo() const
+{
+    const auto & base_type = least_common_type.getBase();
+    return FieldInfo
+    {
+        .scalar_type = base_type,
+        .have_nulls = base_type->isNullable(),
+        .need_convert = false,
+        .num_dimensions = least_common_type.getNumberOfDimensions(),
+    };
+}
+
 ColumnObject::Subcolumn ColumnObject::Subcolumn::recreateWithDefaultValues(const FieldInfo & field_info) const
 {
    auto scalar_type = field_info.scalar_type;
@ -479,6 +590,13 @@ const ColumnPtr & ColumnObject::Subcolumn::getFinalizedColumnPtr() const
    return data[0];
 }

+ColumnObject::Subcolumn::LeastCommonType::LeastCommonType()
+    : type(std::make_shared<DataTypeNothing>())
+    , base_type(type)
+    , num_dimensions(0)
+{
+}
+
 ColumnObject::Subcolumn::LeastCommonType::LeastCommonType(DataTypePtr type_)
    : type(std::move(type_))
    , base_type(getBaseTypeOfArray(type))
@ -525,16 +643,6 @@ size_t ColumnObject::size() const
    return num_rows;
 }

-MutableColumnPtr ColumnObject::cloneResized(size_t new_size) const
-{
-    /// cloneResized with new_size == 0 is used for cloneEmpty().
-    if (new_size != 0)
-        throw Exception(ErrorCodes::NOT_IMPLEMENTED,
-            "ColumnObject doesn't support resize to non-zero length");
-
-    return ColumnObject::create(is_nullable);
-}
-
 size_t ColumnObject::byteSize() const
 {
    size_t res = 0;
@ -553,23 +661,21 @@ size_t ColumnObject::allocatedBytes() const

 void ColumnObject::forEachSubcolumn(ColumnCallback callback)
 {
-    if (!isFinalized())
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot iterate over non-finalized ColumnObject");
-
    for (auto & entry : subcolumns)
-        callback(entry->data.data.back());
+        for (auto & part : entry->data.data)
+            callback(part);
 }

 void ColumnObject::insert(const Field & field)
 {
    const auto & object = field.get<const Object &>();

-    HashSet<StringRef, StringRefHash> inserted;
+    HashSet<StringRef, StringRefHash> inserted_paths;
    size_t old_size = size();
    for (const auto & [key_str, value] : object)
    {
        PathInData key(key_str);
-        inserted.insert(key_str);
+        inserted_paths.insert(key_str);
        if (!hasSubcolumn(key))
            addSubcolumn(key, old_size);

@ -578,8 +684,14 @@ void ColumnObject::insert(const Field & field)
    }

    for (auto & entry : subcolumns)
-        if (!inserted.has(entry->path.getPath()))
-            entry->data.insertDefault();
+    {
+        if (!inserted_paths.has(entry->path.getPath()))
+        {
+            bool inserted = tryInsertDefaultFromNested(entry);
+            if (!inserted)
+                entry->data.insertDefault();
+        }
+    }

    ++num_rows;
 }
@ -594,26 +706,21 @@ void ColumnObject::insertDefault()

 Field ColumnObject::operator[](size_t n) const
 {
-    if (!isFinalized())
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot get Field from non-finalized ColumnObject");
-
-    Object object;
-    for (const auto & entry : subcolumns)
-        object[entry->path.getPath()] = (*entry->data.data.back())[n];
-
+    Field object;
+    get(n, object);
    return object;
 }

 void ColumnObject::get(size_t n, Field & res) const
 {
-    if (!isFinalized())
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot get Field from non-finalized ColumnObject");
+    assert(n < size());
+    res = Object();

    auto & object = res.get<Object &>();
    for (const auto & entry : subcolumns)
    {
        auto it = object.try_emplace(entry->path.getPath()).first;
-        entry->data.data.back()->get(n, it->second);
+        entry->data.get(n, it->second);
    }
 }

@ -626,41 +733,28 @@ void ColumnObject::insertFrom(const IColumn & src, size_t n)
 void ColumnObject::insertRangeFrom(const IColumn & src, size_t start, size_t length)
 {
    const auto & src_object = assert_cast<const ColumnObject &>(src);
-    if (!src_object.isFinalized())
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot insertRangeFrom non-finalized ColumnObject");
-
-    for (auto & entry : subcolumns)
-    {
-        if (src_object.hasSubcolumn(entry->path))
-            entry->data.insertRangeFrom(src_object.getSubcolumn(entry->path), start, length);
-        else
-            entry->data.insertManyDefaults(length);
-    }

    for (const auto & entry : src_object.subcolumns)
    {
        if (!hasSubcolumn(entry->path))
        {
            if (entry->path.hasNested())
-            {
-                const auto & base_type = entry->data.getLeastCommonTypeBase();
-                FieldInfo field_info
-                {
-                    .scalar_type = base_type,
-                    .have_nulls = base_type->isNullable(),
-                    .need_convert = false,
-                    .num_dimensions = entry->data.getNumberOfDimensions(),
-                };
-
-                addNestedSubcolumn(entry->path, field_info, num_rows);
-            }
+                addNestedSubcolumn(entry->path, entry->data.getFieldInfo(), num_rows);
            else
-            {
                addSubcolumn(entry->path, num_rows);
-            }
+        }

-            auto & subcolumn = getSubcolumn(entry->path);
-            subcolumn.insertRangeFrom(entry->data, start, length);
+        auto & subcolumn = getSubcolumn(entry->path);
+        subcolumn.insertRangeFrom(entry->data, start, length);
+    }
+
+    for (auto & entry : subcolumns)
+    {
+        if (!src_object.hasSubcolumn(entry->path))
+        {
+            bool inserted = tryInsertManyDefaultsFromNested(entry);
+            if (!inserted)
+                entry->data.insertManyDefaults(length);
        }
    }

@ -668,21 +762,6 @@ void ColumnObject::insertRangeFrom(const IColumn & src, size_t start, size_t len
    finalize();
 }

-ColumnPtr ColumnObject::replicate(const Offsets & offsets) const
-{
-    if (!isFinalized())
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot replicate non-finalized ColumnObject");
-
-    auto res_column = ColumnObject::create(is_nullable);
-    for (const auto & entry : subcolumns)
-    {
-        auto replicated_data = entry->data.data.back()->replicate(offsets)->assumeMutable();
-        res_column->addSubcolumn(entry->path, std::move(replicated_data));
-    }
-
-    return res_column;
-}
-
 void ColumnObject::popBack(size_t length)
 {
    for (auto & entry : subcolumns)
@ -692,10 +771,15 @@ void ColumnObject::popBack(size_t length)
 }

 template <typename Func>
-ColumnPtr ColumnObject::applyForSubcolumns(Func && func, std::string_view func_name) const
+MutableColumnPtr ColumnObject::applyForSubcolumns(Func && func) const
 {
    if (!isFinalized())
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot {} non-finalized ColumnObject", func_name);
+    {
+        auto finalized = IColumn::mutate(getPtr());
+        auto & finalized_object = assert_cast<ColumnObject &>(*finalized);
+        finalized_object.finalize();
+        return finalized_object.applyForSubcolumns(std::forward<Func>(func));
+    }

    auto res = ColumnObject::create(is_nullable);
    for (const auto & subcolumn : subcolumns)
@ -703,22 +787,36 @@ ColumnPtr ColumnObject::applyForSubcolumns(Func && func, std::string_view func_n
        auto new_subcolumn = func(subcolumn->data.getFinalizedColumn());
        res->addSubcolumn(subcolumn->path, new_subcolumn->assumeMutable());
    }
+
    return res;
 }

 ColumnPtr ColumnObject::permute(const Permutation & perm, size_t limit) const
 {
-    return applyForSubcolumns([&](const auto & subcolumn) { return subcolumn.permute(perm, limit); }, "permute");
+    return applyForSubcolumns([&](const auto & subcolumn) { return subcolumn.permute(perm, limit); });
 }

 ColumnPtr ColumnObject::filter(const Filter & filter, ssize_t result_size_hint) const
 {
-    return applyForSubcolumns([&](const auto & subcolumn) { return subcolumn.filter(filter, result_size_hint); }, "filter");
+    return applyForSubcolumns([&](const auto & subcolumn) { return subcolumn.filter(filter, result_size_hint); });
 }

 ColumnPtr ColumnObject::index(const IColumn & indexes, size_t limit) const
 {
-    return applyForSubcolumns([&](const auto & subcolumn) { return subcolumn.index(indexes, limit); }, "index");
+    return applyForSubcolumns([&](const auto & subcolumn) { return subcolumn.index(indexes, limit); });
+}
+
+ColumnPtr ColumnObject::replicate(const Offsets & offsets) const
+{
+    return applyForSubcolumns([&](const auto & subcolumn) { return subcolumn.replicate(offsets); });
+}
+
+MutableColumnPtr ColumnObject::cloneResized(size_t new_size) const
+{
+    if (new_size == 0)
+        return ColumnObject::create(is_nullable);
+
+    return applyForSubcolumns([&](const auto & subcolumn) { return subcolumn.cloneResized(new_size); });
 }

 const ColumnObject::Subcolumn & ColumnObject::getSubcolumn(const PathInData & key) const
@ -810,6 +908,92 @@ void ColumnObject::addNestedSubcolumn(const PathInData & key, const FieldInfo &

    if (num_rows == 0)
        num_rows = new_size;
+    else if (new_size != num_rows)
+        throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH,
+            "Required size of subcolumn {} ({}) is inconsistent with column size ({})",
+            key.getPath(), new_size, num_rows);
+}
+
+const ColumnObject::Subcolumns::Node * ColumnObject::getLeafOfTheSameNested(const Subcolumns::NodePtr & entry) const
+{
+    if (!entry->path.hasNested())
+        return nullptr;
+
+    size_t old_size = entry->data.size();
+    const auto * current_node = subcolumns.findLeaf(entry->path);
+    const Subcolumns::Node * leaf = nullptr;
+
+    while (current_node)
+    {
+        /// Try to find the first Nested up to the current node.
+        const auto * node_nested = subcolumns.findParent(current_node,
+            [](const auto & candidate) { return candidate.isNested(); });
+
+        if (!node_nested)
+            break;
+
+        /// Find the leaf with subcolumn that contains values
+        /// for the last rows.
+        /// If there are no leaves, skip current node and find
+        /// the next node up to the current.
+        leaf = subcolumns.findLeaf(node_nested,
+            [&](const auto & candidate)
+            {
+                return candidate.data.size() > old_size;
+            });
+
+        if (leaf)
+            break;
+
+        current_node = node_nested->parent;
+    }
+
+    if (leaf && isNothing(leaf->data.getLeastCommonTypeBase()))
+        return nullptr;
+
+    return leaf;
+}
+
+bool ColumnObject::tryInsertManyDefaultsFromNested(const Subcolumns::NodePtr & entry) const
+{
+    const auto * leaf = getLeafOfTheSameNested(entry);
+    if (!leaf)
+        return false;
+
+    size_t old_size = entry->data.size();
+    auto field_info = entry->data.getFieldInfo();
+
+    /// Cut the needed range from the found leaf
+    /// and replace scalar values to the correct
+    /// default values for given entry.
+    auto new_subcolumn = leaf->data
+        .cut(old_size, leaf->data.size() - old_size)
+        .recreateWithDefaultValues(field_info);
+
+    entry->data.insertRangeFrom(new_subcolumn, 0, new_subcolumn.size());
+    return true;
+}
+
+bool ColumnObject::tryInsertDefaultFromNested(const Subcolumns::NodePtr & entry) const
+{
+    const auto * leaf = getLeafOfTheSameNested(entry);
+    if (!leaf)
+        return false;
+
+    auto last_field = leaf->data.getLastField();
+    if (last_field.isNull())
+        return false;
+
+    size_t leaf_num_dimensions = leaf->data.getNumberOfDimensions();
+    size_t entry_num_dimensions = entry->data.getNumberOfDimensions();
+
+    auto default_scalar = entry_num_dimensions > leaf_num_dimensions
+        ? createEmptyArrayField(entry_num_dimensions - leaf_num_dimensions)
+        : entry->data.getLeastCommonTypeBase()->getDefault();
+
+    auto default_field = applyVisitor(FieldVisitorReplaceScalars(default_scalar, leaf_num_dimensions), last_field);
+    entry->data.insert(std::move(default_field));
+    return true;
 }

 PathsInData ColumnObject::getKeys() const
@ -835,7 +1019,7 @@ void ColumnObject::finalize()
    {
        const auto & least_common_type = entry->data.getLeastCommonType();

-        /// Do not add subcolumns, which consists only from NULLs.
+        /// Do not add subcolumns, which consist only from NULLs.
        if (isNothing(getBaseTypeOfArray(least_common_type)))
            continue;

--- a/src/Columns/ColumnObject.h
+++ b/src/Columns/ColumnObject.h
@ -65,6 +65,7 @@ public:
        size_t size() const;
        size_t byteSize() const;
        size_t allocatedBytes() const;
+        void get(size_t n, Field & res) const;

        bool isFinalized() const;
        const DataTypePtr & getLeastCommonType() const { return least_common_type.get(); }
@ -84,6 +85,8 @@ public:
        void insertRangeFrom(const Subcolumn & src, size_t start, size_t length);
        void popBack(size_t n);

+        Subcolumn cut(size_t start, size_t length) const;
+
        /// Converts all column's parts to the common type and
        /// creates a single column that stores all values.
        void finalize();
@ -91,6 +94,8 @@ public:
        /// Returns last inserted field.
        Field getLastField() const;

+        FieldInfo getFieldInfo() const;
+
        /// Recreates subcolumn with default scalar values and keeps sizes of arrays.
        /// Used to create columns of type Nested with consistent array sizes.
        Subcolumn recreateWithDefaultValues(const FieldInfo & field_info) const;
@ -101,13 +106,16 @@ public:
        const IColumn & getFinalizedColumn() const;
        const ColumnPtr & getFinalizedColumnPtr() const;

+        const std::vector<WrappedPtr> & getData() const { return data; }
+        size_t getNumberOfDefaultsInPrefix() const { return num_of_defaults_in_prefix; }
+
        friend class ColumnObject;

    private:
        class LeastCommonType
        {
        public:
-            LeastCommonType() = default;
+            LeastCommonType();
            explicit LeastCommonType(DataTypePtr type_);

            const DataTypePtr & get() const { return type; }
@ -175,6 +183,11 @@ public:
    /// It cares about consistency of sizes of Nested arrays.
    void addNestedSubcolumn(const PathInData & key, const FieldInfo & field_info, size_t new_size);

+    /// Finds a subcolumn from the same Nested type as @entry and inserts
+    /// an array with default values with consistent sizes as in Nested type.
+    bool tryInsertDefaultFromNested(const Subcolumns::NodePtr & entry) const;
+    bool tryInsertManyDefaultsFromNested(const Subcolumns::NodePtr & entry) const;
+
    const Subcolumns & getSubcolumns() const { return subcolumns; }
    Subcolumns & getSubcolumns() { return subcolumns; }
    PathsInData getKeys() const;
@ -189,7 +202,6 @@ public:
    TypeIndex getDataType() const override { return TypeIndex::Object; }

    size_t size() const override;
-    MutableColumnPtr cloneResized(size_t new_size) const override;
    size_t byteSize() const override;
    size_t allocatedBytes() const override;
    void forEachSubcolumn(ColumnCallback callback) override;
@ -197,13 +209,14 @@ public:
    void insertDefault() override;
    void insertFrom(const IColumn & src, size_t n) override;
    void insertRangeFrom(const IColumn & src, size_t start, size_t length) override;
-    ColumnPtr replicate(const Offsets & offsets) const override;
    void popBack(size_t length) override;
    Field operator[](size_t n) const override;
    void get(size_t n, Field & res) const override;
    ColumnPtr permute(const Permutation & perm, size_t limit) const override;
    ColumnPtr filter(const Filter & filter, ssize_t result_size_hint) const override;
    ColumnPtr index(const IColumn & indexes, size_t limit) const override;
+    ColumnPtr replicate(const Offsets & offsets) const override;
+    MutableColumnPtr cloneResized(size_t new_size) const override;

    /// All other methods throw exception.

@ -236,7 +249,11 @@ private:
    }

    template <typename Func>
-    ColumnPtr applyForSubcolumns(Func && func, std::string_view func_name) const;
+    MutableColumnPtr applyForSubcolumns(Func && func) const;
+
+    /// For given subcolumn return subcolumn from the same Nested type.
+    /// It's used to get shared sized of Nested to insert correct default values.
+    const Subcolumns::Node * getLeafOfTheSameNested(const Subcolumns::NodePtr & entry) const;
 };

 }
--- a/src/Columns/tests/gtest_column_object.cpp
+++ b/src/Columns/tests/gtest_column_object.cpp
@ -0,0 +1,120 @@
+#include <Common/FieldVisitorsAccurateComparison.h>
+#include <DataTypes/getLeastSupertype.h>
+#include <Interpreters/castColumn.h>
+#include <Interpreters/convertFieldToType.h>
+#include <Columns/ColumnObject.h>
+#include <Common/FieldVisitorToString.h>
+
+#include <Common/randomSeed.h>
+#include <fmt/core.h>
+#include <pcg_random.hpp>
+#include <gtest/gtest.h>
+#include <random>
+
+using namespace DB;
+
+static pcg64 rng(randomSeed());
+
+Field getRandomField(size_t type)
+{
+    switch (type)
+    {
+        case 0:
+            return rng();
+        case 1:
+            return std::uniform_real_distribution<>(0.0, 1.0)(rng);
+        case 2:
+            return std::string(rng() % 10, 'a' + rng() % 26);
+        default:
+            return Field();
+    }
+}
+
+std::pair<ColumnObject::Subcolumn, std::vector<Field>> generate(size_t size)
+{
+    bool has_defaults = rng() % 3 == 0;
+    size_t num_defaults = has_defaults ? rng() % size : 0;
+
+    ColumnObject::Subcolumn subcolumn(num_defaults, false);
+    std::vector<Field> fields;
+
+    while (subcolumn.size() < size)
+    {
+        size_t part_size = rng() % (size - subcolumn.size()) + 1;
+        size_t field_type = rng() % 3;
+
+        for (size_t i = 0; i < part_size; ++i)
+        {
+            fields.push_back(getRandomField(field_type));
+            subcolumn.insert(fields.back());
+        }
+    }
+
+    std::vector<Field> result_fields;
+    for (size_t i = 0; i < num_defaults; ++i)
+        result_fields.emplace_back();
+
+    result_fields.insert(result_fields.end(), fields.begin(), fields.end());
+    return {std::move(subcolumn), std::move(result_fields)};
+}
+
+void checkFieldsAreEqual(ColumnObject::Subcolumn subcolumn, const std::vector<Field> & fields)
+{
+    ASSERT_EQ(subcolumn.size(), fields.size());
+    for (size_t i = 0; i < subcolumn.size(); ++i)
+    {
+        Field field;
+        subcolumn.get(i, field); // Also check 'get' method.
+        if (!applyVisitor(FieldVisitorAccurateEquals(), field, fields[i]))
+        {
+            std::cerr << fmt::format("Wrong value at position {}, expected {}, got {}",
+                i, applyVisitor(FieldVisitorToString(), fields[i]), applyVisitor(FieldVisitorToString(), field));
+            ASSERT_TRUE(false);
+        }
+    }
+}
+
+constexpr size_t T = 1000;
+constexpr size_t N = 1000;
+
+TEST(ColumnObject, InsertRangeFrom)
+{
+    for (size_t t = 0; t < T; ++t)
+    {
+        auto [subcolumn_dst, fields_dst] = generate(N);
+        auto [subcolumn_src, fields_src] = generate(N);
+
+        ASSERT_EQ(subcolumn_dst.size(), fields_dst.size());
+        ASSERT_EQ(subcolumn_src.size(), fields_src.size());
+
+        const auto & type_dst = subcolumn_dst.getLeastCommonType();
+        const auto & type_src = subcolumn_src.getLeastCommonType();
+        auto type_res = getLeastSupertype(DataTypes{type_dst, type_src}, true);
+
+        size_t from = rng() % subcolumn_src.size();
+        size_t to = rng() % subcolumn_src.size();
+        if (from > to)
+            std::swap(from, to);
+        ++to;
+
+        for (auto & field : fields_dst)
+        {
+            if (field.isNull())
+                field = type_res->getDefault();
+            else
+                field = convertFieldToTypeOrThrow(field, *type_res);
+        }
+
+        for (size_t i = from; i < to; ++i)
+        {
+            if (fields_src[i].isNull())
+                fields_dst.push_back(type_res->getDefault());
+            else
+                fields_dst.push_back(convertFieldToTypeOrThrow(fields_src[i], *type_res));
+
+        }
+
+        subcolumn_dst.insertRangeFrom(subcolumn_src, from, to - from);
+        checkFieldsAreEqual(subcolumn_dst, fields_dst);
+    }
+}
--- a/src/Columns/tests/gtest_column_sparse.cpp
+++ b/src/Columns/tests/gtest_column_sparse.cpp
@ -11,7 +11,7 @@
 #include <Common/FieldVisitors.h>

 using namespace DB;
-pcg64 rng(randomSeed());
+static pcg64 rng(randomSeed());

 std::pair<MutableColumnPtr, MutableColumnPtr> createColumns(size_t n, size_t k)
 {
--- a/src/Common/Allocator.h
+++ b/src/Common/Allocator.h
@ -11,7 +11,7 @@
 #include <pcg_random.hpp>
 #include <Common/thread_local_rng.h>

-#if !defined(__APPLE__) && !defined(__FreeBSD__)
+#if !defined(OS_DARWIN) && !defined(OS_FREEBSD)
 #include <malloc.h>
 #endif

--- a/src/Common/Dwarf.cpp
+++ b/src/Common/Dwarf.cpp
@ -1,4 +1,4 @@
-#if defined(__ELF__) && !defined(__FreeBSD__)
+#if defined(__ELF__) && !defined(OS_FREEBSD)

 /*
 * Copyright 2012-present Facebook, Inc.
--- a/src/Common/Dwarf.h
+++ b/src/Common/Dwarf.h
@ -1,6 +1,6 @@
 #pragma once

-#if defined(__ELF__) && !defined(__FreeBSD__)
+#if defined(__ELF__) && !defined(OS_FREEBSD)

 /*
 * Copyright 2012-present Facebook, Inc.
--- a/src/Common/Elf.cpp
+++ b/src/Common/Elf.cpp
@ -1,4 +1,4 @@
-#if defined(__ELF__) && !defined(__FreeBSD__)
+#if defined(__ELF__) && !defined(OS_FREEBSD)

 #include <Common/Elf.h>
 #include <Common/Exception.h>
--- a/src/Common/Elf.h
+++ b/src/Common/Elf.h
@ -1,6 +1,6 @@
 #pragma once

-#if defined(__ELF__) && !defined(__FreeBSD__)
+#if defined(__ELF__) && !defined(OS_FREEBSD)

 #include <IO/MMapReadBufferFromFile.h>

--- a/src/Common/Exception.cpp
+++ b/src/Common/Exception.cpp
@ -218,7 +218,7 @@ static void getNoSpaceLeftInfoMessage(std::filesystem::path path, String & msg)
        formatReadableQuantity(fs.f_favail),
        mount_point);

-#if defined(__linux__)
+#if defined(OS_LINUX)
    msg += "\nFilesystem: " + getFilesystemName(mount_point);
 #endif
 }
@ -230,7 +230,7 @@ static void getNoSpaceLeftInfoMessage(std::filesystem::path path, String & msg)
  */
 static void getNotEnoughMemoryMessage(std::string & msg)
 {
-#if defined(__linux__)
+#if defined(OS_LINUX)
    try
    {
        static constexpr size_t buf_size = 1024;
@ -261,7 +261,7 @@ static void getNotEnoughMemoryMessage(std::string & msg)
            }
        }

-        if (num_maps > max_map_count * 0.99)
+        if (num_maps > max_map_count * 0.90)
        {
            msg += fmt::format(
                "\nIt looks like that the process is near the limit on number of virtual memory mappings."
--- a/src/Common/FileCacheSettings.cpp
+++ b/src/Common/FileCacheSettings.cpp
@ -28,6 +28,7 @@ void FileCacheSettings::loadFromConfig(const Poco::Util::AbstractConfiguration &
    max_file_segment_size = config.getUInt64(config_prefix + ".max_file_segment_size", REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_FILE_SEGMENT_SIZE);

    cache_on_write_operations = config.getUInt64(config_prefix + ".cache_on_write_operations", false);
+    enable_filesystem_query_cache_limit = config.getUInt64(config_prefix + ".enable_filesystem_query_cache_limit", false);
    enable_cache_hits_threshold = config.getUInt64(config_prefix + ".enable_cache_hits_threshold", REMOTE_FS_OBJECTS_CACHE_ENABLE_HITS_THRESHOLD);
    do_not_evict_index_and_mark_files = config.getUInt64(config_prefix + ".do_not_evict_index_and_mark_files", true);
    allow_remove_persistent_cache_by_default = config.getUInt64(config_prefix + ".allow_remove_persistent_cache_by_default", true);
--- a/src/Common/FileCacheSettings.h
+++ b/src/Common/FileCacheSettings.h
@ -12,11 +12,14 @@ struct FileCacheSettings
    size_t max_size = 0;
    size_t max_elements = REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_ELEMENTS;
    size_t max_file_segment_size = REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_FILE_SEGMENT_SIZE;
+
    bool cache_on_write_operations = false;
-    bool do_not_evict_index_and_mark_files = true;
-    bool allow_remove_persistent_cache_by_default = true;

    size_t enable_cache_hits_threshold = REMOTE_FS_OBJECTS_CACHE_ENABLE_HITS_THRESHOLD;
+    bool enable_filesystem_query_cache_limit = false;
+
+    bool do_not_evict_index_and_mark_files = true;
+    bool allow_remove_persistent_cache_by_default = true;

    void loadFromConfig(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix);
 };
--- a/src/Common/FileSegment.h
+++ b/src/Common/FileSegment.h
@ -197,6 +197,7 @@ private:
    Range segment_range;

    State download_state;
+
    String downloader_id;

    RemoteFileReaderPtr remote_file_reader;
--- a/src/Common/IFileCache.cpp
+++ b/src/Common/IFileCache.cpp
@ -4,6 +4,7 @@
 #include <Common/CurrentThread.h>
 #include <Common/SipHash.h>
 #include <Common/FileCacheSettings.h>
+#include <IO/ReadSettings.h>
 #include <filesystem>

 namespace fs = std::filesystem;
@ -23,6 +24,7 @@ IFileCache::IFileCache(
    , max_size(cache_settings_.max_size)
    , max_element_size(cache_settings_.max_elements)
    , max_file_segment_size(cache_settings_.max_file_segment_size)
+    , enable_filesystem_query_cache_limit(cache_settings_.enable_filesystem_query_cache_limit)
 {
 }

@ -51,11 +53,14 @@ String IFileCache::getPathInLocalCache(const Key & key) const
    return fs::path(cache_base_path) / key_str.substr(0, 3) / key_str;
 }

+static bool isQueryInitialized()
+{
+    return CurrentThread::isInitialized() && CurrentThread::get().getQueryContext() && CurrentThread::getQueryId().size != 0;
+}
+
 bool IFileCache::isReadOnly()
 {
-    return !CurrentThread::isInitialized()
-        || !CurrentThread::get().getQueryContext()
-        || CurrentThread::getQueryId().size == 0;
+    return !isQueryInitialized();
 }

 void IFileCache::assertInitialized() const
@ -64,4 +69,116 @@ void IFileCache::assertInitialized() const
        throw Exception(ErrorCodes::REMOTE_FS_OBJECT_CACHE_ERROR, "Cache not initialized");
 }

+IFileCache::QueryContextPtr IFileCache::getCurrentQueryContext(std::lock_guard<std::mutex> & cache_lock)
+{
+    if (!isQueryInitialized())
+        return nullptr;
+
+    return getQueryContext(CurrentThread::getQueryId().toString(), cache_lock);
+}
+
+IFileCache::QueryContextPtr IFileCache::getQueryContext(const String & query_id, std::lock_guard<std::mutex> &)
+{
+    auto query_iter = query_map.find(query_id);
+    return (query_iter == query_map.end()) ? nullptr : query_iter->second;
+}
+
+void IFileCache::removeQueryContext(const String & query_id)
+{
+    std::lock_guard cache_lock(mutex);
+    auto query_iter = query_map.find(query_id);
+
+    if (query_iter == query_map.end())
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to release query context that does not exist");
+
+    query_map.erase(query_iter);
+}
+
+IFileCache::QueryContextPtr IFileCache::getOrSetQueryContext(const String & query_id, const ReadSettings & settings, std::lock_guard<std::mutex> & cache_lock)
+{
+    if (query_id.empty())
+        return nullptr;
+
+    auto context = getQueryContext(query_id, cache_lock);
+    if (!context)
+    {
+        auto query_iter = query_map.insert({query_id, std::make_shared<QueryContext>(settings.max_query_cache_size, settings.skip_download_if_exceeds_query_cache)}).first;
+        context = query_iter->second;
+    }
+    return context;
+}
+
+IFileCache::QueryContextHolder IFileCache::getQueryContextHolder(const String & query_id, const ReadSettings & settings)
+{
+    std::lock_guard cache_lock(mutex);
+
+    /// if enable_filesystem_query_cache_limit is true, and max_query_cache_size large than zero,
+    /// we create context query for current query.
+    if (enable_filesystem_query_cache_limit && settings.max_query_cache_size)
+    {
+        auto context = getOrSetQueryContext(query_id, settings, cache_lock);
+        return QueryContextHolder(query_id, this, context);
+    }
+    else
+        return QueryContextHolder();
+}
+
+void IFileCache::QueryContext::remove(const Key & key, size_t offset, size_t size, std::lock_guard<std::mutex> & cache_lock)
+{
+    if (cache_size < size)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Deleted cache size exceeds existing cache size");
+
+    if (!skip_download_if_exceeds_query_cache)
+    {
+        auto record = records.find({key, offset});
+        if (record != records.end())
+        {
+            lru_queue.remove(record->second, cache_lock);
+            records.erase({key, offset});
+        }
+    }
+    cache_size -= size;
+}
+
+void IFileCache::QueryContext::reserve(const Key & key, size_t offset, size_t size, std::lock_guard<std::mutex> & cache_lock)
+{
+    if (cache_size + size > max_cache_size)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "reserved cache size exceeds the remaining cache size");
+
+    if (!skip_download_if_exceeds_query_cache)
+    {
+        auto record = records.find({key, offset});
+        if (record == records.end())
+        {
+            auto queue_iter = lru_queue.add(key, offset, 0, cache_lock);
+            record = records.insert({{key, offset}, queue_iter}).first;
+        }
+        record->second->size += size;
+    }
+    cache_size += size;
+}
+
+void IFileCache::QueryContext::use(const Key & key, size_t offset, std::lock_guard<std::mutex> & cache_lock)
+{
+    if (!skip_download_if_exceeds_query_cache)
+    {
+        auto record = records.find({key, offset});
+        if (record != records.end())
+            lru_queue.moveToEnd(record->second, cache_lock);
+    }
+}
+
+IFileCache::QueryContextHolder::QueryContextHolder(const String & query_id_, IFileCache * cache_, IFileCache::QueryContextPtr context_)
+    : query_id(query_id_), cache(cache_), context(context_)
+{
+}
+
+IFileCache::QueryContextHolder::~QueryContextHolder()
+{
+    /// If only the query_map and the current holder hold the context_query,
+    /// the query has been completed and the query_context is released.
+    if (context && context.use_count() == 2)
+        cache->removeQueryContext(query_id);
+}
+
 }
--- a/src/Common/IFileCache.h
+++ b/src/Common/IFileCache.h
@ -5,6 +5,7 @@

 #include <boost/noncopyable.hpp>
 #include <list>
+#include <unordered_map>
 #include <functional>


@ -15,6 +16,12 @@ class FileSegment;
 using FileSegmentPtr = std::shared_ptr<FileSegment>;
 using FileSegments = std::list<FileSegmentPtr>;
 struct FileSegmentsHolder;
+struct ReadSettings;
+
+namespace ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+}

 /**
 * Local cache for remote filesystem files, represented as a set of non-overlapping non-empty file segments.
@ -129,6 +136,125 @@ protected:
    virtual FileSegmentPtr setDownloading(const Key & key, size_t offset, size_t size, bool is_persistent, std::lock_guard<std::mutex> & cache_lock) = 0;

    void assertInitialized() const;
+
+    class LRUQueue
+    {
+    public:
+        struct FileKeyAndOffset
+        {
+            Key key;
+            size_t offset;
+            size_t size;
+            size_t hits = 0;
+
+            FileKeyAndOffset(const Key & key_, size_t offset_, size_t size_) : key(key_), offset(offset_), size(size_) {}
+        };
+
+        using Iterator = typename std::list<FileKeyAndOffset>::iterator;
+
+        size_t getTotalCacheSize(std::lock_guard<std::mutex> & /* cache_lock */) const { return cache_size; }
+
+        size_t getElementsNum(std::lock_guard<std::mutex> & /* cache_lock */) const { return queue.size(); }
+
+        Iterator add(const Key & key, size_t offset, size_t size, std::lock_guard<std::mutex> & cache_lock);
+
+        void remove(Iterator queue_it, std::lock_guard<std::mutex> & cache_lock);
+
+        void moveToEnd(Iterator queue_it, std::lock_guard<std::mutex> & cache_lock);
+
+        /// Space reservation for a file segment is incremental, so we need to be able to increment size of the queue entry.
+        void incrementSize(Iterator queue_it, size_t size_increment, std::lock_guard<std::mutex> & cache_lock);
+
+        String toString(std::lock_guard<std::mutex> & cache_lock) const;
+
+        bool contains(const Key & key, size_t offset, std::lock_guard<std::mutex> & cache_lock) const;
+
+        Iterator begin() { return queue.begin(); }
+
+        Iterator end() { return queue.end(); }
+
+        void removeAll(std::lock_guard<std::mutex> & cache_lock);
+
+    private:
+        std::list<FileKeyAndOffset> queue;
+        size_t cache_size = 0;
+    };
+
+    using AccessKeyAndOffset = std::pair<Key, size_t>;
+    struct KeyAndOffsetHash
+    {
+        std::size_t operator()(const AccessKeyAndOffset & key) const
+        {
+            return std::hash<UInt128>()(key.first.key) ^ std::hash<UInt64>()(key.second);
+        }
+    };
+
+    using AccessRecord = std::unordered_map<AccessKeyAndOffset, LRUQueue::Iterator, KeyAndOffsetHash>;
+
+    /// Used to track and control the cache access of each query.
+    /// Through it, we can realize the processing of different queries by the cache layer.
+    struct QueryContext
+    {
+        LRUQueue lru_queue;
+        AccessRecord records;
+
+        size_t cache_size = 0;
+        size_t max_cache_size;
+
+        bool skip_download_if_exceeds_query_cache;
+
+        QueryContext(size_t max_cache_size_, bool skip_download_if_exceeds_query_cache_)
+            : max_cache_size(max_cache_size_)
+            , skip_download_if_exceeds_query_cache(skip_download_if_exceeds_query_cache_) {}
+
+        void remove(const Key & key, size_t offset, size_t size, std::lock_guard<std::mutex> & cache_lock);
+
+        void reserve(const Key & key, size_t offset, size_t size, std::lock_guard<std::mutex> & cache_lock);
+
+        void use(const Key & key, size_t offset, std::lock_guard<std::mutex> & cache_lock);
+
+        size_t getMaxCacheSize() const { return max_cache_size; }
+
+        size_t getCacheSize() const { return cache_size; }
+
+        LRUQueue & queue() { return lru_queue; }
+
+        bool isSkipDownloadIfExceed() const { return skip_download_if_exceeds_query_cache; }
+    };
+
+    using QueryContextPtr = std::shared_ptr<QueryContext>;
+    using QueryContextMap = std::unordered_map<String, QueryContextPtr>;
+
+    QueryContextMap query_map;
+
+    bool enable_filesystem_query_cache_limit;
+
+public:
+    QueryContextPtr getCurrentQueryContext(std::lock_guard<std::mutex> & cache_lock);
+
+    QueryContextPtr getQueryContext(const String & query_id, std::lock_guard<std::mutex> & cache_lock);
+
+    void removeQueryContext(const String & query_id);
+
+    QueryContextPtr getOrSetQueryContext(const String & query_id, const ReadSettings & settings, std::lock_guard<std::mutex> &);
+
+    /// Save a query context information, and adopt different cache policies
+    /// for different queries through the context cache layer.
+    struct QueryContextHolder : private boost::noncopyable
+    {
+        explicit QueryContextHolder(const String & query_id_, IFileCache * cache_, QueryContextPtr context_);
+
+        QueryContextHolder() = default;
+
+        ~QueryContextHolder();
+
+        String query_id {};
+        IFileCache * cache = nullptr;
+        QueryContextPtr context = nullptr;
+    };
+
+    QueryContextHolder getQueryContextHolder(const String & query_id, const ReadSettings & settings);
+
 };

 using FileCachePtr = std::shared_ptr<IFileCache>;
--- a/src/Common/InterruptListener.h
+++ b/src/Common/InterruptListener.h
@ -16,7 +16,7 @@ namespace ErrorCodes
    extern const int CANNOT_UNBLOCK_SIGNAL;
 }

-#ifdef __APPLE__
+#ifdef OS_DARWIN
 // We only need to support timeout = {0, 0} at this moment
 static int sigtimedwait(const sigset_t *set, siginfo_t *info, const struct timespec * /*timeout*/)
 {
--- a/src/Common/IntervalKind.cpp
+++ b/src/Common/IntervalKind.cpp
@ -9,13 +9,13 @@ namespace ErrorCodes
    extern const int SYNTAX_ERROR;
 }

-Int32 IntervalKind::toAvgSeconds() const
+Float64 IntervalKind::toAvgSeconds() const
 {
    switch (kind)
    {
-        case IntervalKind::Nanosecond:
-        case IntervalKind::Microsecond:
-        case IntervalKind::Millisecond: return 0; /// fractional parts of seconds have 0 seconds
+        case IntervalKind::Nanosecond: return 0.000000001;
+        case IntervalKind::Microsecond: return 0.000001;
+        case IntervalKind::Millisecond: return 0.001;
        case IntervalKind::Second: return 1;
        case IntervalKind::Minute: return 60;
        case IntervalKind::Hour: return 3600;
@ -28,6 +28,25 @@ Int32 IntervalKind::toAvgSeconds() const
    __builtin_unreachable();
 }

+bool IntervalKind::isFixedLength() const
+{
+    switch (kind)
+    {
+        case IntervalKind::Nanosecond:
+        case IntervalKind::Microsecond:
+        case IntervalKind::Millisecond:
+        case IntervalKind::Second:
+        case IntervalKind::Minute:
+        case IntervalKind::Hour:
+        case IntervalKind::Day:
+        case IntervalKind::Week: return true;
+        case IntervalKind::Month:
+        case IntervalKind::Quarter:
+        case IntervalKind::Year: return false;
+    }
+    __builtin_unreachable();
+}
+
 IntervalKind IntervalKind::fromAvgSeconds(Int64 num_seconds)
 {
    if (num_seconds)
--- a/src/Common/IntervalKind.h
+++ b/src/Common/IntervalKind.h
@ -31,12 +31,15 @@ struct IntervalKind

    /// Returns number of seconds in one interval.
    /// For `Month`, `Quarter` and `Year` the function returns an average number of seconds.
-    Int32 toAvgSeconds() const;
+    Float64 toAvgSeconds() const;

    /// Chooses an interval kind based on number of seconds.
    /// For example, `IntervalKind::fromAvgSeconds(3600)` returns `IntervalKind::Hour`.
    static IntervalKind fromAvgSeconds(Int64 num_seconds);

+    /// Returns whether IntervalKind has a fixed number of seconds (e.g. Day) or non-fixed(e.g. Month)
+    bool isFixedLength() const;
+
    /// Returns an uppercased version of what `toString()` returns.
    const char * toKeyword() const;

--- a/src/Common/LRUFileCache.cpp
+++ b/src/Common/LRUFileCache.cpp
@ -458,8 +458,129 @@ FileSegmentPtr LRUFileCache::setDownloading(
    return cell->file_segment;
 }

-bool LRUFileCache::tryReserve(
-    const Key & key, size_t offset, size_t size, std::lock_guard<std::mutex> & cache_lock)
+bool LRUFileCache::tryReserve(const Key & key, size_t offset, size_t size, std::lock_guard<std::mutex> & cache_lock)
+{
+    auto query_context = enable_filesystem_query_cache_limit ? getCurrentQueryContext(cache_lock) : nullptr;
+    if (!query_context)
+        return tryReserveForMainList(key, offset, size, nullptr, cache_lock);
+
+    /// The maximum cache capacity of the request is not reached, thus the
+    //// cache block is evicted from the main LRU queue by tryReserveForMainList().
+    else if (query_context->getCacheSize() + size <= query_context->getMaxCacheSize())
+        return tryReserveForMainList(key, offset, size, query_context, cache_lock);
+
+    /// When skip_download_if_exceeds_query_cache is true, there is no need
+    /// to evict old data, skip the cache and read directly from remote fs.
+    else if (query_context->isSkipDownloadIfExceed())
+        return false;
+
+    /// The maximum cache size of the query is reached, the cache will be
+    /// evicted from the history cache accessed by the current query.
+    else
+    {
+        size_t removed_size = 0;
+        size_t queue_size = queue.getElementsNum(cache_lock);
+
+        auto * cell_for_reserve = getCell(key, offset, cache_lock);
+
+        std::vector<IFileCache::LRUQueue::Iterator> ghost;
+        std::vector<FileSegmentCell *> trash;
+        std::vector<FileSegmentCell *> to_evict;
+
+        auto is_overflow = [&]
+        {
+            return (max_size != 0 && queue.getTotalCacheSize(cache_lock) + size - removed_size > max_size)
+            || (max_element_size != 0 && queue_size > max_element_size)
+            || (query_context->getCacheSize() + size - removed_size > query_context->getMaxCacheSize());
+        };
+
+        /// Select the cache from the LRU queue held by query for expulsion.
+        for (auto iter = query_context->queue().begin(); iter != query_context->queue().end(); iter++)
+        {
+            if (!is_overflow())
+                break;
+
+            auto * cell = getCell(iter->key, iter->offset, cache_lock);
+
+            if (!cell)
+            {
+                /// The cache corresponding to this record may be swapped out by
+                /// other queries, so it has become invalid.
+                ghost.push_back(iter);
+                removed_size += iter->size;
+            }
+            else
+            {
+                size_t cell_size = cell->size();
+                assert(iter->size == cell_size);
+
+                if (cell->releasable())
+                {
+                    auto & file_segment = cell->file_segment;
+                    std::lock_guard segment_lock(file_segment->mutex);
+
+                    switch (file_segment->download_state)
+                    {
+                        case FileSegment::State::DOWNLOADED:
+                        {
+                            to_evict.push_back(cell);
+                            break;
+                        }
+                        default:
+                        {
+                            trash.push_back(cell);
+                            break;
+                        }
+                    }
+                    removed_size += cell_size;
+                    --queue_size;
+                }
+            }
+        }
+
+        auto remove_file_segment = [&](FileSegmentPtr file_segment, size_t file_segment_size)
+        {
+            query_context->remove(file_segment->key(), file_segment->offset(), file_segment_size, cache_lock);
+
+            std::lock_guard segment_lock(file_segment->mutex);
+            remove(file_segment->key(), file_segment->offset(), cache_lock, segment_lock);
+        };
+
+        assert(trash.empty());
+        for (auto & cell : trash)
+        {
+            if (auto file_segment = cell->file_segment)
+                remove_file_segment(file_segment, cell->size());
+        }
+
+        for (auto & iter : ghost)
+            query_context->remove(iter->key, iter->offset, iter->size, cache_lock);
+
+        if (is_overflow())
+            return false;
+
+        if (cell_for_reserve)
+        {
+            auto queue_iterator = cell_for_reserve->queue_iterator;
+            if (queue_iterator)
+                queue.incrementSize(*queue_iterator, size, cache_lock);
+            else
+                cell_for_reserve->queue_iterator = queue.add(key, offset, size, cache_lock);
+        }
+
+        for (auto & cell : to_evict)
+        {
+            if (auto file_segment = cell->file_segment)
+                remove_file_segment(file_segment, cell->size());
+        }
+
+        query_context->reserve(key, offset, size, cache_lock);
+        return true;
+    }
+}
+
+bool LRUFileCache::tryReserveForMainList(
+    const Key & key, size_t offset, size_t size, QueryContextPtr query_context, std::lock_guard<std::mutex> & cache_lock)
 {
    auto removed_size = 0;
    size_t queue_size = queue.getElementsNum(cache_lock);
@ -477,7 +598,7 @@ bool LRUFileCache::tryReserve(
    auto is_overflow = [&]
    {
        /// max_size == 0 means unlimited cache size, max_element_size means unlimited number of cache elements.
-        return (max_size != 0 && queue.getTotalWeight(cache_lock) + size - removed_size > max_size)
+        return (max_size != 0 && queue.getTotalCacheSize(cache_lock) + size - removed_size > max_size)
            || (max_element_size != 0 && queue_size > max_element_size);
    };

@ -530,18 +651,19 @@ bool LRUFileCache::tryReserve(
        }
    }

+    auto remove_file_segment = [&](FileSegmentPtr file_segment)
+    {
+        std::lock_guard segment_lock(file_segment->mutex);
+        remove(file_segment->key(), file_segment->offset(), cache_lock, segment_lock);
+    };

    /// This case is very unlikely, can happen in case of exception from
    /// file_segment->complete(), which would be a logical error.
    assert(trash.empty());
    for (auto & cell : trash)
    {
-        auto file_segment = cell->file_segment;
-        if (file_segment)
-        {
-            std::lock_guard segment_lock(file_segment->mutex);
-            remove(file_segment->key(), file_segment->offset(), cache_lock, segment_lock);
-        }
+        if (auto file_segment = cell->file_segment)
+            remove_file_segment(file_segment);
    }

    if (is_overflow())
@ -562,17 +684,16 @@ bool LRUFileCache::tryReserve(

    for (auto & cell : to_evict)
    {
-        auto file_segment = cell->file_segment;
-        if (file_segment)
-        {
-            std::lock_guard<std::mutex> segment_lock(file_segment->mutex);
-            remove(file_segment->key(), file_segment->offset(), cache_lock, segment_lock);
-        }
+        if (auto file_segment = cell->file_segment)
+            remove_file_segment(file_segment);
    }

-    if (queue.getTotalWeight(cache_lock) > (1ull << 63))
+    if (queue.getTotalCacheSize(cache_lock) > (1ull << 63))
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Cache became inconsistent. There must be a bug");

+    if (query_context)
+        query_context->reserve(key, offset, size, cache_lock);
+
    return true;
 }

@ -855,7 +976,6 @@ FileSegments LRUFileCache::getSnapshot() const
        for (const auto & [offset, cell] : cells_by_offset)
            file_segments.push_back(FileSegment::getSnapshot(cell.file_segment, cache_lock));
    }
-
    return file_segments;
 }

@ -884,7 +1004,7 @@ size_t LRUFileCache::getUsedCacheSize() const

 size_t LRUFileCache::getUsedCacheSizeUnlocked(std::lock_guard<std::mutex> & cache_lock) const
 {
-    return queue.getTotalWeight(cache_lock);
+    return queue.getTotalCacheSize(cache_lock);
 }

 size_t LRUFileCache::getAvailableCacheSize() const
@ -941,7 +1061,7 @@ LRUFileCache::FileSegmentCell::FileSegmentCell(
    }
 }

-LRUFileCache::LRUQueue::Iterator LRUFileCache::LRUQueue::add(
+IFileCache::LRUQueue::Iterator IFileCache::LRUQueue::add(
    const IFileCache::Key & key, size_t offset, size_t size, std::lock_guard<std::mutex> & /* cache_lock */)
 {
 #ifndef NDEBUG
@ -959,30 +1079,30 @@ LRUFileCache::LRUQueue::Iterator LRUFileCache::LRUQueue::add(
    return queue.insert(queue.end(), FileKeyAndOffset(key, offset, size));
 }

-void LRUFileCache::LRUQueue::remove(Iterator queue_it, std::lock_guard<std::mutex> & /* cache_lock */)
+void IFileCache::LRUQueue::remove(Iterator queue_it, std::lock_guard<std::mutex> & /* cache_lock */)
 {
    cache_size -= queue_it->size;
    queue.erase(queue_it);
 }

-void LRUFileCache::LRUQueue::removeAll(std::lock_guard<std::mutex> & /* cache_lock */)
+void IFileCache::LRUQueue::removeAll(std::lock_guard<std::mutex> & /* cache_lock */)
 {
    queue.clear();
    cache_size = 0;
 }

-void LRUFileCache::LRUQueue::moveToEnd(Iterator queue_it, std::lock_guard<std::mutex> & /* cache_lock */)
+void IFileCache::LRUQueue::moveToEnd(Iterator queue_it, std::lock_guard<std::mutex> & /* cache_lock */)
 {
    queue.splice(queue.end(), queue, queue_it);
 }

-void LRUFileCache::LRUQueue::incrementSize(Iterator queue_it, size_t size_increment, std::lock_guard<std::mutex> & /* cache_lock */)
+void IFileCache::LRUQueue::incrementSize(Iterator queue_it, size_t size_increment, std::lock_guard<std::mutex> & /* cache_lock */)
 {
    cache_size += size_increment;
    queue_it->size += size_increment;
 }

-bool LRUFileCache::LRUQueue::contains(
+bool IFileCache::LRUQueue::contains(
    const IFileCache::Key & key, size_t offset, std::lock_guard<std::mutex> & /* cache_lock */) const
 {
    /// This method is used for assertions in debug mode.
@ -995,31 +1115,7 @@ bool LRUFileCache::LRUQueue::contains(
    return false;
 }

-void LRUFileCache::LRUQueue::assertCorrectness(LRUFileCache * cache, std::lock_guard<std::mutex> & cache_lock)
-{
-    [[maybe_unused]] size_t total_size = 0;
-    for (auto it = queue.begin(); it != queue.end();)
-    {
-        auto & [key, offset, size, _] = *it++;
-
-        auto * cell = cache->getCell(key, offset, cache_lock);
-        if (!cell)
-        {
-            throw Exception(
-                ErrorCodes::LOGICAL_ERROR,
-                "Cache is in inconsistent state: LRU queue contains entries with no cache cell (assertCorrectness())");
-        }
-
-        assert(cell->size() == size);
-        total_size += size;
-    }
-
-    assert(total_size == cache_size);
-    assert(cache_size <= cache->max_size);
-    assert(queue.size() <= cache->max_element_size);
-}
-
-String LRUFileCache::LRUQueue::toString(std::lock_guard<std::mutex> & /* cache_lock */) const
+String IFileCache::LRUQueue::toString(std::lock_guard<std::mutex> & /* cache_lock */) const
 {
    String result;
    for (const auto & [key, offset, size, _] : queue)
@ -1068,14 +1164,38 @@ void LRUFileCache::assertCacheCellsCorrectness(
 void LRUFileCache::assertCacheCorrectness(const Key & key, std::lock_guard<std::mutex> & cache_lock)
 {
    assertCacheCellsCorrectness(files[key], cache_lock);
-    queue.assertCorrectness(this, cache_lock);
+    assertQueueCorrectness(cache_lock);
 }

 void LRUFileCache::assertCacheCorrectness(std::lock_guard<std::mutex> & cache_lock)
 {
    for (const auto & [key, cells_by_offset] : files)
        assertCacheCellsCorrectness(files[key], cache_lock);
-    queue.assertCorrectness(this, cache_lock);
+    assertQueueCorrectness(cache_lock);
+}
+
+void LRUFileCache::assertQueueCorrectness(std::lock_guard<std::mutex> & cache_lock)
+{
+    [[maybe_unused]] size_t total_size = 0;
+    for (auto it = queue.begin(); it != queue.end();)
+    {
+        auto & [key, offset, size, _] = *it++;
+
+        auto * cell = getCell(key, offset, cache_lock);
+        if (!cell)
+        {
+            throw Exception(
+                ErrorCodes::LOGICAL_ERROR,
+                "Cache is in inconsistent state: LRU queue contains entries with no cache cell (assertCorrectness())");
+        }
+
+        assert(cell->size() == size);
+        total_size += size;
+    }
+
+    assert(total_size == queue.getTotalCacheSize(cache_lock));
+    assert(queue.getTotalCacheSize(cache_lock) <= max_size);
+    assert(queue.getElementsNum(cache_lock) <= max_element_size);
 }

 }
--- a/src/Common/LRUFileCache.h
+++ b/src/Common/LRUFileCache.h
@ -45,51 +45,6 @@ public:
    size_t getFileSegmentsNum() const override;

 private:
-    class LRUQueue
-    {
-    public:
-        struct FileKeyAndOffset
-        {
-            Key key;
-            size_t offset;
-            size_t size;
-            size_t hits = 0;
-
-            FileKeyAndOffset(const Key & key_, size_t offset_, size_t size_) : key(key_), offset(offset_), size(size_) {}
-        };
-
-        using Iterator = typename std::list<FileKeyAndOffset>::iterator;
-
-        size_t getTotalWeight(std::lock_guard<std::mutex> & /* cache_lock */) const { return cache_size; }
-
-        size_t getElementsNum(std::lock_guard<std::mutex> & /* cache_lock */) const { return queue.size(); }
-
-        Iterator add(const Key & key, size_t offset, size_t size, std::lock_guard<std::mutex> & cache_lock);
-
-        void remove(Iterator queue_it, std::lock_guard<std::mutex> & cache_lock);
-
-        void moveToEnd(Iterator queue_it, std::lock_guard<std::mutex> & cache_lock);
-
-        /// Space reservation for a file segment is incremental, so we need to be able to increment size of the queue entry.
-        void incrementSize(Iterator queue_it, size_t size_increment, std::lock_guard<std::mutex> & cache_lock);
-
-        void assertCorrectness(LRUFileCache * cache, std::lock_guard<std::mutex> & cache_lock);
-
-        String toString(std::lock_guard<std::mutex> & cache_lock) const;
-
-        bool contains(const Key & key, size_t offset, std::lock_guard<std::mutex> & cache_lock) const;
-
-        Iterator begin() { return queue.begin(); }
-
-        Iterator end() { return queue.end(); }
-
-        void removeAll(std::lock_guard<std::mutex> & cache_lock);
-
-    private:
-        std::list<FileKeyAndOffset> queue;
-        size_t cache_size = 0;
-    };
-
    struct FileSegmentCell : private boost::noncopyable
    {
        FileSegmentPtr file_segment;
@ -114,23 +69,12 @@ private:
    using FileSegmentsByOffset = std::map<size_t, FileSegmentCell>;
    using CachedFiles = std::unordered_map<Key, FileSegmentsByOffset>;

-    using AccessKeyAndOffset = std::pair<Key, size_t>;
-
-    struct KeyAndOffsetHash
-    {
-        std::size_t operator()(const AccessKeyAndOffset & key) const
-        {
-            return std::hash<UInt128>()(key.first.key) ^ std::hash<UInt64>()(key.second);
-        }
-    };
-
-    using AccessRecord = std::unordered_map<AccessKeyAndOffset, LRUQueue::Iterator, KeyAndOffsetHash>;
-
    CachedFiles files;
    LRUQueue queue;

    LRUQueue stash_queue;
    AccessRecord records;
+
    size_t max_stash_element_size;
    size_t enable_cache_hits_threshold;

@ -155,6 +99,11 @@ private:
        const Key & key, size_t offset, size_t size,
        std::lock_guard<std::mutex> & cache_lock) override;

+    bool tryReserveForMainList(
+        const Key & key, size_t offset, size_t size,
+        QueryContextPtr query_context,
+        std::lock_guard<std::mutex> & cache_lock);
+
    void remove(
        Key key, size_t offset,
        std::lock_guard<std::mutex> & cache_lock,
@ -197,6 +146,8 @@ public:
    void assertCacheCorrectness(const Key & key, std::lock_guard<std::mutex> & cache_lock);

    void assertCacheCorrectness(std::lock_guard<std::mutex> & cache_lock);
+
+    void assertQueueCorrectness(std::lock_guard<std::mutex> & cache_lock);
 };

 }
--- a/src/Common/PipeFDs.cpp
+++ b/src/Common/PipeFDs.cpp
@ -27,7 +27,7 @@ void LazyPipeFDs::open()
        if (fd >= 0)
            throw Exception("Pipe is already opened", ErrorCodes::LOGICAL_ERROR);

-#ifndef __APPLE__
+#ifndef OS_DARWIN
    if (0 != pipe2(fds_rw, O_CLOEXEC))
        throwFromErrno("Cannot create pipe", ErrorCodes::CANNOT_PIPE);
 #else
--- a/src/Common/ProcfsMetricsProvider.cpp
+++ b/src/Common/ProcfsMetricsProvider.cpp
@ -1,6 +1,6 @@
 #include "ProcfsMetricsProvider.h"

-#if defined(__linux__)
+#if defined(OS_LINUX)

 #include <Common/Exception.h>
 #include <IO/ReadBufferFromMemory.h>
--- a/src/Common/ProcfsMetricsProvider.h
+++ b/src/Common/ProcfsMetricsProvider.h
@ -4,7 +4,7 @@
 #include <boost/noncopyable.hpp>


-#if defined(__linux__)
+#if defined(OS_LINUX)
 struct taskstats;

 namespace DB
--- a/src/Common/RadixSort.h
+++ b/src/Common/RadixSort.h
@ -2,7 +2,7 @@


 #include <string.h>
-#if !defined(__APPLE__) && !defined(__FreeBSD__)
+#if !defined(OS_DARWIN) && !defined(OS_FREEBSD)
 #include <malloc.h>
 #endif
 #include <algorithm>
--- a/src/Common/StackTrace.cpp
+++ b/src/Common/StackTrace.cpp
@ -33,7 +33,7 @@ std::string signalToErrorMessage(int sig, const siginfo_t & info, [[maybe_unused
            else
                error << "Address: " << info.si_addr;

-#if defined(__x86_64__) && !defined(__FreeBSD__) && !defined(__APPLE__) && !defined(__arm__) && !defined(__powerpc__)
+#if defined(__x86_64__) && !defined(OS_FREEBSD) && !defined(OS_DARWIN) && !defined(__arm__) && !defined(__powerpc__)
            auto err_mask = context.uc_mcontext.gregs[REG_ERR];
            if ((err_mask & 0x02))
                error << " Access: write.";
@ -173,18 +173,18 @@ static void * getCallerAddress(const ucontext_t & context)
 {
 #if defined(__x86_64__)
    /// Get the address at the time the signal was raised from the RIP (x86-64)
-#    if defined(__FreeBSD__)
+#    if defined(OS_FREEBSD)
    return reinterpret_cast<void *>(context.uc_mcontext.mc_rip);
-#    elif defined(__APPLE__)
+#    elif defined(OS_DARWIN)
    return reinterpret_cast<void *>(context.uc_mcontext->__ss.__rip);
 #    else
    return reinterpret_cast<void *>(context.uc_mcontext.gregs[REG_RIP]);
 #    endif

-#elif defined(__APPLE__) && defined(__aarch64__)
+#elif defined(OS_DARWIN) && defined(__aarch64__)
    return reinterpret_cast<void *>(context.uc_mcontext->__ss.__pc);

-#elif defined(__FreeBSD__) && defined(__aarch64__)
+#elif defined(OS_FREEBSD) && defined(__aarch64__)
    return reinterpret_cast<void *>(context.uc_mcontext.mc_gpregs.gp_elr);
 #elif defined(__aarch64__)
    return reinterpret_cast<void *>(context.uc_mcontext.pc);
@ -201,7 +201,7 @@ void StackTrace::symbolize(
    const StackTrace::FramePointers & frame_pointers, [[maybe_unused]] size_t offset,
    size_t size, StackTrace::Frames & frames)
 {
-#if defined(__ELF__) && !defined(__FreeBSD__)
+#if defined(__ELF__) && !defined(OS_FREEBSD)

    auto symbol_index_ptr = DB::SymbolIndex::instance();
    const DB::SymbolIndex & symbol_index = *symbol_index_ptr;
@ -332,7 +332,7 @@ static void toStringEveryLineImpl(
    if (size == 0)
        return callback("<Empty trace>");

-#if defined(__ELF__) && !defined(__FreeBSD__)
+#if defined(__ELF__) && !defined(OS_FREEBSD)
    auto symbol_index_ptr = DB::SymbolIndex::instance();
    const DB::SymbolIndex & symbol_index = *symbol_index_ptr;
    std::unordered_map<std::string, DB::Dwarf> dwarfs;
--- a/src/Common/StackTrace.h
+++ b/src/Common/StackTrace.h
@ -9,7 +9,7 @@
 #include <functional>
 #include <signal.h>

-#ifdef __APPLE__
+#ifdef OS_DARWIN
 // ucontext is not available without _XOPEN_SOURCE
 #   ifdef __clang__
 #       pragma clang diagnostic ignored "-Wreserved-id-macro"
--- a/src/Common/SymbolIndex.cpp
+++ b/src/Common/SymbolIndex.cpp
@ -1,4 +1,4 @@
-#if defined(__ELF__) && !defined(__FreeBSD__)
+#if defined(__ELF__) && !defined(OS_FREEBSD)

 #include <Common/SymbolIndex.h>
 #include <Common/hex.h>
--- a/src/Common/SymbolIndex.h
+++ b/src/Common/SymbolIndex.h
@ -1,6 +1,6 @@
 #pragma once

-#if defined(__ELF__) && !defined(__FreeBSD__)
+#if defined(__ELF__) && !defined(OS_FREEBSD)

 #include <vector>
 #include <string>
--- a/src/Common/ThreadProfileEvents.cpp
+++ b/src/Common/ThreadProfileEvents.cpp
@ -1,6 +1,6 @@
 #include "ThreadProfileEvents.h"

-#if defined(__linux__)
+#if defined(OS_LINUX)

 #include "TaskStatsInfoGetter.h"
 #include "ProcfsMetricsProvider.h"
@ -177,7 +177,7 @@ void TasksStatsCounters::incrementProfileEvents(const ::taskstats & prev, const

 #endif

-#if defined(__linux__)
+#if defined(OS_LINUX)

 namespace DB
 {
--- a/src/Common/ThreadProfileEvents.h
+++ b/src/Common/ThreadProfileEvents.h
@ -8,7 +8,7 @@
 #include <Common/logger_useful.h>


-#if defined(__linux__)
+#if defined(OS_LINUX)
 #include <linux/taskstats.h>
 #else
 struct taskstats {};
@ -66,7 +66,7 @@ struct RUsageCounters
    static RUsageCounters current()
    {
        ::rusage rusage {};
-#if !defined(__APPLE__)
+#if !defined(OS_DARWIN)
 #if defined(OS_SUNOS)
        ::getrusage(RUSAGE_LWP, &rusage);
 #else
@ -102,7 +102,7 @@ private:
    }
 };

-#if defined(__linux__)
+#if defined(OS_LINUX)

 struct PerfEventInfo
 {
@ -171,7 +171,7 @@ extern PerfEventsCounters current_thread_counters;

 #endif

-#if defined(__linux__)
+#if defined(OS_LINUX)

 class TasksStatsCounters
 {
--- a/src/Common/ZooKeeper/ZooKeeperIO.cpp
+++ b/src/Common/ZooKeeper/ZooKeeperIO.cpp
@ -9,7 +9,7 @@ void write(size_t x, WriteBuffer & out)
    writeBinary(x, out);
 }

-#ifdef __APPLE__
+#ifdef OS_DARWIN
 void write(uint64_t x, WriteBuffer & out)
 {
    x = __builtin_bswap64(x);
@ -71,7 +71,7 @@ void write(const Error & x, WriteBuffer & out)
    write(static_cast<int32_t>(x), out);
 }

-#ifdef __APPLE__
+#ifdef OS_DARWIN
 void read(uint64_t & x, ReadBuffer & in)
 {
    readBinary(x, in);
--- a/src/Common/ZooKeeper/ZooKeeperIO.h
+++ b/src/Common/ZooKeeper/ZooKeeperIO.h
@ -16,7 +16,7 @@ using namespace DB;
 void write(size_t x, WriteBuffer & out);

 /// uint64_t != size_t on darwin
-#ifdef __APPLE__
+#ifdef OS_DARWIN
 void write(uint64_t x, WriteBuffer & out);
 #endif

@ -45,7 +45,7 @@ void write(const std::vector<T> & arr, WriteBuffer & out)
 }

 void read(size_t & x, ReadBuffer & in);
-#ifdef __APPLE__
+#ifdef OS_DARWIN
 void read(uint64_t & x, ReadBuffer & in);
 #endif
 void read(int64_t & x, ReadBuffer & in);
--- a/src/Common/atomicRename.cpp
+++ b/src/Common/atomicRename.cpp
@ -21,7 +21,7 @@ namespace ErrorCodes
 }


-#if defined(__linux__)
+#if defined(OS_LINUX)

 #include <unistd.h>
 #include <fcntl.h>
@ -101,7 +101,7 @@ bool supportsAtomicRename()

 }

-#elif defined(__APPLE__)
+#elif defined(OS_DARWIN)

 // Includes
 #include <dlfcn.h>  // For dlsym
--- a/src/Common/checkStackSize.cpp
+++ b/src/Common/checkStackSize.cpp
@ -5,7 +5,7 @@
 #include <pthread.h>
 #include <cstdint>

-#if defined(__FreeBSD__)
+#if defined(OS_FREEBSD)
 #   include <pthread_np.h>
 #endif

@ -48,7 +48,7 @@ size_t getStackSize(void ** out_address)
    address = reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(pthread_get_stackaddr_np(thread)) - size);
 #else
    pthread_attr_t attr;
-#   if defined(__FreeBSD__) || defined(OS_SUNOS)
+#   if defined(OS_FREEBSD) || defined(OS_SUNOS)
    pthread_attr_init(&attr);
    if (0 != pthread_attr_get_np(pthread_self(), &attr))
        throwFromErrno("Cannot pthread_attr_get_np", ErrorCodes::CANNOT_PTHREAD_ATTR);
--- a/src/Common/examples/int_hashes_perf.cpp
+++ b/src/Common/examples/int_hashes_perf.cpp
@ -16,7 +16,7 @@

 static void setAffinity()
 {
-#if !defined(__APPLE__) && !defined(__FreeBSD__) && !defined(__sun)
+#if !defined(OS_DARWIN) && !defined(OS_FREEBSD) && !defined(__sun)
    cpu_set_t mask;
    CPU_ZERO(&mask);
    CPU_SET(0, &mask);
@ -283,7 +283,7 @@ int main(int argc, char ** argv)

    if (!method || method == 1) test<identity>  (n, data.data(), "0: identity");
    if (!method || method == 2) test<intHash32> (n, data.data(), "1: intHash32");
-#if !defined(__APPLE__) /// The difference in size_t: unsigned long on Linux, unsigned long long on Mac OS.
+#if !defined(OS_DARWIN) /// The difference in size_t: unsigned long on Linux, unsigned long long on Mac OS.
    if (!method || method == 3) test<intHash64> (n, data.data(), "2: intHash64");
 #endif
    if (!method || method == 4) test<hash3>     (n, data.data(), "3: two rounds");
--- a/src/Common/examples/procfs_metrics_provider_perf.cpp
+++ b/src/Common/examples/procfs_metrics_provider_perf.cpp
@ -1,4 +1,4 @@
-#if defined(__linux__)
+#if defined(OS_LINUX)
 #include <Common/ProcfsMetricsProvider.h>

 #include <iostream>
@ -6,7 +6,7 @@
 #endif


-#if defined(__linux__)
+#if defined(OS_LINUX)
 int main(int argc, char ** argv)
 {
    using namespace DB;
--- a/src/Common/examples/symbol_index.cpp
+++ b/src/Common/examples/symbol_index.cpp
@ -16,7 +16,7 @@ static NO_INLINE const void * getAddress()

 int main(int argc, char ** argv)
 {
-#if defined(__ELF__) && !defined(__FreeBSD__)
+#if defined(__ELF__) && !defined(OS_FREEBSD)
    using namespace DB;

    if (argc < 2)
--- a/src/Common/filesystemHelpers.cpp
+++ b/src/Common/filesystemHelpers.cpp
@ -1,6 +1,6 @@
 #include "filesystemHelpers.h"

-#if defined(__linux__)
+#if defined(OS_LINUX)
 #    include <cstdio>
 #    include <mntent.h>
 #    include <sys/sysmacros.h>
@ -64,12 +64,12 @@ std::unique_ptr<TemporaryFile> createTemporaryFile(const std::string & path)
    return std::make_unique<TemporaryFile>(path);
 }

-#if !defined(__linux__)
+#if !defined(OS_LINUX)
 [[noreturn]]
 #endif
 String getBlockDeviceId([[maybe_unused]] const String & path)
 {
-#if defined(__linux__)
+#if defined(OS_LINUX)
    struct stat sb;
    if (lstat(path.c_str(), &sb))
        throwFromErrnoWithPath("Cannot lstat " + path, path, ErrorCodes::CANNOT_STAT);
@ -81,12 +81,12 @@ String getBlockDeviceId([[maybe_unused]] const String & path)
 #endif
 }

-#if !defined(__linux__)
+#if !defined(OS_LINUX)
 [[noreturn]]
 #endif
 BlockDeviceType getBlockDeviceType([[maybe_unused]] const String & device_id)
 {
-#if defined(__linux__)
+#if defined(OS_LINUX)
    try
    {
        ReadBufferFromFile in("/sys/dev/block/" + device_id + "/queue/rotational");
@ -103,12 +103,12 @@ BlockDeviceType getBlockDeviceType([[maybe_unused]] const String & device_id)
 #endif
 }

-#if !defined(__linux__)
+#if !defined(OS_LINUX)
 [[noreturn]]
 #endif
 UInt64 getBlockDeviceReadAheadBytes([[maybe_unused]] const String & device_id)
 {
-#if defined(__linux__)
+#if defined(OS_LINUX)
    try
    {
        ReadBufferFromFile in("/sys/dev/block/" + device_id + "/queue/read_ahead_kb");
@ -157,12 +157,12 @@ std::filesystem::path getMountPoint(std::filesystem::path absolute_path)
 }

 /// Returns name of filesystem mounted to mount_point
-#if !defined(__linux__)
+#if !defined(OS_LINUX)
 [[noreturn]]
 #endif
 String getFilesystemName([[maybe_unused]] const String & mount_point)
 {
-#if defined(__linux__)
+#if defined(OS_LINUX)
    FILE * mounted_filesystems = setmntent("/etc/mtab", "r");
    if (!mounted_filesystems)
        throw DB::Exception("Cannot open /etc/mtab to get name of filesystem", ErrorCodes::SYSTEM_ERROR);
--- a/src/Common/filesystemHelpers.h
+++ b/src/Common/filesystemHelpers.h
@ -19,7 +19,7 @@ bool enoughSpaceInDirectory(const std::string & path, size_t data_size);
 std::unique_ptr<TemporaryFile> createTemporaryFile(const std::string & path);

 // Determine what block device is responsible for specified path
-#if !defined(__linux__)
+#if !defined(OS_LINUX)
 [[noreturn]]
 #endif
 String getBlockDeviceId([[maybe_unused]] const String & path);
@ -32,13 +32,13 @@ enum class BlockDeviceType
 };

 // Try to determine block device type
-#if !defined(__linux__)
+#if !defined(OS_LINUX)
 [[noreturn]]
 #endif
 BlockDeviceType getBlockDeviceType([[maybe_unused]] const String & device_id);

 // Get size of read-ahead in bytes for specified block device
-#if !defined(__linux__)
+#if !defined(OS_LINUX)
 [[noreturn]]
 #endif
 UInt64 getBlockDeviceReadAheadBytes([[maybe_unused]] const String & device_id);
@ -47,7 +47,7 @@ UInt64 getBlockDeviceReadAheadBytes([[maybe_unused]] const String & device_id);
 std::filesystem::path getMountPoint(std::filesystem::path absolute_path);

 /// Returns name of filesystem mounted to mount_point
-#if !defined(__linux__)
+#if !defined(OS_LINUX)
 [[noreturn]]
 #endif
 String getFilesystemName([[maybe_unused]] const String & mount_point);
--- a/src/Common/getCurrentProcessFDCount.cpp
+++ b/src/Common/getCurrentProcessFDCount.cpp
@ -11,7 +11,7 @@ int getCurrentProcessFDCount()
 {
    namespace fs = std::filesystem;
    int result = -1;
-#if defined(__linux__)  || defined(__APPLE__)
+#if defined(OS_LINUX)  || defined(OS_DARWIN)
    using namespace DB;

    Int32 pid = getpid();
--- a/src/Common/getHashOfLoadedBinary.cpp
+++ b/src/Common/getHashOfLoadedBinary.cpp
@ -1,6 +1,6 @@
 #include <Common/getHashOfLoadedBinary.h>

-#if defined(__linux__)
+#if defined(OS_LINUX)

 #include <link.h>
 #include <array>
--- a/src/Common/getMappedArea.cpp
+++ b/src/Common/getMappedArea.cpp
@ -1,7 +1,7 @@
 #include "getMappedArea.h"
 #include <Common/Exception.h>

-#if defined(__linux__)
+#if defined(OS_LINUX)

 #include <Common/StringUtils/StringUtils.h>
 #include <Common/hex.h>
--- a/src/Common/getMaxFileDescriptorCount.cpp
+++ b/src/Common/getMaxFileDescriptorCount.cpp
@ -9,7 +9,7 @@ int getMaxFileDescriptorCount()
 {
    namespace fs = std::filesystem;
    int result = -1;
-#if defined(__linux__) || defined(__APPLE__)
+#if defined(OS_LINUX) || defined(OS_DARWIN)
    using namespace DB;

    if (fs::exists("/proc/sys/fs/file-max"))
--- a/src/Common/hasLinuxCapability.cpp
+++ b/src/Common/hasLinuxCapability.cpp
@ -1,4 +1,4 @@
-#if defined(__linux__)
+#if defined(OS_LINUX)

 #include "hasLinuxCapability.h"

--- a/src/Common/hasLinuxCapability.h
+++ b/src/Common/hasLinuxCapability.h
@ -1,5 +1,5 @@
 #pragma once
-#if defined(__linux__)
+#if defined(OS_LINUX)

 #include <linux/capability.h>

--- a/src/Common/remapExecutable.cpp
+++ b/src/Common/remapExecutable.cpp
@ -1,6 +1,6 @@
 #include "remapExecutable.h"

-#if defined(__linux__) && defined(__amd64__) && defined(__SSE2__) && !defined(SANITIZER) && defined(NDEBUG) && !defined(SPLIT_SHARED_LIBRARIES)
+#if defined(OS_LINUX) && defined(__amd64__) && defined(__SSE2__) && !defined(SANITIZER) && defined(NDEBUG) && !defined(SPLIT_SHARED_LIBRARIES)

 #include <sys/mman.h>
 #include <unistd.h>
--- a/src/Common/setThreadName.cpp
+++ b/src/Common/setThreadName.cpp
@ -1,7 +1,7 @@
 #include <pthread.h>

-#if defined(__APPLE__) || defined(OS_SUNOS)
-#elif defined(__FreeBSD__)
+#if defined(OS_DARWIN) || defined(OS_SUNOS)
+#elif defined(OS_FREEBSD)
    #include <pthread_np.h>
 #else
    #include <sys/prctl.h>
@ -55,10 +55,10 @@ const char * getThreadName()
    if (thread_name[0])
        return thread_name;

-#if defined(__APPLE__) || defined(OS_SUNOS)
+#if defined(OS_DARWIN) || defined(OS_SUNOS)
    if (pthread_getname_np(pthread_self(), thread_name, THREAD_NAME_SIZE))
        throw DB::Exception("Cannot get thread name with pthread_getname_np()", DB::ErrorCodes::PTHREAD_ERROR);
-#elif defined(__FreeBSD__)
+#elif defined(OS_FREEBSD)
 // TODO: make test. freebsd will have this function soon https://freshbsd.org/commit/freebsd/r337983
 //    if (pthread_get_name_np(pthread_self(), thread_name, THREAD_NAME_SIZE))
 //        throw DB::Exception("Cannot get thread name with pthread_get_name_np()", DB::ErrorCodes::PTHREAD_ERROR);
--- a/src/Common/tests/gtest_lru_file_cache.cpp
+++ b/src/Common/tests/gtest_lru_file_cache.cpp
@ -94,9 +94,10 @@ TEST(LRUFileCache, get)
    DB::ThreadStatus thread_status;

    /// To work with cache need query_id and query context.
+    std::string query_id = "query_id";
    auto query_context = DB::Context::createCopy(getContext().context);
    query_context->makeQueryContext();
-    query_context->setCurrentQueryId("query_id");
+    query_context->setCurrentQueryId(query_id);
    DB::CurrentThread::QueryScope query_scope_holder(query_context);

    DB::FileCacheSettings settings;
@ -509,4 +510,5 @@ TEST(LRUFileCache, get)
        assertRange(49, segments1[1], DB::FileSegment::Range(10, 19), DB::FileSegment::State::EMPTY);
        assertRange(50, segments1[2], DB::FileSegment::Range(20, 24), DB::FileSegment::State::EMPTY);
    }
+
 }
--- a/src/Coordination/Changelog.cpp
+++ b/src/Coordination/Changelog.cpp
@ -405,7 +405,7 @@ void Changelog::readChangelogAndInitWriter(uint64_t last_commited_log_index, uin

        if (last_log_read_result->last_read_index == 0 || last_log_read_result->error) /// If it's broken log then remove it
        {
-            LOG_INFO(log, "Removing log {} because it's empty or read finished with error", description.path);
+            LOG_INFO(log, "Removing chagelog {} because it's empty or read finished with error", description.path);
            std::filesystem::remove(description.path);
            existing_changelogs.erase(last_log_read_result->log_start_index);
            std::erase_if(logs, [last_log_read_result] (const auto & item) { return item.first >= last_log_read_result->log_start_index; });
--- a/src/Coordination/FourLetterCommand.cpp
+++ b/src/Coordination/FourLetterCommand.cpp
@ -236,7 +236,7 @@ String MonitorCommand::run()
    print(ret, "key_arena_size", state_machine.getKeyArenaSize());
    print(ret, "latest_snapshot_size", state_machine.getLatestSnapshotBufSize());

-#if defined(__linux__) || defined(__APPLE__)
+#if defined(OS_LINUX) || defined(OS_DARWIN)
    print(ret, "open_file_descriptor_count", getCurrentProcessFDCount());
    print(ret, "max_file_descriptor_count", getMaxFileDescriptorCount());
 #endif
--- a/src/Coordination/KeeperSnapshotManager.cpp
+++ b/src/Coordination/KeeperSnapshotManager.cpp
@ -12,6 +12,7 @@
 #include <Coordination/pathUtils.h>
 #include <filesystem>
 #include <memory>
+#include <Common/logger_useful.h>

 namespace DB
 {
@ -20,6 +21,7 @@ namespace ErrorCodes
 {
    extern const int UNKNOWN_FORMAT_VERSION;
    extern const int UNKNOWN_SNAPSHOT;
+    extern const int LOGICAL_ERROR;
 }

 namespace
@ -296,6 +298,25 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial
        }
    }

+    for (const auto & itr : storage.container)
+    {
+        if (itr.key != "/")
+        {
+            if (itr.value.stat.numChildren != static_cast<int32_t>(itr.value.getChildren().size()))
+            {
+#ifdef NDEBUG
+                /// TODO (alesapin) remove this, it should be always CORRUPTED_DATA.
+                LOG_ERROR(&Poco::Logger::get("KeeperSnapshotManager"), "Children counter in stat.numChildren {}"
+                            " is different from actual children size {} for node {}", itr.value.stat.numChildren, itr.value.getChildren().size(), itr.key);
+#else
+                throw Exception(ErrorCodes::LOGICAL_ERROR, "Children counter in stat.numChildren {}"
+                            " is different from actual children size {} for node {}", itr.value.stat.numChildren, itr.value.getChildren().size(), itr.key);
+#endif
+            }
+        }
+    }
+
+
    size_t active_sessions_size;
    readBinary(active_sessions_size, in);

--- a/src/Coordination/KeeperStorage.cpp
+++ b/src/Coordination/KeeperStorage.cpp
@ -13,7 +13,7 @@
 #include <iomanip>
 #include <mutex>
 #include <functional>
-#include <Common/logger_useful.h>
+#include <base/defines.h>

 namespace DB
 {
@ -349,7 +349,9 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr
        container.updateValue(parent_path, [child_path, zxid, &prev_parent_zxid,
                                            parent_cversion, &prev_parent_cversion] (KeeperStorage::Node & parent)
        {
+            ++parent.stat.numChildren;
            parent.addChild(child_path);
+
            prev_parent_cversion = parent.stat.cversion;
            prev_parent_zxid = parent.stat.pzxid;

@ -363,7 +365,7 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr

            if (zxid > parent.stat.pzxid)
                parent.stat.pzxid = zxid;
-            ++parent.stat.numChildren;
+            chassert(parent.stat.numChildren == static_cast<int32_t>(parent.getChildren().size()));
        });

        response.path_created = path_created;
@ -385,6 +387,7 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr
                undo_parent.stat.cversion = prev_parent_cversion;
                undo_parent.stat.pzxid = prev_parent_zxid;
                undo_parent.removeChild(child_path);
+                chassert(undo_parent.stat.numChildren == static_cast<int32_t>(undo_parent.getChildren().size()));
            });

            storage.container.erase(path_created);
@ -494,7 +497,7 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr
        {
            response.error = Coordination::Error::ZBADVERSION;
        }
-        else if (it->value.stat.numChildren)
+        else if (!it->value.getChildren().empty())
        {
            response.error = Coordination::Error::ZNOTEMPTY;
        }
@ -519,6 +522,7 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr
                --parent.stat.numChildren;
                ++parent.stat.cversion;
                parent.removeChild(child_basename);
+                chassert(parent.stat.numChildren == static_cast<int32_t>(parent.getChildren().size()));
            });

            response.error = Coordination::Error::ZOK;
@ -540,6 +544,7 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr
                    ++parent.stat.numChildren;
                    --parent.stat.cversion;
                    parent.addChild(child_name);
+                    chassert(parent.stat.numChildren == static_cast<int32_t>(parent.getChildren().size()));
                });
            };
        }
@ -1110,6 +1115,7 @@ KeeperStorage::ResponsesForSessions KeeperStorage::processRequest(const Coordina
                    ++parent.stat.cversion;
                    auto base_name = getBaseName(ephemeral_path);
                    parent.removeChild(base_name);
+                    chassert(parent.stat.numChildren == static_cast<int32_t>(parent.getChildren().size()));
                });

                container.erase(ephemeral_path);
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -576,6 +576,8 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
    M(Bool, read_from_filesystem_cache_if_exists_otherwise_bypass_cache, false, "", 0) \
    M(Bool, filesystem_cache_do_not_evict_index_and_marks_files, true, "", 0) \
    M(Bool, enable_filesystem_cache_on_lower_level, true, "", 0) \
+    M(Bool, skip_download_if_exceeds_query_cache, true, "Skip download from remote filesystem if exceeds query cache size", 0) \
+    M(UInt64, max_query_cache_size, (128UL * 1024 * 1024 * 1024), "Max remote filesystem cache size that can be used by a single query", 0) \
    \
    M(Bool, use_structure_from_insertion_table_in_table_functions, false, "Use structure from insertion table instead of schema inference from data", 0) \
    \
--- a/src/Core/SortCursor.h
+++ b/src/Core/SortCursor.h
@ -6,11 +6,24 @@

 #include <Common/typeid_cast.h>
 #include <Common/assert_cast.h>
+#include <Core/callOnTypeIndex.h>
 #include <Core/SortDescription.h>
 #include <Core/Block.h>
 #include <Core/ColumnNumbers.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <DataTypes/DataTypesDecimal.h>
+#include <DataTypes/DataTypeString.h>
+#include <DataTypes/DataTypeFixedString.h>
+#include <DataTypes/DataTypeDate.h>
+#include <DataTypes/DataTypeDate32.h>
+#include <DataTypes/DataTypeDateTime.h>
+#include <DataTypes/DataTypeDateTime64.h>
+#include <DataTypes/DataTypeEnum.h>
+#include <DataTypes/DataTypeUUID.h>
 #include <Columns/IColumn.h>
+#include <Columns/ColumnDecimal.h>
 #include <Columns/ColumnString.h>
+#include <Columns/ColumnFixedString.h>

 #include "config_core.h"

@ -250,6 +263,36 @@ struct SimpleSortCursor : SortCursorHelper<SimpleSortCursor>
    }
 };

+template <typename ColumnType>
+struct SpecializedSingleColumnSortCursor : SortCursorHelper<SpecializedSingleColumnSortCursor<ColumnType>>
+{
+    using SortCursorHelper<SpecializedSingleColumnSortCursor>::SortCursorHelper;
+
+    bool ALWAYS_INLINE greaterAt(const SortCursorHelper<SpecializedSingleColumnSortCursor> & rhs, size_t lhs_pos, size_t rhs_pos) const
+    {
+        auto & this_impl = this->impl;
+
+        auto & lhs_columns = this_impl->sort_columns;
+        auto & rhs_columns = rhs.impl->sort_columns;
+
+        assert(lhs_columns.size() == 1);
+        assert(rhs_columns.size() == 1);
+
+        const auto & lhs_column = assert_cast<const ColumnType &>(*lhs_columns[0]);
+        const auto & rhs_column = assert_cast<const ColumnType &>(*rhs_columns[0]);
+
+        const auto & desc = this->impl->desc[0];
+
+        int res = desc.direction * lhs_column.compareAt(lhs_pos, rhs_pos, rhs_column, desc.nulls_direction);
+
+        if (res > 0)
+            return true;
+        if (res < 0)
+            return false;
+
+        return this_impl->order > rhs.impl->order;
+    }
+};

 /// Separate comparator for locale-sensitive string comparisons
 struct SortCursorWithCollation : SortCursorHelper<SortCursorWithCollation>
@ -411,6 +454,124 @@ private:
    }
 };

+/** SortQueueVariants allow to specialize sorting queue for concrete types and sort description.
+  * To access queue callOnVariant method must be used.
+  */
+class SortQueueVariants
+{
+public:
+    SortQueueVariants() = default;
+
+    SortQueueVariants(const DataTypes & sort_description_types, const SortDescription & sort_description)
+    {
+        bool has_collation = false;
+        for (const auto & column_description : sort_description)
+        {
+            if (column_description.collator)
+            {
+                has_collation = true;
+                break;
+            }
+        }
+
+        if (has_collation)
+        {
+            queue_variants = SortingHeap<SortCursorWithCollation>();
+            return;
+        }
+        else if (sort_description.size() == 1)
+        {
+            TypeIndex column_type_index = sort_description_types[0]->getTypeId();
+
+            bool result = callOnIndexAndDataType<void>(
+                column_type_index,
+                [&](const auto & types)
+                {
+                    using Types = std::decay_t<decltype(types)>;
+                    using ColumnDataType = typename Types::LeftType;
+                    using ColumnType = typename ColumnDataType::ColumnType;
+
+                    queue_variants = SortingHeap<SpecializedSingleColumnSortCursor<ColumnType>>();
+                    return true;
+                });
+
+            if (!result)
+                queue_variants = SortingHeap<SimpleSortCursor>();
+        }
+        else
+        {
+            queue_variants = SortingHeap<SortCursor>();
+        }
+    }
+
+    SortQueueVariants(const Block & header, const SortDescription & sort_description)
+        : SortQueueVariants(extractSortDescriptionTypesFromHeader(header, sort_description), sort_description)
+    {
+    }
+
+    template <typename Func>
+    decltype(auto) callOnVariant(Func && func)
+    {
+        return std::visit(func, queue_variants);
+    }
+
+    bool variantSupportJITCompilation() const
+    {
+        return std::holds_alternative<SortingHeap<SimpleSortCursor>>(queue_variants)
+            || std::holds_alternative<SortingHeap<SortCursor>>(queue_variants)
+            || std::holds_alternative<SortingHeap<SortCursorWithCollation>>(queue_variants);
+    }
+
+private:
+    static DataTypes extractSortDescriptionTypesFromHeader(const Block & header, const SortDescription & sort_description)
+    {
+        size_t sort_description_size = sort_description.size();
+        DataTypes data_types(sort_description_size);
+
+        for (size_t i = 0; i < sort_description_size; ++i)
+        {
+            const auto & column_sort_description = sort_description[i];
+            data_types[i] = header.getByName(column_sort_description.column_name).type;
+        }
+
+        return data_types;
+    }
+
+    std::variant<
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<UInt8>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<UInt16>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<UInt32>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<UInt64>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<UInt128>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<UInt256>>>,
+
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<Int8>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<Int16>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<Int32>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<Int64>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<Int128>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<Int256>>>,
+
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<Float32>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<Float64>>>,
+
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnDecimal<Decimal32>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnDecimal<Decimal64>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnDecimal<Decimal128>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnDecimal<Decimal256>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnDecimal<DateTime64>>>,
+
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<UUID>>>,
+
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnString>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnFixedString>>,
+
+        SortingHeap<SimpleSortCursor>,
+        SortingHeap<SortCursor>,
+        SortingHeap<SortCursorWithCollation>>
+        queue_variants;
+};
+
 template <typename TLeftColumns, typename TRightColumns>
 bool less(const TLeftColumns & lhs, const TRightColumns & rhs, size_t i, size_t j, const SortDescriptionWithPositions & descr)
 {
--- a/src/Core/examples/coro.cpp
+++ b/src/Core/examples/coro.cpp
@ -17,6 +17,11 @@ namespace std
    using namespace experimental::coroutines_v1;
 }

+#if __has_warning("-Wdeprecated-experimental-coroutine")
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdeprecated-experimental-coroutine"
+#endif
+
 #else
 #include <coroutine>
 #pragma GCC diagnostic push
--- a/src/Daemon/BaseDaemon.cpp
+++ b/src/Daemon/BaseDaemon.cpp
@ -10,7 +10,7 @@
 #include <sys/time.h>
 #include <sys/wait.h>
 #include <sys/resource.h>
-#if defined(__linux__)
+#if defined(OS_LINUX)
    #include <sys/prctl.h>
 #endif
 #include <cerrno>
@ -858,7 +858,7 @@ void BaseDaemon::initializeTerminationAndSignalProcessing()
    signal_listener = std::make_unique<SignalListener>(*this);
    signal_listener_thread.start(*signal_listener);

-#if defined(__ELF__) && !defined(__FreeBSD__)
+#if defined(__ELF__) && !defined(OS_FREEBSD)
    String build_id_hex = DB::SymbolIndex::instance()->getBuildIDHex();
    if (build_id_hex.empty())
        build_id_info = "no build id";
@ -868,7 +868,7 @@ void BaseDaemon::initializeTerminationAndSignalProcessing()
    build_id_info = "no build id";
 #endif

-#if defined(__linux__)
+#if defined(OS_LINUX)
    std::string executable_path = getExecutablePath();

    if (!executable_path.empty())
@ -986,7 +986,7 @@ void BaseDaemon::setupWatchdog()
        if (0 == pid)
        {
            logger().information("Forked a child process to watch");
-#if defined(__linux__)
+#if defined(OS_LINUX)
            if (0 != prctl(PR_SET_PDEATHSIG, SIGKILL))
                logger().warning("Cannot do prctl to ask termination with parent.");
 #endif
--- a/src/Daemon/SentryWriter.cpp
+++ b/src/Daemon/SentryWriter.cpp
@ -149,7 +149,7 @@ void SentryWriter::onFault(int sig, const std::string & error_message, const Sta
        sentry_set_tag("signal", strsignal(sig));
        sentry_set_extra("signal_number", sentry_value_new_int32(sig));

-        #if defined(__ELF__) && !defined(__FreeBSD__)
+        #if defined(__ELF__) && !defined(OS_FREEBSD)
            const String & build_id_hex = DB::SymbolIndex::instance()->getBuildIDHex();
            sentry_set_tag("build_id", build_id_hex.c_str());
        #endif
--- a/src/DataTypes/ObjectUtils.cpp
+++ b/src/DataTypes/ObjectUtils.cpp
@ -718,9 +718,9 @@ void replaceMissedSubcolumnsByConstants(
            addConstantToWithClause(query, name, type);
 }

-void finalizeObjectColumns(MutableColumns & columns)
+void finalizeObjectColumns(const MutableColumns & columns)
 {
-    for (auto & column : columns)
+    for (const auto & column : columns)
        if (auto * column_object = typeid_cast<ColumnObject *>(column.get()))
            column_object->finalize();
 }
--- a/src/DataTypes/ObjectUtils.h
+++ b/src/DataTypes/ObjectUtils.h
@ -51,7 +51,7 @@ void extendObjectColumns(NamesAndTypesList & columns_list, const ColumnsDescript

 NameSet getNamesOfObjectColumns(const NamesAndTypesList & columns_list);
 bool hasObjectColumns(const ColumnsDescription & columns);
-void finalizeObjectColumns(MutableColumns & columns);
+void finalizeObjectColumns(const MutableColumns & columns);

 /// Updates types of objects in @object_columns inplace
 /// according to types in new_columns.
--- a/src/DataTypes/Serializations/SerializationObject.cpp
+++ b/src/DataTypes/Serializations/SerializationObject.cpp
@ -33,71 +33,6 @@ namespace ErrorCodes
    extern const int LOGICAL_ERROR;
 }

-namespace
-{
-
-using Node = typename ColumnObject::Subcolumns::Node;
-
-/// Finds a subcolumn from the same Nested type as @entry and inserts
-/// an array with default values with consistent sizes as in Nested type.
-bool tryInsertDefaultFromNested(
-    const std::shared_ptr<Node> & entry, const ColumnObject::Subcolumns & subcolumns)
-{
-    if (!entry->path.hasNested())
-        return false;
-
-    const Node * current_node = subcolumns.findLeaf(entry->path);
-    const Node * leaf = nullptr;
-    size_t num_skipped_nested = 0;
-
-    while (current_node)
-    {
-        /// Try to find the first Nested up to the current node.
-        const auto * node_nested = subcolumns.findParent(current_node,
-            [](const auto & candidate) { return candidate.isNested(); });
-
-        if (!node_nested)
-            break;
-
-        /// If there are no leaves, skip current node and find
-        /// the next node up to the current.
-        leaf = subcolumns.findLeaf(node_nested,
-            [&](const auto & candidate)
-            {
-                return candidate.data.size() == entry->data.size() + 1;
-            });
-
-        if (leaf)
-            break;
-
-        current_node = node_nested->parent;
-        ++num_skipped_nested;
-    }
-
-    if (!leaf)
-        return false;
-
-    auto last_field = leaf->data.getLastField();
-    if (last_field.isNull())
-        return false;
-
-    const auto & least_common_type = entry->data.getLeastCommonType();
-    size_t num_dimensions = getNumberOfDimensions(*least_common_type);
-    assert(num_skipped_nested < num_dimensions);
-
-    /// Replace scalars to default values with consistent array sizes.
-    size_t num_dimensions_to_keep = num_dimensions - num_skipped_nested;
-    auto default_scalar = num_skipped_nested
-        ? createEmptyArrayField(num_skipped_nested)
-        : getBaseTypeOfArray(least_common_type)->getDefault();
-
-    auto default_field = applyVisitor(FieldVisitorReplaceScalars(default_scalar, num_dimensions_to_keep), last_field);
-    entry->data.insert(std::move(default_field));
-    return true;
-}
-
-}
-
 template <typename Parser>
 template <typename Reader>
 void SerializationObject<Parser>::deserializeTextImpl(IColumn & column, Reader && reader) const
@ -159,7 +94,7 @@ void SerializationObject<Parser>::deserializeTextImpl(IColumn & column, Reader &
    {
        if (!paths_set.has(entry->path.getPath()))
        {
-            bool inserted = tryInsertDefaultFromNested(entry, subcolumns);
+            bool inserted = column_object.tryInsertDefaultFromNested(entry);
            if (!inserted)
                entry->data.insertDefault();
        }
--- a/src/Dictionaries/SSDCacheDictionaryStorage.h
+++ b/src/Dictionaries/SSDCacheDictionaryStorage.h
@ -1,6 +1,6 @@
 #pragma once

-#if defined(__linux__) || defined(__FreeBSD__)
+#if defined(OS_LINUX) || defined(OS_FREEBSD)

 #include <chrono>

@ -507,7 +507,7 @@ public:
        iocb write_request{};
        iocb * write_request_ptr{&write_request};

-        #if defined(__FreeBSD__)
+        #if defined(OS_FREEBSD)
        write_request.aio.aio_lio_opcode = LIO_WRITE;
        write_request.aio.aio_fildes = file.fd;
        write_request.aio.aio_buf = reinterpret_cast<volatile void *>(const_cast<char *>(buffer));
@ -576,7 +576,7 @@ public:
        iocb request{};
        iocb * request_ptr = &request;

-        #if defined(__FreeBSD__)
+        #if defined(OS_FREEBSD)
        request.aio.aio_lio_opcode = LIO_READ;
        request.aio.aio_fildes = file.fd;
        request.aio.aio_buf = reinterpret_cast<volatile void *>(reinterpret_cast<UInt64>(read_buffer_memory.data()));
@ -656,7 +656,7 @@ public:

            char * buffer_place = read_buffer.data() + block_size * (block_to_fetch_index % read_from_file_buffer_blocks_size);

-            #if defined(__FreeBSD__)
+            #if defined(OS_FREEBSD)
            request.aio.aio_lio_opcode = LIO_READ;
            request.aio.aio_fildes = file.fd;
            request.aio.aio_buf = reinterpret_cast<volatile void *>(reinterpret_cast<UInt64>(buffer_place));
@ -785,7 +785,7 @@ private:

    inline static int preallocateDiskSpace(int fd, size_t offset, size_t len)
    {
-        #if defined(__FreeBSD__)
+        #if defined(OS_FREEBSD)
            return posix_fallocate(fd, offset, len);
        #else
            return fallocate(fd, 0, offset, len);
@ -796,7 +796,7 @@ private:
    {
        char * result = nullptr;

-        #if defined(__FreeBSD__)
+        #if defined(OS_FREEBSD)
            result = reinterpret_cast<char *>(reinterpret_cast<UInt64>(request.aio.aio_buf));
        #else
            result = reinterpret_cast<char *>(request.aio_buf);
@ -809,7 +809,7 @@ private:
    {
        ssize_t  bytes_written;

-        #if defined(__FreeBSD__)
+        #if defined(OS_FREEBSD)
            bytes_written = aio_return(reinterpret_cast<struct aiocb *>(event.udata));
        #else
            bytes_written = event.res;
--- a/src/Dictionaries/registerCacheDictionaries.cpp
+++ b/src/Dictionaries/registerCacheDictionaries.cpp
@ -41,7 +41,7 @@ CacheDictionaryStorageConfiguration parseCacheStorageConfiguration(
    return storage_configuration;
 }

-#if defined(OS_LINUX) || defined(__FreeBSD__)
+#if defined(OS_LINUX) || defined(OS_FREEBSD)

 SSDCacheDictionaryStorageConfiguration parseSSDCacheStorageConfiguration(
    const Poco::Util::AbstractConfiguration & config,
@ -209,7 +209,7 @@ DictionaryPtr createCacheDictionaryLayout(
        auto storage_configuration = parseCacheStorageConfiguration(config, full_name, layout_type, dictionary_layout_prefix, dict_lifetime);
        storage = std::make_shared<CacheDictionaryStorage<dictionary_key_type>>(dict_struct, storage_configuration);
    }
-#if defined(OS_LINUX) || defined(__FreeBSD__)
+#if defined(OS_LINUX) || defined(OS_FREEBSD)
    else
    {
        auto storage_configuration = parseSSDCacheStorageConfiguration(config, full_name, layout_type, dictionary_layout_prefix, dict_lifetime);
@ -261,7 +261,7 @@ void registerDictionaryCache(DictionaryFactory & factory)

    factory.registerLayout("complex_key_cache", create_complex_key_cache_layout, true);

-#if defined(OS_LINUX) || defined(__FreeBSD__)
+#if defined(OS_LINUX) || defined(OS_FREEBSD)

    auto create_simple_ssd_cache_layout = [=](const std::string & full_name,
                                              const DictionaryStructure & dict_struct,
--- a/src/Dictionaries/tests/gtest_dictionary_ssd_cache_dictionary_storage.cpp
+++ b/src/Dictionaries/tests/gtest_dictionary_ssd_cache_dictionary_storage.cpp
@ -1,4 +1,4 @@
-#if defined(__linux__) || defined(__FreeBSD__)
+#if defined(OS_LINUX) || defined(OS_FREEBSD)

 #include <gtest/gtest.h>

--- a/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.cpp
+++ b/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.cpp
@ -97,10 +97,10 @@ bool AsynchronousReadIndirectBufferFromRemoteFS::hasPendingDataToRead()
 }


-std::future<IAsynchronousReader::Result> AsynchronousReadIndirectBufferFromRemoteFS::readInto(char * data, size_t size)
+std::future<IAsynchronousReader::Result> AsynchronousReadIndirectBufferFromRemoteFS::asyncReadInto(char * data, size_t size)
 {
    IAsynchronousReader::Request request;
-    request.descriptor = std::make_shared<ThreadPoolRemoteFSReader::RemoteFSFileDescriptor>(impl);
+    request.descriptor = std::make_shared<RemoteFSFileDescriptor>(impl);
    request.buf = data;
    request.size = size;
    request.offset = file_offset_of_buffer_end;
@ -125,7 +125,7 @@ void AsynchronousReadIndirectBufferFromRemoteFS::prefetch()
        return;

    /// Prefetch even in case hasPendingData() == true.
-    prefetch_future = readInto(prefetch_buffer.data(), prefetch_buffer.size());
+    prefetch_future = asyncReadInto(prefetch_buffer.data(), prefetch_buffer.size());
    ProfileEvents::increment(ProfileEvents::RemoteFSPrefetches);
 }

@ -192,7 +192,7 @@ bool AsynchronousReadIndirectBufferFromRemoteFS::nextImpl()
    {
        ProfileEvents::increment(ProfileEvents::RemoteFSUnprefetchedReads);

-        auto result = readInto(memory.data(), memory.size()).get();
+        auto result = asyncReadInto(memory.data(), memory.size()).get();
        size = result.size;
        auto offset = result.offset;

--- a/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h
+++ b/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h
@ -62,7 +62,7 @@ private:

    bool hasPendingDataToRead();

-    std::future<IAsynchronousReader::Result> readInto(char * data, size_t size);
+    std::future<IAsynchronousReader::Result> asyncReadInto(char * data, size_t size);

    AsynchronousReaderPtr reader;

--- a/src/Disks/IO/CachedReadBufferFromFile.cpp
+++ b/src/Disks/IO/CachedReadBufferFromFile.cpp
@ -64,6 +64,7 @@ CachedReadBufferFromFile::CachedReadBufferFromFile(
    , current_buffer_id(getRandomASCIIString(8))
    , allow_seeks(allow_seeks_)
    , use_external_buffer(use_external_buffer_)
+    , query_context_holder(cache_->getQueryContextHolder(query_id, settings_))
 {
 }

--- a/src/Disks/IO/CachedReadBufferFromFile.h
+++ b/src/Disks/IO/CachedReadBufferFromFile.h
@ -138,6 +138,8 @@ private:
    [[maybe_unused]]bool use_external_buffer;
    CurrentMetrics::Increment metric_increment{CurrentMetrics::FilesystemCacheReadBuffers};
    ProfileEvents::Counters current_file_segment_counters;
+
+    IFileCache::QueryContextHolder query_context_holder;
 };

 }
--- a/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp
+++ b/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp
@ -160,7 +160,7 @@ void ReadBufferFromRemoteFSGather::appendFilesystemCacheLog()
 }


-ReadBufferFromRemoteFSGather::ReadResult ReadBufferFromRemoteFSGather::readInto(char * data, size_t size, size_t offset, size_t ignore)
+IAsynchronousReader::Result ReadBufferFromRemoteFSGather::readInto(char * data, size_t size, size_t offset, size_t ignore)
 {
    /**
     * Set `data` to current working and internal buffers.
--- a/src/Disks/IO/ReadBufferFromRemoteFSGather.h
+++ b/src/Disks/IO/ReadBufferFromRemoteFSGather.h
@ -3,6 +3,7 @@
 #include <Common/config.h>
 #include <IO/ReadBufferFromFile.h>
 #include <IO/ReadSettings.h>
+#include <IO/AsynchronousReader.h>
 #include <Disks/ObjectStorages/IObjectStorage.h>

 #if USE_AZURE_BLOB_STORAGE
@ -38,13 +39,7 @@ public:

    void setReadUntilPosition(size_t position) override;

-    struct ReadResult
-    {
-        size_t size = 0;
-        size_t offset = 0;
-    };
-
-    ReadResult readInto(char * data, size_t size, size_t offset, size_t ignore = 0);
+    IAsynchronousReader::Result readInto(char * data, size_t size, size_t offset, size_t ignore) override;

    size_t getFileSize() const;

--- a/src/Disks/IO/ThreadPoolReader.cpp
+++ b/src/Disks/IO/ThreadPoolReader.cpp
@ -13,7 +13,7 @@
 #include <unistd.h>
 #include <fcntl.h>

-#if defined(__linux__)
+#if defined(OS_LINUX)

 #include <sys/syscall.h>
 #include <sys/uio.h>
@ -84,7 +84,7 @@ std::future<IAsynchronousReader::Result> ThreadPoolReader::submit(Request reques

    int fd = assert_cast<const LocalFileDescriptor &>(*request.descriptor).fd;

-#if defined(__linux__)
+#if defined(OS_LINUX)
    /// Check if data is already in page cache with preadv2 syscall.

    /// We don't want to depend on new Linux kernel.
--- a/src/Disks/IO/ThreadPoolRemoteFSReader.cpp
+++ b/src/Disks/IO/ThreadPoolRemoteFSReader.cpp
@ -7,11 +7,10 @@
 #include <Common/assert_cast.h>
 #include <Common/setThreadName.h>
 #include <Common/CurrentThread.h>
-
+#include <Common/config.h>
 #include <IO/SeekableReadBuffer.h>

 #include <future>
-#include <iostream>


 namespace ProfileEvents
@ -27,8 +26,7 @@ namespace CurrentMetrics

 namespace DB
 {
-
-ReadBufferFromRemoteFSGather::ReadResult ThreadPoolRemoteFSReader::RemoteFSFileDescriptor::readInto(char * data, size_t size, size_t offset, size_t ignore)
+IAsynchronousReader::Result RemoteFSFileDescriptor::readInto(char * data, size_t size, size_t offset, size_t ignore)
 {
    return reader->readInto(data, size, offset, ignore);
 }
@ -69,7 +67,7 @@ std::future<IAsynchronousReader::Result> ThreadPoolRemoteFSReader::submit(Reques

        Stopwatch watch(CLOCK_MONOTONIC);

-        ReadBufferFromRemoteFSGather::ReadResult result;
+        Result result;
        try
        {
            result = remote_fs_fd->readInto(request.buf, request.size, request.offset, request.ignore);
@ -98,4 +96,5 @@ std::future<IAsynchronousReader::Result> ThreadPoolRemoteFSReader::submit(Reques

    return future;
 }
+
 }
--- a/src/Disks/IO/ThreadPoolRemoteFSReader.h
+++ b/src/Disks/IO/ThreadPoolRemoteFSReader.h
@ -1,38 +1,32 @@
 #pragma once

 #include <IO/AsynchronousReader.h>
-#include <IO/SeekableReadBuffer.h>
+#include <IO/ReadBuffer.h>
 #include <Common/ThreadPool.h>
-#include <Disks/IO/ReadBufferFromRemoteFSGather.h>
-

 namespace DB
 {

 class ThreadPoolRemoteFSReader : public IAsynchronousReader
 {
-
-private:
-    ThreadPool pool;
-
 public:
    ThreadPoolRemoteFSReader(size_t pool_size, size_t queue_size_);

-    std::future<Result> submit(Request request) override;
-
-    struct RemoteFSFileDescriptor;
-};
-
-
-struct ThreadPoolRemoteFSReader::RemoteFSFileDescriptor : public IFileDescriptor
-{
-public:
-    explicit RemoteFSFileDescriptor(std::shared_ptr<ReadBufferFromRemoteFSGather> reader_) : reader(reader_) {}
-
-    ReadBufferFromRemoteFSGather::ReadResult readInto(char * data, size_t size, size_t offset, size_t ignore = 0);
+    std::future<IAsynchronousReader::Result> submit(Request request) override;

 private:
-    std::shared_ptr<ReadBufferFromRemoteFSGather> reader;
+    ThreadPool pool;
+};
+
+class RemoteFSFileDescriptor : public IAsynchronousReader::IFileDescriptor
+{
+public:
+    explicit RemoteFSFileDescriptor(ReadBufferPtr reader_) : reader(std::move(reader_)) { }
+
+    IAsynchronousReader::Result readInto(char * data, size_t size, size_t offset, size_t ignore = 0);
+
+private:
+    ReadBufferPtr reader;
 };

 }
--- a/src/Disks/IO/createReadBufferFromFileBase.cpp
+++ b/src/Disks/IO/createReadBufferFromFileBase.cpp
@ -96,7 +96,7 @@ std::unique_ptr<ReadBufferFromFileBase> createReadBufferFromFileBase(
    if (flags == -1)
        flags = O_RDONLY | O_CLOEXEC;

-#if defined(OS_LINUX) || defined(__FreeBSD__)
+#if defined(OS_LINUX) || defined(OS_FREEBSD)
    if (settings.direct_io_threshold && estimated_size >= settings.direct_io_threshold)
    {
        /** O_DIRECT
--- a/src/Disks/ObjectStorages/S3/S3Capabilities.cpp
+++ b/src/Disks/ObjectStorages/S3/S3Capabilities.cpp
@ -0,0 +1,15 @@
+#include <Disks/ObjectStorages/S3/S3Capabilities.h>
+
+namespace DB
+{
+
+S3Capabilities getCapabilitiesFromConfig(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix)
+{
+    return S3Capabilities
+    {
+        .support_batch_delete = config.getBool(config_prefix + ".support_batch_delete", true),
+        .support_proxy = config.getBool(config_prefix + ".support_proxy", config.has(config_prefix + ".proxy")),
+    };
+}
+
+}
--- a/src/Disks/ObjectStorages/S3/S3Capabilities.h
+++ b/src/Disks/ObjectStorages/S3/S3Capabilities.h
@ -0,0 +1,27 @@
+#pragma once
+
+#include <string>
+#include <Poco/Util/AbstractConfiguration.h>
+
+namespace DB
+{
+
+/// Supported/unsupported features by different S3 implementations
+/// Can be useful only for almost compatible with AWS S3 versions.
+struct S3Capabilities
+{
+    /// Google S3 implementation doesn't support batch delete
+    /// TODO: possibly we have to use Google SDK https://github.com/googleapis/google-cloud-cpp/tree/main/google/cloud/storage
+    /// because looks like it miss a lot of features like:
+    /// 1) batch delete
+    /// 2) list_v2
+    /// 3) multipart upload works differently
+    bool support_batch_delete{true};
+
+    /// Y.Cloud S3 implementation support proxy for connection
+    bool support_proxy{false};
+};
+
+S3Capabilities getCapabilitiesFromConfig(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix);
+
+}
--- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp
@ -17,6 +17,7 @@
 #include <aws/s3/model/CopyObjectRequest.h>
 #include <aws/s3/model/ListObjectsV2Request.h>
 #include <aws/s3/model/HeadObjectRequest.h>
+#include <aws/s3/model/DeleteObjectRequest.h>
 #include <aws/s3/model/DeleteObjectsRequest.h>
 #include <aws/s3/model/CreateMultipartUploadRequest.h>
 #include <aws/s3/model/CompleteMultipartUploadRequest.h>
@ -202,18 +203,34 @@ void S3ObjectStorage::listPrefix(const std::string & path, BlobsPathToSize & chi
 void S3ObjectStorage::removeObject(const std::string & path)
 {
    auto client_ptr = client.get();
-    Aws::S3::Model::ObjectIdentifier obj;
-    obj.SetKey(path);
+    auto settings_ptr = s3_settings.get();

-    Aws::S3::Model::Delete delkeys;
-    delkeys.SetObjects({obj});
+    // If chunk size is 0, only use single delete request
+    // This allows us to work with GCS, which doesn't support DeleteObjects
+    if (!s3_capabilities.support_batch_delete)
+    {
+        Aws::S3::Model::DeleteObjectRequest request;
+        request.SetBucket(bucket);
+        request.SetKey(path);
+        auto outcome = client_ptr->DeleteObject(request);

-    Aws::S3::Model::DeleteObjectsRequest request;
-    request.SetBucket(bucket);
-    request.SetDelete(delkeys);
-    auto outcome = client_ptr->DeleteObjects(request);
+        throwIfError(outcome);
+    }
+    else
+    {
+        /// TODO: For AWS we prefer to use multiobject operation even for single object
+        /// maybe we shouldn't?
+        Aws::S3::Model::ObjectIdentifier obj;
+        obj.SetKey(path);
+        Aws::S3::Model::Delete delkeys;
+        delkeys.SetObjects({obj});
+        Aws::S3::Model::DeleteObjectsRequest request;
+        request.SetBucket(bucket);
+        request.SetDelete(delkeys);
+        auto outcome = client_ptr->DeleteObjects(request);

-    throwIfError(outcome);
+        throwIfError(outcome);
+    }
 }

 void S3ObjectStorage::removeObjects(const std::vector<std::string> & paths)
@ -224,31 +241,39 @@ void S3ObjectStorage::removeObjects(const std::vector<std::string> & paths)
    auto client_ptr = client.get();
    auto settings_ptr = s3_settings.get();

-    size_t chunk_size_limit = settings_ptr->objects_chunk_size_to_delete;
-    size_t current_position = 0;
-
-    while (current_position < paths.size())
+    if (!s3_capabilities.support_batch_delete)
    {
-        std::vector<Aws::S3::Model::ObjectIdentifier> current_chunk;
-        String keys;
-        for (; current_position < paths.size() && current_chunk.size() < chunk_size_limit; ++current_position)
+        for (const auto & path : paths)
+            removeObject(path);
+    }
+    else
+    {
+        size_t chunk_size_limit = settings_ptr->objects_chunk_size_to_delete;
+        size_t current_position = 0;
+
+        while (current_position < paths.size())
        {
-            Aws::S3::Model::ObjectIdentifier obj;
-            obj.SetKey(paths[current_position]);
-            current_chunk.push_back(obj);
+            std::vector<Aws::S3::Model::ObjectIdentifier> current_chunk;
+            String keys;
+            for (; current_position < paths.size() && current_chunk.size() < chunk_size_limit; ++current_position)
+            {
+                Aws::S3::Model::ObjectIdentifier obj;
+                obj.SetKey(paths[current_position]);
+                current_chunk.push_back(obj);

-            if (!keys.empty())
-                keys += ", ";
-            keys += paths[current_position];
+                if (!keys.empty())
+                    keys += ", ";
+                keys += paths[current_position];
+            }
+
+            Aws::S3::Model::Delete delkeys;
+            delkeys.SetObjects(current_chunk);
+            Aws::S3::Model::DeleteObjectsRequest request;
+            request.SetBucket(bucket);
+            request.SetDelete(delkeys);
+            auto outcome = client_ptr->DeleteObjects(request);
+            throwIfError(outcome);
        }
-
-        Aws::S3::Model::Delete delkeys;
-        delkeys.SetObjects(current_chunk);
-        Aws::S3::Model::DeleteObjectsRequest request;
-        request.SetBucket(bucket);
-        request.SetDelete(delkeys);
-        auto outcome = client_ptr->DeleteObjects(request);
-        throwIfError(outcome);
    }
 }

@ -483,7 +508,7 @@ std::unique_ptr<IObjectStorage> S3ObjectStorage::cloneObjectStorage(
    return std::make_unique<S3ObjectStorage>(
        getClient(config, config_prefix, context),
        getSettings(config, config_prefix, context),
-        version_id, new_namespace);
+        version_id, s3_capabilities, new_namespace);
 }

 }
--- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h
+++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h
@ -5,6 +5,7 @@
 #if USE_AWS_S3

 #include <Disks/ObjectStorages/IObjectStorage.h>
+#include <Disks/ObjectStorages/S3/S3Capabilities.h>
 #include <memory>
 #include <aws/s3/S3Client.h>
 #include <aws/s3/model/HeadObjectResult.h>
@ -45,10 +46,12 @@ public:
        std::unique_ptr<Aws::S3::S3Client> && client_,
        std::unique_ptr<S3ObjectStorageSettings> && s3_settings_,
        String version_id_,
+        const S3Capabilities & s3_capabilities_,
        String bucket_)
        : bucket(bucket_)
        , client(std::move(client_))
        , s3_settings(std::move(s3_settings_))
+        , s3_capabilities(s3_capabilities_)
        , version_id(std::move(version_id_))
    {}

@ -134,6 +137,7 @@ private:

    MultiVersion<Aws::S3::S3Client> client;
    MultiVersion<S3ObjectStorageSettings> s3_settings;
+    const S3Capabilities s3_capabilities;

    const String version_id;
 };
--- a/Show More
+++ b/Show More