Merge branch 'master' into stripping

2024-11-21 15:12:02 +00:00 · 2022-06-14 20:48:52 +00:00 · 2022-06-14 20:48:52 +00:00 · 4bd61950db
commit 4bd61950db
parent 45d3091087 c49a96dd19
148 changed files with 2915 additions and 921 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -4,7 +4,7 @@
 **[ClickHouse release v22.3-lts, 2022-03-17](#223)**<br>
 **[ClickHouse release v22.2, 2022-02-17](#222)**<br>
 **[ClickHouse release v22.1, 2022-01-18](#221)**<br>
-**[Changelog for 2021](https://github.com/ClickHouse/ClickHouse/blob/master/docs/en/whats-new/changelog/2021.md)**<br>
+**[Changelog for 2021](https://clickhouse.com/docs/en/whats-new/changelog/2021/)**<br>

 ### <a id="225"></a> ClickHouse release 22.5, 2022-05-19

@ -172,7 +172,7 @@

 #### Backward Incompatible Change

-* Do not allow SETTINGS after FORMAT for INSERT queries (there is compatibility setting `parser_settings_after_format_compact` to accept such queries, but it is turned OFF by default). [#35883](https://github.com/ClickHouse/ClickHouse/pull/35883) ([Azat Khuzhin](https://github.com/azat)).
+* Do not allow SETTINGS after FORMAT for INSERT queries (there is compatibility setting `allow_settings_after_format_in_insert` to accept such queries, but it is turned OFF by default). [#35883](https://github.com/ClickHouse/ClickHouse/pull/35883) ([Azat Khuzhin](https://github.com/azat)).
 * Function `yandexConsistentHash` (consistent hashing algorithm by Konstantin "kostik" Oblakov) is renamed to `kostikConsistentHash`. The old name is left as an alias for compatibility. Although this change is backward compatible, we may remove the alias in subsequent releases, that's why it's recommended to update the usages of this function in your apps. [#35553](https://github.com/ClickHouse/ClickHouse/pull/35553) ([Alexey Milovidov](https://github.com/alexey-milovidov)).

 #### New Feature
--- a/README.md
+++ b/README.md
@ -13,7 +13,3 @@ ClickHouse® is an open-source column-oriented database management system that a
 * [Code Browser (Woboq)](https://clickhouse.com/codebrowser/ClickHouse/index.html) with syntax highlight and navigation.
 * [Code Browser (github.dev)](https://github.dev/ClickHouse/ClickHouse) with syntax highlight, powered by github.dev.
 * [Contacts](https://clickhouse.com/company/#contact) can help to get your questions answered if there are any.
-
-## Upcoming Events
-
-* [ClickHouse Meetup Amsterdam (in-person and online)](https://www.meetup.com/clickhouse-netherlands-user-group/events/286017044/) on June 8th, 2022
--- a/cmake/tools.cmake
+++ b/cmake/tools.cmake
@ -77,6 +77,7 @@ if (OS_LINUX AND NOT LINKER_NAME)

    if (NOT LINKER_NAME)
        if (GOLD_PATH)
+            message (WARNING "Linking with gold is not recommended. Please use lld.")
            if (COMPILER_GCC)
                set (LINKER_NAME "gold")
            else ()
--- a/contrib/llvm-cmake/CMakeLists.txt
+++ b/contrib/llvm-cmake/CMakeLists.txt
@ -76,9 +76,7 @@ message (STATUS "LLVM library Directory: ${LLVM_LIBRARY_DIRS}")
 message (STATUS "LLVM C++ compiler flags: ${LLVM_CXXFLAGS}")

 # ld: unknown option: --color-diagnostics
-if (APPLE)
-    set (LINKER_SUPPORTS_COLOR_DIAGNOSTICS 0 CACHE INTERNAL "")
-endif ()
+set (LINKER_SUPPORTS_COLOR_DIAGNOSTICS 0 CACHE INTERNAL "")

 # Do not adjust RPATH in llvm, since then it will not be able to find libcxx/libcxxabi/libunwind
 set (CMAKE_INSTALL_RPATH "ON")
--- a/docker/server/README.md
+++ b/docker/server/README.md
@ -21,7 +21,9 @@ By default, starting above server instance will be run as default user without p

 ### connect to it from a native client
 ```bash
-$ docker run -it --rm --link some-clickhouse-server:clickhouse-server clickhouse/clickhouse-client --host clickhouse-server
+$ docker run -it --rm --link some-clickhouse-server:clickhouse-server --entrypoint clickhouse-client clickhouse/clickhouse-server --host clickhouse-server
+# OR
+$ docker exec -it some-clickhouse-server clickhouse-client
 ```

 More information about [ClickHouse client](https://clickhouse.com/docs/en/interfaces/cli/).
--- a/docker/test/stateful/Dockerfile
+++ b/docker/test/stateful/Dockerfile
@ -7,22 +7,12 @@ RUN apt-get update -y \
    && env DEBIAN_FRONTEND=noninteractive \
        apt-get install --yes --no-install-recommends \
        python3-requests \
-        llvm-9
+    && apt-get clean

 COPY s3downloader /s3downloader

 ENV S3_URL="https://clickhouse-datasets.s3.amazonaws.com"
 ENV DATASETS="hits visits"
-ENV EXPORT_S3_STORAGE_POLICIES=1
-
-# Download Minio-related binaries
-RUN arch=${TARGETARCH:-amd64} \
-    && if [ "$arch" = "amd64" ] ; then wget "https://dl.min.io/server/minio/release/linux-${arch}/archive/minio-20220103182258.0.0.x86_64.rpm"; else wget "https://dl.min.io/server/minio/release/linux-${arch}/archive/minio-20220103182258.0.0.aarch64.rpm" ; fi \
-    && wget "https://dl.min.io/client/mc/release/linux-${arch}/mc" \
-    && chmod +x ./mc
-ENV MINIO_ROOT_USER="clickhouse"
-ENV MINIO_ROOT_PASSWORD="clickhouse"
-COPY setup_minio.sh /

 COPY run.sh /
 CMD ["/bin/bash", "/run.sh"]
--- a/docker/test/stateful/run.sh
+++ b/docker/test/stateful/run.sh
@ -17,7 +17,7 @@ ln -s /usr/share/clickhouse-test/clickhouse-test /usr/bin/clickhouse-test
 # install test configs
 /usr/share/clickhouse-test/config/install.sh

-./setup_minio.sh
+./setup_minio.sh stateful

 function start()
 {
--- a/docker/test/stateful/setup_minio.sh
+++ b/docker/test/stateful/setup_minio.sh
@ -1,77 +0,0 @@
-#!/bin/bash
-
-# TODO: Make this file shared with stateless tests
-#
-# Usage for local run:
-#
-# ./docker/test/stateful/setup_minio.sh ./tests/
-#
-
-set -e -x -a -u
-
-rpm2cpio ./minio-20220103182258.0.0.*.rpm | cpio -i --make-directories
-find / -name minio
-cp ./usr/local/bin/minio ./
-
-ls -lha
-
-mkdir -p ./minio_data
-
-if [ ! -f ./minio ]; then
-  echo 'MinIO binary not found, downloading...'
-
-  BINARY_TYPE=$(uname -s | tr '[:upper:]' '[:lower:]')
-
-  wget "https://dl.min.io/server/minio/release/${BINARY_TYPE}-amd64/minio" \
-    && chmod +x ./minio \
-    && wget "https://dl.min.io/client/mc/release/${BINARY_TYPE}-amd64/mc" \
-    && chmod +x ./mc
-fi
-
-MINIO_ROOT_USER=${MINIO_ROOT_USER:-clickhouse}
-MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD:-clickhouse}
-
-./minio --version
-./minio server --address ":11111" ./minio_data &
-
-i=0
-while ! curl -v --silent http://localhost:11111 2>&1 | grep AccessDenied
-do
-  if [[ $i == 60 ]]; then
-    echo "Failed to setup minio"
-    exit 0
-  fi
-  echo "Trying to connect to minio"
-  sleep 1
-  i=$((i + 1))
-done
-
-lsof -i :11111
-
-sleep 5
-
-./mc alias set clickminio http://localhost:11111 clickhouse clickhouse
-./mc admin user add clickminio test testtest
-./mc admin policy set clickminio readwrite user=test
-./mc mb clickminio/test
-
-
-# Upload data to Minio. By default after unpacking all tests will in
-# /usr/share/clickhouse-test/queries
-
-TEST_PATH=${1:-/usr/share/clickhouse-test}
-MINIO_DATA_PATH=${TEST_PATH}/queries/1_stateful/data_minio
-
-# Iterating over globs will cause redudant FILE variale to be a path to a file, not a filename
-# shellcheck disable=SC2045
-for FILE in $(ls "${MINIO_DATA_PATH}"); do
-    echo "$FILE";
-    ./mc cp "${MINIO_DATA_PATH}"/"$FILE" clickminio/test/"$FILE";
-done
-
-mkdir -p ~/.aws
-cat <<EOT >> ~/.aws/credentials
-[default]
-aws_access_key_id=clickhouse
-aws_secret_access_key=clickhouse
-EOT
--- a/docker/test/stateful/setup_minio.sh
+++ b/docker/test/stateful/setup_minio.sh
@ -0,0 +1 @@
+../stateless/setup_minio.sh
--- a/docker/test/stateless/Dockerfile
+++ b/docker/test/stateless/Dockerfile
@ -5,37 +5,36 @@ FROM clickhouse/test-base:$FROM_TAG

 ARG odbc_driver_url="https://github.com/ClickHouse/clickhouse-odbc/releases/download/v1.1.4.20200302/clickhouse-odbc-1.1.4-Linux.tar.gz"

+# golang version 1.13 on Ubuntu 20 is enough for tests
 RUN apt-get update -y \
    && env DEBIAN_FRONTEND=noninteractive \
        apt-get install --yes --no-install-recommends \
+            awscli \
            brotli \
            expect \
-            zstd \
+            golang \
            lsof \
+            mysql-client=8.0* \
            ncdu \
            netcat-openbsd \
+            openjdk-11-jre-headless \
            openssl \
+            postgresql-client \
            protobuf-compiler \
            python3 \
            python3-lxml \
+            python3-pip \
            python3-requests \
            python3-termcolor \
-            python3-pip \
            qemu-user-static \
+            sqlite3 \
            sudo \
-            # golang version 1.13 on Ubuntu 20 is enough for tests
-            golang \
            telnet \
            tree \
            unixodbc \
            wget \
-            mysql-client=8.0* \
-            postgresql-client \
-            sqlite3 \
-            awscli \
-            openjdk-11-jre-headless \
-            rpm2cpio \
-            cpio
+            zstd \
+    && apt-get clean


 RUN pip3 install numpy scipy pandas Jinja2
@ -53,13 +52,17 @@ RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
 ENV NUM_TRIES=1
 ENV MAX_RUN_TIME=0

+# Unrelated to vars in setup_minio.sh, but should be the same there
+# to have the same binaries for local running scenario
+ARG MINIO_SERVER_VERSION=2022-01-03T18-22-58Z
+ARG MINIO_CLIENT_VERSION=2022-01-05T23-52-51Z
 ARG TARGETARCH

 # Download Minio-related binaries
 RUN arch=${TARGETARCH:-amd64} \
-    && if [ "$arch" = "amd64" ] ; then wget "https://dl.min.io/server/minio/release/linux-${arch}/archive/minio-20220103182258.0.0.x86_64.rpm"; else wget "https://dl.min.io/server/minio/release/linux-${arch}/archive/minio-20220103182258.0.0.aarch64.rpm" ; fi \
-    && wget "https://dl.min.io/client/mc/release/linux-${arch}/mc" \
-    && chmod +x ./mc
+    && wget "https://dl.min.io/server/minio/release/linux-${arch}/archive/minio.RELEASE.${MINIO_SERVER_VERSION}" -O ./minio \
+    && wget "https://dl.min.io/client/mc/release/linux-${arch}/archive/mc.RELEASE.${MINIO_CLIENT_VERSION}" -O ./mc \
+    && chmod +x ./mc ./minio


 RUN wget 'https://dlcdn.apache.org/hadoop/common/hadoop-3.3.1/hadoop-3.3.1.tar.gz' \
--- a/docker/test/stateless/run.sh
+++ b/docker/test/stateless/run.sh
@ -18,7 +18,7 @@ ln -s /usr/share/clickhouse-test/clickhouse-test /usr/bin/clickhouse-test
 # install test configs
 /usr/share/clickhouse-test/config/install.sh

-./setup_minio.sh
+./setup_minio.sh stateless
 ./setup_hdfs_minicluster.sh

 # For flaky check we also enable thread fuzzer
--- a/docker/test/stateless/setup_minio.sh
+++ b/docker/test/stateless/setup_minio.sh
@ -1,29 +1,41 @@
 #!/bin/bash

-# Usage for local run:
-#
-# ./docker/test/stateless/setup_minio.sh ./tests/
-#
+USAGE='Usage for local run:
+
+./docker/test/stateless/setup_minio.sh { stateful | stateless } ./tests/
+
+'

 set -e -x -a -u

-rpm2cpio ./minio-20220103182258.0.0.*.rpm | cpio -i --make-directories
-find / -name minio
-cp ./usr/local/bin/minio ./
+TEST_TYPE="$1"
+shift
+
+case $TEST_TYPE in
+  stateless) QUERY_DIR=0_stateless ;;
+  stateful) QUERY_DIR=1_stateful ;;
+  *) echo "unknown test type $TEST_TYPE"; echo "${USAGE}"; exit 1 ;;
+esac

 ls -lha

 mkdir -p ./minio_data

 if [ ! -f ./minio ]; then
+  MINIO_SERVER_VERSION=${MINIO_SERVER_VERSION:-2022-01-03T18-22-58Z}
+  MINIO_CLIENT_VERSION=${MINIO_CLIENT_VERSION:-2022-01-05T23-52-51Z}
+  case $(uname -m) in
+    x86_64) BIN_ARCH=amd64 ;;
+    aarch64) BIN_ARCH=arm64 ;;
+    *) echo "unknown architecture $(uname -m)"; exit 1 ;;
+  esac
  echo 'MinIO binary not found, downloading...'

  BINARY_TYPE=$(uname -s | tr '[:upper:]' '[:lower:]')

-  wget "https://dl.min.io/server/minio/release/${BINARY_TYPE}-amd64/minio" \
-    && chmod +x ./minio \
-    && wget "https://dl.min.io/client/mc/release/${BINARY_TYPE}-amd64/mc" \
-    && chmod +x ./mc
+  wget "https://dl.min.io/server/minio/release/${BINARY_TYPE}-${BIN_ARCH}/archive/minio.RELEASE.${MINIO_SERVER_VERSION}" -O ./minio \
+    && wget "https://dl.min.io/client/mc/release/${BINARY_TYPE}-${BIN_ARCH}/archive/mc.RELEASE.${MINIO_CLIENT_VERSION}" -O ./mc \
+    && chmod +x ./mc ./minio
 fi

 MINIO_ROOT_USER=${MINIO_ROOT_USER:-clickhouse}
@ -52,14 +64,16 @@ sleep 5
 ./mc admin user add clickminio test testtest
 ./mc admin policy set clickminio readwrite user=test
 ./mc mb clickminio/test
-./mc policy set public clickminio/test
+if [ "$TEST_TYPE" = "stateless" ]; then
+  ./mc policy set public clickminio/test
+fi


 # Upload data to Minio. By default after unpacking all tests will in
 # /usr/share/clickhouse-test/queries

 TEST_PATH=${1:-/usr/share/clickhouse-test}
-MINIO_DATA_PATH=${TEST_PATH}/queries/0_stateless/data_minio
+MINIO_DATA_PATH=${TEST_PATH}/queries/${QUERY_DIR}/data_minio

 # Iterating over globs will cause redudant FILE variale to be a path to a file, not a filename
 # shellcheck disable=SC2045
@ -71,6 +85,6 @@ done
 mkdir -p ~/.aws
 cat <<EOT >> ~/.aws/credentials
 [default]
-aws_access_key_id=clickhouse
-aws_secret_access_key=clickhouse
+aws_access_key_id=${MINIO_ROOT_USER}
+aws_secret_access_key=${MINIO_ROOT_PASSWORD}
 EOT
--- a/docker/test/stress/run.sh
+++ b/docker/test/stress/run.sh
@ -174,7 +174,7 @@ install_packages package_folder

 configure

-./setup_minio.sh
+./setup_minio.sh stateful  # to have a proper environment

 start

--- a/docs/en/development/build.md
+++ b/docs/en/development/build.md
@ -19,7 +19,7 @@ The following tutorial is based on the Ubuntu Linux system. With appropriate cha
 ### Install Git, CMake, Python and Ninja {#install-git-cmake-python-and-ninja}

 ``` bash
-sudo apt-get install git cmake python ninja-build
+sudo apt-get install git cmake ccache python3 ninja-build
 ```

 Or cmake3 instead of cmake on older systems.
--- a/docs/en/getting-started/example-datasets/metrica.md
+++ b/docs/en/getting-started/example-datasets/metrica.md
@ -1,78 +1,139 @@
 ---
 sidebar_label: Web Analytics Data
-description: Dataset consists of two tables containing anonymized web analytics data with hits and visits
+description: Dataset consisting of two tables containing anonymized web analytics data with hits and visits
 ---

 # Anonymized Web Analytics Data

-Dataset consists of two tables containing anonymized web analytics data with hits (`hits_v1`) and visits (`visits_v1`).
+This dataset consists of two tables containing anonymized web analytics data with hits (`hits_v1`) and visits (`visits_v1`).

-The dataset consists of two tables, either of them can be downloaded as a compressed `tsv.xz` file or as prepared partitions. In addition to that, an extended version of the `hits` table containing 100 million rows is available as TSV at https://datasets.clickhouse.com/hits/tsv/hits_100m_obfuscated_v1.tsv.xz and as prepared partitions at https://datasets.clickhouse.com/hits/partitions/hits_100m_obfuscated_v1.tar.xz.
+The tables can be downloaded as compressed `tsv.xz` files. In addition to the sample worked with in this document, an extended (7.5GB) version of the `hits` table containing 100 million rows is available as TSV at [https://datasets.clickhouse.com/hits/tsv/hits_100m_obfuscated_v1.tsv.xz](https://datasets.clickhouse.com/hits/tsv/hits_100m_obfuscated_v1.tsv.xz).

-## Obtaining Tables from Prepared Partitions {#obtaining-tables-from-prepared-partitions}
+## Download and ingest the data

-Download and import hits table:
-
-``` bash
-curl -O https://datasets.clickhouse.com/hits/partitions/hits_v1.tar
-tar xvf hits_v1.tar -C /var/lib/clickhouse # path to ClickHouse data directory
-# check permissions on unpacked data, fix if required
-sudo service clickhouse-server restart
-clickhouse-client --query "SELECT COUNT(*) FROM datasets.hits_v1"
-```
-
-Download and import visits:
-
-``` bash
-curl -O https://datasets.clickhouse.com/visits/partitions/visits_v1.tar
-tar xvf visits_v1.tar -C /var/lib/clickhouse # path to ClickHouse data directory
-# check permissions on unpacked data, fix if required
-sudo service clickhouse-server restart
-clickhouse-client --query "SELECT COUNT(*) FROM datasets.visits_v1"
-```
-
-## Obtaining Tables from Compressed TSV File {#obtaining-tables-from-compressed-tsv-file}
-
-Download and import hits from compressed TSV file:
+### Download the hits compressed TSV file:

 ``` bash
 curl https://datasets.clickhouse.com/hits/tsv/hits_v1.tsv.xz | unxz --threads=`nproc` > hits_v1.tsv
 # Validate the checksum
 md5sum hits_v1.tsv
 # Checksum should be equal to: f3631b6295bf06989c1437491f7592cb
-# now create table
-clickhouse-client --query "CREATE DATABASE IF NOT EXISTS datasets"
-# for hits_v1
-clickhouse-client --query "CREATE TABLE datasets.hits_v1 ( WatchID UInt64,  JavaEnable UInt8,  Title String,  GoodEvent Int16,  EventTime DateTime,  EventDate Date,  CounterID UInt32,  ClientIP UInt32,  ClientIP6 FixedString(16),  RegionID UInt32,  UserID UInt64,  CounterClass Int8,  OS UInt8,  UserAgent UInt8,  URL String,  Referer String,  URLDomain String,  RefererDomain String,  Refresh UInt8,  IsRobot UInt8,  RefererCategories Array(UInt16),  URLCategories Array(UInt16), URLRegions Array(UInt32),  RefererRegions Array(UInt32),  ResolutionWidth UInt16,  ResolutionHeight UInt16,  ResolutionDepth UInt8,  FlashMajor UInt8, FlashMinor UInt8,  FlashMinor2 String,  NetMajor UInt8,  NetMinor UInt8, UserAgentMajor UInt16,  UserAgentMinor FixedString(2),  CookieEnable UInt8, JavascriptEnable UInt8,  IsMobile UInt8,  MobilePhone UInt8,  MobilePhoneModel String,  Params String,  IPNetworkID UInt32,  TraficSourceID Int8, SearchEngineID UInt16,  SearchPhrase String,  AdvEngineID UInt8,  IsArtifical UInt8,  WindowClientWidth UInt16,  WindowClientHeight UInt16,  ClientTimeZone Int16,  ClientEventTime DateTime,  SilverlightVersion1 UInt8, SilverlightVersion2 UInt8,  SilverlightVersion3 UInt32,  SilverlightVersion4 UInt16,  PageCharset String,  CodeVersion UInt32,  IsLink UInt8,  IsDownload UInt8,  IsNotBounce UInt8,  FUniqID UInt64,  HID UInt32,  IsOldCounter UInt8, IsEvent UInt8,  IsParameter UInt8,  DontCountHits UInt8,  WithHash UInt8, HitColor FixedString(1),  UTCEventTime DateTime,  Age UInt8,  Sex UInt8,  Income UInt8,  Interests UInt16,  Robotness UInt8,  GeneralInterests Array(UInt16), RemoteIP UInt32,  RemoteIP6 FixedString(16),  WindowName Int32,  OpenerName Int32,  HistoryLength Int16,  BrowserLanguage FixedString(2),  BrowserCountry FixedString(2),  SocialNetwork String,  SocialAction String,  HTTPError UInt16, SendTiming Int32,  DNSTiming Int32,  ConnectTiming Int32,  ResponseStartTiming Int32,  ResponseEndTiming Int32,  FetchTiming Int32,  RedirectTiming Int32, DOMInteractiveTiming Int32,  DOMContentLoadedTiming Int32,  DOMCompleteTiming Int32,  LoadEventStartTiming Int32,  LoadEventEndTiming Int32, NSToDOMContentLoadedTiming Int32,  FirstPaintTiming Int32,  RedirectCount Int8, SocialSourceNetworkID UInt8,  SocialSourcePage String,  ParamPrice Int64, ParamOrderID String,  ParamCurrency FixedString(3),  ParamCurrencyID UInt16, GoalsReached Array(UInt32),  OpenstatServiceName String,  OpenstatCampaignID String,  OpenstatAdID String,  OpenstatSourceID String,  UTMSource String, UTMMedium String,  UTMCampaign String,  UTMContent String,  UTMTerm String, FromTag String,  HasGCLID UInt8,  RefererHash UInt64,  URLHash UInt64,  CLID UInt32,  YCLID UInt64,  ShareService String,  ShareURL String,  ShareTitle String,  ParsedParams Nested(Key1 String,  Key2 String, Key3 String, Key4 String, Key5 String,  ValueDouble Float64),  IslandID FixedString(16),  RequestNum UInt32,  RequestTry UInt8) ENGINE = MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS index_granularity = 8192"
-# for hits_100m_obfuscated
-clickhouse-client --query="CREATE TABLE default.hits_100m_obfuscated (WatchID UInt64, JavaEnable UInt8, Title String, GoodEvent Int16, EventTime DateTime, EventDate Date, CounterID UInt32, ClientIP UInt32, RegionID UInt32, UserID UInt64, CounterClass Int8, OS UInt8, UserAgent UInt8, URL String, Referer String, Refresh UInt8, RefererCategoryID UInt16, RefererRegionID UInt32, URLCategoryID UInt16, URLRegionID UInt32, ResolutionWidth UInt16, ResolutionHeight UInt16, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, FlashMinor2 String, NetMajor UInt8, NetMinor UInt8, UserAgentMajor UInt16, UserAgentMinor FixedString(2), CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, MobilePhone UInt8, MobilePhoneModel String, Params String, IPNetworkID UInt32, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, IsArtifical UInt8, WindowClientWidth UInt16, WindowClientHeight UInt16, ClientTimeZone Int16, ClientEventTime DateTime, SilverlightVersion1 UInt8, SilverlightVersion2 UInt8, SilverlightVersion3 UInt32, SilverlightVersion4 UInt16, PageCharset String, CodeVersion UInt32, IsLink UInt8, IsDownload UInt8, IsNotBounce UInt8, FUniqID UInt64, OriginalURL String, HID UInt32, IsOldCounter UInt8, IsEvent UInt8, IsParameter UInt8, DontCountHits UInt8, WithHash UInt8, HitColor FixedString(1), LocalEventTime DateTime, Age UInt8, Sex UInt8, Income UInt8, Interests UInt16, Robotness UInt8, RemoteIP UInt32, WindowName Int32, OpenerName Int32, HistoryLength Int16, BrowserLanguage FixedString(2), BrowserCountry FixedString(2), SocialNetwork String, SocialAction String, HTTPError UInt16, SendTiming UInt32, DNSTiming UInt32, ConnectTiming UInt32, ResponseStartTiming UInt32, ResponseEndTiming UInt32, FetchTiming UInt32, SocialSourceNetworkID UInt8, SocialSourcePage String, ParamPrice Int64, ParamOrderID String, ParamCurrency FixedString(3), ParamCurrencyID UInt16, OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, RefererHash UInt64, URLHash UInt64, CLID UInt32) ENGINE = MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER  BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS index_granularity = 8192"
+```

-# import data
+### Create the database and table
+
+```bash
+clickhouse-client --query "CREATE DATABASE IF NOT EXISTS datasets"
+```
+
+For hits_v1
+
+```bash
+clickhouse-client --query "CREATE TABLE datasets.hits_v1 ( WatchID UInt64,  JavaEnable UInt8,  Title String,  GoodEvent Int16,  EventTime DateTime,  EventDate Date,  CounterID UInt32,  ClientIP UInt32,  ClientIP6 FixedString(16),  RegionID UInt32,  UserID UInt64,  CounterClass Int8,  OS UInt8,  UserAgent UInt8,  URL String,  Referer String,  URLDomain String,  RefererDomain String,  Refresh UInt8,  IsRobot UInt8,  RefererCategories Array(UInt16),  URLCategories Array(UInt16), URLRegions Array(UInt32),  RefererRegions Array(UInt32),  ResolutionWidth UInt16,  ResolutionHeight UInt16,  ResolutionDepth UInt8,  FlashMajor UInt8, FlashMinor UInt8,  FlashMinor2 String,  NetMajor UInt8,  NetMinor UInt8, UserAgentMajor UInt16,  UserAgentMinor FixedString(2),  CookieEnable UInt8, JavascriptEnable UInt8,  IsMobile UInt8,  MobilePhone UInt8,  MobilePhoneModel String,  Params String,  IPNetworkID UInt32,  TraficSourceID Int8, SearchEngineID UInt16,  SearchPhrase String,  AdvEngineID UInt8,  IsArtifical UInt8,  WindowClientWidth UInt16,  WindowClientHeight UInt16,  ClientTimeZone Int16,  ClientEventTime DateTime,  SilverlightVersion1 UInt8, SilverlightVersion2 UInt8,  SilverlightVersion3 UInt32,  SilverlightVersion4 UInt16,  PageCharset String,  CodeVersion UInt32,  IsLink UInt8,  IsDownload UInt8,  IsNotBounce UInt8,  FUniqID UInt64,  HID UInt32,  IsOldCounter UInt8, IsEvent UInt8,  IsParameter UInt8,  DontCountHits UInt8,  WithHash UInt8, HitColor FixedString(1),  UTCEventTime DateTime,  Age UInt8,  Sex UInt8,  Income UInt8,  Interests UInt16,  Robotness UInt8,  GeneralInterests Array(UInt16), RemoteIP UInt32,  RemoteIP6 FixedString(16),  WindowName Int32,  OpenerName Int32,  HistoryLength Int16,  BrowserLanguage FixedString(2),  BrowserCountry FixedString(2),  SocialNetwork String,  SocialAction String,  HTTPError UInt16, SendTiming Int32,  DNSTiming Int32,  ConnectTiming Int32,  ResponseStartTiming Int32,  ResponseEndTiming Int32,  FetchTiming Int32,  RedirectTiming Int32, DOMInteractiveTiming Int32,  DOMContentLoadedTiming Int32,  DOMCompleteTiming Int32,  LoadEventStartTiming Int32,  LoadEventEndTiming Int32, NSToDOMContentLoadedTiming Int32,  FirstPaintTiming Int32,  RedirectCount Int8, SocialSourceNetworkID UInt8,  SocialSourcePage String,  ParamPrice Int64, ParamOrderID String,  ParamCurrency FixedString(3),  ParamCurrencyID UInt16, GoalsReached Array(UInt32),  OpenstatServiceName String,  OpenstatCampaignID String,  OpenstatAdID String,  OpenstatSourceID String,  UTMSource String, UTMMedium String,  UTMCampaign String,  UTMContent String,  UTMTerm String, FromTag String,  HasGCLID UInt8,  RefererHash UInt64,  URLHash UInt64,  CLID UInt32,  YCLID UInt64,  ShareService String,  ShareURL String,  ShareTitle String,  ParsedParams Nested(Key1 String,  Key2 String, Key3 String, Key4 String, Key5 String,  ValueDouble Float64),  IslandID FixedString(16),  RequestNum UInt32,  RequestTry UInt8) ENGINE = MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS index_granularity = 8192"
+```
+
+Or for hits_100m_obfuscated
+
+```bash
+clickhouse-client --query="CREATE TABLE default.hits_100m_obfuscated (WatchID UInt64, JavaEnable UInt8, Title String, GoodEvent Int16, EventTime DateTime, EventDate Date, CounterID UInt32, ClientIP UInt32, RegionID UInt32, UserID UInt64, CounterClass Int8, OS UInt8, UserAgent UInt8, URL String, Referer String, Refresh UInt8, RefererCategoryID UInt16, RefererRegionID UInt32, URLCategoryID UInt16, URLRegionID UInt32, ResolutionWidth UInt16, ResolutionHeight UInt16, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, FlashMinor2 String, NetMajor UInt8, NetMinor UInt8, UserAgentMajor UInt16, UserAgentMinor FixedString(2), CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, MobilePhone UInt8, MobilePhoneModel String, Params String, IPNetworkID UInt32, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, IsArtifical UInt8, WindowClientWidth UInt16, WindowClientHeight UInt16, ClientTimeZone Int16, ClientEventTime DateTime, SilverlightVersion1 UInt8, SilverlightVersion2 UInt8, SilverlightVersion3 UInt32, SilverlightVersion4 UInt16, PageCharset String, CodeVersion UInt32, IsLink UInt8, IsDownload UInt8, IsNotBounce UInt8, FUniqID UInt64, OriginalURL String, HID UInt32, IsOldCounter UInt8, IsEvent UInt8, IsParameter UInt8, DontCountHits UInt8, WithHash UInt8, HitColor FixedString(1), LocalEventTime DateTime, Age UInt8, Sex UInt8, Income UInt8, Interests UInt16, Robotness UInt8, RemoteIP UInt32, WindowName Int32, OpenerName Int32, HistoryLength Int16, BrowserLanguage FixedString(2), BrowserCountry FixedString(2), SocialNetwork String, SocialAction String, HTTPError UInt16, SendTiming UInt32, DNSTiming UInt32, ConnectTiming UInt32, ResponseStartTiming UInt32, ResponseEndTiming UInt32, FetchTiming UInt32, SocialSourceNetworkID UInt8, SocialSourcePage String, ParamPrice Int64, ParamOrderID String, ParamCurrency FixedString(3), ParamCurrencyID UInt16, OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, RefererHash UInt64, URLHash UInt64, CLID UInt32) ENGINE = MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER  BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS index_granularity = 8192"
+```
+
+### Import the hits data:
+
+```bash
 cat hits_v1.tsv | clickhouse-client --query "INSERT INTO datasets.hits_v1 FORMAT TSV" --max_insert_block_size=100000
-# optionally you can optimize table
-clickhouse-client --query "OPTIMIZE TABLE datasets.hits_v1 FINAL"
+```
+
+Verify the count of rows
+
+```bash
 clickhouse-client --query "SELECT COUNT(*) FROM datasets.hits_v1"
 ```

-Download and import visits from compressed tsv-file:
+```response
+8873898
+```
+
+### Download the visits compressed TSV file:

 ``` bash
 curl https://datasets.clickhouse.com/visits/tsv/visits_v1.tsv.xz | unxz --threads=`nproc` > visits_v1.tsv
 # Validate the checksum
 md5sum visits_v1.tsv
 # Checksum should be equal to: 6dafe1a0f24e59e3fc2d0fed85601de6
-# now create table
-clickhouse-client --query "CREATE DATABASE IF NOT EXISTS datasets"
+```
+
+### Create the visits table
+
+```bash
 clickhouse-client --query "CREATE TABLE datasets.visits_v1 ( CounterID UInt32,  StartDate Date,  Sign Int8,  IsNew UInt8,  VisitID UInt64,  UserID UInt64,  StartTime DateTime,  Duration UInt32,  UTCStartTime DateTime,  PageViews Int32,  Hits Int32,  IsBounce UInt8,  Referer String,  StartURL String,  RefererDomain String,  StartURLDomain String,  EndURL String,  LinkURL String,  IsDownload UInt8,  TraficSourceID Int8,  SearchEngineID UInt16,  SearchPhrase String,  AdvEngineID UInt8,  PlaceID Int32,  RefererCategories Array(UInt16),  URLCategories Array(UInt16),  URLRegions Array(UInt32),  RefererRegions Array(UInt32),  IsYandex UInt8,  GoalReachesDepth Int32,  GoalReachesURL Int32,  GoalReachesAny Int32,  SocialSourceNetworkID UInt8,  SocialSourcePage String,  MobilePhoneModel String,  ClientEventTime DateTime,  RegionID UInt32,  ClientIP UInt32,  ClientIP6 FixedString(16),  RemoteIP UInt32,  RemoteIP6 FixedString(16),  IPNetworkID UInt32,  SilverlightVersion3 UInt32,  CodeVersion UInt32,  ResolutionWidth UInt16,  ResolutionHeight UInt16,  UserAgentMajor UInt16,  UserAgentMinor UInt16,  WindowClientWidth UInt16,  WindowClientHeight UInt16,  SilverlightVersion2 UInt8,  SilverlightVersion4 UInt16,  FlashVersion3 UInt16,  FlashVersion4 UInt16,  ClientTimeZone Int16,  OS UInt8,  UserAgent UInt8,  ResolutionDepth UInt8,  FlashMajor UInt8,  FlashMinor UInt8,  NetMajor UInt8,  NetMinor UInt8,  MobilePhone UInt8,  SilverlightVersion1 UInt8,  Age UInt8,  Sex UInt8,  Income UInt8,  JavaEnable UInt8,  CookieEnable UInt8,  JavascriptEnable UInt8,  IsMobile UInt8,  BrowserLanguage UInt16,  BrowserCountry UInt16,  Interests UInt16,  Robotness UInt8,  GeneralInterests Array(UInt16),  Params Array(String),  Goals Nested(ID UInt32, Serial UInt32, EventTime DateTime,  Price Int64,  OrderID String, CurrencyID UInt32),  WatchIDs Array(UInt64),  ParamSumPrice Int64,  ParamCurrency FixedString(3),  ParamCurrencyID UInt16,  ClickLogID UInt64,  ClickEventID Int32,  ClickGoodEvent Int32,  ClickEventTime DateTime,  ClickPriorityID Int32,  ClickPhraseID Int32,  ClickPageID Int32,  ClickPlaceID Int32,  ClickTypeID Int32,  ClickResourceID Int32,  ClickCost UInt32,  ClickClientIP UInt32,  ClickDomainID UInt32,  ClickURL String,  ClickAttempt UInt8,  ClickOrderID UInt32,  ClickBannerID UInt32,  ClickMarketCategoryID UInt32,  ClickMarketPP UInt32,  ClickMarketCategoryName String,  ClickMarketPPName String,  ClickAWAPSCampaignName String,  ClickPageName String,  ClickTargetType UInt16,  ClickTargetPhraseID UInt64,  ClickContextType UInt8,  ClickSelectType Int8,  ClickOptions String,  ClickGroupBannerID Int32,  OpenstatServiceName String,  OpenstatCampaignID String,  OpenstatAdID String,  OpenstatSourceID String,  UTMSource String,  UTMMedium String,  UTMCampaign String,  UTMContent String,  UTMTerm String,  FromTag String,  HasGCLID UInt8,  FirstVisit DateTime,  PredLastVisit Date,  LastVisit Date,  TotalVisits UInt32,  TraficSource    Nested(ID Int8,  SearchEngineID UInt16, AdvEngineID UInt8, PlaceID UInt16, SocialSourceNetworkID UInt8, Domain String, SearchPhrase String, SocialSourcePage String),  Attendance FixedString(16),  CLID UInt32,  YCLID UInt64,  NormalizedRefererHash UInt64,  SearchPhraseHash UInt64,  RefererDomainHash UInt64,  NormalizedStartURLHash UInt64,  StartURLDomainHash UInt64,  NormalizedEndURLHash UInt64,  TopLevelDomain UInt64,  URLScheme UInt64,  OpenstatServiceNameHash UInt64,  OpenstatCampaignIDHash UInt64,  OpenstatAdIDHash UInt64,  OpenstatSourceIDHash UInt64,  UTMSourceHash UInt64,  UTMMediumHash UInt64,  UTMCampaignHash UInt64,  UTMContentHash UInt64,  UTMTermHash UInt64,  FromHash UInt64,  WebVisorEnabled UInt8,  WebVisorActivity UInt32,  ParsedParams    Nested(Key1 String,  Key2 String,  Key3 String,  Key4 String, Key5 String, ValueDouble    Float64),  Market Nested(Type UInt8, GoalID UInt32, OrderID String,  OrderPrice Int64,  PP UInt32,  DirectPlaceID UInt32,  DirectOrderID  UInt32,  DirectBannerID UInt32,  GoodID String, GoodName String, GoodQuantity Int32,  GoodPrice Int64),  IslandID FixedString(16)) ENGINE = CollapsingMergeTree(Sign) PARTITION BY toYYYYMM(StartDate) ORDER BY (CounterID, StartDate, intHash32(UserID), VisitID) SAMPLE BY intHash32(UserID) SETTINGS index_granularity = 8192"
-# import data
+```
+
+### Import the visits data
+```bash
 cat visits_v1.tsv | clickhouse-client --query "INSERT INTO datasets.visits_v1 FORMAT TSV" --max_insert_block_size=100000
-# optionally you can optimize table
-clickhouse-client --query "OPTIMIZE TABLE datasets.visits_v1 FINAL"
+```
+
+Verify the count
+```bash
 clickhouse-client --query "SELECT COUNT(*) FROM datasets.visits_v1"
 ```

-## Example Queries {#example-queries}
+```response
+1680609
+```

-[The ClickHouse tutorial](../../tutorial.md) is based on this web analytics dataset, and the recommended way to get started with this dataset is to go through the tutorial.
+## An example JOIN 

-Additional examples of queries to these tables can be found among [stateful tests](https://github.com/ClickHouse/ClickHouse/tree/master/tests/queries/1_stateful) of ClickHouse (they are named `test.hits` and `test.visits` there).
+The hits and visits dataset is used in the ClickHouse test
+routines, this is one of the queries from the test suite. The rest
+of the tests are refernced in the *What's Next* section at the
+end of this page.
+
+```sql
+clickhouse-client --query "SELECT
+    EventDate,
+    hits,
+    visits
+FROM
+(
+    SELECT
+        EventDate,
+        count() AS hits
+    FROM datasets.hits_v1
+    GROUP BY EventDate
+) ANY LEFT JOIN
+(
+    SELECT
+        StartDate AS EventDate,
+        sum(Sign) AS visits
+    FROM datasets.visits_v1
+    GROUP BY EventDate
+) USING EventDate
+ORDER BY hits DESC
+LIMIT 10
+SETTINGS joined_subquery_requires_alias = 0
+FORMAT PrettyCompact"
+```
+
+```response
+┌──EventDate─┬────hits─┬─visits─┐
+│ 2014-03-17 │ 1406958 │ 265108 │
+│ 2014-03-19 │ 1405797 │ 261624 │
+│ 2014-03-18 │ 1383658 │ 258723 │
+│ 2014-03-20 │ 1353623 │ 255328 │
+│ 2014-03-21 │ 1245779 │ 236232 │
+│ 2014-03-23 │ 1046491 │ 202212 │
+│ 2014-03-22 │ 1031592 │ 197354 │
+└────────────┴─────────┴────────┘
+```
+
+## Next Steps
+
+[A Practical Introduction to Sparse Primary Indexes in ClickHouse](../../guides/improving-query-performance/sparse-primary-indexes/sparse-primary-indexes-intro.md) uses the hits dataset to discuss the differences in ClickHouse indexing compared to traditional relational databases, how ClickHouse builds and uses a sparse primary index, and indexing best practices.
+
+Additional examples of queries to these tables can be found among the ClickHouse [stateful tests](https://github.com/ClickHouse/ClickHouse/blob/d7129855757f38ceec3e4ecc6dafacdabe9b178f/tests/queries/1_stateful/00172_parallel_join.sql).
+
+:::note
+The test suite uses a database name `test`, and the tables are named `hits` and `visits`.  You can rename your database and tables, or edit the SQL from the test file.  
+:::
--- a/docs/en/interfaces/third-party/client-libraries.md
+++ b/docs/en/interfaces/third-party/client-libraries.md
@ -47,6 +47,8 @@ ClickHouse Inc does **not** maintain the libraries listed below and hasn’t don
    -   [ClickHouse (Ruby)](https://github.com/shlima/click_house)
    -   [clickhouse-activerecord](https://github.com/PNixx/clickhouse-activerecord)
 -   Rust
+    -   [clickhouse.rs](https://github.com/loyd/clickhouse.rs)
+    -   [clickhouse-rs](https://github.com/suharev7/clickhouse-rs)
    -   [Klickhouse](https://github.com/Protryon/klickhouse)
 -   R
    -   [clickhouse-r](https://github.com/hannesmuehleisen/clickhouse-r)
--- a/docs/en/sql-reference/statements/create/view.md
+++ b/docs/en/sql-reference/statements/create/view.md
@ -252,12 +252,14 @@ This is an experimental feature that may change in backwards-incompatible ways i
 :::

 ``` sql
-CREATE WINDOW VIEW [IF NOT EXISTS] [db.]table_name [TO [db.]table_name] [ENGINE = engine] [WATERMARK = strategy] [ALLOWED_LATENESS = interval_function] AS SELECT ... GROUP BY time_window_function
+CREATE WINDOW VIEW [IF NOT EXISTS] [db.]table_name [TO [db.]table_name] [INNER ENGINE engine] [ENGINE engine] [WATERMARK strategy] [ALLOWED_LATENESS interval_function] [POPULATE] AS SELECT ... GROUP BY time_window_function
 ```

 Window view can aggregate data by time window and output the results when the window is ready to fire. It stores the partial aggregation results in an inner(or specified) table to reduce latency and can push the processing result to a specified table or push notifications using the WATCH query.

-Creating a window view is similar to creating `MATERIALIZED VIEW`. Window view needs an inner storage engine to store intermediate data. The inner storage will use `AggregatingMergeTree` as the default engine.
+Creating a window view is similar to creating `MATERIALIZED VIEW`. Window view needs an inner storage engine to store intermediate data. The inner storage can be specified by using `INNER ENGINE` clause, the window view will use `AggregatingMergeTree` as the default inner engine.
+
+When creating a window view without `TO [db].[table]`, you must specify `ENGINE` – the table engine for storing data.

 ### Time Window Functions

@ -297,6 +299,8 @@ CREATE WINDOW VIEW test.wv TO test.dst WATERMARK=ASCENDING ALLOWED_LATENESS=INTE

 Note that elements emitted by a late firing should be treated as updated results of a previous computation. Instead of firing at the end of windows, the window view will fire immediately when the late event arrives. Thus, it will result in multiple outputs for the same window. Users need to take these duplicated results into account or deduplicate them.

+You can modify `SELECT` query that was specified in the window view by using `ALTER TABLE … MODIFY QUERY` statement. The data structure resulting in a new `SELECT` query should be the same as the original `SELECT` query when with or without `TO [db.]name` clause. Note that the data in the current window will be lost because the intermediate state cannot be reused.
+
 ### Monitoring New Windows

 Window view supports the [WATCH](../../../sql-reference/statements/watch.md) query to monitoring changes, or use `TO` syntax to output the results to a table.
@ -314,6 +318,7 @@ WATCH [db.]window_view

 - `window_view_clean_interval`: The clean interval of window view in seconds to free outdated data. The system will retain the windows that have not been fully triggered according to the system time or `WATERMARK` configuration, and the other data will be deleted.
 - `window_view_heartbeat_interval`: The heartbeat interval in seconds to indicate the watch query is alive.
+- `wait_for_window_view_fire_signal_timeout`: Timeout for waiting for window view fire signal in event time processing.

 ### Example

--- a/docs/en/sql-reference/statements/system.md
+++ b/docs/en/sql-reference/statements/system.md
@ -32,6 +32,7 @@ The list of available `SYSTEM` statements:
 -   [START TTL MERGES](#query_language-start-ttl-merges)
 -   [STOP MOVES](#query_language-stop-moves)
 -   [START MOVES](#query_language-start-moves)
+-   [SYSTEM UNFREEZE](#query_language-system-unfreeze)
 -   [STOP FETCHES](#query_language-system-stop-fetches)
 -   [START FETCHES](#query_language-system-start-fetches)
 -   [STOP REPLICATED SENDS](#query_language-system-start-replicated-sends)
@ -239,6 +240,14 @@ Returns `Ok.` even if table does not exist. Returns error when database does not
 SYSTEM START MOVES [[db.]merge_tree_family_table_name]
 ```

+### SYSTEM UNFREEZE {#query_language-system-unfreeze}
+
+Clears freezed backup with the specified name from all the disks. See more about unfreezing separate parts in [ALTER TABLE table_name UNFREEZE WITH NAME ](alter/partition.md#alter_unfreeze-partition)
+
+``` sql
+SYSTEM UNFREEZE WITH NAME <backup_name>
+```
+
 ## Managing ReplicatedMergeTree Tables

 ClickHouse can manage background replication related processes in [ReplicatedMergeTree](../../engines/table-engines/mergetree-family/replication.md#table_engines-replication) tables.
--- a/docs/ru/interfaces/third-party/client-libraries.md
+++ b/docs/ru/interfaces/third-party/client-libraries.md
@ -41,6 +41,8 @@ sidebar_label: "Клиентские библиотеки от сторонни
    -   [ClickHouse (Ruby)](https://github.com/shlima/click_house)
    -   [clickhouse-activerecord](https://github.com/PNixx/clickhouse-activerecord)
 -   Rust
+    -   [clickhouse.rs](https://github.com/loyd/clickhouse.rs)
+    -   [clickhouse-rs](https://github.com/suharev7/clickhouse-rs)
    -   [Klickhouse](https://github.com/Protryon/klickhouse)
 -   R
    -   [clickhouse-r](https://github.com/hannesmuehleisen/clickhouse-r)
--- a/docs/ru/sql-reference/statements/system.md
+++ b/docs/ru/sql-reference/statements/system.md
@ -30,6 +30,7 @@ sidebar_label: SYSTEM
 -   [START TTL MERGES](#query_language-start-ttl-merges)
 -   [STOP MOVES](#query_language-stop-moves)
 -   [START MOVES](#query_language-start-moves)
+-   [SYSTEM UNFREEZE](#query_language-system-unfreeze)
 -   [STOP FETCHES](#query_language-system-stop-fetches)
 -   [START FETCHES](#query_language-system-start-fetches)
 -   [STOP REPLICATED SENDS](#query_language-system-start-replicated-sends)
@ -235,6 +236,14 @@ SYSTEM STOP MOVES [[db.]merge_tree_family_table_name]
 SYSTEM START MOVES [[db.]merge_tree_family_table_name]
 ```

+### SYSTEM UNFREEZE {#query_language-system-unfreeze}
+
+Удаляет с диска все "замороженные" партиции данного бэкапа. Про удаление партиций по отдельности смотрите запрос [ALTER TABLE table_name UNFREEZE WITH NAME ](alter/partition.md#alter_unfreeze-partition)
+
+``` sql
+SYSTEM UNFREEZE WITH NAME <backup_name>
+```
+
 ## Managing ReplicatedMergeTree Tables {#query-language-system-replicated}

 ClickHouse может управлять фоновыми процессами связанными c репликацией в таблицах семейства [ReplicatedMergeTree](../../engines/table-engines/mergetree-family/replacingmergetree.md).
--- a/docs/ru/whats-new/index.md
+++ b/docs/ru/whats-new/index.md
@ -5,4 +5,4 @@ sidebar_position: 82

 # Что нового в ClickHouse?

-Планы развития вкратце изложены [здесь](https://github.com/ClickHouse/ClickHouse/issues/17623), а новости по предыдущим релизам подробно описаны в [журнале изменений](./changelog/).
+Планы развития вкратце изложены [здесь](https://github.com/ClickHouse/ClickHouse/issues/32513), а новости по предыдущим релизам подробно описаны в [журнале изменений](./changelog/).
--- a/docs/zh/interfaces/third-party/client-libraries.md
+++ b/docs/zh/interfaces/third-party/client-libraries.md
@ -41,6 +41,10 @@ Yandex**没有**维护下面列出的库，也没有做过任何广泛的测试
 -   Ruby
    -   [ClickHouse (Ruby)](https://github.com/shlima/click_house)
    -   [clickhouse-activerecord](https://github.com/PNixx/clickhouse-activerecord)
+-   Rust
+    -   [clickhouse.rs](https://github.com/loyd/clickhouse.rs)
+    -   [clickhouse-rs](https://github.com/suharev7/clickhouse-rs)
+    -   [Klickhouse](https://github.com/Protryon/klickhouse)
 -   R
    -   [clickhouse-r](https://github.com/hannesmuehleisen/clickhouse-r)
    -   [RClickHouse](https://github.com/IMSMWU/RClickHouse)
--- a/docs/zh/sql-reference/statements/create/view.md
+++ b/docs/zh/sql-reference/statements/create/view.md
@ -250,12 +250,14 @@ Code: 60. DB::Exception: Received from localhost:9000. DB::Exception: Table defa
    `set allow_experimental_window_view = 1`。

 ``` sql
-CREATE WINDOW VIEW [IF NOT EXISTS] [db.]table_name [TO [db.]table_name] [ENGINE = engine] [WATERMARK = strategy] [ALLOWED_LATENESS = interval_function] AS SELECT ... GROUP BY time_window_function
+CREATE WINDOW VIEW [IF NOT EXISTS] [db.]table_name [TO [db.]table_name] [INNER ENGINE engine] [ENGINE engine] [WATERMARK strategy] [ALLOWED_LATENESS interval_function] [POPULATE] AS SELECT ... GROUP BY time_window_function
 ```

 Window view可以通过时间窗口聚合数据，并在满足窗口触发条件时自动触发对应窗口计算。其通过将计算状态保存降低处理延迟，支持将处理结果输出至目标表或通过`WATCH`语句输出至终端。

-创建window view的方式和创建物化视图类似。Window view使用默认为`AggregatingMergeTree`的内部存储引擎存储计算中间状态。
+创建window view的方式和创建物化视图类似。Window view通过`INNER ENGINE`指定内部存储引擎以存储窗口计算中间状态，默认使用`AggregatingMergeTree`作为内部中间状态存储引擎。
+
+创建不带`TO [db].[table]`的window view时，必须指定`ENGINE` – 用于存储数据的表引擎。

 ### 时间窗口函数 {#window-view-shi-jian-chuang-kou-han-shu}

@ -295,6 +297,10 @@ CREATE WINDOW VIEW test.wv TO test.dst WATERMARK=ASCENDING ALLOWED_LATENESS=INTE

 需要注意的是，迟到消息需要更新之前的处理结果。与在窗口结束时触发不同，迟到消息到达时window view会立即触发计算。因此，会导致同一个窗口输出多次计算结果。用户需要注意这种情况，并消除重复结果。

+### 查询语句修改 {#window-view-cha-xun-yu-ju-xiu-gai}
+
+用户可以通过`ALTER TABLE ... MODIFY QUERY`语句修改window view的`SELECT`查询语句。无论是否使用`TO [db.]name`语句，新`SELECT`语句的数据结构均需和旧语句相同。需要注意的是，由于窗口计算中间状态无法复用，修改查询语句时会丢失当前窗口数据。
+
 ### 新窗口监控 {#window-view-xin-chuang-kou-jian-kong}

 Window view可以通过`WATCH`语句将处理结果推送至终端，或通过`TO`语句将结果推送至数据表。
@ -309,6 +315,7 @@ WATCH [db.]name [LIMIT n]

 - `window_view_clean_interval`: window view清除过期数据间隔(单位为秒)。系统会定期清除过期数据，尚未触发的窗口数据不会被清除。
 - `window_view_heartbeat_interval`: 用于判断watch查询活跃的心跳时间间隔。
+- `wait_for_window_view_fire_signal_timeout`: Event time 处理模式下，窗口触发信号等待超时时间。

 ### 示例 {#window-view-shi-li}

--- a/docs/zh/sql-reference/statements/system.md
+++ b/docs/zh/sql-reference/statements/system.md
@ -26,6 +26,7 @@ sidebar_label: SYSTEM
 -   [START TTL MERGES](#query_language-start-ttl-merges)
 -   [STOP MOVES](#query_language-stop-moves)
 -   [START MOVES](#query_language-start-moves)
+-   [SYSTEM UNFREEZE](#query_language-system-unfreeze)
 -   [STOP FETCHES](#query_language-system-stop-fetches)
 -   [START FETCHES](#query_language-system-start-fetches)
 -   [STOP REPLICATED SENDS](#query_language-system-start-replicated-sends)
@ -203,6 +204,14 @@ SYSTEM STOP MOVES [[db.]merge_tree_family_table_name]
 SYSTEM STOP MOVES [[db.]merge_tree_family_table_name]
 ```

+### SYSTEM UNFREEZE {#query_language-system-unfreeze}
+
+从所有磁盘中清除具有指定名称的冻结备份。 查看更多关于解冻单独部分的信息 [ALTER TABLE table_name UNFREEZE WITH NAME ](alter/partition.md#alter_unfreeze-partition)
+
+``` sql
+SYSTEM UNFREEZE WITH NAME <backup_name>
+```
+
 ## Managing ReplicatedMergeTree Tables {#query-language-system-replicated}

 管理 [ReplicatedMergeTree](../../engines/table-engines/mergetree-family/replacingmergetree.md)表的后台复制相关进程。
--- a/src/Access/Common/AccessType.h
+++ b/src/Access/Common/AccessType.h
@ -164,6 +164,7 @@ enum class AccessType
    M(SYSTEM_FLUSH_LOGS, "FLUSH LOGS", GLOBAL, SYSTEM_FLUSH) \
    M(SYSTEM_FLUSH, "", GROUP, SYSTEM) \
    M(SYSTEM_THREAD_FUZZER, "SYSTEM START THREAD FUZZER, SYSTEM STOP THREAD FUZZER, START THREAD FUZZER, STOP THREAD FUZZER", GLOBAL, SYSTEM) \
+    M(SYSTEM_UNFREEZE, "SYSTEM UNFREEZE", GLOBAL, SYSTEM) \
    M(SYSTEM, "", GROUP, ALL) /* allows to execute SYSTEM {SHUTDOWN|RELOAD CONFIG|...} */ \
    \
    M(dictGet, "dictHas, dictGetHierarchy, dictIsIn", DICTIONARY, ALL) /* allows to execute functions dictGet(), dictHas(), dictGetHierarchy(), dictIsIn() */\
--- a/src/Access/ContextAccess.cpp
+++ b/src/Access/ContextAccess.cpp
@ -120,6 +120,7 @@ namespace

        AccessRights res = access;
        res.modifyFlags(modifier);
+        res.modifyFlagsWithGrantOption(modifier);

        /// Anyone has access to the "system" and "information_schema" database.
        res.grant(AccessType::SELECT, DatabaseCatalog::SYSTEM_DATABASE);
--- a/src/Backups/BackupCoordinationDistributed.cpp
+++ b/src/Backups/BackupCoordinationDistributed.cpp
@ -326,7 +326,7 @@ Strings BackupCoordinationDistributed::listFiles(const String & prefix, const St
        elements.push_back(String{new_element});
    }

-    std::sort(elements.begin(), elements.end());
+    ::sort(elements.begin(), elements.end());
    return elements;
 }

--- a/src/Backups/RestoreUtils.cpp
+++ b/src/Backups/RestoreUtils.cpp
@ -84,7 +84,7 @@ namespace
                    return true;
                });

-            std::sort(res.begin(), res.end());
+            ::sort(res.begin(), res.end());
            res.erase(std::unique(res.begin(), res.end()), res.end());
            return res;
        }
@ -113,7 +113,7 @@ namespace
                    return true;
                });

-            std::sort(res.begin(), res.end());
+            ::sort(res.begin(), res.end());
            res.erase(std::unique(res.begin(), res.end()), res.end());
            return res;
        }
--- a/src/Columns/ColumnObject.cpp
+++ b/src/Columns/ColumnObject.cpp
@ -22,8 +22,8 @@ namespace ErrorCodes
    extern const int ILLEGAL_COLUMN;
    extern const int DUPLICATE_COLUMN;
    extern const int NUMBER_OF_DIMENSIONS_MISMATHED;
-    extern const int NOT_IMPLEMENTED;
    extern const int SIZES_OF_COLUMNS_DOESNT_MATCH;
+    extern const int ARGUMENT_OUT_OF_BOUND;
 }

 namespace
@ -179,7 +179,7 @@ ColumnObject::Subcolumn::Subcolumn(
 {
 }

-size_t ColumnObject::Subcolumn::Subcolumn::size() const
+size_t ColumnObject::Subcolumn::size() const
 {
    size_t res = num_of_defaults_in_prefix;
    for (const auto & part : data)
@ -187,7 +187,7 @@ size_t ColumnObject::Subcolumn::Subcolumn::size() const
    return res;
 }

-size_t ColumnObject::Subcolumn::Subcolumn::byteSize() const
+size_t ColumnObject::Subcolumn::byteSize() const
 {
    size_t res = 0;
    for (const auto & part : data)
@ -195,7 +195,7 @@ size_t ColumnObject::Subcolumn::Subcolumn::byteSize() const
    return res;
 }

-size_t ColumnObject::Subcolumn::Subcolumn::allocatedBytes() const
+size_t ColumnObject::Subcolumn::allocatedBytes() const
 {
    size_t res = 0;
    for (const auto & part : data)
@ -203,6 +203,37 @@ size_t ColumnObject::Subcolumn::Subcolumn::allocatedBytes() const
    return res;
 }

+void ColumnObject::Subcolumn::get(size_t n, Field & res) const
+{
+    if (isFinalized())
+    {
+        getFinalizedColumn().get(n, res);
+        return;
+    }
+
+    size_t ind = n;
+    if (ind < num_of_defaults_in_prefix)
+    {
+        res = least_common_type.get()->getDefault();
+        return;
+    }
+
+    ind -= num_of_defaults_in_prefix;
+    for (const auto & part : data)
+    {
+        if (ind < part->size())
+        {
+            part->get(ind, res);
+            res = convertFieldToTypeOrThrow(res, *least_common_type.get());
+            return;
+        }
+
+        ind -= part->size();
+    }
+
+    throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Index ({}) for getting field is out of range", n);
+}
+
 void ColumnObject::Subcolumn::checkTypes() const
 {
    DataTypes prefix_types;
@ -221,7 +252,7 @@ void ColumnObject::Subcolumn::checkTypes() const

 void ColumnObject::Subcolumn::insert(Field field)
 {
-    auto info = getFieldInfo(field);
+    auto info = DB::getFieldInfo(field);
    insert(std::move(field), std::move(info));
 }

@ -244,8 +275,8 @@ static bool isConversionRequiredBetweenIntegers(const IDataType & lhs, const IDa
    bool is_native_int = which_lhs.isNativeInt() && which_rhs.isNativeInt();
    bool is_native_uint = which_lhs.isNativeUInt() && which_rhs.isNativeUInt();

-    return (is_native_int || is_native_uint)
-        && lhs.getSizeOfValueInMemory() <= rhs.getSizeOfValueInMemory();
+    return (!is_native_int && !is_native_uint)
+        || lhs.getSizeOfValueInMemory() > rhs.getSizeOfValueInMemory();
 }

 void ColumnObject::Subcolumn::insert(Field field, FieldInfo info)
@ -288,7 +319,7 @@ void ColumnObject::Subcolumn::insert(Field field, FieldInfo info)
    }
    else if (!least_common_base_type->equals(*base_type) && !isNothing(base_type))
    {
-        if (!isConversionRequiredBetweenIntegers(*base_type, *least_common_base_type))
+        if (isConversionRequiredBetweenIntegers(*base_type, *least_common_base_type))
        {
            base_type = getLeastSupertype(DataTypes{std::move(base_type), least_common_base_type}, true);
            type_changed = true;
@ -305,35 +336,96 @@ void ColumnObject::Subcolumn::insert(Field field, FieldInfo info)

 void ColumnObject::Subcolumn::insertRangeFrom(const Subcolumn & src, size_t start, size_t length)
 {
-    assert(src.isFinalized());
-    const auto & src_column = src.data.back();
-    const auto & src_type = src.least_common_type.get();
+    assert(start + length <= src.size());
+    size_t end = start + length;

    if (data.empty())
    {
-        addNewColumnPart(src.least_common_type.get());
-        data.back()->insertRangeFrom(*src_column, start, length);
+        addNewColumnPart(src.getLeastCommonType());
    }
-    else if (least_common_type.get()->equals(*src_type))
+    else if (!least_common_type.get()->equals(*src.getLeastCommonType()))
    {
-        data.back()->insertRangeFrom(*src_column, start, length);
-    }
-    else
-    {
-        auto new_least_common_type = getLeastSupertype(DataTypes{least_common_type.get(), src_type}, true);
-        auto casted_column = castColumn({src_column, src_type, ""}, new_least_common_type);
-
-        if (!least_common_type.get()->equals(*new_least_common_type))
+        auto new_least_common_type = getLeastSupertype(DataTypes{least_common_type.get(), src.getLeastCommonType()}, true);
+        if (!new_least_common_type->equals(*least_common_type.get()))
            addNewColumnPart(std::move(new_least_common_type));
+    }

-        data.back()->insertRangeFrom(*casted_column, start, length);
+    if (end <= src.num_of_defaults_in_prefix)
+    {
+        data.back()->insertManyDefaults(length);
+        return;
+    }
+
+    if (start < src.num_of_defaults_in_prefix)
+        data.back()->insertManyDefaults(src.num_of_defaults_in_prefix - start);
+
+    auto insert_from_part = [&](const auto & column, size_t from, size_t n)
+    {
+        assert(from + n <= column->size());
+        auto column_type = getDataTypeByColumn(*column);
+
+        if (column_type->equals(*least_common_type.get()))
+        {
+            data.back()->insertRangeFrom(*column, from, n);
+            return;
+        }
+
+        /// If we need to insert large range, there is no sense to cut part of column and cast it.
+        /// Casting of all column and inserting from it can be faster.
+        /// Threshold is just a guess.
+
+        if (n * 3 >= column->size())
+        {
+            auto casted_column = castColumn({column, column_type, ""}, least_common_type.get());
+            data.back()->insertRangeFrom(*casted_column, from, n);
+            return;
+        }
+
+        auto casted_column = column->cut(from, n);
+        casted_column = castColumn({casted_column, column_type, ""}, least_common_type.get());
+        data.back()->insertRangeFrom(*casted_column, 0, n);
+    };
+
+    size_t pos = 0;
+    size_t processed_rows = src.num_of_defaults_in_prefix;
+
+    /// Find the first part of the column that intersects the range.
+    while (pos < src.data.size() && processed_rows + src.data[pos]->size() < start)
+    {
+        processed_rows += src.data[pos]->size();
+        ++pos;
+    }
+
+    /// Insert from the first part of column.
+    if (pos < src.data.size() && processed_rows < start)
+    {
+        size_t part_start = start - processed_rows;
+        size_t part_length = std::min(src.data[pos]->size() - part_start, end - start);
+        insert_from_part(src.data[pos], part_start, part_length);
+        processed_rows += src.data[pos]->size();
+        ++pos;
+    }
+
+    /// Insert from the parts of column in the middle of range.
+    while (pos < src.data.size() && processed_rows + src.data[pos]->size() < end)
+    {
+        insert_from_part(src.data[pos], 0, src.data[pos]->size());
+        processed_rows += src.data[pos]->size();
+        ++pos;
+    }
+
+    /// Insert from the last part of column if needed.
+    if (pos < src.data.size() && processed_rows < end)
+    {
+        size_t part_end = end - processed_rows;
+        insert_from_part(src.data[pos], 0, part_end);
    }
 }

 bool ColumnObject::Subcolumn::isFinalized() const
 {
-    return data.empty() ||
-        (data.size() == 1 && !data[0]->isSparse() && num_of_defaults_in_prefix == 0);
+    return num_of_defaults_in_prefix == 0 &&
+        (data.empty() || (data.size() == 1 && !data[0]->isSparse()));
 }

 void ColumnObject::Subcolumn::finalize()
@ -432,6 +524,13 @@ void ColumnObject::Subcolumn::popBack(size_t n)
    num_of_defaults_in_prefix -= n;
 }

+ColumnObject::Subcolumn ColumnObject::Subcolumn::cut(size_t start, size_t length) const
+{
+    Subcolumn new_subcolumn(0, is_nullable);
+    new_subcolumn.insertRangeFrom(*this, start, length);
+    return new_subcolumn;
+}
+
 Field ColumnObject::Subcolumn::getLastField() const
 {
    if (data.empty())
@ -442,6 +541,18 @@ Field ColumnObject::Subcolumn::getLastField() const
    return (*last_part)[last_part->size() - 1];
 }

+FieldInfo ColumnObject::Subcolumn::getFieldInfo() const
+{
+    const auto & base_type = least_common_type.getBase();
+    return FieldInfo
+    {
+        .scalar_type = base_type,
+        .have_nulls = base_type->isNullable(),
+        .need_convert = false,
+        .num_dimensions = least_common_type.getNumberOfDimensions(),
+    };
+}
+
 ColumnObject::Subcolumn ColumnObject::Subcolumn::recreateWithDefaultValues(const FieldInfo & field_info) const
 {
    auto scalar_type = field_info.scalar_type;
@ -479,6 +590,13 @@ const ColumnPtr & ColumnObject::Subcolumn::getFinalizedColumnPtr() const
    return data[0];
 }

+ColumnObject::Subcolumn::LeastCommonType::LeastCommonType()
+    : type(std::make_shared<DataTypeNothing>())
+    , base_type(type)
+    , num_dimensions(0)
+{
+}
+
 ColumnObject::Subcolumn::LeastCommonType::LeastCommonType(DataTypePtr type_)
    : type(std::move(type_))
    , base_type(getBaseTypeOfArray(type))
@ -525,16 +643,6 @@ size_t ColumnObject::size() const
    return num_rows;
 }

-MutableColumnPtr ColumnObject::cloneResized(size_t new_size) const
-{
-    /// cloneResized with new_size == 0 is used for cloneEmpty().
-    if (new_size != 0)
-        throw Exception(ErrorCodes::NOT_IMPLEMENTED,
-            "ColumnObject doesn't support resize to non-zero length");
-
-    return ColumnObject::create(is_nullable);
-}
-
 size_t ColumnObject::byteSize() const
 {
    size_t res = 0;
@ -553,23 +661,21 @@ size_t ColumnObject::allocatedBytes() const

 void ColumnObject::forEachSubcolumn(ColumnCallback callback)
 {
-    if (!isFinalized())
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot iterate over non-finalized ColumnObject");
-
    for (auto & entry : subcolumns)
-        callback(entry->data.data.back());
+        for (auto & part : entry->data.data)
+            callback(part);
 }

 void ColumnObject::insert(const Field & field)
 {
    const auto & object = field.get<const Object &>();

-    HashSet<StringRef, StringRefHash> inserted;
+    HashSet<StringRef, StringRefHash> inserted_paths;
    size_t old_size = size();
    for (const auto & [key_str, value] : object)
    {
        PathInData key(key_str);
-        inserted.insert(key_str);
+        inserted_paths.insert(key_str);
        if (!hasSubcolumn(key))
            addSubcolumn(key, old_size);

@ -578,8 +684,14 @@ void ColumnObject::insert(const Field & field)
    }

    for (auto & entry : subcolumns)
-        if (!inserted.has(entry->path.getPath()))
-            entry->data.insertDefault();
+    {
+        if (!inserted_paths.has(entry->path.getPath()))
+        {
+            bool inserted = tryInsertDefaultFromNested(entry);
+            if (!inserted)
+                entry->data.insertDefault();
+        }
+    }

    ++num_rows;
 }
@ -594,26 +706,21 @@ void ColumnObject::insertDefault()

 Field ColumnObject::operator[](size_t n) const
 {
-    if (!isFinalized())
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot get Field from non-finalized ColumnObject");
-
-    Object object;
-    for (const auto & entry : subcolumns)
-        object[entry->path.getPath()] = (*entry->data.data.back())[n];
-
+    Field object;
+    get(n, object);
    return object;
 }

 void ColumnObject::get(size_t n, Field & res) const
 {
-    if (!isFinalized())
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot get Field from non-finalized ColumnObject");
+    assert(n < size());
+    res = Object();

    auto & object = res.get<Object &>();
    for (const auto & entry : subcolumns)
    {
        auto it = object.try_emplace(entry->path.getPath()).first;
-        entry->data.data.back()->get(n, it->second);
+        entry->data.get(n, it->second);
    }
 }

@ -626,41 +733,28 @@ void ColumnObject::insertFrom(const IColumn & src, size_t n)
 void ColumnObject::insertRangeFrom(const IColumn & src, size_t start, size_t length)
 {
    const auto & src_object = assert_cast<const ColumnObject &>(src);
-    if (!src_object.isFinalized())
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot insertRangeFrom non-finalized ColumnObject");
-
-    for (auto & entry : subcolumns)
-    {
-        if (src_object.hasSubcolumn(entry->path))
-            entry->data.insertRangeFrom(src_object.getSubcolumn(entry->path), start, length);
-        else
-            entry->data.insertManyDefaults(length);
-    }

    for (const auto & entry : src_object.subcolumns)
    {
        if (!hasSubcolumn(entry->path))
        {
            if (entry->path.hasNested())
-            {
-                const auto & base_type = entry->data.getLeastCommonTypeBase();
-                FieldInfo field_info
-                {
-                    .scalar_type = base_type,
-                    .have_nulls = base_type->isNullable(),
-                    .need_convert = false,
-                    .num_dimensions = entry->data.getNumberOfDimensions(),
-                };
-
-                addNestedSubcolumn(entry->path, field_info, num_rows);
-            }
+                addNestedSubcolumn(entry->path, entry->data.getFieldInfo(), num_rows);
            else
-            {
                addSubcolumn(entry->path, num_rows);
-            }
+        }

-            auto & subcolumn = getSubcolumn(entry->path);
-            subcolumn.insertRangeFrom(entry->data, start, length);
+        auto & subcolumn = getSubcolumn(entry->path);
+        subcolumn.insertRangeFrom(entry->data, start, length);
+    }
+
+    for (auto & entry : subcolumns)
+    {
+        if (!src_object.hasSubcolumn(entry->path))
+        {
+            bool inserted = tryInsertManyDefaultsFromNested(entry);
+            if (!inserted)
+                entry->data.insertManyDefaults(length);
        }
    }

@ -668,21 +762,6 @@ void ColumnObject::insertRangeFrom(const IColumn & src, size_t start, size_t len
    finalize();
 }

-ColumnPtr ColumnObject::replicate(const Offsets & offsets) const
-{
-    if (!isFinalized())
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot replicate non-finalized ColumnObject");
-
-    auto res_column = ColumnObject::create(is_nullable);
-    for (const auto & entry : subcolumns)
-    {
-        auto replicated_data = entry->data.data.back()->replicate(offsets)->assumeMutable();
-        res_column->addSubcolumn(entry->path, std::move(replicated_data));
-    }
-
-    return res_column;
-}
-
 void ColumnObject::popBack(size_t length)
 {
    for (auto & entry : subcolumns)
@ -692,10 +771,15 @@ void ColumnObject::popBack(size_t length)
 }

 template <typename Func>
-ColumnPtr ColumnObject::applyForSubcolumns(Func && func, std::string_view func_name) const
+MutableColumnPtr ColumnObject::applyForSubcolumns(Func && func) const
 {
    if (!isFinalized())
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot {} non-finalized ColumnObject", func_name);
+    {
+        auto finalized = IColumn::mutate(getPtr());
+        auto & finalized_object = assert_cast<ColumnObject &>(*finalized);
+        finalized_object.finalize();
+        return finalized_object.applyForSubcolumns(std::forward<Func>(func));
+    }

    auto res = ColumnObject::create(is_nullable);
    for (const auto & subcolumn : subcolumns)
@ -703,22 +787,36 @@ ColumnPtr ColumnObject::applyForSubcolumns(Func && func, std::string_view func_n
        auto new_subcolumn = func(subcolumn->data.getFinalizedColumn());
        res->addSubcolumn(subcolumn->path, new_subcolumn->assumeMutable());
    }
+
    return res;
 }

 ColumnPtr ColumnObject::permute(const Permutation & perm, size_t limit) const
 {
-    return applyForSubcolumns([&](const auto & subcolumn) { return subcolumn.permute(perm, limit); }, "permute");
+    return applyForSubcolumns([&](const auto & subcolumn) { return subcolumn.permute(perm, limit); });
 }

 ColumnPtr ColumnObject::filter(const Filter & filter, ssize_t result_size_hint) const
 {
-    return applyForSubcolumns([&](const auto & subcolumn) { return subcolumn.filter(filter, result_size_hint); }, "filter");
+    return applyForSubcolumns([&](const auto & subcolumn) { return subcolumn.filter(filter, result_size_hint); });
 }

 ColumnPtr ColumnObject::index(const IColumn & indexes, size_t limit) const
 {
-    return applyForSubcolumns([&](const auto & subcolumn) { return subcolumn.index(indexes, limit); }, "index");
+    return applyForSubcolumns([&](const auto & subcolumn) { return subcolumn.index(indexes, limit); });
+}
+
+ColumnPtr ColumnObject::replicate(const Offsets & offsets) const
+{
+    return applyForSubcolumns([&](const auto & subcolumn) { return subcolumn.replicate(offsets); });
+}
+
+MutableColumnPtr ColumnObject::cloneResized(size_t new_size) const
+{
+    if (new_size == 0)
+        return ColumnObject::create(is_nullable);
+
+    return applyForSubcolumns([&](const auto & subcolumn) { return subcolumn.cloneResized(new_size); });
 }

 const ColumnObject::Subcolumn & ColumnObject::getSubcolumn(const PathInData & key) const
@ -810,6 +908,92 @@ void ColumnObject::addNestedSubcolumn(const PathInData & key, const FieldInfo &

    if (num_rows == 0)
        num_rows = new_size;
+    else if (new_size != num_rows)
+        throw Exception(ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH,
+            "Required size of subcolumn {} ({}) is inconsistent with column size ({})",
+            key.getPath(), new_size, num_rows);
+}
+
+const ColumnObject::Subcolumns::Node * ColumnObject::getLeafOfTheSameNested(const Subcolumns::NodePtr & entry) const
+{
+    if (!entry->path.hasNested())
+        return nullptr;
+
+    size_t old_size = entry->data.size();
+    const auto * current_node = subcolumns.findLeaf(entry->path);
+    const Subcolumns::Node * leaf = nullptr;
+
+    while (current_node)
+    {
+        /// Try to find the first Nested up to the current node.
+        const auto * node_nested = subcolumns.findParent(current_node,
+            [](const auto & candidate) { return candidate.isNested(); });
+
+        if (!node_nested)
+            break;
+
+        /// Find the leaf with subcolumn that contains values
+        /// for the last rows.
+        /// If there are no leaves, skip current node and find
+        /// the next node up to the current.
+        leaf = subcolumns.findLeaf(node_nested,
+            [&](const auto & candidate)
+            {
+                return candidate.data.size() > old_size;
+            });
+
+        if (leaf)
+            break;
+
+        current_node = node_nested->parent;
+    }
+
+    if (leaf && isNothing(leaf->data.getLeastCommonTypeBase()))
+        return nullptr;
+
+    return leaf;
+}
+
+bool ColumnObject::tryInsertManyDefaultsFromNested(const Subcolumns::NodePtr & entry) const
+{
+    const auto * leaf = getLeafOfTheSameNested(entry);
+    if (!leaf)
+        return false;
+
+    size_t old_size = entry->data.size();
+    auto field_info = entry->data.getFieldInfo();
+
+    /// Cut the needed range from the found leaf
+    /// and replace scalar values to the correct
+    /// default values for given entry.
+    auto new_subcolumn = leaf->data
+        .cut(old_size, leaf->data.size() - old_size)
+        .recreateWithDefaultValues(field_info);
+
+    entry->data.insertRangeFrom(new_subcolumn, 0, new_subcolumn.size());
+    return true;
+}
+
+bool ColumnObject::tryInsertDefaultFromNested(const Subcolumns::NodePtr & entry) const
+{
+    const auto * leaf = getLeafOfTheSameNested(entry);
+    if (!leaf)
+        return false;
+
+    auto last_field = leaf->data.getLastField();
+    if (last_field.isNull())
+        return false;
+
+    size_t leaf_num_dimensions = leaf->data.getNumberOfDimensions();
+    size_t entry_num_dimensions = entry->data.getNumberOfDimensions();
+
+    auto default_scalar = entry_num_dimensions > leaf_num_dimensions
+        ? createEmptyArrayField(entry_num_dimensions - leaf_num_dimensions)
+        : entry->data.getLeastCommonTypeBase()->getDefault();
+
+    auto default_field = applyVisitor(FieldVisitorReplaceScalars(default_scalar, leaf_num_dimensions), last_field);
+    entry->data.insert(std::move(default_field));
+    return true;
 }

 PathsInData ColumnObject::getKeys() const
@ -835,7 +1019,7 @@ void ColumnObject::finalize()
    {
        const auto & least_common_type = entry->data.getLeastCommonType();

-        /// Do not add subcolumns, which consists only from NULLs.
+        /// Do not add subcolumns, which consist only from NULLs.
        if (isNothing(getBaseTypeOfArray(least_common_type)))
            continue;

--- a/src/Columns/ColumnObject.h
+++ b/src/Columns/ColumnObject.h
@ -65,6 +65,7 @@ public:
        size_t size() const;
        size_t byteSize() const;
        size_t allocatedBytes() const;
+        void get(size_t n, Field & res) const;

        bool isFinalized() const;
        const DataTypePtr & getLeastCommonType() const { return least_common_type.get(); }
@ -84,6 +85,8 @@ public:
        void insertRangeFrom(const Subcolumn & src, size_t start, size_t length);
        void popBack(size_t n);

+        Subcolumn cut(size_t start, size_t length) const;
+
        /// Converts all column's parts to the common type and
        /// creates a single column that stores all values.
        void finalize();
@ -91,6 +94,8 @@ public:
        /// Returns last inserted field.
        Field getLastField() const;

+        FieldInfo getFieldInfo() const;
+
        /// Recreates subcolumn with default scalar values and keeps sizes of arrays.
        /// Used to create columns of type Nested with consistent array sizes.
        Subcolumn recreateWithDefaultValues(const FieldInfo & field_info) const;
@ -101,13 +106,16 @@ public:
        const IColumn & getFinalizedColumn() const;
        const ColumnPtr & getFinalizedColumnPtr() const;

+        const std::vector<WrappedPtr> & getData() const { return data; }
+        size_t getNumberOfDefaultsInPrefix() const { return num_of_defaults_in_prefix; }
+
        friend class ColumnObject;

    private:
        class LeastCommonType
        {
        public:
-            LeastCommonType() = default;
+            LeastCommonType();
            explicit LeastCommonType(DataTypePtr type_);

            const DataTypePtr & get() const { return type; }
@ -175,6 +183,11 @@ public:
    /// It cares about consistency of sizes of Nested arrays.
    void addNestedSubcolumn(const PathInData & key, const FieldInfo & field_info, size_t new_size);

+    /// Finds a subcolumn from the same Nested type as @entry and inserts
+    /// an array with default values with consistent sizes as in Nested type.
+    bool tryInsertDefaultFromNested(const Subcolumns::NodePtr & entry) const;
+    bool tryInsertManyDefaultsFromNested(const Subcolumns::NodePtr & entry) const;
+
    const Subcolumns & getSubcolumns() const { return subcolumns; }
    Subcolumns & getSubcolumns() { return subcolumns; }
    PathsInData getKeys() const;
@ -189,7 +202,6 @@ public:
    TypeIndex getDataType() const override { return TypeIndex::Object; }

    size_t size() const override;
-    MutableColumnPtr cloneResized(size_t new_size) const override;
    size_t byteSize() const override;
    size_t allocatedBytes() const override;
    void forEachSubcolumn(ColumnCallback callback) override;
@ -197,13 +209,14 @@ public:
    void insertDefault() override;
    void insertFrom(const IColumn & src, size_t n) override;
    void insertRangeFrom(const IColumn & src, size_t start, size_t length) override;
-    ColumnPtr replicate(const Offsets & offsets) const override;
    void popBack(size_t length) override;
    Field operator[](size_t n) const override;
    void get(size_t n, Field & res) const override;
    ColumnPtr permute(const Permutation & perm, size_t limit) const override;
    ColumnPtr filter(const Filter & filter, ssize_t result_size_hint) const override;
    ColumnPtr index(const IColumn & indexes, size_t limit) const override;
+    ColumnPtr replicate(const Offsets & offsets) const override;
+    MutableColumnPtr cloneResized(size_t new_size) const override;

    /// All other methods throw exception.

@ -236,7 +249,11 @@ private:
    }

    template <typename Func>
-    ColumnPtr applyForSubcolumns(Func && func, std::string_view func_name) const;
+    MutableColumnPtr applyForSubcolumns(Func && func) const;
+
+    /// For given subcolumn return subcolumn from the same Nested type.
+    /// It's used to get shared sized of Nested to insert correct default values.
+    const Subcolumns::Node * getLeafOfTheSameNested(const Subcolumns::NodePtr & entry) const;
 };

 }
--- a/src/Columns/tests/gtest_column_object.cpp
+++ b/src/Columns/tests/gtest_column_object.cpp
@ -0,0 +1,120 @@
+#include <Common/FieldVisitorsAccurateComparison.h>
+#include <DataTypes/getLeastSupertype.h>
+#include <Interpreters/castColumn.h>
+#include <Interpreters/convertFieldToType.h>
+#include <Columns/ColumnObject.h>
+#include <Common/FieldVisitorToString.h>
+
+#include <Common/randomSeed.h>
+#include <fmt/core.h>
+#include <pcg_random.hpp>
+#include <gtest/gtest.h>
+#include <random>
+
+using namespace DB;
+
+static pcg64 rng(randomSeed());
+
+Field getRandomField(size_t type)
+{
+    switch (type)
+    {
+        case 0:
+            return rng();
+        case 1:
+            return std::uniform_real_distribution<>(0.0, 1.0)(rng);
+        case 2:
+            return std::string(rng() % 10, 'a' + rng() % 26);
+        default:
+            return Field();
+    }
+}
+
+std::pair<ColumnObject::Subcolumn, std::vector<Field>> generate(size_t size)
+{
+    bool has_defaults = rng() % 3 == 0;
+    size_t num_defaults = has_defaults ? rng() % size : 0;
+
+    ColumnObject::Subcolumn subcolumn(num_defaults, false);
+    std::vector<Field> fields;
+
+    while (subcolumn.size() < size)
+    {
+        size_t part_size = rng() % (size - subcolumn.size()) + 1;
+        size_t field_type = rng() % 3;
+
+        for (size_t i = 0; i < part_size; ++i)
+        {
+            fields.push_back(getRandomField(field_type));
+            subcolumn.insert(fields.back());
+        }
+    }
+
+    std::vector<Field> result_fields;
+    for (size_t i = 0; i < num_defaults; ++i)
+        result_fields.emplace_back();
+
+    result_fields.insert(result_fields.end(), fields.begin(), fields.end());
+    return {std::move(subcolumn), std::move(result_fields)};
+}
+
+void checkFieldsAreEqual(ColumnObject::Subcolumn subcolumn, const std::vector<Field> & fields)
+{
+    ASSERT_EQ(subcolumn.size(), fields.size());
+    for (size_t i = 0; i < subcolumn.size(); ++i)
+    {
+        Field field;
+        subcolumn.get(i, field); // Also check 'get' method.
+        if (!applyVisitor(FieldVisitorAccurateEquals(), field, fields[i]))
+        {
+            std::cerr << fmt::format("Wrong value at position {}, expected {}, got {}",
+                i, applyVisitor(FieldVisitorToString(), fields[i]), applyVisitor(FieldVisitorToString(), field));
+            ASSERT_TRUE(false);
+        }
+    }
+}
+
+constexpr size_t T = 1000;
+constexpr size_t N = 1000;
+
+TEST(ColumnObject, InsertRangeFrom)
+{
+    for (size_t t = 0; t < T; ++t)
+    {
+        auto [subcolumn_dst, fields_dst] = generate(N);
+        auto [subcolumn_src, fields_src] = generate(N);
+
+        ASSERT_EQ(subcolumn_dst.size(), fields_dst.size());
+        ASSERT_EQ(subcolumn_src.size(), fields_src.size());
+
+        const auto & type_dst = subcolumn_dst.getLeastCommonType();
+        const auto & type_src = subcolumn_src.getLeastCommonType();
+        auto type_res = getLeastSupertype(DataTypes{type_dst, type_src}, true);
+
+        size_t from = rng() % subcolumn_src.size();
+        size_t to = rng() % subcolumn_src.size();
+        if (from > to)
+            std::swap(from, to);
+        ++to;
+
+        for (auto & field : fields_dst)
+        {
+            if (field.isNull())
+                field = type_res->getDefault();
+            else
+                field = convertFieldToTypeOrThrow(field, *type_res);
+        }
+
+        for (size_t i = from; i < to; ++i)
+        {
+            if (fields_src[i].isNull())
+                fields_dst.push_back(type_res->getDefault());
+            else
+                fields_dst.push_back(convertFieldToTypeOrThrow(fields_src[i], *type_res));
+
+        }
+
+        subcolumn_dst.insertRangeFrom(subcolumn_src, from, to - from);
+        checkFieldsAreEqual(subcolumn_dst, fields_dst);
+    }
+}
--- a/src/Columns/tests/gtest_column_sparse.cpp
+++ b/src/Columns/tests/gtest_column_sparse.cpp
@ -11,7 +11,7 @@
 #include <Common/FieldVisitors.h>

 using namespace DB;
-pcg64 rng(randomSeed());
+static pcg64 rng(randomSeed());

 std::pair<MutableColumnPtr, MutableColumnPtr> createColumns(size_t n, size_t k)
 {
--- a/src/Common/Exception.cpp
+++ b/src/Common/Exception.cpp
@ -261,7 +261,7 @@ static void getNotEnoughMemoryMessage(std::string & msg)
            }
        }

-        if (num_maps > max_map_count * 0.99)
+        if (num_maps > max_map_count * 0.90)
        {
            msg += fmt::format(
                "\nIt looks like that the process is near the limit on number of virtual memory mappings."
--- a/src/Common/FileCache.cpp
+++ b/src/Common/FileCache.cpp
@ -30,6 +30,11 @@ namespace
    }
 }

+static bool isQueryInitialized()
+{
+    return CurrentThread::isInitialized() && CurrentThread::get().getQueryContext() && CurrentThread::getQueryId().size != 0;
+}
+
 IFileCache::IFileCache(
    const String & cache_base_path_,
    const FileCacheSettings & cache_settings_)
@ -37,6 +42,7 @@ IFileCache::IFileCache(
    , max_size(cache_settings_.max_size)
    , max_element_size(cache_settings_.max_elements)
    , max_file_segment_size(cache_settings_.max_file_segment_size)
+    , enable_filesystem_query_cache_limit(cache_settings_.enable_filesystem_query_cache_limit)
 {
 }

@ -59,9 +65,7 @@ String IFileCache::getPathInLocalCache(const Key & key)

 bool IFileCache::isReadOnly()
 {
-    return !CurrentThread::isInitialized()
-        || !CurrentThread::get().getQueryContext()
-        || CurrentThread::getQueryId().size == 0;
+    return (!isQueryInitialized());
 }

 void IFileCache::assertInitialized() const
@ -70,6 +74,73 @@ void IFileCache::assertInitialized() const
        throw Exception(ErrorCodes::REMOTE_FS_OBJECT_CACHE_ERROR, "Cache not initialized");
 }

+IFileCache::QueryContextPtr IFileCache::getCurrentQueryContext(std::lock_guard<std::mutex> & cache_lock)
+{
+    if (!isQueryInitialized())
+        return nullptr;
+
+    return getQueryContext(CurrentThread::getQueryId().toString(), cache_lock);
+}
+
+IFileCache::QueryContextPtr IFileCache::getQueryContext(const String & query_id, std::lock_guard<std::mutex> &)
+{
+    auto query_iter = query_map.find(query_id);
+    return (query_iter == query_map.end()) ? nullptr : query_iter->second;
+}
+
+void IFileCache::removeQueryContext(const String & query_id)
+{
+    std::lock_guard cache_lock(mutex);
+    auto query_iter = query_map.find(query_id);
+
+    if (query_iter == query_map.end())
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to release query context that does not exist");
+
+    query_map.erase(query_iter);
+}
+
+IFileCache::QueryContextPtr IFileCache::getOrSetQueryContext(const String & query_id, const ReadSettings & settings, std::lock_guard<std::mutex> & cache_lock)
+{
+    if (query_id.empty())
+        return nullptr;
+
+    auto context = getQueryContext(query_id, cache_lock);
+    if (!context)
+    {
+        auto query_iter = query_map.insert({query_id, std::make_shared<QueryContext>(settings.max_query_cache_size, settings.skip_download_if_exceeds_query_cache)}).first;
+        context = query_iter->second;
+    }
+    return context;
+}
+
+IFileCache::QueryContextHolder IFileCache::getQueryContextHolder(const String & query_id, const ReadSettings & settings)
+{
+    std::lock_guard cache_lock(mutex);
+
+    /// if enable_filesystem_query_cache_limit is true, and max_query_cache_size large than zero,
+    /// we create context query for current query.
+    if (enable_filesystem_query_cache_limit && settings.max_query_cache_size)
+    {
+        auto context = getOrSetQueryContext(query_id, settings, cache_lock);
+        return QueryContextHolder(query_id, this, context);
+    }
+    else
+        return QueryContextHolder();
+}
+
+IFileCache::QueryContextHolder::QueryContextHolder(const String & query_id_, IFileCache * cache_, IFileCache::QueryContextPtr context_)
+    : query_id(query_id_), cache(cache_), context(context_)
+{
+}
+
+IFileCache::QueryContextHolder::~QueryContextHolder()
+{
+    /// If only the query_map and the current holder hold the context_query,
+    /// the query has been completed and the query_context is released.
+    if (context && context.use_count() == 2)
+        cache->removeQueryContext(query_id);
+}
+
 LRUFileCache::LRUFileCache(const String & cache_base_path_, const FileCacheSettings & cache_settings_)
    : IFileCache(cache_base_path_, cache_settings_)
    , max_stash_element_size(cache_settings_.max_elements)
@ -480,8 +551,129 @@ FileSegmentsHolder LRUFileCache::setDownloading(const Key & key, size_t offset,
    return FileSegmentsHolder(std::move(file_segments));
 }

-bool LRUFileCache::tryReserve(
-    const Key & key, size_t offset, size_t size, std::lock_guard<std::mutex> & cache_lock)
+bool LRUFileCache::tryReserve(const Key & key, size_t offset, size_t size, std::lock_guard<std::mutex> & cache_lock)
+{
+    auto query_context = enable_filesystem_query_cache_limit ? getCurrentQueryContext(cache_lock) : nullptr;
+    if (!query_context)
+        return tryReserveForMainList(key, offset, size, nullptr, cache_lock);
+
+    /// The maximum cache capacity of the request is not reached, thus the
+    //// cache block is evicted from the main LRU queue by tryReserveForMainList().
+    else if (query_context->getCacheSize() + size <= query_context->getMaxCacheSize())
+        return tryReserveForMainList(key, offset, size, query_context, cache_lock);
+
+    /// When skip_download_if_exceeds_query_cache is true, there is no need
+    /// to evict old data, skip the cache and read directly from remote fs.
+    else if (query_context->isSkipDownloadIfExceed())
+        return false;
+
+    /// The maximum cache size of the query is reached, the cache will be
+    /// evicted from the history cache accessed by the current query.
+    else
+    {
+        size_t removed_size = 0;
+        size_t queue_size = queue.getElementsNum(cache_lock);
+
+        auto * cell_for_reserve = getCell(key, offset, cache_lock);
+
+        std::vector<IFileCache::LRUQueue::Iterator> ghost;
+        std::vector<FileSegmentCell *> trash;
+        std::vector<FileSegmentCell *> to_evict;
+
+        auto is_overflow = [&]
+        {
+            return (max_size != 0 && queue.getTotalCacheSize(cache_lock) + size - removed_size > max_size)
+            || (max_element_size != 0 && queue_size > max_element_size)
+            || (query_context->getCacheSize() + size - removed_size > query_context->getMaxCacheSize());
+        };
+
+        /// Select the cache from the LRU queue held by query for expulsion.
+        for (auto iter = query_context->queue().begin(); iter != query_context->queue().end(); iter++)
+        {
+            if (!is_overflow())
+                break;
+
+            auto * cell = getCell(iter->key, iter->offset, cache_lock);
+
+            if (!cell)
+            {
+                /// The cache corresponding to this record may be swapped out by
+                /// other queries, so it has become invalid.
+                ghost.push_back(iter);
+                removed_size += iter->size;
+            }
+            else
+            {
+                size_t cell_size = cell->size();
+                assert(iter->size == cell_size);
+
+                if (cell->releasable())
+                {
+                    auto & file_segment = cell->file_segment;
+                    std::lock_guard segment_lock(file_segment->mutex);
+
+                    switch (file_segment->download_state)
+                    {
+                        case FileSegment::State::DOWNLOADED:
+                        {
+                            to_evict.push_back(cell);
+                            break;
+                        }
+                        default:
+                        {
+                            trash.push_back(cell);
+                            break;
+                        }
+                    }
+                    removed_size += cell_size;
+                    --queue_size;
+                }
+            }
+        }
+
+        auto remove_file_segment = [&](FileSegmentPtr file_segment, size_t file_segment_size)
+        {
+            query_context->remove(file_segment->key(), file_segment->offset(), file_segment_size, cache_lock);
+
+            std::lock_guard segment_lock(file_segment->mutex);
+            remove(file_segment->key(), file_segment->offset(), cache_lock, segment_lock);
+        };
+
+        assert(trash.empty());
+        for (auto & cell : trash)
+        {
+            if (auto file_segment = cell->file_segment)
+                remove_file_segment(file_segment, cell->size());
+        }
+
+        for (auto & iter : ghost)
+            query_context->remove(iter->key, iter->offset, iter->size, cache_lock);
+
+        if (is_overflow())
+            return false;
+
+        if (cell_for_reserve)
+        {
+            auto queue_iterator = cell_for_reserve->queue_iterator;
+            if (queue_iterator)
+                queue.incrementSize(*queue_iterator, size, cache_lock);
+            else
+                cell_for_reserve->queue_iterator = queue.add(key, offset, size, cache_lock);
+        }
+
+        for (auto & cell : to_evict)
+        {
+            if (auto file_segment = cell->file_segment)
+                remove_file_segment(file_segment, cell->size());
+        }
+
+        query_context->reserve(key, offset, size, cache_lock);
+        return true;
+    }
+}
+
+bool LRUFileCache::tryReserveForMainList(
+    const Key & key, size_t offset, size_t size, QueryContextPtr query_context, std::lock_guard<std::mutex> & cache_lock)
 {
    auto removed_size = 0;
    size_t queue_size = queue.getElementsNum(cache_lock);
@ -499,7 +691,7 @@ bool LRUFileCache::tryReserve(
    auto is_overflow = [&]
    {
        /// max_size == 0 means unlimited cache size, max_element_size means unlimited number of cache elements.
-        return (max_size != 0 && queue.getTotalWeight(cache_lock) + size - removed_size > max_size)
+        return (max_size != 0 && queue.getTotalCacheSize(cache_lock) + size - removed_size > max_size)
            || (max_element_size != 0 && queue_size > max_element_size);
    };

@ -552,18 +744,19 @@ bool LRUFileCache::tryReserve(
        }
    }

+    auto remove_file_segment = [&](FileSegmentPtr file_segment)
+    {
+        std::lock_guard segment_lock(file_segment->mutex);
+        remove(file_segment->key(), file_segment->offset(), cache_lock, segment_lock);
+    };

    /// This case is very unlikely, can happen in case of exception from
    /// file_segment->complete(), which would be a logical error.
    assert(trash.empty());
    for (auto & cell : trash)
    {
-        auto file_segment = cell->file_segment;
-        if (file_segment)
-        {
-            std::lock_guard segment_lock(file_segment->mutex);
-            remove(file_segment->key(), file_segment->offset(), cache_lock, segment_lock);
-        }
+        if (auto file_segment = cell->file_segment)
+            remove_file_segment(file_segment);
    }

    if (is_overflow())
@ -584,17 +777,16 @@ bool LRUFileCache::tryReserve(

    for (auto & cell : to_evict)
    {
-        auto file_segment = cell->file_segment;
-        if (file_segment)
-        {
-            std::lock_guard<std::mutex> segment_lock(file_segment->mutex);
-            remove(file_segment->key(), file_segment->offset(), cache_lock, segment_lock);
-        }
+        if (auto file_segment = cell->file_segment)
+            remove_file_segment(file_segment);
    }

-    if (queue.getTotalWeight(cache_lock) > (1ull << 63))
+    if (queue.getTotalCacheSize(cache_lock) > (1ull << 63))
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Cache became inconsistent. There must be a bug");

+    if (query_context)
+        query_context->reserve(key, offset, size, cache_lock);
+
    return true;
 }

@ -852,7 +1044,6 @@ FileSegments LRUFileCache::getSnapshot() const
        for (const auto & [offset, cell] : cells_by_offset)
            file_segments.push_back(FileSegment::getSnapshot(cell.file_segment, cache_lock));
    }
-
    return file_segments;
 }

@ -881,7 +1072,7 @@ size_t LRUFileCache::getUsedCacheSize() const

 size_t LRUFileCache::getUsedCacheSizeUnlocked(std::lock_guard<std::mutex> & cache_lock) const
 {
-    return queue.getTotalWeight(cache_lock);
+    return queue.getTotalCacheSize(cache_lock);
 }

 size_t LRUFileCache::getAvailableCacheSize() const
@ -938,7 +1129,7 @@ LRUFileCache::FileSegmentCell::FileSegmentCell(
    }
 }

-LRUFileCache::LRUQueue::Iterator LRUFileCache::LRUQueue::add(
+IFileCache::LRUQueue::Iterator IFileCache::LRUQueue::add(
    const IFileCache::Key & key, size_t offset, size_t size, std::lock_guard<std::mutex> & /* cache_lock */)
 {
 #ifndef NDEBUG
@ -956,30 +1147,30 @@ LRUFileCache::LRUQueue::Iterator LRUFileCache::LRUQueue::add(
    return queue.insert(queue.end(), FileKeyAndOffset(key, offset, size));
 }

-void LRUFileCache::LRUQueue::remove(Iterator queue_it, std::lock_guard<std::mutex> & /* cache_lock */)
+void IFileCache::LRUQueue::remove(Iterator queue_it, std::lock_guard<std::mutex> & /* cache_lock */)
 {
    cache_size -= queue_it->size;
    queue.erase(queue_it);
 }

-void LRUFileCache::LRUQueue::removeAll(std::lock_guard<std::mutex> & /* cache_lock */)
+void IFileCache::LRUQueue::removeAll(std::lock_guard<std::mutex> & /* cache_lock */)
 {
    queue.clear();
    cache_size = 0;
 }

-void LRUFileCache::LRUQueue::moveToEnd(Iterator queue_it, std::lock_guard<std::mutex> & /* cache_lock */)
+void IFileCache::LRUQueue::moveToEnd(Iterator queue_it, std::lock_guard<std::mutex> & /* cache_lock */)
 {
    queue.splice(queue.end(), queue, queue_it);
 }

-void LRUFileCache::LRUQueue::incrementSize(Iterator queue_it, size_t size_increment, std::lock_guard<std::mutex> & /* cache_lock */)
+void IFileCache::LRUQueue::incrementSize(Iterator queue_it, size_t size_increment, std::lock_guard<std::mutex> & /* cache_lock */)
 {
    cache_size += size_increment;
    queue_it->size += size_increment;
 }

-bool LRUFileCache::LRUQueue::contains(
+bool IFileCache::LRUQueue::contains(
    const IFileCache::Key & key, size_t offset, std::lock_guard<std::mutex> & /* cache_lock */) const
 {
    /// This method is used for assertions in debug mode.
@ -992,31 +1183,7 @@ bool LRUFileCache::LRUQueue::contains(
    return false;
 }

-void LRUFileCache::LRUQueue::assertCorrectness(LRUFileCache * cache, std::lock_guard<std::mutex> & cache_lock)
-{
-    [[maybe_unused]] size_t total_size = 0;
-    for (auto it = queue.begin(); it != queue.end();)
-    {
-        auto & [key, offset, size, _] = *it++;
-
-        auto * cell = cache->getCell(key, offset, cache_lock);
-        if (!cell)
-        {
-            throw Exception(
-                ErrorCodes::LOGICAL_ERROR,
-                "Cache is in inconsistent state: LRU queue contains entries with no cache cell (assertCorrectness())");
-        }
-
-        assert(cell->size() == size);
-        total_size += size;
-    }
-
-    assert(total_size == cache_size);
-    assert(cache_size <= cache->max_size);
-    assert(queue.size() <= cache->max_element_size);
-}
-
-String LRUFileCache::LRUQueue::toString(std::lock_guard<std::mutex> & /* cache_lock */) const
+String IFileCache::LRUQueue::toString(std::lock_guard<std::mutex> & /* cache_lock */) const
 {
    String result;
    for (const auto & [key, offset, size, _] : queue)
@ -1065,14 +1232,38 @@ void LRUFileCache::assertCacheCellsCorrectness(
 void LRUFileCache::assertCacheCorrectness(const Key & key, std::lock_guard<std::mutex> & cache_lock)
 {
    assertCacheCellsCorrectness(files[key], cache_lock);
-    queue.assertCorrectness(this, cache_lock);
+    assertQueueCorrectness(cache_lock);
 }

 void LRUFileCache::assertCacheCorrectness(std::lock_guard<std::mutex> & cache_lock)
 {
    for (const auto & [key, cells_by_offset] : files)
        assertCacheCellsCorrectness(files[key], cache_lock);
-    queue.assertCorrectness(this, cache_lock);
+    assertQueueCorrectness(cache_lock);
+}
+
+void LRUFileCache::assertQueueCorrectness(std::lock_guard<std::mutex> & cache_lock)
+{
+    [[maybe_unused]] size_t total_size = 0;
+    for (auto it = queue.begin(); it != queue.end();)
+    {
+        auto & [key, offset, size, _] = *it++;
+
+        auto * cell = getCell(key, offset, cache_lock);
+        if (!cell)
+        {
+            throw Exception(
+                ErrorCodes::LOGICAL_ERROR,
+                "Cache is in inconsistent state: LRU queue contains entries with no cache cell (assertCorrectness())");
+        }
+
+        assert(cell->size() == size);
+        total_size += size;
+    }
+
+    assert(total_size == queue.getTotalCacheSize(cache_lock));
+    assert(queue.getTotalCacheSize(cache_lock) <= max_size);
+    assert(queue.getElementsNum(cache_lock) <= max_element_size);
 }

 }
--- a/src/Common/FileCache.h
+++ b/src/Common/FileCache.h
@ -12,6 +12,7 @@
 #include <map>

 #include "FileCache_fwd.h"
+#include <IO/ReadSettings.h>
 #include <Common/logger_useful.h>
 #include <Common/FileSegment.h>
 #include <Core/Types.h>
@ -20,6 +21,14 @@
 namespace DB
 {

+namespace ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+}
+
+class IFileCache;
+using FileCachePtr = std::shared_ptr<IFileCache>;
+
 /**
 * Local cache for remote filesystem files, represented as a set of non-overlapping non-empty file segments.
 */
@ -106,6 +115,146 @@ protected:

    mutable std::mutex mutex;

+    class LRUQueue
+    {
+    public:
+        struct FileKeyAndOffset
+        {
+            Key key;
+            size_t offset;
+            size_t size;
+            size_t hits = 0;
+
+            FileKeyAndOffset(const Key & key_, size_t offset_, size_t size_) : key(key_), offset(offset_), size(size_) {}
+        };
+
+        using Iterator = typename std::list<FileKeyAndOffset>::iterator;
+
+        size_t getTotalCacheSize(std::lock_guard<std::mutex> & /* cache_lock */) const { return cache_size; }
+
+        size_t getElementsNum(std::lock_guard<std::mutex> & /* cache_lock */) const { return queue.size(); }
+
+        Iterator add(const Key & key, size_t offset, size_t size, std::lock_guard<std::mutex> & cache_lock);
+
+        void remove(Iterator queue_it, std::lock_guard<std::mutex> & cache_lock);
+
+        void moveToEnd(Iterator queue_it, std::lock_guard<std::mutex> & cache_lock);
+
+        /// Space reservation for a file segment is incremental, so we need to be able to increment size of the queue entry.
+        void incrementSize(Iterator queue_it, size_t size_increment, std::lock_guard<std::mutex> & cache_lock);
+
+        String toString(std::lock_guard<std::mutex> & cache_lock) const;
+
+        bool contains(const Key & key, size_t offset, std::lock_guard<std::mutex> & cache_lock) const;
+
+        Iterator begin() { return queue.begin(); }
+
+        Iterator end() { return queue.end(); }
+
+        void removeAll(std::lock_guard<std::mutex> & cache_lock);
+
+    private:
+        std::list<FileKeyAndOffset> queue;
+        size_t cache_size = 0;
+    };
+
+    using AccessKeyAndOffset = std::pair<Key, size_t>;
+
+    struct KeyAndOffsetHash
+    {
+        std::size_t operator()(const AccessKeyAndOffset & key) const
+        {
+            return std::hash<UInt128>()(key.first) ^ std::hash<UInt64>()(key.second);
+        }
+    };
+
+    using AccessRecord = std::unordered_map<AccessKeyAndOffset, LRUQueue::Iterator, KeyAndOffsetHash>;
+
+    /// Used to track and control the cache access of each query.
+    /// Through it, we can realize the processing of different queries by the cache layer.
+    struct QueryContext
+    {
+        LRUQueue lru_queue;
+        AccessRecord records;
+
+        size_t cache_size = 0;
+        size_t max_cache_size;
+
+        bool skip_download_if_exceeds_query_cache;
+
+        QueryContext(size_t max_cache_size_, bool skip_download_if_exceeds_query_cache_)
+            : max_cache_size(max_cache_size_)
+            , skip_download_if_exceeds_query_cache(skip_download_if_exceeds_query_cache_) {}
+
+        void remove(const Key & key, size_t offset, size_t size, std::lock_guard<std::mutex> & cache_lock)
+        {
+            if (cache_size < size)
+                throw Exception(ErrorCodes::LOGICAL_ERROR, "Deleted cache size exceeds existing cache size");
+
+            if (!skip_download_if_exceeds_query_cache)
+            {
+                auto record = records.find({key, offset});
+                if (record != records.end())
+                {
+                    lru_queue.remove(record->second, cache_lock);
+                    records.erase({key, offset});
+                }
+            }
+            cache_size -= size;
+        }
+
+        void reserve(const Key & key, size_t offset, size_t size, std::lock_guard<std::mutex> & cache_lock)
+        {
+            if (cache_size + size > max_cache_size)
+                throw Exception(ErrorCodes::LOGICAL_ERROR, "Reserved cache size exceeds the remaining cache size");
+
+            if (!skip_download_if_exceeds_query_cache)
+            {
+                auto record = records.find({key, offset});
+                if (record == records.end())
+                {
+                    auto queue_iter = lru_queue.add(key, offset, 0, cache_lock);
+                    record = records.insert({{key, offset}, queue_iter}).first;
+                }
+                record->second->size += size;
+            }
+            cache_size += size;
+        }
+
+        void use(const Key & key, size_t offset, std::lock_guard<std::mutex> & cache_lock)
+        {
+            if (!skip_download_if_exceeds_query_cache)
+            {
+                auto record = records.find({key, offset});
+                if (record != records.end())
+                    lru_queue.moveToEnd(record->second, cache_lock);
+            }
+        }
+
+        size_t getMaxCacheSize() { return max_cache_size; }
+
+        size_t getCacheSize() { return cache_size; }
+
+        LRUQueue & queue() { return lru_queue; }
+
+        bool isSkipDownloadIfExceed() { return skip_download_if_exceeds_query_cache; }
+    };
+
+    using QueryContextPtr = std::shared_ptr<QueryContext>;
+    using QueryContextMap = std::unordered_map<String, QueryContextPtr>;
+
+    QueryContextMap query_map;
+
+    bool enable_filesystem_query_cache_limit;
+
+    QueryContextPtr getCurrentQueryContext(std::lock_guard<std::mutex> & cache_lock);
+
+    QueryContextPtr getQueryContext(const String & query_id, std::lock_guard<std::mutex> & cache_lock);
+
+    void removeQueryContext(const String & query_id);
+
+    QueryContextPtr getOrSetQueryContext(const String & query_id, const ReadSettings & settings, std::lock_guard<std::mutex> &);
+
    virtual bool tryReserve(
        const Key & key, size_t offset, size_t size,
        std::lock_guard<std::mutex> & cache_lock) = 0;
@ -128,9 +277,25 @@ protected:
        std::lock_guard<std::mutex> & segment_lock) = 0;

    void assertInitialized() const;
-};

-using FileCachePtr = std::shared_ptr<IFileCache>;
+public:
+    /// Save a query context information, and adopt different cache policies
+    /// for different queries through the context cache layer.
+    struct QueryContextHolder : private boost::noncopyable
+    {
+        explicit QueryContextHolder(const String & query_id_, IFileCache * cache_, QueryContextPtr context_);
+
+        QueryContextHolder() = default;
+
+        ~QueryContextHolder();
+
+        String query_id {};
+        IFileCache * cache = nullptr;
+        QueryContextPtr context = nullptr;
+    };
+
+    QueryContextHolder getQueryContextHolder(const String & query_id, const ReadSettings & settings);
+};

 class LRUFileCache final : public IFileCache
 {
@ -158,51 +323,6 @@ public:
    size_t getFileSegmentsNum() const override;

 private:
-    class LRUQueue
-    {
-    public:
-        struct FileKeyAndOffset
-        {
-            Key key;
-            size_t offset;
-            size_t size;
-            size_t hits = 0;
-
-            FileKeyAndOffset(const Key & key_, size_t offset_, size_t size_) : key(key_), offset(offset_), size(size_) {}
-        };
-
-        using Iterator = typename std::list<FileKeyAndOffset>::iterator;
-
-        size_t getTotalWeight(std::lock_guard<std::mutex> & /* cache_lock */) const { return cache_size; }
-
-        size_t getElementsNum(std::lock_guard<std::mutex> & /* cache_lock */) const { return queue.size(); }
-
-        Iterator add(const Key & key, size_t offset, size_t size, std::lock_guard<std::mutex> & cache_lock);
-
-        void remove(Iterator queue_it, std::lock_guard<std::mutex> & cache_lock);
-
-        void moveToEnd(Iterator queue_it, std::lock_guard<std::mutex> & cache_lock);
-
-        /// Space reservation for a file segment is incremental, so we need to be able to increment size of the queue entry.
-        void incrementSize(Iterator queue_it, size_t size_increment, std::lock_guard<std::mutex> & cache_lock);
-
-        void assertCorrectness(LRUFileCache * cache, std::lock_guard<std::mutex> & cache_lock);
-
-        String toString(std::lock_guard<std::mutex> & cache_lock) const;
-
-        bool contains(const Key & key, size_t offset, std::lock_guard<std::mutex> & cache_lock) const;
-
-        Iterator begin() { return queue.begin(); }
-
-        Iterator end() { return queue.end(); }
-
-        void removeAll(std::lock_guard<std::mutex> & cache_lock);
-
-    private:
-        std::list<FileKeyAndOffset> queue;
-        size_t cache_size = 0;
-    };
-
    struct FileSegmentCell : private boost::noncopyable
    {
        FileSegmentPtr file_segment;
@ -227,23 +347,12 @@ private:
    using FileSegmentsByOffset = std::map<size_t, FileSegmentCell>;
    using CachedFiles = std::unordered_map<Key, FileSegmentsByOffset>;

-    using AccessKeyAndOffset = std::pair<Key, size_t>;
-
-    struct KeyAndOffsetHash
-    {
-        std::size_t operator()(const AccessKeyAndOffset & key) const
-        {
-            return std::hash<UInt128>()(key.first) ^ std::hash<UInt64>()(key.second);
-        }
-    };
-
-    using AccessRecord = std::unordered_map<AccessKeyAndOffset, LRUQueue::Iterator, KeyAndOffsetHash>;
-
    CachedFiles files;
    LRUQueue queue;

    LRUQueue stash_queue;
    AccessRecord records;
+
    size_t max_stash_element_size;
    size_t enable_cache_hits_threshold;

@ -266,6 +375,11 @@ private:
        const Key & key, size_t offset, size_t size,
        std::lock_guard<std::mutex> & cache_lock) override;

+    bool tryReserveForMainList(
+        const Key & key, size_t offset, size_t size,
+        QueryContextPtr query_context,
+        std::lock_guard<std::mutex> & cache_lock);
+
    void remove(
        Key key, size_t offset,
        std::lock_guard<std::mutex> & cache_lock,
@ -309,6 +423,8 @@ public:
    void assertCacheCorrectness(const Key & key, std::lock_guard<std::mutex> & cache_lock);

    void assertCacheCorrectness(std::lock_guard<std::mutex> & cache_lock);
+
+    void assertQueueCorrectness(std::lock_guard<std::mutex> & cache_lock);
 };

 }
--- a/src/Common/FileCacheSettings.cpp
+++ b/src/Common/FileCacheSettings.cpp
@ -11,6 +11,7 @@ void FileCacheSettings::loadFromConfig(const Poco::Util::AbstractConfiguration &
    max_elements = config.getUInt64(config_prefix + ".data_cache_max_elements", REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_ELEMENTS);
    max_file_segment_size = config.getUInt64(config_prefix + ".max_file_segment_size", REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_FILE_SEGMENT_SIZE);
    cache_on_write_operations = config.getUInt64(config_prefix + ".cache_on_write_operations", false);
+    enable_filesystem_query_cache_limit = config.getUInt64(config_prefix + ".enable_filesystem_query_cache_limit", false);
    enable_cache_hits_threshold = config.getUInt64(config_prefix + ".enable_cache_hits_threshold", REMOTE_FS_OBJECTS_CACHE_ENABLE_HITS_THRESHOLD);
 }

--- a/src/Common/FileCacheSettings.h
+++ b/src/Common/FileCacheSettings.h
@ -13,6 +13,7 @@ struct FileCacheSettings
    size_t max_elements = REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_ELEMENTS;
    size_t max_file_segment_size = REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_FILE_SEGMENT_SIZE;
    bool cache_on_write_operations = false;
+    bool enable_filesystem_query_cache_limit = false;

    size_t enable_cache_hits_threshold = REMOTE_FS_OBJECTS_CACHE_ENABLE_HITS_THRESHOLD;

--- a/src/Common/FileSegment.h
+++ b/src/Common/FileSegment.h
@ -200,6 +200,7 @@ private:
    const Range segment_range;

    State download_state;
+
    String downloader_id;

    RemoteFileReaderPtr remote_file_reader;
--- a/src/Common/IntervalKind.cpp
+++ b/src/Common/IntervalKind.cpp
@ -9,13 +9,13 @@ namespace ErrorCodes
    extern const int SYNTAX_ERROR;
 }

-Int32 IntervalKind::toAvgSeconds() const
+Float64 IntervalKind::toAvgSeconds() const
 {
    switch (kind)
    {
-        case IntervalKind::Nanosecond:
-        case IntervalKind::Microsecond:
-        case IntervalKind::Millisecond: return 0; /// fractional parts of seconds have 0 seconds
+        case IntervalKind::Nanosecond: return 0.000000001;
+        case IntervalKind::Microsecond: return 0.000001;
+        case IntervalKind::Millisecond: return 0.001;
        case IntervalKind::Second: return 1;
        case IntervalKind::Minute: return 60;
        case IntervalKind::Hour: return 3600;
@ -28,6 +28,25 @@ Int32 IntervalKind::toAvgSeconds() const
    __builtin_unreachable();
 }

+bool IntervalKind::isFixedLength() const
+{
+    switch (kind)
+    {
+        case IntervalKind::Nanosecond:
+        case IntervalKind::Microsecond:
+        case IntervalKind::Millisecond:
+        case IntervalKind::Second:
+        case IntervalKind::Minute:
+        case IntervalKind::Hour:
+        case IntervalKind::Day:
+        case IntervalKind::Week: return true;
+        case IntervalKind::Month:
+        case IntervalKind::Quarter:
+        case IntervalKind::Year: return false;
+    }
+    __builtin_unreachable();
+}
+
 IntervalKind IntervalKind::fromAvgSeconds(Int64 num_seconds)
 {
    if (num_seconds)
--- a/src/Common/IntervalKind.h
+++ b/src/Common/IntervalKind.h
@ -31,12 +31,15 @@ struct IntervalKind

    /// Returns number of seconds in one interval.
    /// For `Month`, `Quarter` and `Year` the function returns an average number of seconds.
-    Int32 toAvgSeconds() const;
+    Float64 toAvgSeconds() const;

    /// Chooses an interval kind based on number of seconds.
    /// For example, `IntervalKind::fromAvgSeconds(3600)` returns `IntervalKind::Hour`.
    static IntervalKind fromAvgSeconds(Int64 num_seconds);

+    /// Returns whether IntervalKind has a fixed number of seconds (e.g. Day) or non-fixed(e.g. Month)
+    bool isFixedLength() const;
+
    /// Returns an uppercased version of what `toString()` returns.
    const char * toKeyword() const;

--- a/src/Common/ZooKeeper/ZooKeeper.cpp
+++ b/src/Common/ZooKeeper/ZooKeeper.cpp
@ -7,6 +7,7 @@
 #include <filesystem>

 #include <base/find_symbols.h>
+#include <base/sort.h>
 #include <base/getFQDNOrHostName.h>
 #include <Common/StringUtils/StringUtils.h>
 #include <Common/Exception.h>
@ -169,7 +170,7 @@ std::vector<ShuffleHost> ZooKeeper::shuffleHosts() const
        shuffle_hosts.emplace_back(shuffle_host);
    }

-    std::sort(
+    ::sort(
        shuffle_hosts.begin(), shuffle_hosts.end(),
        [](const ShuffleHost & lhs, const ShuffleHost & rhs)
        {
--- a/src/Common/filesystemHelpers.cpp
+++ b/src/Common/filesystemHelpers.cpp
@ -280,7 +280,15 @@ time_t getModificationTime(const std::string & path)
    struct stat st;
    if (stat(path.c_str(), &st) == 0)
        return st.st_mtime;
-    DB::throwFromErrnoWithPath("Cannot check modification time for file: " + path, path, DB::ErrorCodes::PATH_ACCESS_DENIED);
+    DB::throwFromErrnoWithPath("Cannot check modification time for file: " + path, path, DB::ErrorCodes::CANNOT_STAT);
+}
+
+time_t getChangeTime(const std::string & path)
+{
+    struct stat st;
+    if (stat(path.c_str(), &st) == 0)
+        return st.st_ctime;
+    DB::throwFromErrnoWithPath("Cannot check change time for file: " + path, path, DB::ErrorCodes::CANNOT_STAT);
 }

 Poco::Timestamp getModificationTimestamp(const std::string & path)
--- a/src/Common/filesystemHelpers.h
+++ b/src/Common/filesystemHelpers.h
@ -75,7 +75,10 @@ bool canRead(const std::string & path);
 bool canWrite(const std::string & path);
 bool canExecute(const std::string & path);

+/// st_mtime
 time_t getModificationTime(const std::string & path);
 Poco::Timestamp getModificationTimestamp(const std::string & path);
 void setModificationTime(const std::string & path, time_t time);
+/// st_ctime
+time_t getChangeTime(const std::string & path);
 }
--- a/src/Common/tests/gtest_lru_file_cache.cpp
+++ b/src/Common/tests/gtest_lru_file_cache.cpp
@ -98,9 +98,10 @@ TEST(LRUFileCache, get)
    DB::ThreadStatus thread_status;

    /// To work with cache need query_id and query context.
+    std::string query_id = "query_id";
    auto query_context = DB::Context::createCopy(getContext().context);
    query_context->makeQueryContext();
-    query_context->setCurrentQueryId("query_id");
+    query_context->setCurrentQueryId(query_id);
    DB::CurrentThread::QueryScope query_scope_holder(query_context);

    DB::FileCacheSettings settings;
@ -513,4 +514,5 @@ TEST(LRUFileCache, get)
        assertRange(49, segments1[1], DB::FileSegment::Range(10, 19), DB::FileSegment::State::EMPTY);
        assertRange(50, segments1[2], DB::FileSegment::Range(20, 24), DB::FileSegment::State::EMPTY);
    }
+
 }
--- a/src/Coordination/Changelog.cpp
+++ b/src/Coordination/Changelog.cpp
@ -36,21 +36,20 @@ std::string formatChangelogPath(const std::string & prefix, const ChangelogFileD
    return path;
 }

-ChangelogFileDescription getChangelogFileDescription(const std::string & path_str)
+ChangelogFileDescription getChangelogFileDescription(const std::filesystem::path & path)
 {
-    std::filesystem::path path(path_str);
    std::string filename = path.stem();
    Strings filename_parts;
    boost::split(filename_parts, filename, boost::is_any_of("_"));
    if (filename_parts.size() < 3)
-        throw Exception(ErrorCodes::CORRUPTED_DATA, "Invalid changelog {}", path_str);
+        throw Exception(ErrorCodes::CORRUPTED_DATA, "Invalid changelog {}", path.generic_string());

    ChangelogFileDescription result;
    result.prefix = filename_parts[0];
    result.from_log_index = parse<uint64_t>(filename_parts[1]);
    result.to_log_index = parse<uint64_t>(filename_parts[2]);
    result.extension = path.extension();
-    result.path = path_str;
+    result.path = path.generic_string();
    return result;
 }

@ -276,6 +275,7 @@ Changelog::Changelog(
    Poco::Logger * log_,
    bool compress_logs_)
    : changelogs_dir(changelogs_dir_)
+    , changelogs_detached_dir(changelogs_dir / "detached")
    , rotate_interval(rotate_interval_)
    , force_sync(force_sync_)
    , log(log_)
@ -288,12 +288,15 @@ Changelog::Changelog(

    for (const auto & p : fs::directory_iterator(changelogs_dir))
    {
+        if (p == changelogs_detached_dir)
+            continue;
+
        auto file_description = getChangelogFileDescription(p.path());
        existing_changelogs[file_description.from_log_index] = file_description;
    }

    if (existing_changelogs.empty())
-        LOG_WARNING(log, "No logs exists in {}. It's Ok if it's the first run of clickhouse-keeper.", changelogs_dir);
+        LOG_WARNING(log, "No logs exists in {}. It's Ok if it's the first run of clickhouse-keeper.", changelogs_dir.generic_string());

    clean_log_thread = ThreadFromGlobalPool([this] { cleanLogThread(); });
 }
@ -328,7 +331,7 @@ void Changelog::readChangelogAndInitWriter(uint64_t last_commited_log_index, uin
                /// entries from leader.
                if (changelog_description.from_log_index > last_commited_log_index && (changelog_description.from_log_index - last_commited_log_index) > 1)
                {
-                    LOG_ERROR(log, "Some records was lost, last committed log index {}, smallest available log index on disk {}. Hopefully will receive missing records from leader.", last_commited_log_index, changelog_description.from_log_index);
+                    LOG_ERROR(log, "Some records were lost, last committed log index {}, smallest available log index on disk {}. Hopefully will receive missing records from leader.", last_commited_log_index, changelog_description.from_log_index);
                    /// Nothing to do with our more fresh log, leader will overwrite them, so remove everything and just start from last_commited_index
                    removeAllLogs();
                    min_log_id = last_commited_log_index;
@ -342,6 +345,12 @@ void Changelog::readChangelogAndInitWriter(uint64_t last_commited_log_index, uin
                    LOG_WARNING(log, "Don't have required amount of reserved log records. Need to read from {}, smallest available log index on disk {}.", start_to_read_from, changelog_description.from_log_index);
                }
            }
+            else if ((changelog_description.from_log_index - last_log_read_result->last_read_index) > 1)
+            {
+                LOG_ERROR(log, "Some records were lost, last found log index {}, while the next log index on disk is {}. Hopefully will receive missing records from leader.", last_log_read_result->last_read_index, changelog_description.from_log_index);
+                removeAllLogsAfter(last_log_read_result->log_start_index);
+                break;
+            }

            ChangelogReader reader(changelog_description.path);
            last_log_read_result = reader.readChangelog(logs, start_to_read_from, log);
@ -405,7 +414,7 @@ void Changelog::readChangelogAndInitWriter(uint64_t last_commited_log_index, uin

        if (last_log_read_result->last_read_index == 0 || last_log_read_result->error) /// If it's broken log then remove it
        {
-            LOG_INFO(log, "Removing log {} because it's empty or read finished with error", description.path);
+            LOG_INFO(log, "Removing chagelog {} because it's empty or read finished with error", description.path);
            std::filesystem::remove(description.path);
            existing_changelogs.erase(last_log_read_result->log_start_index);
            std::erase_if(logs, [last_log_read_result] (const auto & item) { return item.first >= last_log_read_result->log_start_index; });
@ -431,6 +440,44 @@ void Changelog::initWriter(const ChangelogFileDescription & description)
    current_writer = std::make_unique<ChangelogWriter>(description.path, WriteMode::Append, description.from_log_index);
 }

+namespace
+{
+
+std::string getCurrentTimestampFolder()
+{
+    const auto timestamp = LocalDateTime{std::time(nullptr)};
+    return fmt::format(
+        "{:02}{:02}{:02}T{:02}{:02}{:02}",
+        timestamp.year(),
+        timestamp.month(),
+        timestamp.day(),
+        timestamp.hour(),
+        timestamp.minute(),
+        timestamp.second());
+}
+
+}
+
+void Changelog::removeExistingLogs(ChangelogIter begin, ChangelogIter end)
+{
+    const auto timestamp_folder = changelogs_detached_dir / getCurrentTimestampFolder();
+
+    for (auto itr = begin; itr != end;)
+    {
+        if (!std::filesystem::exists(timestamp_folder))
+        {
+            LOG_WARNING(log, "Moving broken logs to {}", timestamp_folder.generic_string());
+            std::filesystem::create_directories(timestamp_folder);
+        }
+
+        LOG_WARNING(log, "Removing changelog {}", itr->second.path);
+        const std::filesystem::path path = itr->second.path;
+        const auto new_path = timestamp_folder / path.filename();
+        std::filesystem::rename(path, new_path);
+        itr = existing_changelogs.erase(itr);
+    }
+}
+
 void Changelog::removeAllLogsAfter(uint64_t remove_after_log_start_index)
 {
    auto start_to_remove_from_itr = existing_changelogs.upper_bound(remove_after_log_start_index);
@ -440,12 +487,8 @@ void Changelog::removeAllLogsAfter(uint64_t remove_after_log_start_index)
    size_t start_to_remove_from_log_id = start_to_remove_from_itr->first;

    /// All subsequent logs shouldn't exist. But they may exist if we crashed after writeAt started. Remove them.
-    for (auto itr = start_to_remove_from_itr; itr != existing_changelogs.end();)
-    {
-        LOG_WARNING(log, "Removing changelog {}, because it's goes after broken changelog entry", itr->second.path);
-        std::filesystem::remove(itr->second.path);
-        itr = existing_changelogs.erase(itr);
-    }
+    LOG_WARNING(log, "Removing changelogs that go after broken changelog entry");
+    removeExistingLogs(start_to_remove_from_itr, existing_changelogs.end());

    std::erase_if(logs, [start_to_remove_from_log_id] (const auto & item) { return item.first >= start_to_remove_from_log_id; });
 }
@ -453,12 +496,7 @@ void Changelog::removeAllLogsAfter(uint64_t remove_after_log_start_index)
 void Changelog::removeAllLogs()
 {
    LOG_WARNING(log, "Removing all changelogs");
-    for (auto itr = existing_changelogs.begin(); itr != existing_changelogs.end();)
-    {
-        LOG_WARNING(log, "Removing changelog {}, because it's goes after broken changelog entry", itr->second.path);
-        std::filesystem::remove(itr->second.path);
-        itr = existing_changelogs.erase(itr);
-    }
+    removeExistingLogs(existing_changelogs.begin(), existing_changelogs.end());
    logs.clear();
 }

--- a/src/Coordination/Changelog.h
+++ b/src/Coordination/Changelog.h
@ -138,6 +138,13 @@ private:
    /// Starts new file [new_start_log_index, new_start_log_index + rotate_interval]
    void rotate(uint64_t new_start_log_index);

+    /// Currently existing changelogs
+    std::map<uint64_t, ChangelogFileDescription> existing_changelogs;
+
+    using ChangelogIter = decltype(existing_changelogs)::iterator;
+    void removeExistingLogs(ChangelogIter begin, ChangelogIter end);
+
+    static void removeLog(const std::filesystem::path & path, const std::filesystem::path & detached_folder);
    /// Remove all changelogs from disk with start_index bigger than start_to_remove_from_id
    void removeAllLogsAfter(uint64_t remove_after_log_start_index);
    /// Remove all logs from disk
@ -148,14 +155,13 @@ private:
    /// Clean useless log files in a background thread
    void cleanLogThread();

-    const std::string changelogs_dir;
+    const std::filesystem::path changelogs_dir;
+    const std::filesystem::path changelogs_detached_dir;
    const uint64_t rotate_interval;
    const bool force_sync;
    Poco::Logger * log;
    bool compress_logs;

-    /// Currently existing changelogs
-    std::map<uint64_t, ChangelogFileDescription> existing_changelogs;

    /// Current writer for changelog file
    std::unique_ptr<ChangelogWriter> current_writer;
--- a/src/Coordination/KeeperSnapshotManager.cpp
+++ b/src/Coordination/KeeperSnapshotManager.cpp
@ -12,6 +12,7 @@
 #include <Coordination/pathUtils.h>
 #include <filesystem>
 #include <memory>
+#include <Common/logger_useful.h>

 namespace DB
 {
@ -20,6 +21,7 @@ namespace ErrorCodes
 {
    extern const int UNKNOWN_FORMAT_VERSION;
    extern const int UNKNOWN_SNAPSHOT;
+    extern const int LOGICAL_ERROR;
 }

 namespace
@ -151,7 +153,7 @@ void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, Wr

    /// Better to sort before serialization, otherwise snapshots can be different on different replicas
    std::vector<std::pair<int64_t, Coordination::ACLs>> sorted_acl_map(snapshot.acl_map.begin(), snapshot.acl_map.end());
-    std::sort(sorted_acl_map.begin(), sorted_acl_map.end());
+    ::sort(sorted_acl_map.begin(), sorted_acl_map.end());
    /// Serialize ACLs map
    writeBinary(sorted_acl_map.size(), out);
    for (const auto & [acl_id, acls] : sorted_acl_map)
@ -193,7 +195,7 @@ void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, Wr
    /// Session must be saved in a sorted order,
    /// otherwise snapshots will be different
    std::vector<std::pair<int64_t, int64_t>> sorted_session_and_timeout(snapshot.session_and_timeout.begin(), snapshot.session_and_timeout.end());
-    std::sort(sorted_session_and_timeout.begin(), sorted_session_and_timeout.end());
+    ::sort(sorted_session_and_timeout.begin(), sorted_session_and_timeout.end());

    /// Serialize sessions
    size_t size = sorted_session_and_timeout.size();
@ -296,6 +298,25 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial
        }
    }

+    for (const auto & itr : storage.container)
+    {
+        if (itr.key != "/")
+        {
+            if (itr.value.stat.numChildren != static_cast<int32_t>(itr.value.getChildren().size()))
+            {
+#ifdef NDEBUG
+                /// TODO (alesapin) remove this, it should be always CORRUPTED_DATA.
+                LOG_ERROR(&Poco::Logger::get("KeeperSnapshotManager"), "Children counter in stat.numChildren {}"
+                            " is different from actual children size {} for node {}", itr.value.stat.numChildren, itr.value.getChildren().size(), itr.key);
+#else
+                throw Exception(ErrorCodes::LOGICAL_ERROR, "Children counter in stat.numChildren {}"
+                            " is different from actual children size {} for node {}", itr.value.stat.numChildren, itr.value.getChildren().size(), itr.key);
+#endif
+            }
+        }
+    }
+
+
    size_t active_sessions_size;
    readBinary(active_sessions_size, in);

--- a/src/Coordination/KeeperStorage.cpp
+++ b/src/Coordination/KeeperStorage.cpp
@ -13,7 +13,7 @@
 #include <iomanip>
 #include <mutex>
 #include <functional>
-#include <Common/logger_useful.h>
+#include <base/defines.h>

 namespace DB
 {
@ -349,7 +349,9 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr
        container.updateValue(parent_path, [child_path, zxid, &prev_parent_zxid,
                                            parent_cversion, &prev_parent_cversion] (KeeperStorage::Node & parent)
        {
+            ++parent.stat.numChildren;
            parent.addChild(child_path);
+
            prev_parent_cversion = parent.stat.cversion;
            prev_parent_zxid = parent.stat.pzxid;

@ -363,7 +365,7 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr

            if (zxid > parent.stat.pzxid)
                parent.stat.pzxid = zxid;
-            ++parent.stat.numChildren;
+            chassert(parent.stat.numChildren == static_cast<int32_t>(parent.getChildren().size()));
        });

        response.path_created = path_created;
@ -385,6 +387,7 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr
                undo_parent.stat.cversion = prev_parent_cversion;
                undo_parent.stat.pzxid = prev_parent_zxid;
                undo_parent.removeChild(child_path);
+                chassert(undo_parent.stat.numChildren == static_cast<int32_t>(undo_parent.getChildren().size()));
            });

            storage.container.erase(path_created);
@ -494,7 +497,7 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr
        {
            response.error = Coordination::Error::ZBADVERSION;
        }
-        else if (it->value.stat.numChildren)
+        else if (!it->value.getChildren().empty())
        {
            response.error = Coordination::Error::ZNOTEMPTY;
        }
@ -519,6 +522,7 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr
                --parent.stat.numChildren;
                ++parent.stat.cversion;
                parent.removeChild(child_basename);
+                chassert(parent.stat.numChildren == static_cast<int32_t>(parent.getChildren().size()));
            });

            response.error = Coordination::Error::ZOK;
@ -540,6 +544,7 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr
                    ++parent.stat.numChildren;
                    --parent.stat.cversion;
                    parent.addChild(child_name);
+                    chassert(parent.stat.numChildren == static_cast<int32_t>(parent.getChildren().size()));
                });
            };
        }
@ -1110,6 +1115,7 @@ KeeperStorage::ResponsesForSessions KeeperStorage::processRequest(const Coordina
                    ++parent.stat.cversion;
                    auto base_name = getBaseName(ephemeral_path);
                    parent.removeChild(base_name);
+                    chassert(parent.stat.numChildren == static_cast<int32_t>(parent.getChildren().size()));
                });

                container.erase(ephemeral_path);
--- a/src/Coordination/tests/gtest_coordination.cpp
+++ b/src/Coordination/tests/gtest_coordination.cpp
@ -698,13 +698,32 @@ TEST_P(CoordinationTest, ChangelogTestStartNewLogAfterRead)
    EXPECT_TRUE(fs::exists("./logs/changelog_36_40.bin" + params.extension));
 }

+namespace
+{
+void assertBrokenLogRemoved(const fs::path & log_folder, const fs::path & filename)
+{
+        EXPECT_FALSE(fs::exists(log_folder / filename));
+        // broken logs are sent to the detached/{timestamp} folder
+        // we don't know timestamp so we iterate all of them
+        for (const auto & dir_entry : fs::recursive_directory_iterator(log_folder / "detached"))
+        {
+            if (dir_entry.path().filename() == filename)
+                return;
+        }
+
+        FAIL() << "Broken log " << filename << " was not moved to the detached folder";
+}
+
+}

 TEST_P(CoordinationTest, ChangelogTestReadAfterBrokenTruncate)
 {
-    auto params = GetParam();
-    ChangelogDirTest test("./logs");
+    static const fs::path log_folder{"./logs"};

-    DB::KeeperLogStore changelog("./logs", 5, true, params.enable_compression);
+    auto params = GetParam();
+    ChangelogDirTest test(log_folder);
+
+    DB::KeeperLogStore changelog(log_folder, 5, true, params.enable_compression);
    changelog.init(1, 0);

    for (size_t i = 0; i < 35; ++i)
@ -736,10 +755,10 @@ TEST_P(CoordinationTest, ChangelogTestReadAfterBrokenTruncate)
    EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin" + params.extension));
    EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin" + params.extension));

-    EXPECT_FALSE(fs::exists("./logs/changelog_16_20.bin" + params.extension));
-    EXPECT_FALSE(fs::exists("./logs/changelog_21_25.bin" + params.extension));
-    EXPECT_FALSE(fs::exists("./logs/changelog_26_30.bin" + params.extension));
-    EXPECT_FALSE(fs::exists("./logs/changelog_31_35.bin" + params.extension));
+    assertBrokenLogRemoved(log_folder, "changelog_16_20.bin" + params.extension);
+    assertBrokenLogRemoved(log_folder, "changelog_21_25.bin" + params.extension);
+    assertBrokenLogRemoved(log_folder, "changelog_26_30.bin" + params.extension);
+    assertBrokenLogRemoved(log_folder, "changelog_31_35.bin" + params.extension);

    auto entry = getLogEntry("h", 7777);
    changelog_reader.append(entry);
@ -751,10 +770,10 @@ TEST_P(CoordinationTest, ChangelogTestReadAfterBrokenTruncate)
    EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin" + params.extension));
    EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin" + params.extension));

-    EXPECT_FALSE(fs::exists("./logs/changelog_16_20.bin" + params.extension));
-    EXPECT_FALSE(fs::exists("./logs/changelog_21_25.bin" + params.extension));
-    EXPECT_FALSE(fs::exists("./logs/changelog_26_30.bin" + params.extension));
-    EXPECT_FALSE(fs::exists("./logs/changelog_31_35.bin" + params.extension));
+    assertBrokenLogRemoved(log_folder, "changelog_16_20.bin" + params.extension);
+    assertBrokenLogRemoved(log_folder, "changelog_21_25.bin" + params.extension);
+    assertBrokenLogRemoved(log_folder, "changelog_26_30.bin" + params.extension);
+    assertBrokenLogRemoved(log_folder, "changelog_31_35.bin" + params.extension);

    DB::KeeperLogStore changelog_reader2("./logs", 5, true, params.enable_compression);
    changelog_reader2.init(1, 0);
@ -788,14 +807,13 @@ TEST_P(CoordinationTest, ChangelogTestReadAfterBrokenTruncate2)

    EXPECT_EQ(changelog_reader.size(), 0);
    EXPECT_TRUE(fs::exists("./logs/changelog_1_20.bin" + params.extension));
-    EXPECT_FALSE(fs::exists("./logs/changelog_21_40.bin" + params.extension));
+    assertBrokenLogRemoved("./logs", "changelog_21_40.bin" + params.extension);
    auto entry = getLogEntry("hello_world", 7777);
    changelog_reader.append(entry);
    changelog_reader.end_of_append_batch(0, 0);
    EXPECT_EQ(changelog_reader.size(), 1);
    EXPECT_EQ(changelog_reader.last_entry()->get_term(), 7777);

-
    DB::KeeperLogStore changelog_reader2("./logs", 1, true, params.enable_compression);
    changelog_reader2.init(1, 0);
    EXPECT_EQ(changelog_reader2.size(), 1);
@ -825,10 +843,40 @@ TEST_P(CoordinationTest, ChangelogTestLostFiles)
    DB::KeeperLogStore changelog_reader("./logs", 20, true, params.enable_compression);
    /// It should print error message, but still able to start
    changelog_reader.init(5, 0);
-    EXPECT_FALSE(fs::exists("./logs/changelog_1_20.bin" + params.extension));
-    EXPECT_FALSE(fs::exists("./logs/changelog_21_40.bin" + params.extension));
+    assertBrokenLogRemoved("./logs", "changelog_21_40.bin" + params.extension);
 }

+TEST_P(CoordinationTest, ChangelogTestLostFiles2)
+{
+    auto params = GetParam();
+    ChangelogDirTest test("./logs");
+
+    DB::KeeperLogStore changelog("./logs", 10, true, params.enable_compression);
+    changelog.init(1, 0);
+
+    for (size_t i = 0; i < 35; ++i)
+    {
+        auto entry = getLogEntry(std::to_string(i) + "_hello_world", (i + 44) * 10);
+        changelog.append(entry);
+    }
+    changelog.end_of_append_batch(0, 0);
+
+    EXPECT_TRUE(fs::exists("./logs/changelog_1_10.bin" + params.extension));
+    EXPECT_TRUE(fs::exists("./logs/changelog_11_20.bin" + params.extension));
+    EXPECT_TRUE(fs::exists("./logs/changelog_21_30.bin" + params.extension));
+    EXPECT_TRUE(fs::exists("./logs/changelog_31_40.bin" + params.extension));
+
+    // we have a gap in our logs, we need to remove all the logs after the gap
+    fs::remove("./logs/changelog_21_30.bin" + params.extension);
+
+    DB::KeeperLogStore changelog_reader("./logs", 10, true, params.enable_compression);
+    /// It should print error message, but still able to start
+    changelog_reader.init(5, 0);
+    EXPECT_TRUE(fs::exists("./logs/changelog_1_10.bin" + params.extension));
+    EXPECT_TRUE(fs::exists("./logs/changelog_11_20.bin" + params.extension));
+
+    assertBrokenLogRemoved("./logs", "changelog_31_40.bin" + params.extension);
+}
 struct IntNode
 {
    int value;
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -574,6 +574,8 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
    M(Bool, enable_filesystem_cache_on_write_operations, false, "Write into cache on write operations. To actually work this setting requires be added to disk config too", 0) \
    M(Bool, enable_filesystem_cache_log, false, "Allows to record the filesystem caching log for each query", 0) \
    M(Bool, read_from_filesystem_cache_if_exists_otherwise_bypass_cache, false, "", 0) \
+    M(Bool, skip_download_if_exceeds_query_cache, true, "Skip download from remote filesystem if exceeds query cache size", 0) \
+    M(UInt64, max_query_cache_size, (128UL * 1024 * 1024 * 1024), "Max remote filesystem cache size that can be used by a single query", 0) \
    \
    M(Bool, use_structure_from_insertion_table_in_table_functions, false, "Use structure from insertion table instead of schema inference from data", 0) \
    \
--- a/src/Core/SortCursor.h
+++ b/src/Core/SortCursor.h
@ -6,11 +6,24 @@

 #include <Common/typeid_cast.h>
 #include <Common/assert_cast.h>
+#include <Core/callOnTypeIndex.h>
 #include <Core/SortDescription.h>
 #include <Core/Block.h>
 #include <Core/ColumnNumbers.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <DataTypes/DataTypesDecimal.h>
+#include <DataTypes/DataTypeString.h>
+#include <DataTypes/DataTypeFixedString.h>
+#include <DataTypes/DataTypeDate.h>
+#include <DataTypes/DataTypeDate32.h>
+#include <DataTypes/DataTypeDateTime.h>
+#include <DataTypes/DataTypeDateTime64.h>
+#include <DataTypes/DataTypeEnum.h>
+#include <DataTypes/DataTypeUUID.h>
 #include <Columns/IColumn.h>
+#include <Columns/ColumnDecimal.h>
 #include <Columns/ColumnString.h>
+#include <Columns/ColumnFixedString.h>

 #include "config_core.h"

@ -250,6 +263,36 @@ struct SimpleSortCursor : SortCursorHelper<SimpleSortCursor>
    }
 };

+template <typename ColumnType>
+struct SpecializedSingleColumnSortCursor : SortCursorHelper<SpecializedSingleColumnSortCursor<ColumnType>>
+{
+    using SortCursorHelper<SpecializedSingleColumnSortCursor>::SortCursorHelper;
+
+    bool ALWAYS_INLINE greaterAt(const SortCursorHelper<SpecializedSingleColumnSortCursor> & rhs, size_t lhs_pos, size_t rhs_pos) const
+    {
+        auto & this_impl = this->impl;
+
+        auto & lhs_columns = this_impl->sort_columns;
+        auto & rhs_columns = rhs.impl->sort_columns;
+
+        assert(lhs_columns.size() == 1);
+        assert(rhs_columns.size() == 1);
+
+        const auto & lhs_column = assert_cast<const ColumnType &>(*lhs_columns[0]);
+        const auto & rhs_column = assert_cast<const ColumnType &>(*rhs_columns[0]);
+
+        const auto & desc = this->impl->desc[0];
+
+        int res = desc.direction * lhs_column.compareAt(lhs_pos, rhs_pos, rhs_column, desc.nulls_direction);
+
+        if (res > 0)
+            return true;
+        if (res < 0)
+            return false;
+
+        return this_impl->order > rhs.impl->order;
+    }
+};

 /// Separate comparator for locale-sensitive string comparisons
 struct SortCursorWithCollation : SortCursorHelper<SortCursorWithCollation>
@ -411,6 +454,124 @@ private:
    }
 };

+/** SortQueueVariants allow to specialize sorting queue for concrete types and sort description.
+  * To access queue callOnVariant method must be used.
+  */
+class SortQueueVariants
+{
+public:
+    SortQueueVariants() = default;
+
+    SortQueueVariants(const DataTypes & sort_description_types, const SortDescription & sort_description)
+    {
+        bool has_collation = false;
+        for (const auto & column_description : sort_description)
+        {
+            if (column_description.collator)
+            {
+                has_collation = true;
+                break;
+            }
+        }
+
+        if (has_collation)
+        {
+            queue_variants = SortingHeap<SortCursorWithCollation>();
+            return;
+        }
+        else if (sort_description.size() == 1)
+        {
+            TypeIndex column_type_index = sort_description_types[0]->getTypeId();
+
+            bool result = callOnIndexAndDataType<void>(
+                column_type_index,
+                [&](const auto & types)
+                {
+                    using Types = std::decay_t<decltype(types)>;
+                    using ColumnDataType = typename Types::LeftType;
+                    using ColumnType = typename ColumnDataType::ColumnType;
+
+                    queue_variants = SortingHeap<SpecializedSingleColumnSortCursor<ColumnType>>();
+                    return true;
+                });
+
+            if (!result)
+                queue_variants = SortingHeap<SimpleSortCursor>();
+        }
+        else
+        {
+            queue_variants = SortingHeap<SortCursor>();
+        }
+    }
+
+    SortQueueVariants(const Block & header, const SortDescription & sort_description)
+        : SortQueueVariants(extractSortDescriptionTypesFromHeader(header, sort_description), sort_description)
+    {
+    }
+
+    template <typename Func>
+    decltype(auto) callOnVariant(Func && func)
+    {
+        return std::visit(func, queue_variants);
+    }
+
+    bool variantSupportJITCompilation() const
+    {
+        return std::holds_alternative<SortingHeap<SimpleSortCursor>>(queue_variants)
+            || std::holds_alternative<SortingHeap<SortCursor>>(queue_variants)
+            || std::holds_alternative<SortingHeap<SortCursorWithCollation>>(queue_variants);
+    }
+
+private:
+    static DataTypes extractSortDescriptionTypesFromHeader(const Block & header, const SortDescription & sort_description)
+    {
+        size_t sort_description_size = sort_description.size();
+        DataTypes data_types(sort_description_size);
+
+        for (size_t i = 0; i < sort_description_size; ++i)
+        {
+            const auto & column_sort_description = sort_description[i];
+            data_types[i] = header.getByName(column_sort_description.column_name).type;
+        }
+
+        return data_types;
+    }
+
+    std::variant<
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<UInt8>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<UInt16>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<UInt32>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<UInt64>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<UInt128>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<UInt256>>>,
+
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<Int8>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<Int16>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<Int32>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<Int64>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<Int128>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<Int256>>>,
+
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<Float32>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<Float64>>>,
+
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnDecimal<Decimal32>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnDecimal<Decimal64>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnDecimal<Decimal128>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnDecimal<Decimal256>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnDecimal<DateTime64>>>,
+
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<UUID>>>,
+
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnString>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnFixedString>>,
+
+        SortingHeap<SimpleSortCursor>,
+        SortingHeap<SortCursor>,
+        SortingHeap<SortCursorWithCollation>>
+        queue_variants;
+};
+
 template <typename TLeftColumns, typename TRightColumns>
 bool less(const TLeftColumns & lhs, const TRightColumns & rhs, size_t i, size_t j, const SortDescriptionWithPositions & descr)
 {
--- a/src/Core/examples/coro.cpp
+++ b/src/Core/examples/coro.cpp
@ -17,6 +17,11 @@ namespace std
    using namespace experimental::coroutines_v1;
 }

+#if __has_warning("-Wdeprecated-experimental-coroutine")
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdeprecated-experimental-coroutine"
+#endif
+
 #else
 #include <coroutine>
 #pragma GCC diagnostic push
--- a/src/DataTypes/ObjectUtils.cpp
+++ b/src/DataTypes/ObjectUtils.cpp
@ -455,7 +455,7 @@ ColumnWithTypeAndDimensions createTypeFromNode(const Node * node)
        }

        /// Sort to always create the same type for the same set of subcolumns.
-        std::sort(tuple_elements.begin(), tuple_elements.end(),
+        ::sort(tuple_elements.begin(), tuple_elements.end(),
            [](const auto & lhs, const auto & rhs) { return std::get<0>(lhs) < std::get<0>(rhs); });

        auto tuple_names = extractVector<0>(tuple_elements);
@ -692,7 +692,7 @@ void replaceMissedSubcolumnsByConstants(
                res.emplace_back(full_name, types[i]);
            }

-            std::sort(res.begin(), res.end());
+            ::sort(res.begin(), res.end());
            return res;
        };

@ -718,9 +718,9 @@ void replaceMissedSubcolumnsByConstants(
            addConstantToWithClause(query, name, type);
 }

-void finalizeObjectColumns(MutableColumns & columns)
+void finalizeObjectColumns(const MutableColumns & columns)
 {
-    for (auto & column : columns)
+    for (const auto & column : columns)
        if (auto * column_object = typeid_cast<ColumnObject *>(column.get()))
            column_object->finalize();
 }
--- a/src/DataTypes/ObjectUtils.h
+++ b/src/DataTypes/ObjectUtils.h
@ -51,7 +51,7 @@ void extendObjectColumns(NamesAndTypesList & columns_list, const ColumnsDescript

 NameSet getNamesOfObjectColumns(const NamesAndTypesList & columns_list);
 bool hasObjectColumns(const ColumnsDescription & columns);
-void finalizeObjectColumns(MutableColumns & columns);
+void finalizeObjectColumns(const MutableColumns & columns);

 /// Updates types of objects in @object_columns inplace
 /// according to types in new_columns.
--- a/src/DataTypes/Serializations/SerializationObject.cpp
+++ b/src/DataTypes/Serializations/SerializationObject.cpp
@ -33,71 +33,6 @@ namespace ErrorCodes
    extern const int LOGICAL_ERROR;
 }

-namespace
-{
-
-using Node = typename ColumnObject::Subcolumns::Node;
-
-/// Finds a subcolumn from the same Nested type as @entry and inserts
-/// an array with default values with consistent sizes as in Nested type.
-bool tryInsertDefaultFromNested(
-    const std::shared_ptr<Node> & entry, const ColumnObject::Subcolumns & subcolumns)
-{
-    if (!entry->path.hasNested())
-        return false;
-
-    const Node * current_node = subcolumns.findLeaf(entry->path);
-    const Node * leaf = nullptr;
-    size_t num_skipped_nested = 0;
-
-    while (current_node)
-    {
-        /// Try to find the first Nested up to the current node.
-        const auto * node_nested = subcolumns.findParent(current_node,
-            [](const auto & candidate) { return candidate.isNested(); });
-
-        if (!node_nested)
-            break;
-
-        /// If there are no leaves, skip current node and find
-        /// the next node up to the current.
-        leaf = subcolumns.findLeaf(node_nested,
-            [&](const auto & candidate)
-            {
-                return candidate.data.size() == entry->data.size() + 1;
-            });
-
-        if (leaf)
-            break;
-
-        current_node = node_nested->parent;
-        ++num_skipped_nested;
-    }
-
-    if (!leaf)
-        return false;
-
-    auto last_field = leaf->data.getLastField();
-    if (last_field.isNull())
-        return false;
-
-    const auto & least_common_type = entry->data.getLeastCommonType();
-    size_t num_dimensions = getNumberOfDimensions(*least_common_type);
-    assert(num_skipped_nested < num_dimensions);
-
-    /// Replace scalars to default values with consistent array sizes.
-    size_t num_dimensions_to_keep = num_dimensions - num_skipped_nested;
-    auto default_scalar = num_skipped_nested
-        ? createEmptyArrayField(num_skipped_nested)
-        : getBaseTypeOfArray(least_common_type)->getDefault();
-
-    auto default_field = applyVisitor(FieldVisitorReplaceScalars(default_scalar, num_dimensions_to_keep), last_field);
-    entry->data.insert(std::move(default_field));
-    return true;
-}
-
-}
-
 template <typename Parser>
 template <typename Reader>
 void SerializationObject<Parser>::deserializeTextImpl(IColumn & column, Reader && reader) const
@ -159,7 +94,7 @@ void SerializationObject<Parser>::deserializeTextImpl(IColumn & column, Reader &
    {
        if (!paths_set.has(entry->path.getPath()))
        {
-            bool inserted = tryInsertDefaultFromNested(entry, subcolumns);
+            bool inserted = column_object.tryInsertDefaultFromNested(entry);
            if (!inserted)
                entry->data.insertDefault();
        }
--- a/src/Disks/DiskDecorator.cpp
+++ b/src/Disks/DiskDecorator.cpp
@ -83,7 +83,7 @@ void DiskDecorator::moveDirectory(const String & from_path, const String & to_pa
    delegate->moveDirectory(from_path, to_path);
 }

-DirectoryIteratorPtr DiskDecorator::iterateDirectory(const String & path)
+DirectoryIteratorPtr DiskDecorator::iterateDirectory(const String & path) const
 {
    return delegate->iterateDirectory(path);
 }
@ -113,7 +113,7 @@ void DiskDecorator::copyDirectoryContent(const String & from_dir, const std::sha
    delegate->copyDirectoryContent(from_dir, to_disk, to_dir);
 }

-void DiskDecorator::listFiles(const String & path, std::vector<String> & file_names)
+void DiskDecorator::listFiles(const String & path, std::vector<String> & file_names) const
 {
    delegate->listFiles(path, file_names);
 }
@ -171,11 +171,16 @@ void DiskDecorator::setLastModified(const String & path, const Poco::Timestamp &
    delegate->setLastModified(path, timestamp);
 }

-Poco::Timestamp DiskDecorator::getLastModified(const String & path)
+Poco::Timestamp DiskDecorator::getLastModified(const String & path) const
 {
    return delegate->getLastModified(path);
 }

+time_t DiskDecorator::getLastChanged(const String & path) const
+{
+    return delegate->getLastChanged(path);
+}
+
 void DiskDecorator::setReadOnly(const String & path)
 {
    delegate->setReadOnly(path);
--- a/src/Disks/DiskDecorator.h
+++ b/src/Disks/DiskDecorator.h
@ -28,13 +28,13 @@ public:
    void createDirectories(const String & path) override;
    void clearDirectory(const String & path) override;
    void moveDirectory(const String & from_path, const String & to_path) override;
-    DirectoryIteratorPtr iterateDirectory(const String & path) override;
+    DirectoryIteratorPtr iterateDirectory(const String & path) const override;
    void createFile(const String & path) override;
    void moveFile(const String & from_path, const String & to_path) override;
    void replaceFile(const String & from_path, const String & to_path) override;
    void copy(const String & from_path, const std::shared_ptr<IDisk> & to_disk, const String & to_path) override;
    void copyDirectoryContent(const String & from_dir, const std::shared_ptr<IDisk> & to_disk, const String & to_dir) override;
-    void listFiles(const String & path, std::vector<String> & file_names) override;
+    void listFiles(const String & path, std::vector<String> & file_names) const override;

    std::unique_ptr<ReadBufferFromFileBase> readFile(
        const String & path,
@ -56,7 +56,8 @@ public:
    void removeSharedRecursive(const String & path, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) override;
    void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) override;
    void setLastModified(const String & path, const Poco::Timestamp & timestamp) override;
-    Poco::Timestamp getLastModified(const String & path) override;
+    time_t getLastChanged(const String & path) const override;
+    Poco::Timestamp getLastModified(const String & path) const override;
    void setReadOnly(const String & path) override;
    void createHardLink(const String & src_path, const String & dst_path) override;
    void truncateFile(const String & path, size_t size) override;
--- a/src/Disks/DiskEncrypted.h
+++ b/src/Disks/DiskEncrypted.h
@ -83,7 +83,7 @@ public:
        delegate->moveDirectory(wrapped_from_path, wrapped_to_path);
    }

-    DirectoryIteratorPtr iterateDirectory(const String & path) override
+    DirectoryIteratorPtr iterateDirectory(const String & path) const override
    {
        auto wrapped_path = wrappedPath(path);
        return delegate->iterateDirectory(wrapped_path);
@ -109,7 +109,7 @@ public:
        delegate->replaceFile(wrapped_from_path, wrapped_to_path);
    }

-    void listFiles(const String & path, std::vector<String> & file_names) override
+    void listFiles(const String & path, std::vector<String> & file_names) const override
    {
        auto wrapped_path = wrappedPath(path);
        delegate->listFiles(wrapped_path, file_names);
@ -192,12 +192,18 @@ public:
        delegate->setLastModified(wrapped_path, timestamp);
    }

-    Poco::Timestamp getLastModified(const String & path) override
+    Poco::Timestamp getLastModified(const String & path) const override
    {
        auto wrapped_path = wrappedPath(path);
        return delegate->getLastModified(wrapped_path);
    }

+    time_t getLastChanged(const String & path) const override
+    {
+        auto wrapped_path = wrappedPath(path);
+        return delegate->getLastChanged(wrapped_path);
+    }
+
    void setReadOnly(const String & path) override
    {
        auto wrapped_path = wrappedPath(path);
--- a/src/Disks/DiskLocal.cpp
+++ b/src/Disks/DiskLocal.cpp
@ -325,7 +325,7 @@ void DiskLocal::moveDirectory(const String & from_path, const String & to_path)
    fs::rename(fs::path(disk_path) / from_path, fs::path(disk_path) / to_path);
 }

-DirectoryIteratorPtr DiskLocal::iterateDirectory(const String & path)
+DirectoryIteratorPtr DiskLocal::iterateDirectory(const String & path) const
 {
    fs::path meta_path = fs::path(disk_path) / path;
    if (!broken && fs::exists(meta_path) && fs::is_directory(meta_path))
@ -387,7 +387,7 @@ void DiskLocal::removeRecursive(const String & path)
    fs::remove_all(fs::path(disk_path) / path);
 }

-void DiskLocal::listFiles(const String & path, std::vector<String> & file_names)
+void DiskLocal::listFiles(const String & path, std::vector<String> & file_names) const
 {
    file_names.clear();
    for (const auto & entry : fs::directory_iterator(fs::path(disk_path) / path))
@ -399,11 +399,16 @@ void DiskLocal::setLastModified(const String & path, const Poco::Timestamp & tim
    FS::setModificationTime(fs::path(disk_path) / path, timestamp.epochTime());
 }

-Poco::Timestamp DiskLocal::getLastModified(const String & path)
+Poco::Timestamp DiskLocal::getLastModified(const String & path) const
 {
    return FS::getModificationTimestamp(fs::path(disk_path) / path);
 }

+time_t DiskLocal::getLastChanged(const String & path) const
+{
+    return FS::getChangeTime(fs::path(disk_path) / path);
+}
+
 void DiskLocal::createHardLink(const String & src_path, const String & dst_path)
 {
    DB::createHardLink(fs::path(disk_path) / src_path, fs::path(disk_path) / dst_path);
--- a/src/Disks/DiskLocal.h
+++ b/src/Disks/DiskLocal.h
@ -58,7 +58,7 @@ public:

    void moveDirectory(const String & from_path, const String & to_path) override;

-    DirectoryIteratorPtr iterateDirectory(const String & path) override;
+    DirectoryIteratorPtr iterateDirectory(const String & path) const override;

    void createFile(const String & path) override;

@ -70,7 +70,7 @@ public:

    void copyDirectoryContent(const String & from_dir, const std::shared_ptr<IDisk> & to_disk, const String & to_dir) override;

-    void listFiles(const String & path, std::vector<String> & file_names) override;
+    void listFiles(const String & path, std::vector<String> & file_names) const override;

    std::unique_ptr<ReadBufferFromFileBase> readFile(
        const String & path,
@ -91,7 +91,9 @@ public:

    void setLastModified(const String & path, const Poco::Timestamp & timestamp) override;

-    Poco::Timestamp getLastModified(const String & path) override;
+    Poco::Timestamp getLastModified(const String & path) const override;
+
+    time_t getLastChanged(const String & path) const override;

    void setReadOnly(const String & path) override;

--- a/src/Disks/DiskMemory.cpp
+++ b/src/Disks/DiskMemory.cpp
@ -262,7 +262,7 @@ void DiskMemory::moveDirectory(const String & /*from_path*/, const String & /*to
    throw Exception("Method moveDirectory is not implemented for memory disks", ErrorCodes::NOT_IMPLEMENTED);
 }

-DirectoryIteratorPtr DiskMemory::iterateDirectory(const String & path)
+DirectoryIteratorPtr DiskMemory::iterateDirectory(const String & path) const
 {
    std::lock_guard lock(mutex);

@ -409,7 +409,7 @@ void DiskMemory::removeRecursive(const String & path)
    }
 }

-void DiskMemory::listFiles(const String & path, std::vector<String> & file_names)
+void DiskMemory::listFiles(const String & path, std::vector<String> & file_names) const
 {
    std::lock_guard lock(mutex);

--- a/src/Disks/DiskMemory.h
+++ b/src/Disks/DiskMemory.h
@ -52,7 +52,7 @@ public:

    void moveDirectory(const String & from_path, const String & to_path) override;

-    DirectoryIteratorPtr iterateDirectory(const String & path) override;
+    DirectoryIteratorPtr iterateDirectory(const String & path) const override;

    void createFile(const String & path) override;

@ -60,7 +60,7 @@ public:

    void replaceFile(const String & from_path, const String & to_path) override;

-    void listFiles(const String & path, std::vector<String> & file_names) override;
+    void listFiles(const String & path, std::vector<String> & file_names) const override;

    std::unique_ptr<ReadBufferFromFileBase> readFile(
        const String & path,
@ -81,7 +81,9 @@ public:

    void setLastModified(const String &, const Poco::Timestamp &) override {}

-    Poco::Timestamp getLastModified(const String &) override { return Poco::Timestamp(); }
+    Poco::Timestamp getLastModified(const String &) const override { return Poco::Timestamp(); }
+
+    time_t getLastChanged(const String &) const override { return {}; }

    void setReadOnly(const String & path) override;

--- a/src/Disks/DiskRestartProxy.cpp
+++ b/src/Disks/DiskRestartProxy.cpp
@ -171,7 +171,7 @@ void DiskRestartProxy::moveDirectory(const String & from_path, const String & to
    DiskDecorator::moveDirectory(from_path, to_path);
 }

-DirectoryIteratorPtr DiskRestartProxy::iterateDirectory(const String & path)
+DirectoryIteratorPtr DiskRestartProxy::iterateDirectory(const String & path) const
 {
    ReadLock lock (mutex);
    return DiskDecorator::iterateDirectory(path);
@ -207,7 +207,7 @@ void DiskRestartProxy::copyDirectoryContent(const String & from_dir, const std::
    DiskDecorator::copyDirectoryContent(from_dir, to_disk, to_dir);
 }

-void DiskRestartProxy::listFiles(const String & path, std::vector<String> & file_names)
+void DiskRestartProxy::listFiles(const String & path, std::vector<String> & file_names) const
 {
    ReadLock lock (mutex);
    DiskDecorator::listFiles(path, file_names);
@ -276,7 +276,7 @@ void DiskRestartProxy::setLastModified(const String & path, const Poco::Timestam
    DiskDecorator::setLastModified(path, timestamp);
 }

-Poco::Timestamp DiskRestartProxy::getLastModified(const String & path)
+Poco::Timestamp DiskRestartProxy::getLastModified(const String & path) const
 {
    ReadLock lock (mutex);
    return DiskDecorator::getLastModified(path);
--- a/src/Disks/DiskRestartProxy.h
+++ b/src/Disks/DiskRestartProxy.h
@ -37,13 +37,13 @@ public:
    void createDirectories(const String & path) override;
    void clearDirectory(const String & path) override;
    void moveDirectory(const String & from_path, const String & to_path) override;
-    DirectoryIteratorPtr iterateDirectory(const String & path) override;
+    DirectoryIteratorPtr iterateDirectory(const String & path) const override;
    void createFile(const String & path) override;
    void moveFile(const String & from_path, const String & to_path) override;
    void replaceFile(const String & from_path, const String & to_path) override;
    void copy(const String & from_path, const DiskPtr & to_disk, const String & to_path) override;
    void copyDirectoryContent(const String & from_dir, const std::shared_ptr<IDisk> & to_disk, const String & to_dir) override;
-    void listFiles(const String & path, std::vector<String> & file_names) override;
+    void listFiles(const String & path, std::vector<String> & file_names) const override;
    std::unique_ptr<ReadBufferFromFileBase> readFile(
        const String & path,
        const ReadSettings & settings,
@ -58,7 +58,7 @@ public:
    void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) override;
    void removeSharedRecursive(const String & path, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) override;
    void setLastModified(const String & path, const Poco::Timestamp & timestamp) override;
-    Poco::Timestamp getLastModified(const String & path) override;
+    Poco::Timestamp getLastModified(const String & path) const override;
    void setReadOnly(const String & path) override;
    void createHardLink(const String & src_path, const String & dst_path) override;
    void truncateFile(const String & path, size_t size) override;
--- a/src/Disks/DiskWebServer.cpp
+++ b/src/Disks/DiskWebServer.cpp
@ -188,7 +188,7 @@ std::unique_ptr<ReadBufferFromFileBase> DiskWebServer::readFile(const String & p
 }


-DirectoryIteratorPtr DiskWebServer::iterateDirectory(const String & path)
+DirectoryIteratorPtr DiskWebServer::iterateDirectory(const String & path) const
 {
    std::vector<fs::path> dir_file_paths;
    if (files.find(path) == files.end())
--- a/src/Disks/DiskWebServer.h
+++ b/src/Disks/DiskWebServer.h
@ -90,15 +90,17 @@ public:

    size_t getFileSize(const String & path) const override;

-    void listFiles(const String & /* path */, std::vector<String> & /* file_names */) override { }
+    void listFiles(const String & /* path */, std::vector<String> & /* file_names */) const override { }

    void setReadOnly(const String & /* path */) override {}

    bool isDirectory(const String & path) const override;

-    DirectoryIteratorPtr iterateDirectory(const String & /* path */) override;
+    DirectoryIteratorPtr iterateDirectory(const String & /* path */) const override;

-    Poco::Timestamp getLastModified(const String &) override { return Poco::Timestamp{}; }
+    Poco::Timestamp getLastModified(const String &) const override { return Poco::Timestamp{}; }
+
+    time_t getLastChanged(const String &) const override { return {}; }

    /// Write and modification part

--- a/src/Disks/IDisk.cpp
+++ b/src/Disks/IDisk.cpp
@ -16,7 +16,7 @@ namespace ErrorCodes
    extern const int NOT_IMPLEMENTED;
 }

-bool IDisk::isDirectoryEmpty(const String & path)
+bool IDisk::isDirectoryEmpty(const String & path) const
 {
    return !iterateDirectory(path)->isValid();
 }
--- a/src/Disks/IDisk.h
+++ b/src/Disks/IDisk.h
@ -138,10 +138,10 @@ public:
    virtual void moveDirectory(const String & from_path, const String & to_path) = 0;

    /// Return iterator to the contents of the specified directory.
-    virtual DirectoryIteratorPtr iterateDirectory(const String & path) = 0;
+    virtual DirectoryIteratorPtr iterateDirectory(const String & path) const = 0;

    /// Return `true` if the specified directory is empty.
-    bool isDirectoryEmpty(const String & path);
+    bool isDirectoryEmpty(const String & path) const;

    /// Create empty file at `path`.
    virtual void createFile(const String & path) = 0;
@ -164,7 +164,7 @@ public:
    virtual void copyFile(const String & from_file_path, IDisk & to_disk, const String & to_file_path);

    /// List files at `path` and add their names to `file_names`
-    virtual void listFiles(const String & path, std::vector<String> & file_names) = 0;
+    virtual void listFiles(const String & path, std::vector<String> & file_names) const = 0;

    /// Open the file for read and return ReadBufferFromFileBase object.
    virtual std::unique_ptr<ReadBufferFromFileBase> readFile( /// NOLINT
@ -259,7 +259,11 @@ public:
    virtual void setLastModified(const String & path, const Poco::Timestamp & timestamp) = 0;

    /// Get last modified time of file or directory at `path`.
-    virtual Poco::Timestamp getLastModified(const String & path) = 0;
+    virtual Poco::Timestamp getLastModified(const String & path) const = 0;
+
+    /// Get last changed time of file or directory at `path`.
+    /// Meaning is the same as stat.mt_ctime (e.g. different from getLastModified()).
+    virtual time_t getLastChanged(const String & path) const = 0;

    /// Set file at `path` as read-only.
    virtual void setReadOnly(const String & path) = 0;
--- a/src/Disks/IO/CachedReadBufferFromRemoteFS.cpp
+++ b/src/Disks/IO/CachedReadBufferFromRemoteFS.cpp
@ -55,6 +55,7 @@ CachedReadBufferFromRemoteFS::CachedReadBufferFromRemoteFS(
    , query_id(query_id_)
    , enable_logging(!query_id.empty() && settings_.enable_filesystem_cache_log)
    , current_buffer_id(getRandomASCIIString(8))
+    , query_context_holder(cache_->getQueryContextHolder(query_id, settings_))
 {
 }

--- a/src/Disks/IO/CachedReadBufferFromRemoteFS.h
+++ b/src/Disks/IO/CachedReadBufferFromRemoteFS.h
@ -123,6 +123,8 @@ private:

    CurrentMetrics::Increment metric_increment{CurrentMetrics::FilesystemCacheReadBuffers};
    ProfileEvents::Counters current_file_segment_counters;
+
+    IFileCache::QueryContextHolder query_context_holder;
 };

 }
--- a/src/Disks/ObjectStorages/DiskObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/DiskObjectStorage.cpp
@ -350,13 +350,13 @@ void DiskObjectStorage::removeDirectory(const String & path)
 }


-DirectoryIteratorPtr DiskObjectStorage::iterateDirectory(const String & path)
+DirectoryIteratorPtr DiskObjectStorage::iterateDirectory(const String & path) const
 {
    return metadata_storage->iterateDirectory(path);
 }


-void DiskObjectStorage::listFiles(const String & path, std::vector<String> & file_names)
+void DiskObjectStorage::listFiles(const String & path, std::vector<String> & file_names) const
 {
    for (auto it = iterateDirectory(path); it->isValid(); it->next())
        file_names.push_back(it->name());
@ -371,11 +371,16 @@ void DiskObjectStorage::setLastModified(const String & path, const Poco::Timesta
 }


-Poco::Timestamp DiskObjectStorage::getLastModified(const String & path)
+Poco::Timestamp DiskObjectStorage::getLastModified(const String & path) const
 {
    return metadata_storage->getLastModified(path);
 }

+time_t DiskObjectStorage::getLastChanged(const String & path) const
+{
+    return metadata_storage->getLastChanged(path);
+}
+
 void DiskObjectStorage::removeMetadata(const String & path, std::vector<String> & paths_to_remove)
 {
    LOG_TRACE(log, "Remove file by path: {}", backQuote(metadata_storage->getPath() + path));
--- a/src/Disks/ObjectStorages/DiskObjectStorage.h
+++ b/src/Disks/ObjectStorages/DiskObjectStorage.h
@ -108,7 +108,7 @@ public:
    void createHardLink(const String & src_path, const String & dst_path) override;
    void createHardLink(const String & src_path, const String & dst_path, bool should_send_metadata);

-    void listFiles(const String & path, std::vector<String> & file_names) override;
+    void listFiles(const String & path, std::vector<String> & file_names) const override;

    void setReadOnly(const String & path) override;

@ -124,11 +124,13 @@ public:

    void removeDirectory(const String & path) override;

-    DirectoryIteratorPtr iterateDirectory(const String & path) override;
+    DirectoryIteratorPtr iterateDirectory(const String & path) const override;

    void setLastModified(const String & path, const Poco::Timestamp & timestamp) override;

-    Poco::Timestamp getLastModified(const String & path) override;
+    Poco::Timestamp getLastModified(const String & path) const override;
+
+    time_t getLastChanged(const String & path) const override;

    bool isRemote() const override { return true; }

--- a/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp
+++ b/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp
@ -72,7 +72,7 @@ void DiskObjectStorageMetadata::deserializeFromString(const std::string & data)

 void DiskObjectStorageMetadata::serialize(WriteBuffer & buf, bool sync) const
 {
-    writeIntText(VERSION_RELATIVE_PATHS, buf);
+    writeIntText(VERSION_READ_ONLY_FLAG, buf);
    writeChar('\n', buf);

    writeIntText(remote_fs_objects.size(), buf);
--- a/src/Disks/ObjectStorages/IMetadataStorage.h
+++ b/src/Disks/ObjectStorages/IMetadataStorage.h
@ -15,16 +15,6 @@
 namespace DB
 {

-struct IMetadataOperation
-{
-    virtual void execute() = 0;
-    virtual void undo() = 0;
-    virtual void finalize() {}
-    virtual ~IMetadataOperation() = default;
-};
-
-using MetadataOperationPtr = std::unique_ptr<IMetadataOperation>;
-
 class IMetadataStorage;

 /// Tries to provide some "transactions" interface, which allow
@ -111,9 +101,11 @@ public:

    virtual Poco::Timestamp getLastModified(const std::string & path) const = 0;

+    virtual time_t getLastChanged(const std::string & path) const = 0;
+
    virtual std::vector<std::string> listDirectory(const std::string & path) const = 0;

-    virtual DirectoryIteratorPtr iterateDirectory(const std::string & path) = 0;
+    virtual DirectoryIteratorPtr iterateDirectory(const std::string & path) const = 0;

    virtual uint32_t getHardlinkCount(const std::string & path) const = 0;

--- a/src/Disks/ObjectStorages/MetadataStorageFromDisk.cpp
+++ b/src/Disks/ObjectStorages/MetadataStorageFromDisk.cpp
@ -471,6 +471,11 @@ Poco::Timestamp MetadataStorageFromDisk::getLastModified(const std::string & pat
    return disk->getLastModified(path);
 }

+time_t MetadataStorageFromDisk::getLastChanged(const std::string & path) const
+{
+    return disk->getLastChanged(path);
+}
+
 uint64_t MetadataStorageFromDisk::getFileSize(const String & path) const
 {
    auto metadata = readMetadata(path);
@ -484,7 +489,7 @@ std::vector<std::string> MetadataStorageFromDisk::listDirectory(const std::strin
    return result_files;
 }

-DirectoryIteratorPtr MetadataStorageFromDisk::iterateDirectory(const std::string & path)
+DirectoryIteratorPtr MetadataStorageFromDisk::iterateDirectory(const std::string & path) const
 {
    return disk->iterateDirectory(path);
 }
--- a/src/Disks/ObjectStorages/MetadataStorageFromDisk.h
+++ b/src/Disks/ObjectStorages/MetadataStorageFromDisk.h
@ -8,6 +8,17 @@
 namespace DB
 {

+
+struct IMetadataOperation
+{
+    virtual void execute() = 0;
+    virtual void undo() = 0;
+    virtual void finalize() {}
+    virtual ~IMetadataOperation() = default;
+};
+
+using MetadataOperationPtr = std::unique_ptr<IMetadataOperation>;
+
 enum class MetadataFromDiskTransactionState
 {
    PREPARING,
@ -48,9 +59,11 @@ public:

    Poco::Timestamp getLastModified(const std::string & path) const override;

+    time_t getLastChanged(const std::string & path) const override;
+
    std::vector<std::string> listDirectory(const std::string & path) const override;

-    DirectoryIteratorPtr iterateDirectory(const std::string & path) override;
+    DirectoryIteratorPtr iterateDirectory(const std::string & path) const override;

    std::string readFileToString(const std::string & path) const override;

--- a/src/Disks/ObjectStorages/S3/S3Capabilities.cpp
+++ b/src/Disks/ObjectStorages/S3/S3Capabilities.cpp
@ -0,0 +1,15 @@
+#include <Disks/ObjectStorages/S3/S3Capabilities.h>
+
+namespace DB
+{
+
+S3Capabilities getCapabilitiesFromConfig(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix)
+{
+    return S3Capabilities
+    {
+        .support_batch_delete = config.getBool(config_prefix + ".support_batch_delete", true),
+        .support_proxy = config.getBool(config_prefix + ".support_proxy", config.has(config_prefix + ".proxy")),
+    };
+}
+
+}
--- a/src/Disks/ObjectStorages/S3/S3Capabilities.h
+++ b/src/Disks/ObjectStorages/S3/S3Capabilities.h
@ -0,0 +1,27 @@
+#pragma once
+
+#include <string>
+#include <Poco/Util/AbstractConfiguration.h>
+
+namespace DB
+{
+
+/// Supported/unsupported features by different S3 implementations
+/// Can be useful only for almost compatible with AWS S3 versions.
+struct S3Capabilities
+{
+    /// Google S3 implementation doesn't support batch delete
+    /// TODO: possibly we have to use Google SDK https://github.com/googleapis/google-cloud-cpp/tree/main/google/cloud/storage
+    /// because looks like it miss a lot of features like:
+    /// 1) batch delete
+    /// 2) list_v2
+    /// 3) multipart upload works differently
+    bool support_batch_delete{true};
+
+    /// Y.Cloud S3 implementation support proxy for connection
+    bool support_proxy{false};
+};
+
+S3Capabilities getCapabilitiesFromConfig(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix);
+
+}
--- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp
@ -17,6 +17,7 @@
 #include <aws/s3/model/CopyObjectRequest.h>
 #include <aws/s3/model/ListObjectsV2Request.h>
 #include <aws/s3/model/HeadObjectRequest.h>
+#include <aws/s3/model/DeleteObjectRequest.h>
 #include <aws/s3/model/DeleteObjectsRequest.h>
 #include <aws/s3/model/CreateMultipartUploadRequest.h>
 #include <aws/s3/model/CompleteMultipartUploadRequest.h>
@ -213,18 +214,34 @@ void S3ObjectStorage::listPrefix(const std::string & path, BlobsPathToSize & chi
 void S3ObjectStorage::removeObject(const std::string & path)
 {
    auto client_ptr = client.get();
-    Aws::S3::Model::ObjectIdentifier obj;
-    obj.SetKey(path);
+    auto settings_ptr = s3_settings.get();

-    Aws::S3::Model::Delete delkeys;
-    delkeys.SetObjects({obj});
+    // If chunk size is 0, only use single delete request
+    // This allows us to work with GCS, which doesn't support DeleteObjects
+    if (!s3_capabilities.support_batch_delete)
+    {
+        Aws::S3::Model::DeleteObjectRequest request;
+        request.SetBucket(bucket);
+        request.SetKey(path);
+        auto outcome = client_ptr->DeleteObject(request);

-    Aws::S3::Model::DeleteObjectsRequest request;
-    request.SetBucket(bucket);
-    request.SetDelete(delkeys);
-    auto outcome = client_ptr->DeleteObjects(request);
+        throwIfError(outcome);
+    }
+    else
+    {
+        /// TODO: For AWS we prefer to use multiobject operation even for single object
+        /// maybe we shouldn't?
+        Aws::S3::Model::ObjectIdentifier obj;
+        obj.SetKey(path);
+        Aws::S3::Model::Delete delkeys;
+        delkeys.SetObjects({obj});
+        Aws::S3::Model::DeleteObjectsRequest request;
+        request.SetBucket(bucket);
+        request.SetDelete(delkeys);
+        auto outcome = client_ptr->DeleteObjects(request);

-    throwIfError(outcome);
+        throwIfError(outcome);
+    }
 }

 void S3ObjectStorage::removeObjects(const std::vector<std::string> & paths)
@ -235,31 +252,39 @@ void S3ObjectStorage::removeObjects(const std::vector<std::string> & paths)
    auto client_ptr = client.get();
    auto settings_ptr = s3_settings.get();

-    size_t chunk_size_limit = settings_ptr->objects_chunk_size_to_delete;
-    size_t current_position = 0;
-
-    while (current_position < paths.size())
+    if (!s3_capabilities.support_batch_delete)
    {
-        std::vector<Aws::S3::Model::ObjectIdentifier> current_chunk;
-        String keys;
-        for (; current_position < paths.size() && current_chunk.size() < chunk_size_limit; ++current_position)
+        for (const auto & path : paths)
+            removeObject(path);
+    }
+    else
+    {
+        size_t chunk_size_limit = settings_ptr->objects_chunk_size_to_delete;
+        size_t current_position = 0;
+
+        while (current_position < paths.size())
        {
-            Aws::S3::Model::ObjectIdentifier obj;
-            obj.SetKey(paths[current_position]);
-            current_chunk.push_back(obj);
+            std::vector<Aws::S3::Model::ObjectIdentifier> current_chunk;
+            String keys;
+            for (; current_position < paths.size() && current_chunk.size() < chunk_size_limit; ++current_position)
+            {
+                Aws::S3::Model::ObjectIdentifier obj;
+                obj.SetKey(paths[current_position]);
+                current_chunk.push_back(obj);

-            if (!keys.empty())
-                keys += ", ";
-            keys += paths[current_position];
+                if (!keys.empty())
+                    keys += ", ";
+                keys += paths[current_position];
+            }
+
+            Aws::S3::Model::Delete delkeys;
+            delkeys.SetObjects(current_chunk);
+            Aws::S3::Model::DeleteObjectsRequest request;
+            request.SetBucket(bucket);
+            request.SetDelete(delkeys);
+            auto outcome = client_ptr->DeleteObjects(request);
+            throwIfError(outcome);
        }
-
-        Aws::S3::Model::Delete delkeys;
-        delkeys.SetObjects(current_chunk);
-        Aws::S3::Model::DeleteObjectsRequest request;
-        request.SetBucket(bucket);
-        request.SetDelete(delkeys);
-        auto outcome = client_ptr->DeleteObjects(request);
-        throwIfError(outcome);
    }
 }

@ -493,7 +518,7 @@ std::unique_ptr<IObjectStorage> S3ObjectStorage::cloneObjectStorage(const std::s
    return std::make_unique<S3ObjectStorage>(
        nullptr, getClient(config, config_prefix, context),
        getSettings(config, config_prefix, context),
-        version_id, new_namespace);
+        version_id, s3_capabilities, new_namespace);
 }

 }
--- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h
+++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h
@ -5,6 +5,7 @@
 #if USE_AWS_S3

 #include <Disks/ObjectStorages/IObjectStorage.h>
+#include <Disks/ObjectStorages/S3/S3Capabilities.h>
 #include <memory>
 #include <aws/s3/S3Client.h>
 #include <aws/s3/model/HeadObjectResult.h>
@ -46,11 +47,13 @@ public:
        std::unique_ptr<Aws::S3::S3Client> && client_,
        std::unique_ptr<S3ObjectStorageSettings> && s3_settings_,
        String version_id_,
+        const S3Capabilities & s3_capabilities_,
        String bucket_)
        : IObjectStorage(std::move(cache_))
        , bucket(bucket_)
        , client(std::move(client_))
        , s3_settings(std::move(s3_settings_))
+        , s3_capabilities(s3_capabilities_)
        , version_id(std::move(version_id_))
    {}

@ -129,6 +132,7 @@ private:

    MultiVersion<Aws::S3::S3Client> client;
    MultiVersion<S3ObjectStorageSettings> s3_settings;
+    const S3Capabilities s3_capabilities;

    const String version_id;
 };
--- a/src/Disks/ObjectStorages/S3/registerDiskS3.cpp
+++ b/src/Disks/ObjectStorages/S3/registerDiskS3.cpp
@ -89,11 +89,12 @@ void registerDiskS3(DiskFactory & factory)
        auto metadata_storage = std::make_shared<MetadataStorageFromDisk>(metadata_disk, uri.key);

        FileCachePtr cache = getCachePtrForDisk(name, config, config_prefix, context);
+        S3Capabilities s3_capabilities = getCapabilitiesFromConfig(config, config_prefix);

        ObjectStoragePtr s3_storage = std::make_unique<S3ObjectStorage>(
            std::move(cache), getClient(config, config_prefix, context),
            getSettings(config, config_prefix, context),
-            uri.version_id, uri.bucket);
+            uri.version_id, s3_capabilities, uri.bucket);

        bool send_metadata = config.getBool(config_prefix + ".send_metadata", false);
        uint64_t copy_thread_pool_size = config.getUInt(config_prefix + ".thread_pool_size", 16);
--- a/src/IO/ReadSettings.h
+++ b/src/IO/ReadSettings.h
@ -82,6 +82,9 @@ struct ReadSettings
    bool read_from_filesystem_cache_if_exists_otherwise_bypass_cache = false;
    bool enable_filesystem_cache_log = false;

+    size_t max_query_cache_size = (128UL * 1024 * 1024 * 1024);
+    bool skip_download_if_exceeds_query_cache = true;
+
    size_t remote_read_min_bytes_for_seek = DBMS_DEFAULT_BUFFER_SIZE;

    FileCachePtr remote_fs_cache;
--- a/src/IO/S3Common.cpp
+++ b/src/IO/S3Common.cpp
@ -730,19 +730,9 @@ namespace S3
            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Host is empty in S3 URI.");

        /// Extract object version ID from query string.
-        {
-            version_id = "";
-            const String version_key = "versionId=";
-            const auto query_string = uri.getQuery();
-
-            auto start = query_string.rfind(version_key);
-            if (start != std::string::npos)
-            {
-                start += version_key.length();
-                auto end = query_string.find_first_of('&', start);
-                version_id = query_string.substr(start, end == std::string::npos ? std::string::npos : end - start);
-            }
-        }
+        for (const auto & [query_key, query_value] : uri.getQueryParameters())
+            if (query_key == "versionId")
+                version_id = query_value;

        String name;
        String endpoint_authority_from_uri;
--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@ -3419,6 +3419,9 @@ ReadSettings Context::getReadSettings() const
    res.read_from_filesystem_cache_if_exists_otherwise_bypass_cache = settings.read_from_filesystem_cache_if_exists_otherwise_bypass_cache;
    res.enable_filesystem_cache_log = settings.enable_filesystem_cache_log;

+    res.max_query_cache_size = settings.max_query_cache_size;
+    res.skip_download_if_exceeds_query_cache = settings.skip_download_if_exceeds_query_cache;
+
    res.remote_read_min_bytes_for_seek = settings.remote_read_min_bytes_for_seek;

    res.local_fs_buffer_size = settings.max_read_buffer_size;
--- a/src/Interpreters/InterpreterSystemQuery.cpp
+++ b/src/Interpreters/InterpreterSystemQuery.cpp
@ -43,6 +43,7 @@
 #include <Disks/DiskRestartProxy.h>
 #include <Storages/StorageDistributed.h>
 #include <Storages/StorageReplicatedMergeTree.h>
+#include <Storages/Freeze.h>
 #include <Storages/StorageFactory.h>
 #include <Parsers/ASTSystemQuery.h>
 #include <Parsers/ASTDropQuery.h>
@ -235,6 +236,8 @@ BlockIO InterpreterSystemQuery::execute()
    }


+    BlockIO result;
+
    volume_ptr = {};
    if (!query.storage_policy.empty() && !query.volume.empty())
        volume_ptr = getContext()->getStoragePolicy(query.storage_policy)->getVolumeByName(query.volume);
@ -493,11 +496,18 @@ BlockIO InterpreterSystemQuery::execute()
            getContext()->checkAccess(AccessType::SYSTEM_THREAD_FUZZER);
            ThreadFuzzer::start();
            break;
+        case Type::UNFREEZE:
+        {
+            getContext()->checkAccess(AccessType::SYSTEM_UNFREEZE);
+            /// The result contains information about deleted parts as a table. It is for compatibility with ALTER TABLE UNFREEZE query.
+            result = Unfreezer().unfreeze(query.backup_name, getContext());
+            break;
+        }
        default:
            throw Exception("Unknown type of SYSTEM query", ErrorCodes::BAD_ARGUMENTS);
    }

-    return BlockIO();
+    return result;
 }

 void InterpreterSystemQuery::restoreReplica()
@ -968,6 +978,11 @@ AccessRightsElements InterpreterSystemQuery::getRequiredAccessForDDLOnCluster()
            required_access.emplace_back(AccessType::SYSTEM_RESTART_DISK);
            break;
        }
+        case Type::UNFREEZE:
+        {
+            required_access.emplace_back(AccessType::SYSTEM_UNFREEZE);
+            break;
+        }
        case Type::STOP_LISTEN_QUERIES:
        case Type::START_LISTEN_QUERIES:
        case Type::STOP_THREAD_FUZZER:
--- a/src/Interpreters/TransactionLog.cpp
+++ b/src/Interpreters/TransactionLog.cpp
@ -201,7 +201,7 @@ void TransactionLog::loadLogFromZooKeeper()
    /// 3. support 64-bit CSNs on top of Apache ZooKeeper (it uses Int32 for sequential numbers)
    Strings entries_list = zookeeper->getChildren(zookeeper_path_log, nullptr, log_updated_event);
    chassert(!entries_list.empty());
-    std::sort(entries_list.begin(), entries_list.end());
+    ::sort(entries_list.begin(), entries_list.end());
    loadEntries(entries_list.begin(), entries_list.end());
    chassert(!last_loaded_entry.empty());
    chassert(latest_snapshot == deserializeCSN(last_loaded_entry));
@ -262,7 +262,7 @@ void TransactionLog::loadNewEntries()
 {
    Strings entries_list = zookeeper->getChildren(zookeeper_path_log, nullptr, log_updated_event);
    chassert(!entries_list.empty());
-    std::sort(entries_list.begin(), entries_list.end());
+    ::sort(entries_list.begin(), entries_list.end());
    auto it = std::upper_bound(entries_list.begin(), entries_list.end(), last_loaded_entry);
    loadEntries(it, entries_list.end());
    chassert(last_loaded_entry == entries_list.back());
@ -602,7 +602,7 @@ void TransactionLog::sync() const
 {
    Strings entries_list = zookeeper->getChildren(zookeeper_path_log);
    chassert(!entries_list.empty());
-    std::sort(entries_list.begin(), entries_list.end());
+    ::sort(entries_list.begin(), entries_list.end());
    CSN newest_csn = deserializeCSN(entries_list.back());
    waitForCSNLoaded(newest_csn);
 }
--- a/src/Interpreters/TreeRewriter.cpp
+++ b/src/Interpreters/TreeRewriter.cpp
@ -469,7 +469,7 @@ void removeUnneededColumnsFromSelectClause(ASTSelectQuery * select_query, const
            for (const auto & name : required_result_columns)
                name_pos[name] = pos++;
        }
-        std::sort(elements.begin(), elements.end(), [&](const auto & lhs, const auto & rhs)
+        ::sort(elements.begin(), elements.end(), [&](const auto & lhs, const auto & rhs)
        {
            String lhs_name = lhs->getAliasOrColumnName();
            String rhs_name = rhs->getAliasOrColumnName();
--- a/src/Parsers/ASTSystemQuery.h
+++ b/src/Parsers/ASTSystemQuery.h
@ -66,6 +66,7 @@ public:
        START_DISTRIBUTED_SENDS,
        START_THREAD_FUZZER,
        STOP_THREAD_FUZZER,
+        UNFREEZE,
        END
    };

@ -93,6 +94,7 @@ public:
    UInt64 seconds{};

    String filesystem_cache_path;
+    String backup_name;

    String getID(char) const override { return "SYSTEM query"; }

--- a/src/Parsers/ParserSystemQuery.cpp
+++ b/src/Parsers/ParserSystemQuery.cpp
@ -363,6 +363,20 @@ bool ParserSystemQuery::parseImpl(IParser::Pos & pos, ASTPtr & node, Expected &
            break;
        }

+        case Type::UNFREEZE:
+        {
+            ASTPtr ast;
+            if (ParserKeyword{"WITH NAME"}.ignore(pos, expected) && ParserStringLiteral{}.parse(pos, ast, expected))
+            {
+                res->backup_name = ast->as<ASTLiteral &>().value.get<const String &>();
+            }
+            else
+            {
+                return false;
+            }
+            break;
+        }
+
        default:
        {
            parseQueryWithOnCluster(res, pos, expected);
--- a/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.cpp
+++ b/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.cpp
@ -38,7 +38,9 @@ MergingSortedAlgorithm::MergingSortedAlgorithm(
        sort_description_types.emplace_back(header.getByName(column_description.column_name).type);
    }

-    compileSortDescriptionIfNeeded(description, sort_description_types, true /*increase_compile_attemps*/);
+    queue_variants = SortQueueVariants(sort_description_types, description);
+    if (queue_variants.variantSupportJITCompilation())
+        compileSortDescriptionIfNeeded(description, sort_description_types, true /*increase_compile_attemps*/);
 }

 void MergingSortedAlgorithm::addInput()
@ -72,12 +74,11 @@ void MergingSortedAlgorithm::initialize(Inputs inputs)
        cursors[source_num] = SortCursorImpl(header, chunk.getColumns(), description, source_num);
    }

-    if (has_collation)
-        queue_with_collation = SortingHeap<SortCursorWithCollation>(cursors);
-    else if (description.size() == 1)
-        queue_simple = SortingHeap<SimpleSortCursor>(cursors);
-    else
-        queue_without_collation = SortingHeap<SortCursor>(cursors);
+    queue_variants.callOnVariant([&](auto & queue)
+    {
+        using QueueType = std::decay_t<decltype(queue)>;
+        queue = QueueType(cursors);
+    });
 }

 void MergingSortedAlgorithm::consume(Input & input, size_t source_num)
@ -86,22 +87,20 @@ void MergingSortedAlgorithm::consume(Input & input, size_t source_num)
    current_inputs[source_num].swap(input);
    cursors[source_num].reset(current_inputs[source_num].chunk.getColumns(), header);

-    if (has_collation)
-        queue_with_collation.push(cursors[source_num]);
-    else if (description.size() == 1)
-        queue_simple.push(cursors[source_num]);
-    else
-        queue_without_collation.push(cursors[source_num]);
+    queue_variants.callOnVariant([&](auto & queue)
+    {
+        queue.push(cursors[source_num]);
+    });
 }

 IMergingAlgorithm::Status MergingSortedAlgorithm::merge()
 {
-    if (has_collation)
-        return mergeImpl(queue_with_collation);
-    else if (description.size() == 1)
-        return mergeImpl(queue_simple);
-    else
-        return mergeImpl(queue_without_collation);
+    IMergingAlgorithm::Status result = queue_variants.callOnVariant([&](auto & queue)
+    {
+        return mergeImpl(queue);
+    });
+
+    return result;
 }

 template <typename TSortingHeap>
--- a/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.h
+++ b/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.h
@ -49,9 +49,7 @@ private:

    SortCursorImpls cursors;

-    SortingHeap<SimpleSortCursor> queue_simple;
-    SortingHeap<SortCursor> queue_without_collation;
-    SortingHeap<SortCursorWithCollation> queue_with_collation;
+    SortQueueVariants queue_variants;

    Status insertFromChunk(size_t source_num);

--- a/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp
+++ b/src/Processors/QueryPlan/Optimizations/liftUpFunctions.cpp
@ -1,5 +1,6 @@
 #include <Interpreters/ActionsDAG.h>
 #include <Processors/QueryPlan/ExpressionStep.h>
+#include <Processors/QueryPlan/FillingStep.h>
 #include <Processors/QueryPlan/Optimizations/Optimizations.h>
 #include <Processors/QueryPlan/SortingStep.h>
 #include <Common/Exception.h>
@ -42,6 +43,11 @@ size_t tryExecuteFunctionsAfterSorting(QueryPlan::Node * parent_node, QueryPlan:
    if (!sorting_step || !expression_step)
        return 0;

+    // Filling step position should be preserved
+    if (!child_node->children.empty())
+        if (typeid_cast<FillingStep *>(child_node->children.front()->step.get()))
+            return 0;
+
    NameSet sort_columns;
    for (const auto & col : sorting_step->getSortDescription())
        sort_columns.insert(col.column_name);
--- a/src/Processors/Transforms/AggregatingTransform.h
+++ b/src/Processors/Transforms/AggregatingTransform.h
@ -1,9 +1,10 @@
 #pragma once
-#include <Processors/IAccumulatingTransform.h>
-#include <Interpreters/Aggregator.h>
-#include <IO/ReadBufferFromFile.h>
 #include <Compression/CompressedReadBuffer.h>
+#include <IO/ReadBufferFromFile.h>
+#include <Interpreters/Aggregator.h>
+#include <Processors/IAccumulatingTransform.h>
 #include <Common/Stopwatch.h>
+#include <Common/setThreadName.h>

 namespace DB
 {
@ -70,6 +71,46 @@ struct ManyAggregatedData
        for (auto & mut : mutexes)
            mut = std::make_unique<std::mutex>();
    }
+
+    ~ManyAggregatedData()
+    {
+        try
+        {
+            if (variants.size() <= 1)
+                return;
+
+            // Aggregation states destruction may be very time-consuming.
+            // In the case of a query with LIMIT, most states won't be destroyed during conversion to blocks.
+            // Without the following code, they would be destroyed in the destructor of AggregatedDataVariants in the current thread (i.e. sequentially).
+            const auto pool = std::make_unique<ThreadPool>(variants.size());
+
+            for (auto && variant : variants)
+            {
+                if (variant->size() < 100'000) // some seemingly reasonable constant
+                    continue;
+
+                // It doesn't make sense to spawn a thread if the variant is not going to actually destroy anything.
+                if (variant->aggregator)
+                {
+                    // variant is moved here and will be destroyed in the destructor of the lambda function.
+                    pool->trySchedule(
+                        [variant = std::move(variant), thread_group = CurrentThread::getGroup()]()
+                        {
+                            if (thread_group)
+                                CurrentThread::attachToIfDetached(thread_group);
+
+                            setThreadName("AggregDestruct");
+                        });
+                }
+            }
+
+            pool->wait();
+        }
+        catch (...)
+        {
+            tryLogCurrentException(__PRETTY_FUNCTION__);
+        }
+    }
 };

 using AggregatingTransformParamsPtr = std::shared_ptr<AggregatingTransformParams>;
--- a/src/Processors/Transforms/SortingTransform.cpp
+++ b/src/Processors/Transforms/SortingTransform.cpp
@ -23,7 +23,7 @@ namespace ErrorCodes
 }

 MergeSorter::MergeSorter(const Block & header, Chunks chunks_, SortDescription & description_, size_t max_merged_block_size_, UInt64 limit_)
-    : chunks(std::move(chunks_)), description(description_), max_merged_block_size(max_merged_block_size_), limit(limit_)
+    : chunks(std::move(chunks_)), description(description_), max_merged_block_size(max_merged_block_size_), limit(limit_), queue_variants(header, description)
 {
    Chunks nonempty_chunks;
    for (auto & chunk : chunks)
@ -44,12 +44,11 @@ MergeSorter::MergeSorter(const Block & header, Chunks chunks_, SortDescription &

    chunks.swap(nonempty_chunks);

-    if (has_collation)
-        queue_with_collation = SortingHeap<SortCursorWithCollation>(cursors);
-    else if (description.size() > 1)
-        queue_without_collation = SortingHeap<SortCursor>(cursors);
-    else
-        queue_simple = SortingHeap<SimpleSortCursor>(cursors);
+    queue_variants.callOnVariant([&](auto & queue)
+    {
+        using QueueType = std::decay_t<decltype(queue)>;
+        queue = QueueType(cursors);
+    });
 }


@ -65,12 +64,12 @@ Chunk MergeSorter::read()
        return res;
    }

-    if (has_collation)
-        return mergeImpl(queue_with_collation);
-    else if (description.size() > 1)
-        return mergeImpl(queue_without_collation);
-    else
-        return mergeImpl(queue_simple);
+    Chunk result = queue_variants.callOnVariant([&](auto & queue)
+    {
+        return mergeImpl(queue);
+    });
+
+    return result;
 }


@ -175,7 +174,8 @@ SortingTransform::SortingTransform(

    description.swap(description_without_constants);

-    compileSortDescriptionIfNeeded(description, sort_description_types, increase_sort_description_compile_attempts /*increase_compile_attemps*/);
+    if (SortQueueVariants(sort_description_types, description).variantSupportJITCompilation())
+        compileSortDescriptionIfNeeded(description, sort_description_types, increase_sort_description_compile_attempts /*increase_compile_attemps*/);
 }

 SortingTransform::~SortingTransform() = default;
--- a/src/Processors/Transforms/SortingTransform.h
+++ b/src/Processors/Transforms/SortingTransform.h
@ -24,16 +24,13 @@ private:
    SortDescription description;
    size_t max_merged_block_size;
    UInt64 limit;
+    SortQueueVariants queue_variants;
    size_t total_merged_rows = 0;

    SortCursorImpls cursors;

    bool has_collation = false;

-    SortingHeap<SortCursor> queue_without_collation;
-    SortingHeap<SimpleSortCursor> queue_simple;
-    SortingHeap<SortCursorWithCollation> queue_with_collation;
-
    /** Two different cursors are supported - with and without Collation.
      *  Templates are used (instead of virtual functions in SortCursor) for zero-overhead.
      */
--- a/src/Processors/Transforms/WindowTransform.cpp
+++ b/src/Processors/Transforms/WindowTransform.cpp
@ -13,6 +13,7 @@
 #include <DataTypes/DataTypesNumber.h>
 #include <DataTypes/getLeastSupertype.h>
 #include <DataTypes/DataTypeLowCardinality.h>
+#include <DataTypes/DataTypeInterval.h>
 #include <Interpreters/ExpressionActions.h>
 #include <Interpreters/convertFieldToType.h>

@ -27,6 +28,7 @@ namespace ErrorCodes
    extern const int BAD_ARGUMENTS;
    extern const int NOT_IMPLEMENTED;
    extern const int ILLEGAL_COLUMN;
+    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
 }

 // Interface for true window functions. It's not much of an interface, they just
@ -2200,6 +2202,109 @@ struct WindowFunctionNthValue final : public WindowFunction
    }
 };

+struct NonNegativeDerivativeState
+{
+    Float64 previous_metric = 0;
+    Float64 previous_timestamp = 0;
+};
+
+// nonNegativeDerivative(metric_column, timestamp_column[, INTERVAL 1 SECOND])
+struct WindowFunctionNonNegativeDerivative final : public StatefulWindowFunction<NonNegativeDerivativeState>
+{
+    static constexpr size_t ARGUMENT_METRIC = 0;
+    static constexpr size_t ARGUMENT_TIMESTAMP = 1;
+    static constexpr size_t ARGUMENT_INTERVAL = 2;
+
+    WindowFunctionNonNegativeDerivative(const std::string & name_,
+                                            const DataTypes & argument_types_, const Array & parameters_)
+        : StatefulWindowFunction(name_, argument_types_, parameters_)
+    {
+        if (!parameters.empty())
+        {
+            throw Exception(ErrorCodes::BAD_ARGUMENTS,
+                            "Function {} cannot be parameterized", name_);
+        }
+
+        if (argument_types.size() != 2 && argument_types.size() != 3)
+        {
+            throw Exception(ErrorCodes::BAD_ARGUMENTS,
+                            "Function {} takes 2 or 3 arguments", name_);
+        }
+
+        if (!isNumber(argument_types[ARGUMENT_METRIC]))
+        {
+            throw Exception(ErrorCodes::BAD_ARGUMENTS,
+                            "Argument {} must be a number, '{}' given",
+                            ARGUMENT_METRIC,
+                            argument_types[ARGUMENT_METRIC]->getName());
+        }
+
+        if (!isDateTime(argument_types[ARGUMENT_TIMESTAMP]) && !isDateTime64(argument_types[ARGUMENT_TIMESTAMP]))
+        {
+            throw Exception(ErrorCodes::BAD_ARGUMENTS,
+                            "Argument {} must be DateTime or DateTime64, '{}' given",
+                            ARGUMENT_TIMESTAMP,
+                            argument_types[ARGUMENT_TIMESTAMP]->getName());
+        }
+
+        if (argument_types.size() == 3)
+        {
+            const DataTypeInterval * interval_datatype = checkAndGetDataType<DataTypeInterval>(argument_types[ARGUMENT_INTERVAL].get());
+            if (!interval_datatype)
+            {
+                throw Exception(
+                    ErrorCodes::BAD_ARGUMENTS,
+                    "Argument {} must be an INTERVAL, '{}' given",
+                    ARGUMENT_INTERVAL,
+                    argument_types[ARGUMENT_INTERVAL]->getName());
+            }
+            if (!interval_datatype->getKind().isFixedLength())
+            {
+                throw Exception(
+                    ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+                    "The INTERVAL must be a week or shorter, '{}' given",
+                    argument_types[ARGUMENT_INTERVAL]->getName());
+            }
+            interval_length = interval_datatype->getKind().toAvgSeconds();
+            interval_specified = true;
+        }
+    }
+
+
+    DataTypePtr getReturnType() const override { return argument_types[0]; }
+
+    bool allocatesMemoryInArena() const override { return false; }
+
+    void windowInsertResultInto(const WindowTransform * transform,
+                                size_t function_index) override
+    {
+        const auto & current_block = transform->blockAt(transform->current_row);
+        const auto & workspace = transform->workspaces[function_index];
+        auto & state = getState(workspace);
+
+        auto interval_duration = interval_specified ? interval_length *
+            (*current_block.input_columns[workspace.argument_column_indices[ARGUMENT_INTERVAL]]).getFloat64(0) : 1;
+
+        Float64 last_metric = state.previous_metric;
+        Float64 last_timestamp = state.previous_timestamp;
+
+        Float64 curr_metric = WindowFunctionHelpers::getValue<Float64>(transform, function_index, ARGUMENT_METRIC, transform->current_row);
+        Float64 curr_timestamp = WindowFunctionHelpers::getValue<Float64>(transform, function_index, ARGUMENT_TIMESTAMP, transform->current_row);
+
+        Float64 time_elapsed = curr_timestamp - last_timestamp;
+        Float64 metric_diff = curr_metric - last_metric;
+        Float64 result = (time_elapsed != 0) ? (metric_diff / time_elapsed * interval_duration) : 0;
+
+        state.previous_metric = curr_metric;
+        state.previous_timestamp = curr_timestamp;
+
+        WindowFunctionHelpers::setValueToOutputColumn<Float64>(transform, function_index, result >= 0 ? result : 0);
+    }
+private:
+    Float64 interval_length = 1;
+    bool interval_specified = false;
+};
+

 void registerWindowFunctions(AggregateFunctionFactory & factory)
 {
@ -2299,6 +2404,13 @@ void registerWindowFunctions(AggregateFunctionFactory & factory)
            return std::make_shared<WindowFunctionExponentialTimeDecayedAvg>(
                name, argument_types, parameters);
        }, properties});
+
+    factory.registerFunction("nonNegativeDerivative", {[](const std::string & name,
+           const DataTypes & argument_types, const Array & parameters, const Settings *)
+        {
+            return std::make_shared<WindowFunctionNonNegativeDerivative>(
+                name, argument_types, parameters);
+        }, properties});
 }

 }
--- a/src/Storages/Cache/ExternalDataSourceCache.cpp
+++ b/src/Storages/Cache/ExternalDataSourceCache.cpp
@ -7,6 +7,7 @@
 #include <Storages/Cache/ExternalDataSourceCache.h>
 #include <Storages/Cache/RemoteFileMetadataFactory.h>
 #include <base/errnoToString.h>
+#include <base/sort.h>
 #include <Common/logger_useful.h>
 #include <base/sleep.h>
 #include <Poco/Logger.h>
@ -229,7 +230,7 @@ void ExternalDataSourceCache::initOnce(ContextPtr context, const String & root_d
    LOG_INFO(
        log, "Initializing local cache for remote data sources. Local cache root path: {}, cache size limit: {}", root_dir_, limit_size_);
    splitInto<','>(root_dirs, root_dir_);
-    std::sort(root_dirs.begin(), root_dirs.end());
+    ::sort(root_dirs.begin(), root_dirs.end());
    local_cache_bytes_read_before_flush = bytes_read_before_flush_;
    lru_caches = std::make_unique<RemoteFileCacheType>(limit_size_);

--- a/src/Storages/Freeze.cpp
+++ b/src/Storages/Freeze.cpp
@ -0,0 +1,201 @@
+#include <Storages/Freeze.h>
+
+#include <Disks/ObjectStorages/IMetadataStorage.h>
+#include <Storages/PartitionCommands.h>
+#include <Common/escapeForFileName.h>
+#include <Common/logger_useful.h>
+
+namespace DB
+{
+void FreezeMetaData::fill(const StorageReplicatedMergeTree & storage)
+{
+    is_replicated = storage.supportsReplication();
+    is_remote = storage.isRemote();
+    replica_name = storage.getReplicaName();
+    zookeeper_name = storage.getZooKeeperName();
+    table_shared_id = storage.getTableSharedID();
+}
+
+void FreezeMetaData::save(DiskPtr data_disk, const String & path) const
+{
+    auto metadata_storage = data_disk->getMetadataStorage();
+
+    auto file_path = getFileName(path);
+    auto tx = metadata_storage->createTransaction();
+    WriteBufferFromOwnString buffer;
+
+    writeIntText(version, buffer);
+    buffer.write("\n", 1);
+    writeBoolText(is_replicated, buffer);
+    buffer.write("\n", 1);
+    writeBoolText(is_remote, buffer);
+    buffer.write("\n", 1);
+    writeString(replica_name, buffer);
+    buffer.write("\n", 1);
+    writeString(zookeeper_name, buffer);
+    buffer.write("\n", 1);
+    writeString(table_shared_id, buffer);
+    buffer.write("\n", 1);
+
+    tx->writeStringToFile(file_path, buffer.str());
+    tx->commit();
+}
+
+bool FreezeMetaData::load(DiskPtr data_disk, const String & path)
+{
+    auto metadata_storage = data_disk->getMetadataStorage();
+    auto file_path = getFileName(path);
+
+    if (!metadata_storage->exists(file_path))
+        return false;
+    auto metadata_str = metadata_storage->readFileToString(file_path);
+    ReadBufferFromString buffer(metadata_str);
+    readIntText(version, buffer);
+    if (version != 1)
+    {
+        LOG_ERROR(&Poco::Logger::get("FreezeMetaData"), "Unknown freezed metadata version: {}", version);
+        return false;
+    }
+    DB::assertChar('\n', buffer);
+    readBoolText(is_replicated, buffer);
+    DB::assertChar('\n', buffer);
+    readBoolText(is_remote, buffer);
+    DB::assertChar('\n', buffer);
+    readString(replica_name, buffer);
+    DB::assertChar('\n', buffer);
+    readString(zookeeper_name, buffer);
+    DB::assertChar('\n', buffer);
+    readString(table_shared_id, buffer);
+    DB::assertChar('\n', buffer);
+    return true;
+}
+
+void FreezeMetaData::clean(DiskPtr data_disk, const String & path)
+{
+    auto metadata_storage = data_disk->getMetadataStorage();
+    auto fname = getFileName(path);
+    if (metadata_storage->exists(fname))
+    {
+        auto tx = metadata_storage->createTransaction();
+        tx->unlinkFile(fname);
+        tx->commit();
+    }
+}
+
+String FreezeMetaData::getFileName(const String & path)
+{
+    return fs::path(path) / "frozen_metadata.txt";
+}
+
+BlockIO Unfreezer::unfreeze(const String & backup_name, ContextPtr local_context)
+{
+    LOG_DEBUG(log, "Unfreezing backup {}", backup_name);
+    auto disks_map = local_context->getDisksMap();
+    Disks disks;
+    for (auto & [name, disk]: disks_map)
+    {
+        disks.push_back(disk);
+    }
+    auto backup_path = fs::path(backup_directory_prefix) / escapeForFileName(backup_name);
+    auto store_path = backup_path / "store";
+
+    PartitionCommandsResultInfo result_info;
+
+    for (const auto & disk: disks)
+    {
+        if (!disk->exists(store_path))
+            continue;
+        for (auto prefix_it = disk->iterateDirectory(store_path); prefix_it->isValid(); prefix_it->next())
+        {
+            auto prefix_directory = store_path / prefix_it->name();
+            for (auto table_it = disk->iterateDirectory(prefix_directory); table_it->isValid(); table_it->next())
+            {
+                auto table_directory = prefix_directory / table_it->name();
+                auto current_result_info = unfreezePartitionsFromTableDirectory([] (const String &) { return true; }, backup_name, {disk}, table_directory, local_context);
+                for (auto & command_result : current_result_info)
+                {
+                    command_result.command_type = "SYSTEM UNFREEZE";
+                }
+                result_info.insert(
+                                result_info.end(),
+                                std::make_move_iterator(current_result_info.begin()),
+                                std::make_move_iterator(current_result_info.end()));
+            }
+        }
+        if (disk->exists(backup_path))
+        {
+            disk->removeRecursive(backup_path);
+        }
+    }
+
+    BlockIO result;
+    if (!result_info.empty())
+    {
+        result.pipeline = QueryPipeline(convertCommandsResultToSource(result_info));
+    }
+    return result;
+}
+
+bool Unfreezer::removeFreezedPart(DiskPtr disk, const String & path, const String & part_name, ContextPtr local_context)
+{
+    if (disk->supportZeroCopyReplication())
+    {
+        FreezeMetaData meta;
+        if (meta.load(disk, path))
+        {
+            if (meta.is_replicated)
+            {
+                FreezeMetaData::clean(disk, path);
+                return StorageReplicatedMergeTree::removeSharedDetachedPart(disk, path, part_name, meta.table_shared_id, meta.zookeeper_name, meta.replica_name, "", local_context);
+            }
+        }
+    }
+
+    disk->removeRecursive(path);
+
+    return false;
+}
+
+PartitionCommandsResultInfo Unfreezer::unfreezePartitionsFromTableDirectory(MergeTreeData::MatcherFn matcher, const String & backup_name, const Disks & disks, const fs::path & table_directory, ContextPtr local_context)
+{
+    PartitionCommandsResultInfo result;
+
+    for (const auto & disk : disks)
+    {
+        if (!disk->exists(table_directory))
+            continue;
+
+        for (auto it = disk->iterateDirectory(table_directory); it->isValid(); it->next())
+        {
+            const auto & partition_directory = it->name();
+
+            /// Partition ID is prefix of part directory name: <partition id>_<rest of part directory name>
+            auto found = partition_directory.find('_');
+            if (found == std::string::npos)
+                continue;
+            auto partition_id = partition_directory.substr(0, found);
+
+            if (!matcher(partition_id))
+                continue;
+
+            const auto & path = it->path();
+
+            bool keep_shared = removeFreezedPart(disk, path, partition_directory, local_context);
+
+            result.push_back(PartitionCommandResultInfo{
+                .partition_id = partition_id,
+                .part_name = partition_directory,
+                .backup_path = disk->getPath() + table_directory.generic_string(),
+                .part_backup_path = disk->getPath() + path,
+                .backup_name = backup_name,
+            });
+
+            LOG_DEBUG(log, "Unfreezed part by path {}, keep shared data: {}", disk->getPath() + path, keep_shared);
+        }
+    }
+
+    LOG_DEBUG(log, "Unfreezed {} parts", result.size());
+
+    return result;
+}
+}
--- a/src/Storages/Freeze.h
+++ b/src/Storages/Freeze.h
@ -0,0 +1,45 @@
+#pragma once
+
+#include <Storages/StorageReplicatedMergeTree.h>
+#include <Storages/MergeTree/MergeTreeData.h>
+
+namespace DB
+{
+
+/// Special metadata used during freeze table. Required for zero-copy
+/// replication.
+struct FreezeMetaData
+{
+public:
+    void fill(const StorageReplicatedMergeTree & storage);
+
+    void save(DiskPtr data_disk, const String & path) const;
+
+    bool load(DiskPtr data_disk, const String & path);
+
+    static void clean(DiskPtr data_disk, const String & path);
+
+private:
+    static String getFileName(const String & path);
+
+public:
+    int version = 1;
+    bool is_replicated{false};
+    bool is_remote{false};
+    String replica_name;
+    String zookeeper_name;
+    String table_shared_id;
+};
+
+class Unfreezer
+{
+public:
+    PartitionCommandsResultInfo unfreezePartitionsFromTableDirectory(MergeTreeData::MatcherFn matcher, const String & backup_name, const Disks & disks, const fs::path & table_directory, ContextPtr local_context);
+    BlockIO unfreeze(const String & backup_name, ContextPtr local_context);
+private:
+    Poco::Logger * log = &Poco::Logger::get("Unfreezer");
+    static constexpr std::string_view backup_directory_prefix = "shadow";
+    static bool removeFreezedPart(DiskPtr disk, const String & path, const String & part_name, ContextPtr local_context);
+};
+
+}
--- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp
+++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp
@ -1591,11 +1591,31 @@ void IMergeTreeDataPart::remove() const
      * And a race condition can happen that will lead to "File not found" error here.
      */

+
    /// NOTE We rename part to delete_tmp_<relative_path> instead of delete_tmp_<name> to avoid race condition
    /// when we try to remove two parts with the same name, but different relative paths,
    /// for example all_1_2_1 (in Deleting state) and tmp_merge_all_1_2_1 (in Temporary state).
    fs::path from = fs::path(storage.relative_data_path) / relative_path;
-    fs::path to = fs::path(storage.relative_data_path) / ("delete_tmp_" + relative_path);
+
+    /// Cut last "/" if it exists (it shouldn't). Otherwise fs::path behave differently.
+    fs::path relative_path_without_slash = relative_path.ends_with("/") ? relative_path.substr(0, relative_path.size() - 1) : relative_path;
+
+    /// NOTE relative_path can contain not only part name itself, but also some prefix like
+    /// "moving/all_1_1_1" or "detached/all_2_3_5". We should handle this case more properly.
+    fs::path to = fs::path(storage.relative_data_path);
+    if (relative_path_without_slash.has_parent_path())
+    {
+        auto parent_path = relative_path_without_slash.parent_path();
+        if (parent_path == "detached")
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to remove detached part {} with path {} in remove function. It shouldn't happen", name, relative_path);
+
+        to /= parent_path / ("delete_tmp_" + std::string{relative_path_without_slash.filename()});
+    }
+    else
+    {
+        to /= ("delete_tmp_" + std::string{relative_path_without_slash});
+    }
+
    // TODO directory delete_tmp_<name> is never removed if server crashes before returning from this function

    auto disk = volume->getDisk();
--- a/Show More
+++ b/Show More