Merge branch 'master' into clickhouse-disks-fixes

2024-09-20 08:40:50 +00:00 · 2022-06-20 11:20:59 +00:00 · 2022-06-20 11:20:59 +00:00 · 2d25daf64d
commit 2d25daf64d
parent 7520bfeab7 24d5be1669
226 changed files with 5607 additions and 2572 deletions
--- a/.github/workflows/backport.yml
+++ b/.github/workflows/backport.yml
@ -13,7 +13,7 @@ on: # yamllint disable-line rule:truthy

 jobs:
  CherryPick:
-    runs-on: [self-hosted, style-checker]
+    runs-on: [self-hosted, style-checker-aarch64]
    steps:
      - name: Set envs
        # https://docs.github.com/en/actions/learn-github-actions/workflow-commands-for-github-actions#multiline-strings
--- a/SECURITY.md
+++ b/SECURITY.md
@ -1,3 +1,4 @@
+
 # Security Policy

 ## Security Announcements
@ -7,29 +8,30 @@ Security fixes will be announced by posting them in the [security changelog](htt

 The following versions of ClickHouse server are currently being supported with security updates:

-| Version | Supported          |
-| ------- | ------------------ |
-| 1.x   | :x: |
-| 18.x   | :x:                |
-| 19.x   | :x:                |
-| 20.x   | :x: |
-| 21.1   | :x: |
-| 21.2   | :x: |
-| 21.3   | :x: |
-| 21.4   | :x: |
-| 21.5   | :x: |
-| 21.6   | :x: |
-| 21.7   | :x: |
-| 21.8   | ✅ |
-| 21.9   | :x: |
-| 21.10   | :x: |
-| 21.11   | :x: |
-| 21.12   | :x: |
-| 22.1   | :x: |
-| 22.2   | :x: |
-| 22.3   | ✅ |
-| 22.4   | ✅ |
-| 22.5   | ✅ |
+| Version | Supported |
+|:-|:-|
+| 22.6 | ✔️ |
+| 22.5 | ✔️ |
+| 22.4 | ✔️ |
+| 22.3 | ✔️ |
+| 22.2 | ❌ |
+| 22.1 | ❌ |
+| 21.12 | ❌ |
+| 21.11 | ❌ |
+| 21.10 | ❌ |
+| 21.9 | ❌ |
+| 21.8 | ✔️ |
+| 21.7 | ❌ |
+| 21.6 | ❌ |
+| 21.5 | ❌ |
+| 21.4 | ❌ |
+| 21.3 | ❌ |
+| 21.2 | ❌ |
+| 21.1 | ❌ |
+| 20.* | ❌ |
+| 19.* | ❌ |
+| 18.* | ❌ |
+| 1.* | ❌ |

 ## Reporting a Vulnerability

@ -57,4 +59,3 @@ As the security issue moves from triage, to identified fix, to release planning

 A public disclosure date is negotiated by the ClickHouse maintainers and the bug submitter. We prefer to fully disclose the bug as soon as possible once a user mitigation is available. It is reasonable to delay disclosure when the bug or the fix is not yet fully understood, the solution is not well-tested, or for vendor coordination. The timeframe for disclosure is from immediate (especially if it's already publicly known) to 90 days. For a vulnerability with a straightforward mitigation, we expect report date to disclosure date to be on the order of 7 days. 

-
--- a/base/harmful/harmful.c
+++ b/base/harmful/harmful.c
@ -260,4 +260,35 @@ TRAP(mq_timedreceive)
 TRAP(wordexp)
 TRAP(wordfree)

+/// C11 threading primitives are not supported by ThreadSanitizer.
+/// Also we should avoid using them for compatibility with old libc.
+TRAP(thrd_create)
+TRAP(thrd_equal)
+TRAP(thrd_current)
+TRAP(thrd_sleep)
+TRAP(thrd_yield)
+TRAP(thrd_exit)
+TRAP(thrd_detach)
+TRAP(thrd_join)
+
+TRAP(mtx_init)
+TRAP(mtx_lock)
+TRAP(mtx_timedlock)
+TRAP(mtx_trylock)
+TRAP(mtx_unlock)
+TRAP(mtx_destroy)
+TRAP(call_once)
+
+TRAP(cnd_init)
+TRAP(cnd_signal)
+TRAP(cnd_broadcast)
+TRAP(cnd_wait)
+TRAP(cnd_timedwait)
+TRAP(cnd_destroy)
+
+TRAP(tss_create)
+TRAP(tss_get)
+TRAP(tss_set)
+TRAP(tss_delete)
+
 #endif
--- a/cmake/autogenerated_versions.txt
+++ b/cmake/autogenerated_versions.txt
@ -2,11 +2,11 @@

 # NOTE: has nothing common with DBMS_TCP_PROTOCOL_VERSION,
 # only DBMS_TCP_PROTOCOL_VERSION should be incremented on protocol changes.
-SET(VERSION_REVISION 54463)
+SET(VERSION_REVISION 54464)
 SET(VERSION_MAJOR 22)
-SET(VERSION_MINOR 6)
+SET(VERSION_MINOR 7)
 SET(VERSION_PATCH 1)
-SET(VERSION_GITHASH df0cb0620985eb5ec59760cc76f7736e5b6209bb)
-SET(VERSION_DESCRIBE v22.6.1.1-testing)
-SET(VERSION_STRING 22.6.1.1)
+SET(VERSION_GITHASH 7000c4e0033bb9e69050ab8ef73e8e7465f78059)
+SET(VERSION_DESCRIBE v22.7.1.1-testing)
+SET(VERSION_STRING 22.7.1.1)
 # end of autochange
--- a/cmake/tools.cmake
+++ b/cmake/tools.cmake
@ -112,7 +112,7 @@ endif()
 # Archiver

 if (COMPILER_GCC)
-    find_program (LLVM_AR_PATH NAMES "llvm-ar" "llvm-ar-13" "llvm-ar-12" "llvm-ar-11")
+    find_program (LLVM_AR_PATH NAMES "llvm-ar" "llvm-ar-14" "llvm-ar-13" "llvm-ar-12")
 else ()
    find_program (LLVM_AR_PATH NAMES "llvm-ar-${COMPILER_VERSION_MAJOR}" "llvm-ar")
 endif ()
@ -126,7 +126,7 @@ message(STATUS "Using archiver: ${CMAKE_AR}")
 # Ranlib

 if (COMPILER_GCC)
-    find_program (LLVM_RANLIB_PATH NAMES "llvm-ranlib" "llvm-ranlib-13" "llvm-ranlib-12" "llvm-ranlib-11")
+    find_program (LLVM_RANLIB_PATH NAMES "llvm-ranlib" "llvm-ranlib-14" "llvm-ranlib-13" "llvm-ranlib-12")
 else ()
    find_program (LLVM_RANLIB_PATH NAMES "llvm-ranlib-${COMPILER_VERSION_MAJOR}" "llvm-ranlib")
 endif ()
@ -140,7 +140,7 @@ message(STATUS "Using ranlib: ${CMAKE_RANLIB}")
 # Install Name Tool

 if (COMPILER_GCC)
-    find_program (LLVM_INSTALL_NAME_TOOL_PATH NAMES "llvm-install-name-tool" "llvm-install-name-tool-13" "llvm-install-name-tool-12" "llvm-install-name-tool-11")
+    find_program (LLVM_INSTALL_NAME_TOOL_PATH NAMES "llvm-install-name-tool" "llvm-install-name-tool-14" "llvm-install-name-tool-13" "llvm-install-name-tool-12")
 else ()
    find_program (LLVM_INSTALL_NAME_TOOL_PATH NAMES "llvm-install-name-tool-${COMPILER_VERSION_MAJOR}" "llvm-install-name-tool")
 endif ()
@ -154,7 +154,7 @@ message(STATUS "Using install-name-tool: ${CMAKE_INSTALL_NAME_TOOL}")
 # Objcopy

 if (COMPILER_GCC)
-    find_program (OBJCOPY_PATH NAMES "llvm-objcopy" "llvm-objcopy-13" "llvm-objcopy-12" "llvm-objcopy-11" "objcopy")
+    find_program (OBJCOPY_PATH NAMES "llvm-objcopy" "llvm-objcopy-14" "llvm-objcopy-13" "llvm-objcopy-12" "objcopy")
 else ()
    find_program (OBJCOPY_PATH NAMES "llvm-objcopy-${COMPILER_VERSION_MAJOR}" "llvm-objcopy" "objcopy")
 endif ()
@ -168,7 +168,7 @@ endif ()
 # Strip

 if (COMPILER_GCC)
-    find_program (STRIP_PATH NAMES "llvm-strip" "llvm-strip-13" "llvm-strip-12" "llvm-strip-11" "strip")
+    find_program (STRIP_PATH NAMES "llvm-strip" "llvm-strip-14" "llvm-strip-13" "llvm-strip-12" "strip")
 else ()
    find_program (STRIP_PATH NAMES "llvm-strip-${COMPILER_VERSION_MAJOR}" "llvm-strip" "strip")
 endif ()
--- a/contrib/NuRaft
+++ b/contrib/NuRaft
@ -1 +1 @@
-Subproject commit 24a13f15cf0838b93f3b1beb62ed010dffdb2117
+Subproject commit 1334b9ae72576821a698d657d08838861cf33007
--- a/contrib/curl
+++ b/contrib/curl
@ -1 +1 @@
-Subproject commit 801bd5138ce31aa0d906fa4e2eabfc599d74e793
+Subproject commit 462196e6b4a47f924293a0e26b8e9c23d37ac26f
--- a/contrib/curl-cmake/CMakeLists.txt
+++ b/contrib/curl-cmake/CMakeLists.txt
@ -84,7 +84,6 @@ set (SRCS
    "${LIBRARY_DIR}/lib/gopher.c"
    "${LIBRARY_DIR}/lib/idn_win32.c"
    "${LIBRARY_DIR}/lib/http_proxy.c"
-    "${LIBRARY_DIR}/lib/non-ascii.c"
    "${LIBRARY_DIR}/lib/asyn-thread.c"
    "${LIBRARY_DIR}/lib/curl_gssapi.c"
    "${LIBRARY_DIR}/lib/http_ntlm.c"
@ -93,10 +92,8 @@ set (SRCS
    "${LIBRARY_DIR}/lib/curl_sasl.c"
    "${LIBRARY_DIR}/lib/rand.c"
    "${LIBRARY_DIR}/lib/curl_multibyte.c"
-    "${LIBRARY_DIR}/lib/hostcheck.c"
    "${LIBRARY_DIR}/lib/conncache.c"
    "${LIBRARY_DIR}/lib/dotdot.c"
-    "${LIBRARY_DIR}/lib/x509asn1.c"
    "${LIBRARY_DIR}/lib/http2.c"
    "${LIBRARY_DIR}/lib/smb.c"
    "${LIBRARY_DIR}/lib/curl_endian.c"
@ -120,6 +117,9 @@ set (SRCS
    "${LIBRARY_DIR}/lib/http_aws_sigv4.c"
    "${LIBRARY_DIR}/lib/mqtt.c"
    "${LIBRARY_DIR}/lib/rename.c"
+    "${LIBRARY_DIR}/lib/h2h3.c"
+    "${LIBRARY_DIR}/lib/headers.c"
+    "${LIBRARY_DIR}/lib/timediff.c"
    "${LIBRARY_DIR}/lib/vauth/vauth.c"
    "${LIBRARY_DIR}/lib/vauth/cleartext.c"
    "${LIBRARY_DIR}/lib/vauth/cram.c"
@ -142,11 +142,13 @@ set (SRCS
    "${LIBRARY_DIR}/lib/vtls/sectransp.c"
    "${LIBRARY_DIR}/lib/vtls/gskit.c"
    "${LIBRARY_DIR}/lib/vtls/mbedtls.c"
-    "${LIBRARY_DIR}/lib/vtls/mesalink.c"
    "${LIBRARY_DIR}/lib/vtls/bearssl.c"
    "${LIBRARY_DIR}/lib/vtls/keylog.c"
+    "${LIBRARY_DIR}/lib/vtls/x509asn1.c"
+    "${LIBRARY_DIR}/lib/vtls/hostcheck.c"
    "${LIBRARY_DIR}/lib/vquic/ngtcp2.c"
    "${LIBRARY_DIR}/lib/vquic/quiche.c"
+    "${LIBRARY_DIR}/lib/vquic/msh3.c"
    "${LIBRARY_DIR}/lib/vssh/libssh2.c"
    "${LIBRARY_DIR}/lib/vssh/libssh.c"
 )
--- a/contrib/librdkafka
+++ b/contrib/librdkafka
@ -1 +1 @@
-Subproject commit b8554f1682062c85ba519eb54ef2f90e02b812cb
+Subproject commit 81b413cc1c2a33ad4e96df856b89184efbd6221c
--- a/docker/server/Dockerfile.ubuntu
+++ b/docker/server/Dockerfile.ubuntu
@ -21,7 +21,7 @@ RUN sed -i "s|http://archive.ubuntu.com|${apt_archive}|g" /etc/apt/sources.list

 ARG REPO_CHANNEL="stable"
 ARG REPOSITORY="deb https://packages.clickhouse.com/deb ${REPO_CHANNEL} main"
-ARG VERSION=22.5.1.*
+ARG VERSION=22.6.1.*
 ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static"

 # set non-empty deb_location_url url to create a docker image
--- a/docker/test/fasttest/run.sh
+++ b/docker/test/fasttest/run.sh
@ -37,38 +37,13 @@ export FASTTEST_DATA
 export FASTTEST_OUT
 export PATH

-server_pid=none
-
-function stop_server
-{
-    if ! kill -0 -- "$server_pid"
-    then
-        echo "ClickHouse server pid '$server_pid' is not running"
-        return 0
-    fi
-
-    for _ in {1..60}
-    do
-        if ! pkill -f "clickhouse-server" && ! kill -- "$server_pid" ; then break ; fi
-        sleep 1
-    done
-
-    if kill -0 -- "$server_pid"
-    then
-        pstree -apgT
-        jobs
-        echo "Failed to kill the ClickHouse server pid '$server_pid'"
-        return 1
-    fi
-
-    server_pid=none
-}
-
 function start_server
 {
    set -m # Spawn server in its own process groups
+
    local opts=(
        --config-file "$FASTTEST_DATA/config.xml"
+        --pid-file "$FASTTEST_DATA/clickhouse-server.pid"
        --
        --path "$FASTTEST_DATA"
        --user_files_path "$FASTTEST_DATA/user_files"
@ -76,40 +51,22 @@ function start_server
        --keeper_server.storage_path "$FASTTEST_DATA/coordination"
    )
    clickhouse-server "${opts[@]}" &>> "$FASTTEST_OUTPUT/server.log" &
-    server_pid=$!
    set +m

-    if [ "$server_pid" == "0" ]
-    then
-        echo "Failed to start ClickHouse server"
-        # Avoid zero PID because `kill` treats it as our process group PID.
-        server_pid="none"
-        return 1
-    fi
-
-    for _ in {1..60}
-    do
-        if clickhouse-client --query "select 1" || ! kill -0 -- "$server_pid"
-        then
+    for _ in {1..60}; do
+        if clickhouse-client --query "select 1"; then
            break
        fi
        sleep 1
    done

-    if ! clickhouse-client --query "select 1"
-    then
+    if ! clickhouse-client --query "select 1"; then
        echo "Failed to wait until ClickHouse server starts."
-        server_pid="none"
-        return 1
-    fi
-
-    if ! kill -0 -- "$server_pid"
-    then
-        echo "Wrong clickhouse server started: PID '$server_pid' we started is not running, but '$(pgrep -f clickhouse-server)' is running"
-        server_pid="none"
        return 1
    fi

+    local server_pid
+    server_pid="$(cat "$FASTTEST_DATA/clickhouse-server.pid")"
    echo "ClickHouse server pid '$server_pid' started and responded"
 }

@ -254,9 +211,6 @@ function run_tests
    clickhouse-server --version
    clickhouse-test --help

-    # Kill the server in case we are running locally and not in docker
-    stop_server ||:
-
    start_server

    set +e
@ -284,6 +238,8 @@ function run_tests
        | ts '%Y-%m-%d %H:%M:%S' \
        | tee "$FASTTEST_OUTPUT/test_result.txt"
    set -e
+
+    clickhouse stop --pid-path "$FASTTEST_DATA"
 }

 case "$stage" in
--- a/docker/test/fuzzer/run-fuzzer.sh
+++ b/docker/test/fuzzer/run-fuzzer.sh
@ -125,16 +125,7 @@ function filter_exists_and_template
 function stop_server
 {
    clickhouse-client --query "select elapsed, query from system.processes" ||:
-    killall clickhouse-server ||:
-    for _ in {1..10}
-    do
-        if ! pgrep -f clickhouse-server
-        then
-            break
-        fi
-        sleep 1
-    done
-    killall -9 clickhouse-server ||:
+    clickhouse stop

    # Debug.
    date
@ -159,10 +150,12 @@ function fuzz
        NEW_TESTS_OPT="${NEW_TESTS_OPT:-}"
    fi

+    mkdir -p /var/run/clickhouse-server
+
    # interferes with gdb
    export CLICKHOUSE_WATCHDOG_ENABLE=0
    # NOTE: we use process substitution here to preserve keep $! as a pid of clickhouse-server
-    clickhouse-server --config-file db/config.xml -- --path db > >(tail -100000 > server.log) 2>&1 &
+    clickhouse-server --config-file db/config.xml --pid-file /var/run/clickhouse-server/clickhouse-server.pid -- --path db > >(tail -100000 > server.log) 2>&1 &
    server_pid=$!

    kill -0 $server_pid
--- a/docker/test/sqlancer/run.sh
+++ b/docker/test/sqlancer/run.sh
@ -21,7 +21,7 @@ export NUM_QUERIES=1000
 ( java -jar target/sqlancer-*.jar --num-threads 10 --timeout-seconds $TIMEOUT --num-queries $NUM_QUERIES  --username default --password "" clickhouse --oracle TLPDistinct | tee /test_output/TLPDistinct.out )  3>&1 1>&2 2>&3 | tee /test_output/TLPDistinct.err
 ( java -jar target/sqlancer-*.jar --num-threads 10 --timeout-seconds $TIMEOUT --num-queries $NUM_QUERIES  --username default --password "" clickhouse --oracle TLPAggregate | tee /test_output/TLPAggregate.out )  3>&1 1>&2 2>&3 | tee /test_output/TLPAggregate.err

-service clickhouse-server stop && sleep 10
+service clickhouse stop

 ls /var/log/clickhouse-server/
 tar czf /test_output/logs.tar.gz -C /var/log/clickhouse-server/ .
--- a/docker/test/stateful/run.sh
+++ b/docker/test/stateful/run.sh
@ -22,17 +22,23 @@ ln -s /usr/share/clickhouse-test/clickhouse-test /usr/bin/clickhouse-test
 function start()
 {
    if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then
+        mkdir -p /var/run/clickhouse-server1
+        sudo chown clickhouse:clickhouse /var/run/clickhouse-server1
        # NOTE We run "clickhouse server" instead of "clickhouse-server"
        # to make "pidof clickhouse-server" return single pid of the main instance.
        # We wil run main instance using "service clickhouse-server start"
        sudo -E -u clickhouse /usr/bin/clickhouse server --config /etc/clickhouse-server1/config.xml --daemon \
+        --pid-file /var/run/clickhouse-server1/clickhouse-server.pid \
        -- --path /var/lib/clickhouse1/ --logger.stderr /var/log/clickhouse-server/stderr1.log \
        --logger.log /var/log/clickhouse-server/clickhouse-server1.log --logger.errorlog /var/log/clickhouse-server/clickhouse-server1.err.log \
        --tcp_port 19000 --tcp_port_secure 19440 --http_port 18123 --https_port 18443 --interserver_http_port 19009 --tcp_with_proxy_port 19010 \
        --mysql_port 19004 --postgresql_port 19005 \
        --keeper_server.tcp_port 19181 --keeper_server.server_id 2

+        mkdir -p /var/run/clickhouse-server2
+        sudo chown clickhouse:clickhouse /var/run/clickhouse-server2
        sudo -E -u clickhouse /usr/bin/clickhouse server --config /etc/clickhouse-server2/config.xml --daemon \
+        --pid-file /var/run/clickhouse-server2/clickhouse-server.pid \
        -- --path /var/lib/clickhouse2/ --logger.stderr /var/log/clickhouse-server/stderr2.log \
        --logger.log /var/log/clickhouse-server/clickhouse-server2.log --logger.errorlog /var/log/clickhouse-server/clickhouse-server2.err.log \
        --tcp_port 29000 --tcp_port_secure 29440 --http_port 28123 --https_port 28443 --interserver_http_port 29009 --tcp_with_proxy_port 29010 \
@ -135,6 +141,12 @@ ls -la /

 /process_functional_tests_result.py || echo -e "failure\tCannot parse results" > /test_output/check_status.tsv

+sudo clickhouse stop ||:
+if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then
+    sudo clickhouse stop --pid-path /var/run/clickhouse-server1 ||:
+    sudo clickhouse stop --pid-path /var/run/clickhouse-server2 ||:
+fi
+
 grep -Fa "Fatal" /var/log/clickhouse-server/clickhouse-server.log ||:

 pigz < /var/log/clickhouse-server/clickhouse-server.log > /test_output/clickhouse-server.log.gz ||:
--- a/docker/test/stateless/run.sh
+++ b/docker/test/stateless/run.sh
@ -41,15 +41,18 @@ if [ "$NUM_TRIES" -gt "1" ]; then
    export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_TIME_US=10000
    export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_TIME_US=10000

+    mkdir -p /var/run/clickhouse-server
    # simpliest way to forward env variables to server
-    sudo -E -u clickhouse /usr/bin/clickhouse-server --config /etc/clickhouse-server/config.xml --daemon
+    sudo -E -u clickhouse /usr/bin/clickhouse-server --config /etc/clickhouse-server/config.xml --daemon --pid-file /var/run/clickhouse-server/clickhouse-server.pid
 else
    sudo clickhouse start
 fi

 if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then
-
+    mkdir -p /var/run/clickhouse-server1
+    sudo chown clickhouse:clickhouse /var/run/clickhouse-server1
    sudo -E -u clickhouse /usr/bin/clickhouse server --config /etc/clickhouse-server1/config.xml --daemon \
+    --pid-file /var/run/clickhouse-server1/clickhouse-server.pid \
    -- --path /var/lib/clickhouse1/ --logger.stderr /var/log/clickhouse-server/stderr1.log \
    --logger.log /var/log/clickhouse-server/clickhouse-server1.log --logger.errorlog /var/log/clickhouse-server/clickhouse-server1.err.log \
    --tcp_port 19000 --tcp_port_secure 19440 --http_port 18123 --https_port 18443 --interserver_http_port 19009 --tcp_with_proxy_port 19010 \
@ -57,7 +60,10 @@ if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]
    --keeper_server.tcp_port 19181 --keeper_server.server_id 2 \
    --macros.replica r2   # It doesn't work :(

+    mkdir -p /var/run/clickhouse-server2
+    sudo chown clickhouse:clickhouse /var/run/clickhouse-server2
    sudo -E -u clickhouse /usr/bin/clickhouse server --config /etc/clickhouse-server2/config.xml --daemon \
+    --pid-file /var/run/clickhouse-server2/clickhouse-server.pid \
    -- --path /var/lib/clickhouse2/ --logger.stderr /var/log/clickhouse-server/stderr2.log \
    --logger.log /var/log/clickhouse-server/clickhouse-server2.log --logger.errorlog /var/log/clickhouse-server/clickhouse-server2.err.log \
    --tcp_port 29000 --tcp_port_secure 29440 --http_port 28123 --https_port 28443 --interserver_http_port 29009 --tcp_with_proxy_port 29010 \
@ -133,18 +139,10 @@ clickhouse-client -q "system flush logs" ||:
 # Stop server so we can safely read data with clickhouse-local.
 # Why do we read data with clickhouse-local?
 # Because it's the simplest way to read it when server has crashed.
-if [ "$NUM_TRIES" -gt "1" ]; then
-    clickhouse-client -q "system shutdown" ||:
-    sleep 10
-else
-    sudo clickhouse stop ||:
-fi
-
-
+sudo clickhouse stop ||:
 if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then
-    clickhouse-client --port 19000 -q "system shutdown" ||:
-    clickhouse-client --port 29000 -q "system shutdown" ||:
-    sleep 10
+    sudo clickhouse stop --pid-path /var/run/clickhouse-server1 ||:
+    sudo clickhouse stop --pid-path /var/run/clickhouse-server2 ||:
 fi

 grep -Fa "Fatal" /var/log/clickhouse-server/clickhouse-server.log ||:
--- a/docker/test/stress/run.sh
+++ b/docker/test/stress/run.sh
@ -7,26 +7,29 @@ set -x

 # Thread Fuzzer allows to check more permutations of possible thread scheduling
 # and find more potential issues.
+#
+# But under thread fuzzer, TSan build is too slow and this produces some flaky
+# tests, so for now, as a temporary solution it had been disabled.
+if ! test -f package_folder/*tsan*.deb; then
+    export THREAD_FUZZER_CPU_TIME_PERIOD_US=1000
+    export THREAD_FUZZER_SLEEP_PROBABILITY=0.1
+    export THREAD_FUZZER_SLEEP_TIME_US=100000

-export THREAD_FUZZER_CPU_TIME_PERIOD_US=1000
-export THREAD_FUZZER_SLEEP_PROBABILITY=0.1
-export THREAD_FUZZER_SLEEP_TIME_US=100000
+    export THREAD_FUZZER_pthread_mutex_lock_BEFORE_MIGRATE_PROBABILITY=1
+    export THREAD_FUZZER_pthread_mutex_lock_AFTER_MIGRATE_PROBABILITY=1
+    export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_MIGRATE_PROBABILITY=1
+    export THREAD_FUZZER_pthread_mutex_unlock_AFTER_MIGRATE_PROBABILITY=1

-export THREAD_FUZZER_pthread_mutex_lock_BEFORE_MIGRATE_PROBABILITY=1
-export THREAD_FUZZER_pthread_mutex_lock_AFTER_MIGRATE_PROBABILITY=1
-export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_MIGRATE_PROBABILITY=1
-export THREAD_FUZZER_pthread_mutex_unlock_AFTER_MIGRATE_PROBABILITY=1
-
-export THREAD_FUZZER_pthread_mutex_lock_BEFORE_SLEEP_PROBABILITY=0.001
-export THREAD_FUZZER_pthread_mutex_lock_AFTER_SLEEP_PROBABILITY=0.001
-export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_PROBABILITY=0.001
-export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_PROBABILITY=0.001
-export THREAD_FUZZER_pthread_mutex_lock_BEFORE_SLEEP_TIME_US=10000
-
-export THREAD_FUZZER_pthread_mutex_lock_AFTER_SLEEP_TIME_US=10000
-export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_TIME_US=10000
-export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_TIME_US=10000
+    export THREAD_FUZZER_pthread_mutex_lock_BEFORE_SLEEP_PROBABILITY=0.001
+    export THREAD_FUZZER_pthread_mutex_lock_AFTER_SLEEP_PROBABILITY=0.001
+    export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_PROBABILITY=0.001
+    export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_PROBABILITY=0.001
+    export THREAD_FUZZER_pthread_mutex_lock_BEFORE_SLEEP_TIME_US=10000

+    export THREAD_FUZZER_pthread_mutex_lock_AFTER_SLEEP_TIME_US=10000
+    export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_TIME_US=10000
+    export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_TIME_US=10000
+fi

 function install_packages()
 {
--- a/docker/test/stress/stress
+++ b/docker/test/stress/stress
@ -18,8 +18,10 @@ def get_options(i, backward_compatibility_check):
        options.append("--db-engine=Ordinary")

    if i % 3 == 2 and not backward_compatibility_check:
-        options.append('''--db-engine="Replicated('/test/db/test_{}', 's1', 'r1')"'''.format(i))
-        client_options.append('allow_experimental_database_replicated=1')
+        options.append(
+            '''--db-engine="Replicated('/test/db/test_{}', 's1', 'r1')"'''.format(i)
+        )
+        client_options.append("allow_experimental_database_replicated=1")

    # If database name is not specified, new database is created for each functional test.
    # Run some threads with one database for all tests.
@ -37,38 +39,58 @@ def get_options(i, backward_compatibility_check):

    if i % 15 == 11:
        client_options.append("join_algorithm='auto'")
-        client_options.append('max_rows_in_join=1000')
+        client_options.append("max_rows_in_join=1000")

    if i == 13:
-        client_options.append('memory_tracker_fault_probability=0.001')
+        client_options.append("memory_tracker_fault_probability=0.001")

    if client_options:
-        options.append(" --client-option " + ' '.join(client_options))
+        options.append(" --client-option " + " ".join(client_options))

-    return ' '.join(options)
+    return " ".join(options)


-def run_func_test(cmd, output_prefix, num_processes, skip_tests_option, global_time_limit, backward_compatibility_check):
-    backward_compatibility_check_option = '--backward-compatibility-check' if backward_compatibility_check else ''
-    global_time_limit_option = ''
+def run_func_test(
+    cmd,
+    output_prefix,
+    num_processes,
+    skip_tests_option,
+    global_time_limit,
+    backward_compatibility_check,
+):
+    backward_compatibility_check_option = (
+        "--backward-compatibility-check" if backward_compatibility_check else ""
+    )
+    global_time_limit_option = ""
    if global_time_limit:
        global_time_limit_option = "--global_time_limit={}".format(global_time_limit)

-    output_paths = [os.path.join(output_prefix, "stress_test_run_{}.txt".format(i)) for i in range(num_processes)]
+    output_paths = [
+        os.path.join(output_prefix, "stress_test_run_{}.txt".format(i))
+        for i in range(num_processes)
+    ]
    pipes = []
    for i in range(0, len(output_paths)):
-        f = open(output_paths[i], 'w')
-        full_command = "{} {} {} {} {}".format(cmd, get_options(i, backward_compatibility_check), global_time_limit_option, skip_tests_option, backward_compatibility_check_option)
+        f = open(output_paths[i], "w")
+        full_command = "{} {} {} {} {}".format(
+            cmd,
+            get_options(i, backward_compatibility_check),
+            global_time_limit_option,
+            skip_tests_option,
+            backward_compatibility_check_option,
+        )
        logging.info("Run func tests '%s'", full_command)
        p = Popen(full_command, shell=True, stdout=f, stderr=f)
        pipes.append(p)
        time.sleep(0.5)
    return pipes

+
 def compress_stress_logs(output_path, files_prefix):
    cmd = f"cd {output_path} && tar -zcf stress_run_logs.tar.gz {files_prefix}* && rm {files_prefix}*"
    check_output(cmd, shell=True)

+
 def call_with_retry(query, timeout=30, retry_count=5):
    for i in range(retry_count):
        code = call(query, shell=True, stderr=STDOUT, timeout=timeout)
@ -77,6 +99,7 @@ def call_with_retry(query, timeout=30, retry_count=5):
        else:
            break

+
 def make_query_command(query):
    return f"""clickhouse client -q "{query}" --max_untracked_memory=1Gi --memory_profiler_step=1Gi --max_memory_usage_for_user=0"""

@ -93,28 +116,34 @@ def prepare_for_hung_check(drop_databases):
    # ThreadFuzzer significantly slows down server and causes false-positive hung check failures
    call_with_retry("clickhouse client -q 'SYSTEM STOP THREAD FUZZER'")

-    call_with_retry(make_query_command('SELECT 1 FORMAT Null'))
+    call_with_retry(make_query_command("SELECT 1 FORMAT Null"))

    # Some tests execute SYSTEM STOP MERGES or similar queries.
    # It may cause some ALTERs to hang.
    # Possibly we should fix tests and forbid to use such queries without specifying table.
-    call_with_retry(make_query_command('SYSTEM START MERGES'))
-    call_with_retry(make_query_command('SYSTEM START DISTRIBUTED SENDS'))
-    call_with_retry(make_query_command('SYSTEM START TTL MERGES'))
-    call_with_retry(make_query_command('SYSTEM START MOVES'))
-    call_with_retry(make_query_command('SYSTEM START FETCHES'))
-    call_with_retry(make_query_command('SYSTEM START REPLICATED SENDS'))
-    call_with_retry(make_query_command('SYSTEM START REPLICATION QUEUES'))
-    call_with_retry(make_query_command('SYSTEM DROP MARK CACHE'))
+    call_with_retry(make_query_command("SYSTEM START MERGES"))
+    call_with_retry(make_query_command("SYSTEM START DISTRIBUTED SENDS"))
+    call_with_retry(make_query_command("SYSTEM START TTL MERGES"))
+    call_with_retry(make_query_command("SYSTEM START MOVES"))
+    call_with_retry(make_query_command("SYSTEM START FETCHES"))
+    call_with_retry(make_query_command("SYSTEM START REPLICATED SENDS"))
+    call_with_retry(make_query_command("SYSTEM START REPLICATION QUEUES"))
+    call_with_retry(make_query_command("SYSTEM DROP MARK CACHE"))

    # Issue #21004, live views are experimental, so let's just suppress it
    call_with_retry(make_query_command("KILL QUERY WHERE upper(query) LIKE 'WATCH %'"))

    # Kill other queries which known to be slow
    # It's query from 01232_preparing_sets_race_condition_long, it may take up to 1000 seconds in slow builds
-    call_with_retry(make_query_command("KILL QUERY WHERE query LIKE 'insert into tableB select %'"))
+    call_with_retry(
+        make_query_command("KILL QUERY WHERE query LIKE 'insert into tableB select %'")
+    )
    # Long query from 00084_external_agregation
-    call_with_retry(make_query_command("KILL QUERY WHERE query LIKE 'SELECT URL, uniq(SearchPhrase) AS u FROM test.hits GROUP BY URL ORDER BY u %'"))
+    call_with_retry(
+        make_query_command(
+            "KILL QUERY WHERE query LIKE 'SELECT URL, uniq(SearchPhrase) AS u FROM test.hits GROUP BY URL ORDER BY u %'"
+        )
+    )

    if drop_databases:
        for i in range(5):
@ -123,23 +152,35 @@ def prepare_for_hung_check(drop_databases):
                # Otherwise we will get rid of queries which wait for background pool. It can take a long time on slow builds (more than 900 seconds).
                #
                # Also specify max_untracked_memory to allow 1GiB of memory to overcommit.
-                databases = check_output(make_query_command('SHOW DATABASES'), shell=True, timeout=30).decode('utf-8').strip().split()
+                databases = (
+                    check_output(
+                        make_query_command("SHOW DATABASES"), shell=True, timeout=30
+                    )
+                    .decode("utf-8")
+                    .strip()
+                    .split()
+                )
                for db in databases:
                    if db == "system":
                        continue
-                    command = make_query_command(f'DROP DATABASE {db}')
+                    command = make_query_command(f"DROP DATABASE {db}")
                    # we don't wait for drop
                    Popen(command, shell=True)
                break
            except Exception as ex:
-                logging.error("Failed to SHOW or DROP databasese, will retry %s", str(ex))
+                logging.error(
+                    "Failed to SHOW or DROP databasese, will retry %s", str(ex)
+                )
                time.sleep(i)
        else:
-            raise Exception("Cannot drop databases after stress tests. Probably server consumed too much memory and cannot execute simple queries")
-
+            raise Exception(
+                "Cannot drop databases after stress tests. Probably server consumed too much memory and cannot execute simple queries"
+            )

    # Wait for last queries to finish if any, not longer than 300 seconds
-    call(make_query_command("""
+    call(
+        make_query_command(
+            """
    select sleepEachRow((
        select maxOrDefault(300 - elapsed) + 1
        from system.processes
@ -147,39 +188,58 @@ def prepare_for_hung_check(drop_databases):
    ) / 300)
    from numbers(300)
    format Null
-    """), shell=True, stderr=STDOUT, timeout=330)
+    """
+        ),
+        shell=True,
+        stderr=STDOUT,
+        timeout=330,
+    )

    # Even if all clickhouse-test processes are finished, there are probably some sh scripts,
    # which still run some new queries. Let's ignore them.
    try:
        query = """clickhouse client -q "SELECT count() FROM system.processes where where elapsed > 300" """
-        output = check_output(query, shell=True, stderr=STDOUT, timeout=30).decode('utf-8').strip()
+        output = (
+            check_output(query, shell=True, stderr=STDOUT, timeout=30)
+            .decode("utf-8")
+            .strip()
+        )
        if int(output) == 0:
            return False
    except:
        pass
    return True

+
 if __name__ == "__main__":
-    logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
-    parser = argparse.ArgumentParser(description="ClickHouse script for running stresstest")
-    parser.add_argument("--test-cmd", default='/usr/bin/clickhouse-test')
-    parser.add_argument("--skip-func-tests", default='')
-    parser.add_argument("--client-cmd", default='clickhouse-client')
-    parser.add_argument("--server-log-folder", default='/var/log/clickhouse-server')
+    logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s")
+    parser = argparse.ArgumentParser(
+        description="ClickHouse script for running stresstest"
+    )
+    parser.add_argument("--test-cmd", default="/usr/bin/clickhouse-test")
+    parser.add_argument("--skip-func-tests", default="")
+    parser.add_argument("--client-cmd", default="clickhouse-client")
+    parser.add_argument("--server-log-folder", default="/var/log/clickhouse-server")
    parser.add_argument("--output-folder")
    parser.add_argument("--global-time-limit", type=int, default=1800)
    parser.add_argument("--num-parallel", type=int, default=cpu_count())
-    parser.add_argument('--backward-compatibility-check', action='store_true')
-    parser.add_argument('--hung-check', action='store_true', default=False)
+    parser.add_argument("--backward-compatibility-check", action="store_true")
+    parser.add_argument("--hung-check", action="store_true", default=False)
    # make sense only for hung check
-    parser.add_argument('--drop-databases', action='store_true', default=False)
+    parser.add_argument("--drop-databases", action="store_true", default=False)

    args = parser.parse_args()
    if args.drop_databases and not args.hung_check:
        raise Exception("--drop-databases only used in hung check (--hung-check)")
    func_pipes = []
-    func_pipes = run_func_test(args.test_cmd, args.output_folder, args.num_parallel, args.skip_func_tests, args.global_time_limit, args.backward_compatibility_check)
+    func_pipes = run_func_test(
+        args.test_cmd,
+        args.output_folder,
+        args.num_parallel,
+        args.skip_func_tests,
+        args.global_time_limit,
+        args.backward_compatibility_check,
+    )

    logging.info("Will wait functests to finish")
    while True:
@ -205,32 +265,41 @@ if __name__ == "__main__":
            have_long_running_queries = True
            logging.error("Failed to prepare for hung check %s", str(ex))
        logging.info("Checking if some queries hung")
-        cmd = ' '.join([args.test_cmd,
-            # Do not track memory allocations up to 1Gi,
-            # this will allow to ignore server memory limit (max_server_memory_usage) for this query.
-            #
-            # NOTE: memory_profiler_step should be also adjusted, because:
-            #
-            #     untracked_memory_limit = min(settings.max_untracked_memory, settings.memory_profiler_step)
-            #
-            # NOTE: that if there will be queries with GROUP BY, this trick
-            # will not work due to CurrentMemoryTracker::check() from
-            # Aggregator code.
-            # But right now it should work, since neither hung check, nor 00001_select_1 has GROUP BY.
-            "--client-option", "max_untracked_memory=1Gi",
-            "--client-option", "max_memory_usage_for_user=0",
-            "--client-option", "memory_profiler_step=1Gi",
-            # Use system database to avoid CREATE/DROP DATABASE queries
-            "--database=system",
-            "--hung-check",
-            "00001_select_1"
-        ])
+        cmd = " ".join(
+            [
+                args.test_cmd,
+                # Do not track memory allocations up to 1Gi,
+                # this will allow to ignore server memory limit (max_server_memory_usage) for this query.
+                #
+                # NOTE: memory_profiler_step should be also adjusted, because:
+                #
+                #     untracked_memory_limit = min(settings.max_untracked_memory, settings.memory_profiler_step)
+                #
+                # NOTE: that if there will be queries with GROUP BY, this trick
+                # will not work due to CurrentMemoryTracker::check() from
+                # Aggregator code.
+                # But right now it should work, since neither hung check, nor 00001_select_1 has GROUP BY.
+                "--client-option",
+                "max_untracked_memory=1Gi",
+                "--client-option",
+                "max_memory_usage_for_user=0",
+                "--client-option",
+                "memory_profiler_step=1Gi",
+                # Use system database to avoid CREATE/DROP DATABASE queries
+                "--database=system",
+                "--hung-check",
+                "--stress",
+                "00001_select_1",
+            ]
+        )
        res = call(cmd, shell=True, stderr=STDOUT)
        hung_check_status = "No queries hung\tOK\n"
        if res != 0 and have_long_running_queries:
            logging.info("Hung check failed with exit code {}".format(res))
            hung_check_status = "Hung check failed\tFAIL\n"
-        with open(os.path.join(args.output_folder, "test_results.tsv"), 'w+') as results:
+        with open(
+            os.path.join(args.output_folder, "test_results.tsv"), "w+"
+        ) as results:
            results.write(hung_check_status)

    logging.info("Stress test finished")
--- a/docs/changelogs/v20.5.1.3833-prestable.md
+++ b/docs/changelogs/v20.5.1.3833-prestable.md
@ -14,7 +14,7 @@
 * Selects with final are executed in parallel. Added setting `max_final_threads` to limit the number of threads used. [#10463](https://github.com/ClickHouse/ClickHouse/pull/10463) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
 * Function that extracts from haystack all matching non-overlapping groups with regular expressions, and put those into `Array(Array(String))` column. [#10534](https://github.com/ClickHouse/ClickHouse/pull/10534) ([Vasily Nemkov](https://github.com/Enmk)).
 * Added ability to delete a subset of expired rows, which satisfies the condition in WHERE clause. Added ability to replace expired rows with aggregates of them specified in GROUP BY clause. [#10537](https://github.com/ClickHouse/ClickHouse/pull/10537) ([expl0si0nn](https://github.com/expl0si0nn)).
-* (Only Linux) Clickhouse server now tries to fallback to ProcfsMetricsProvider when clickhouse binary is not attributed with CAP_NET_ADMIN capability to collect per-query system metrics (for CPU and I/O). [#10544](https://github.com/ClickHouse/ClickHouse/pull/10544) ([Alexander Kazakov](https://github.com/Akazz)).
+* (Only Linux) ClickHouse server now tries to fallback to ProcfsMetricsProvider when clickhouse binary is not attributed with CAP_NET_ADMIN capability to collect per-query system metrics (for CPU and I/O). [#10544](https://github.com/ClickHouse/ClickHouse/pull/10544) ([Alexander Kazakov](https://github.com/Akazz)).
 * - Add Arrow IPC File format (Input and Output) - Fix incorrect work of resetParser() for Parquet Input Format - Add zero-copy optimization for ORC for RandomAccessFiles - Add missing halffloat type for input parquet and ORC formats ... [#10580](https://github.com/ClickHouse/ClickHouse/pull/10580) ([Zhanna](https://github.com/FawnD2)).
 * Allowed to profile memory with finer granularity steps than 4 MiB. Added sampling memory profiler to capture random allocations/deallocations. [#10598](https://github.com/ClickHouse/ClickHouse/pull/10598) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
 * Add new input format `JSONAsString` that accepts a sequence of JSON objects separated by newlines, spaces and/or commas. [#10607](https://github.com/ClickHouse/ClickHouse/pull/10607) ([Kruglov Pavel](https://github.com/Avogar)).
--- a/docs/changelogs/v21.11.1.8636-prestable.md
+++ b/docs/changelogs/v21.11.1.8636-prestable.md
@ -13,7 +13,7 @@
 * Users now can set comments to database in `CREATE DATABASE` statement ... [#29429](https://github.com/ClickHouse/ClickHouse/pull/29429) ([Vasily Nemkov](https://github.com/Enmk)).
 * New function` mapContainsKeyLike` to get the map that key matches a simple regular expression. [#29471](https://github.com/ClickHouse/ClickHouse/pull/29471) ([凌涛](https://github.com/lingtaolf)).
 * Huawei OBS Storage support. Closes [#24294](https://github.com/ClickHouse/ClickHouse/issues/24294). [#29511](https://github.com/ClickHouse/ClickHouse/pull/29511) ([kevin wan](https://github.com/MaxWk)).
-* Clickhouse HTTP Server can enable HSTS by set `hsts_max_age` in config.xml with a positive number. [#29516](https://github.com/ClickHouse/ClickHouse/pull/29516) ([凌涛](https://github.com/lingtaolf)).
+* ClickHouse HTTP Server can enable HSTS by set `hsts_max_age` in config.xml with a positive number. [#29516](https://github.com/ClickHouse/ClickHouse/pull/29516) ([凌涛](https://github.com/lingtaolf)).
 * - Added MD4 and SHA384 functions. [#29602](https://github.com/ClickHouse/ClickHouse/pull/29602) ([Nikita Tikhomirov](https://github.com/NSTikhomirov)).
 * Support EXISTS(subquery). Closes [#6852](https://github.com/ClickHouse/ClickHouse/issues/6852). [#29731](https://github.com/ClickHouse/ClickHouse/pull/29731) ([Kseniia Sumarokova](https://github.com/kssenii)).
 * Added function `ngram`. Closes [#29699](https://github.com/ClickHouse/ClickHouse/issues/29699). [#29738](https://github.com/ClickHouse/ClickHouse/pull/29738) ([Maksim Kita](https://github.com/kitaisreal)).
--- a/docs/changelogs/v21.12.1.9017-prestable.md
+++ b/docs/changelogs/v21.12.1.9017-prestable.md
@ -54,7 +54,7 @@
 * Add settings `merge_tree_min_rows_for_concurrent_read_for_remote_filesystem` and `merge_tree_min_bytes_for_concurrent_read_for_remote_filesystem`. [#30970](https://github.com/ClickHouse/ClickHouse/pull/30970) ([Kseniia Sumarokova](https://github.com/kssenii)).
 * Do not allow to drop a table or dictionary if some tables or dictionaries depend on it. [#30977](https://github.com/ClickHouse/ClickHouse/pull/30977) ([Alexander Tokmakov](https://github.com/tavplubix)).
 * Only grab AlterLock when we do alter command. Let's see if the assumption is correct. [#31010](https://github.com/ClickHouse/ClickHouse/pull/31010) ([Amos Bird](https://github.com/amosbird)).
-* The local session inside a Clickhouse dictionary source won't send its events to the session log anymore. This fixes a possible deadlock (tsan alert) on shutdown. Also this PR fixes flaky `test_dictionaries_dependency_xml/`. [#31013](https://github.com/ClickHouse/ClickHouse/pull/31013) ([Vitaly Baranov](https://github.com/vitlibar)).
+* The local session inside a ClickHouse dictionary source won't send its events to the session log anymore. This fixes a possible deadlock (tsan alert) on shutdown. Also this PR fixes flaky `test_dictionaries_dependency_xml/`. [#31013](https://github.com/ClickHouse/ClickHouse/pull/31013) ([Vitaly Baranov](https://github.com/vitlibar)).
 * Cancel vertical merges when partition is dropped. This is a follow-up of https://github.com/ClickHouse/ClickHouse/pull/25684 and https://github.com/ClickHouse/ClickHouse/pull/30996. [#31057](https://github.com/ClickHouse/ClickHouse/pull/31057) ([Amos Bird](https://github.com/amosbird)).
 * Support `IF EXISTS` modifier for `RENAME DATABASE`/`TABLE`/`DICTIONARY` query, If this directive is used, one will not get an error if the DATABASE/TABLE/DICTIONARY to be renamed doesn't exist. [#31081](https://github.com/ClickHouse/ClickHouse/pull/31081) ([victorgao](https://github.com/kafka1991)).
 * Function name normalization for ALTER queries. This helps avoid metadata mismatch between creating table with indices/projections and adding indices/projections via alter commands. This is a follow-up PR of https://github.com/ClickHouse/ClickHouse/pull/20174. Mark as improvements as there are no bug reports and the senario is somehow rare. [#31095](https://github.com/ClickHouse/ClickHouse/pull/31095) ([Amos Bird](https://github.com/amosbird)).
--- a/docs/changelogs/v21.12.3.32-stable.md
+++ b/docs/changelogs/v21.12.3.32-stable.md
@ -1,7 +1,7 @@
 ### ClickHouse release v21.12.3.32-stable FIXME as compared to v21.12.2.17-stable

 #### Bug Fix
-* Backported in [#33018](https://github.com/ClickHouse/ClickHouse/issues/33018): - Clickhouse Keeper handler should remove operation when response sent. [#32988](https://github.com/ClickHouse/ClickHouse/pull/32988) ([JackyWoo](https://github.com/JackyWoo)).
+* Backported in [#33018](https://github.com/ClickHouse/ClickHouse/issues/33018): - ClickHouse Keeper handler should remove operation when response sent. [#32988](https://github.com/ClickHouse/ClickHouse/pull/32988) ([JackyWoo](https://github.com/JackyWoo)).

 #### Bug Fix (user-visible misbehaviour in official stable or prestable release)

--- a/docs/changelogs/v21.2.2.8-stable.md
+++ b/docs/changelogs/v21.2.2.8-stable.md
@ -68,7 +68,7 @@
 * Add separate pool for message brokers (RabbitMQ and Kafka). [#19722](https://github.com/ClickHouse/ClickHouse/pull/19722) ([Azat Khuzhin](https://github.com/azat)).
 * In distributed queries if the setting `async_socket_for_remote` is enabled, it was possible to get stack overflow at least in debug build configuration if very deeply nested data type is used in table (e.g. `Array(Array(Array(...more...)))`). This fixes [#19108](https://github.com/ClickHouse/ClickHouse/issues/19108). This change introduces minor backward incompatibility: excessive parenthesis in type definitions no longer supported, example: `Array((UInt8))`. [#19736](https://github.com/ClickHouse/ClickHouse/pull/19736) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
 * Table function `S3` will use global region if the region can't be determined exactly. This closes [#10998](https://github.com/ClickHouse/ClickHouse/issues/10998). [#19750](https://github.com/ClickHouse/ClickHouse/pull/19750) ([Vladimir Chebotarev](https://github.com/excitoon)).
-* Clickhouse client query param CTE added test. [#19762](https://github.com/ClickHouse/ClickHouse/pull/19762) ([Maksim Kita](https://github.com/kitaisreal)).
+* ClickHouse client query param CTE added test. [#19762](https://github.com/ClickHouse/ClickHouse/pull/19762) ([Maksim Kita](https://github.com/kitaisreal)).
 * Correctly output infinite arguments for `formatReadableTimeDelta` function. In previous versions, there was implicit conversion to implementation specific integer value. [#19791](https://github.com/ClickHouse/ClickHouse/pull/19791) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
 * `S3` table function now supports `auto` compression mode (autodetect). This closes [#18754](https://github.com/ClickHouse/ClickHouse/issues/18754). [#19793](https://github.com/ClickHouse/ClickHouse/pull/19793) ([Vladimir Chebotarev](https://github.com/excitoon)).
 * Set charset to utf8mb4 when interacting with remote MySQL servers. Fixes [#19795](https://github.com/ClickHouse/ClickHouse/issues/19795). [#19800](https://github.com/ClickHouse/ClickHouse/pull/19800) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
--- a/docs/changelogs/v21.5.1.6601-prestable.md
+++ b/docs/changelogs/v21.5.1.6601-prestable.md
@ -34,7 +34,7 @@
 * Allow to use CTE in VIEW definition. This closes [#22491](https://github.com/ClickHouse/ClickHouse/issues/22491). [#22657](https://github.com/ClickHouse/ClickHouse/pull/22657) ([Amos Bird](https://github.com/amosbird)).
 * Add metric to track how much time is spend during waiting for Buffer layer lock. [#22725](https://github.com/ClickHouse/ClickHouse/pull/22725) ([Azat Khuzhin](https://github.com/azat)).
 * Allow RBAC row policy via postgresql protocol. Closes [#22658](https://github.com/ClickHouse/ClickHouse/issues/22658). PostgreSQL protocol is enabled in configuration by default. [#22755](https://github.com/ClickHouse/ClickHouse/pull/22755) ([Kseniia Sumarokova](https://github.com/kssenii)).
-* MaterializeMySQL (experimental feature). Make Clickhouse to be able to replicate MySQL databases containing views without failing. This is accomplished by ignoring the views. ... [#22760](https://github.com/ClickHouse/ClickHouse/pull/22760) ([Christian Frøystad](https://github.com/cfroystad)).
+* MaterializeMySQL (experimental feature). Make ClickHouse to be able to replicate MySQL databases containing views without failing. This is accomplished by ignoring the views. ... [#22760](https://github.com/ClickHouse/ClickHouse/pull/22760) ([Christian Frøystad](https://github.com/cfroystad)).
 * `dateDiff` now works with `DateTime64` arguments (even for values outside of `DateTime` range) ... [#22931](https://github.com/ClickHouse/ClickHouse/pull/22931) ([Vasily Nemkov](https://github.com/Enmk)).
 * Set `background_fetches_pool_size` to 8 that is better for production usage with frequent small insertions or slow ZooKeeper cluster. [#22945](https://github.com/ClickHouse/ClickHouse/pull/22945) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
 * Fix inactive_parts_to_throw_insert=0 with inactive_parts_to_delay_insert>0. [#22947](https://github.com/ClickHouse/ClickHouse/pull/22947) ([Azat Khuzhin](https://github.com/azat)).
--- a/docs/changelogs/v22.1.1.2542-prestable.md
+++ b/docs/changelogs/v22.1.1.2542-prestable.md
@ -84,7 +84,7 @@

 #### Bug Fix
 * Quota limit was not reached, but the limit was exceeded. This PR fixes [#31174](https://github.com/ClickHouse/ClickHouse/issues/31174). [#31656](https://github.com/ClickHouse/ClickHouse/pull/31656) ([sunny](https://github.com/sunny19930321)).
-* - Clickhouse Keeper handler should remove operation when response sent. [#32988](https://github.com/ClickHouse/ClickHouse/pull/32988) ([JackyWoo](https://github.com/JackyWoo)).
+* - ClickHouse Keeper handler should remove operation when response sent. [#32988](https://github.com/ClickHouse/ClickHouse/pull/32988) ([JackyWoo](https://github.com/JackyWoo)).
 * Fix null pointer dereference in low cardinality data when deserializing LowCardinality data in the Native format. [#33021](https://github.com/ClickHouse/ClickHouse/pull/33021) ([Harry Lee](https://github.com/HarryLeeIBM)).
 * Specifically crafted input data for `Native` format may lead to reading uninitialized memory or crash. This is relevant if `clickhouse-server` is open for write access to adversary. [#33050](https://github.com/ClickHouse/ClickHouse/pull/33050) ([Heena Bansal](https://github.com/HeenaBansal2009)).

@ -196,7 +196,7 @@
 * NO CL ENTRY:  'Update CHANGELOG.md'. [#32472](https://github.com/ClickHouse/ClickHouse/pull/32472) ([Rich Raposa](https://github.com/rfraposa)).
 * NO CL ENTRY:  'Revert "Split long tests into multiple checks"'. [#32514](https://github.com/ClickHouse/ClickHouse/pull/32514) ([alesapin](https://github.com/alesapin)).
 * NO CL ENTRY:  'Revert "Revert "Split long tests into multiple checks""'. [#32515](https://github.com/ClickHouse/ClickHouse/pull/32515) ([alesapin](https://github.com/alesapin)).
-* NO CL ENTRY:  'blog post how to enable predictive capabilities in Clickhouse'. [#32768](https://github.com/ClickHouse/ClickHouse/pull/32768) ([Tom Risse](https://github.com/flickerbox-tom)).
+* NO CL ENTRY:  'blog post how to enable predictive capabilities in ClickHouse'. [#32768](https://github.com/ClickHouse/ClickHouse/pull/32768) ([Tom Risse](https://github.com/flickerbox-tom)).
 * NO CL ENTRY:  'Revert "Fix build issue related to azure blob storage"'. [#32845](https://github.com/ClickHouse/ClickHouse/pull/32845) ([alesapin](https://github.com/alesapin)).
 * NO CL ENTRY:  'Revert "Dictionaries added Date32 type support"'. [#33053](https://github.com/ClickHouse/ClickHouse/pull/33053) ([Alexander Tokmakov](https://github.com/tavplubix)).
 * NO CL ENTRY:  'Updated Lawrence Berkeley National Lab stats'. [#33066](https://github.com/ClickHouse/ClickHouse/pull/33066) ([Michael Smitasin](https://github.com/michaelsmitasin)).
--- a/docs/changelogs/v22.3.1.1262-prestable.md
+++ b/docs/changelogs/v22.3.1.1262-prestable.md
@ -65,7 +65,7 @@
 * Add setting to lower column case when reading parquet/ORC file. [#35145](https://github.com/ClickHouse/ClickHouse/pull/35145) ([shuchaome](https://github.com/shuchaome)).
 * Do not retry non-rertiable errors. Closes [#35161](https://github.com/ClickHouse/ClickHouse/issues/35161). [#35172](https://github.com/ClickHouse/ClickHouse/pull/35172) ([Kseniia Sumarokova](https://github.com/kssenii)).
 * Added disk_name to system.part_log. [#35178](https://github.com/ClickHouse/ClickHouse/pull/35178) ([Artyom Yurkov](https://github.com/Varinara)).
-* Currently,Clickhouse validates hosts defined under <remote_url_allow_hosts> for URL and Remote Table functions. This PR extends the RemoteHostFilter to Mysql and PostgreSQL table functions. [#35191](https://github.com/ClickHouse/ClickHouse/pull/35191) ([Heena Bansal](https://github.com/HeenaBansal2009)).
+* Currently,ClickHouse validates hosts defined under <remote_url_allow_hosts> for URL and Remote Table functions. This PR extends the RemoteHostFilter to Mysql and PostgreSQL table functions. [#35191](https://github.com/ClickHouse/ClickHouse/pull/35191) ([Heena Bansal](https://github.com/HeenaBansal2009)).
 * Sometimes it is not enough for us to distinguish queries hierachy only by is_initial_query in system.query_log and system.processes. So distributed_depth is needed. [#35207](https://github.com/ClickHouse/ClickHouse/pull/35207) ([李扬](https://github.com/taiyang-li)).
 * Support test mode for clickhouse-local. [#35264](https://github.com/ClickHouse/ClickHouse/pull/35264) ([Kseniia Sumarokova](https://github.com/kssenii)).
 * Return const for function getMacro if not in distributed query. Close [#34727](https://github.com/ClickHouse/ClickHouse/issues/34727). [#35289](https://github.com/ClickHouse/ClickHouse/pull/35289) ([李扬](https://github.com/taiyang-li)).
--- a/docs/changelogs/v22.6.1.1985-stable.md
+++ b/docs/changelogs/v22.6.1.1985-stable.md
@ -0,0 +1,192 @@
+### ClickHouse release v22.6.1.1985-stable FIXME as compared to v22.5.1.2079-stable
+
+#### Backward Incompatible Change
+* Changes how settings using `seconds` as type are parsed to support floating point values (for example: `max_execution_time=0.5`). Infinity or NaN values will throw an exception. [#37187](https://github.com/ClickHouse/ClickHouse/pull/37187) ([Raúl Marín](https://github.com/Algunenano)).
+* Changed format of binary serialization of columns of experimental type `Object`. New format is more convenient to implement by third-party clients. [#37482](https://github.com/ClickHouse/ClickHouse/pull/37482) ([Anton Popov](https://github.com/CurtizJ)).
+* Turn on setting `output_format_json_named_tuples_as_objects` by default. It allows to serialize named tuples as JSON objects in JSON formats. [#37756](https://github.com/ClickHouse/ClickHouse/pull/37756) ([Anton Popov](https://github.com/CurtizJ)).
+
+#### New Feature
+* Added `SYSTEM UNFREEZE` query that deletes the whole backup regardless if the corresponding table is deleted or not. [#36424](https://github.com/ClickHouse/ClickHouse/pull/36424) ([Vadim Volodin](https://github.com/PolyProgrammist)).
+* Adds H3 unidirectional edge functions. [#36843](https://github.com/ClickHouse/ClickHouse/pull/36843) ([Bharat Nallan](https://github.com/bharatnc)).
+* Add merge_reason column to system.part_log table. [#36912](https://github.com/ClickHouse/ClickHouse/pull/36912) ([Sema Checherinda](https://github.com/CheSema)).
+* This PR enables `POPULATE` for WindowView. [#36945](https://github.com/ClickHouse/ClickHouse/pull/36945) ([vxider](https://github.com/Vxider)).
+* Add new columnar JSON formats: JSONColumns, JSONCompactColumns, JSONColumnsWithMetadata. Closes [#36338](https://github.com/ClickHouse/ClickHouse/issues/36338) Closes [#34509](https://github.com/ClickHouse/ClickHouse/issues/34509). [#36975](https://github.com/ClickHouse/ClickHouse/pull/36975) ([Kruglov Pavel](https://github.com/Avogar)).
+* Add support for calculating [hashids](https://hashids.org/) from unsigned integers. [#37013](https://github.com/ClickHouse/ClickHouse/pull/37013) ([Michael Nutt](https://github.com/mnutt)).
+* Add GROUPING function. Closes [#19426](https://github.com/ClickHouse/ClickHouse/issues/19426). [#37163](https://github.com/ClickHouse/ClickHouse/pull/37163) ([Dmitry Novik](https://github.com/novikd)).
+* `ALTER TABLE ... MODIFY QUERY` support for WindowView. [#37188](https://github.com/ClickHouse/ClickHouse/pull/37188) ([vxider](https://github.com/Vxider)).
+* This PR changes the behavior of the `ENGINE` syntax in WindowView, to make it like in MaterializedView. [#37214](https://github.com/ClickHouse/ClickHouse/pull/37214) ([vxider](https://github.com/Vxider)).
+* SALT is allowed for CREATE USER <user> IDENTIFIED WITH sha256_hash. [#37377](https://github.com/ClickHouse/ClickHouse/pull/37377) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)).
+* Implemented changing comment to a ReplicatedMergeTree table. [#37416](https://github.com/ClickHouse/ClickHouse/pull/37416) ([Vasily Nemkov](https://github.com/Enmk)).
+* Add support for Maps and Records in Avro format. Add new setting `input_format_avro_null_as_default ` that allow to insert null as default in Avro format. Closes [#18925](https://github.com/ClickHouse/ClickHouse/issues/18925) Closes [#37378](https://github.com/ClickHouse/ClickHouse/issues/37378) Closes [#32899](https://github.com/ClickHouse/ClickHouse/issues/32899). [#37525](https://github.com/ClickHouse/ClickHouse/pull/37525) ([Kruglov Pavel](https://github.com/Avogar)).
+* Add two new settings `input_format_csv_skip_first_lines/input_format_tsv_skip_first_lines` to allow skipping specified number of lines in the beginning of the file in CSV/TSV formats. [#37537](https://github.com/ClickHouse/ClickHouse/pull/37537) ([Kruglov Pavel](https://github.com/Avogar)).
+* showCertificate() function shows current server's SSL certificate. [#37540](https://github.com/ClickHouse/ClickHouse/pull/37540) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)).
+* Implementation of [FPC](https://userweb.cs.txstate.edu/~burtscher/papers/dcc07a.pdf) algorithm for floating point data compression. [#37553](https://github.com/ClickHouse/ClickHouse/pull/37553) ([Mikhail Guzov](https://github.com/koloshmet)).
+* HTTP source for Data Dictionaries in Named Collections is supported. [#37581](https://github.com/ClickHouse/ClickHouse/pull/37581) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)).
+* This PR aims to resolve [#22130](https://github.com/ClickHouse/ClickHouse/issues/22130) which allows inserting into `system.zookeeper`. To simplify, this PR is designed as:. [#37596](https://github.com/ClickHouse/ClickHouse/pull/37596) ([Han Fei](https://github.com/hanfei1991)).
+* Added a new window function `nonNegativeDerivative(metric_column, timestamp_column[, INTERVAL x SECOND])`. [#37628](https://github.com/ClickHouse/ClickHouse/pull/37628) ([Andrey Zvonov](https://github.com/zvonand)).
+* Executable user defined functions support parameters. Example: `SELECT test_function(parameters)(arguments)`. Closes [#37578](https://github.com/ClickHouse/ClickHouse/issues/37578). [#37720](https://github.com/ClickHouse/ClickHouse/pull/37720) ([Maksim Kita](https://github.com/kitaisreal)).
+* Added open telemetry traces visualizing tool based on d3js. [#37810](https://github.com/ClickHouse/ClickHouse/pull/37810) ([Sergei Trifonov](https://github.com/serxa)).
+
+#### Performance Improvement
+* Improve performance of insert into MergeTree if there are multiple columns in ORDER BY. [#35762](https://github.com/ClickHouse/ClickHouse/pull/35762) ([Maksim Kita](https://github.com/kitaisreal)).
+* Apply read method 'threadpool' for StorageHive. [#36328](https://github.com/ClickHouse/ClickHouse/pull/36328) ([李扬](https://github.com/taiyang-li)).
+* Now we split data parts into layers and distribute them among threads instead of whole parts to make the execution of queries with `FINAL` more data-parallel. [#36396](https://github.com/ClickHouse/ClickHouse/pull/36396) ([Nikita Taranov](https://github.com/nickitat)).
+* Load marks for only necessary columns when reading wide parts. [#36879](https://github.com/ClickHouse/ClickHouse/pull/36879) ([Anton Kozlov](https://github.com/tonickkozlov)).
+* When all the columns to read are partition keys, construct columns by the file's row number without real reading the hive file. [#37103](https://github.com/ClickHouse/ClickHouse/pull/37103) ([lgbo](https://github.com/lgbo-ustc)).
+* Fix performance of `dictGetDescendants`, `dictGetChildren` functions, create temporary parent to children hierarchical index per query, not per function call during query. Allow to specify `BIDIRECTIONAL` for `HIERARHICAL` attributes, dictionary will maintain parent to children index in memory, that way functions `dictGetDescendants`, `dictGetChildren` will not create temporary index per query. Closes [#32481](https://github.com/ClickHouse/ClickHouse/issues/32481). [#37148](https://github.com/ClickHouse/ClickHouse/pull/37148) ([Maksim Kita](https://github.com/kitaisreal)).
+* Improve performance and memory usage for select of subset of columns for formats Native, Protobuf, CapnProto, JSONEachRow, TSKV, all formats with suffixes WithNames/WithNamesAndTypes. Previously while selecting only subset of columns from files in these formats all columns were read and stored in memory. Now only required columns are read. This PR enables setting `input_format_skip_unknown_fields` by default, because otherwise in case of select of subset of columns exception will be thrown. [#37192](https://github.com/ClickHouse/ClickHouse/pull/37192) ([Kruglov Pavel](https://github.com/Avogar)).
+* Improve sort performance by single column. [#37195](https://github.com/ClickHouse/ClickHouse/pull/37195) ([Maksim Kita](https://github.com/kitaisreal)).
+* Support multi disks for caching hive files. [#37279](https://github.com/ClickHouse/ClickHouse/pull/37279) ([lgbo](https://github.com/lgbo-ustc)).
+* Improved performance on array norm and distance functions 2x-4x times. [#37394](https://github.com/ClickHouse/ClickHouse/pull/37394) ([Alexander Gololobov](https://github.com/davenger)).
+* Improve performance of number comparison functions using dynamic dispatch. [#37399](https://github.com/ClickHouse/ClickHouse/pull/37399) ([Maksim Kita](https://github.com/kitaisreal)).
+* Improve performance of ORDER BY with LIMIT. [#37481](https://github.com/ClickHouse/ClickHouse/pull/37481) ([Maksim Kita](https://github.com/kitaisreal)).
+* Improve performance of `hasAll` function using dynamic dispatch infrastructure. [#37484](https://github.com/ClickHouse/ClickHouse/pull/37484) ([Maksim Kita](https://github.com/kitaisreal)).
+* Improve performance of `greatCircleAngle`, `greatCircleDistance`, `geoDistance` functions. [#37524](https://github.com/ClickHouse/ClickHouse/pull/37524) ([Maksim Kita](https://github.com/kitaisreal)).
+* Optimized the internal caching of re2 patterns which occur e.g. in LIKE and MATCH functions. [#37544](https://github.com/ClickHouse/ClickHouse/pull/37544) ([Robert Schulze](https://github.com/rschu1ze)).
+* Improve filter bitmask generator function all in one with avx512 instructions. [#37588](https://github.com/ClickHouse/ClickHouse/pull/37588) ([yaqi-zhao](https://github.com/yaqi-zhao)).
+* Improved performance of aggregation in case, when sparse columns (can be enabled by experimental setting `ratio_of_defaults_for_sparse_serialization` in `MergeTree` tables) are used as arguments in aggregate functions. [#37617](https://github.com/ClickHouse/ClickHouse/pull/37617) ([Anton Popov](https://github.com/CurtizJ)).
+* Optimize function `COALESCE` with two arguments. [#37666](https://github.com/ClickHouse/ClickHouse/pull/37666) ([Anton Popov](https://github.com/CurtizJ)).
+* Replace `multiIf` to `if` in case when `multiIf` has only one condition, because function `if` is more performant. [#37695](https://github.com/ClickHouse/ClickHouse/pull/37695) ([Anton Popov](https://github.com/CurtizJ)).
+* Aggregates state destruction now may be posted on a thread pool. For queries with LIMIT and big state it provides significant speedup, e.g. `select uniq(number) from numbers_mt(1e7) group by number limit 100` became around 2.5x faster. [#37855](https://github.com/ClickHouse/ClickHouse/pull/37855) ([Nikita Taranov](https://github.com/nickitat)).
+* Improve performance of single column sorting using sorting queue specializations. [#37990](https://github.com/ClickHouse/ClickHouse/pull/37990) ([Maksim Kita](https://github.com/kitaisreal)).
+* Fix excessive CPU usage in background when there are a lot of tables. [#38028](https://github.com/ClickHouse/ClickHouse/pull/38028) ([Maksim Kita](https://github.com/kitaisreal)).
+* Improve performance of `not` function using dynamic dispatch. [#38058](https://github.com/ClickHouse/ClickHouse/pull/38058) ([Maksim Kita](https://github.com/kitaisreal)).
+* Added numerous NEON accelerated paths for main libraries. [#38093](https://github.com/ClickHouse/ClickHouse/pull/38093) ([Daniel Kutenin](https://github.com/danlark1)).
+
+#### Improvement
+* Add separate CLUSTER grant (and `access_control_improvements.on_cluster_queries_require_cluster_grant` configuration directive, for backward compatibility, default to `false`). [#35767](https://github.com/ClickHouse/ClickHouse/pull/35767) ([Azat Khuzhin](https://github.com/azat)).
+* Add self extracting executable [#34755](https://github.com/ClickHouse/ClickHouse/issues/34755). [#35775](https://github.com/ClickHouse/ClickHouse/pull/35775) ([Filatenkov Artur](https://github.com/FArthur-cmd)).
+* Added support for schema inference for `hdfsCluster`. [#35812](https://github.com/ClickHouse/ClickHouse/pull/35812) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)).
+* Add feature disks( ls - list files on disk, C - set config file, list-disks - list disks names, disk - set disk for work by name, help - produce help message copy - copy data on disk containing at `from_path` to `to_path` link - Create hardlink on disk from `from_path` to `to_path` list - List files on disk move - Move file or directory on disk from `from_path` to `to_path` read - read File on disk `from_path` to `to_path` or to stdout remove - Remove file or directory on disk with all children. write - Write File on disk`from_path` or stdin to `to_path`). [#36060](https://github.com/ClickHouse/ClickHouse/pull/36060) ([Artyom Yurkov](https://github.com/Varinara)).
+* Implement `least_used` load balancing algorithm for disks inside volume (multi disk configuration). [#36686](https://github.com/ClickHouse/ClickHouse/pull/36686) ([Azat Khuzhin](https://github.com/azat)).
+* - Modify the HTTP Endpoint to return the full stats under the `X-ClickHouse-Summary` header when `send_progress_in_http_headers=0` (before it would return all zeros). - Modify the HTTP Endpoint to return `X-ClickHouse-Exception-Code` header when progress has been sent before (`send_progress_in_http_headers=1`) - Modify the HTTP Endpoint to return `HTTP_REQUEST_TIMEOUT` (408) instead of `HTTP_INTERNAL_SERVER_ERROR` (500) on `TIMEOUT_EXCEEDED` errors. [#36884](https://github.com/ClickHouse/ClickHouse/pull/36884) ([Raúl Marín](https://github.com/Algunenano)).
+* Allow a user to inspect grants from granted roles. [#36941](https://github.com/ClickHouse/ClickHouse/pull/36941) ([nvartolomei](https://github.com/nvartolomei)).
+* Do not calculate an integral numerically but use CDF functions instead. This will speed up execution and will increase the precision. This fixes [#36714](https://github.com/ClickHouse/ClickHouse/issues/36714). [#36953](https://github.com/ClickHouse/ClickHouse/pull/36953) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)).
+* Add default implementation for Nothing in functions. Now most of the functions will return column with type Nothing in case one of it's arguments is Nothing. It also solves problem with functions like arrayMap/arrayFilter and similar when they have empty array as an argument. Previously queries like `select arrayMap(x -> 2 * x, []);` failed because function inside lambda cannot work with type `Nothing`, now such queries return empty array with type `Array(Nothing)`. Also add support for arrays of nullable types in functions like arrayFilter/arrayFill. Previously, queries like `select arrayFilter(x -> x % 2, [1, NULL])` failed, now they work (if the result of lambda is NULL, then this value won't be included in the result). Closes [#37000](https://github.com/ClickHouse/ClickHouse/issues/37000). [#37048](https://github.com/ClickHouse/ClickHouse/pull/37048) ([Kruglov Pavel](https://github.com/Avogar)).
+* Now if a shard has local replica we create a local plan and a plan to read from all remote replicas. They have shared initiator which coordinates reading. [#37204](https://github.com/ClickHouse/ClickHouse/pull/37204) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)).
+* In function: CompressedWriteBuffer::nextImpl(), there is an unnecessary write-copy step that would happen frequently during inserting data. Below shows the differentiation with this patch: - Before: 1. Compress "working_buffer" into "compressed_buffer" 2. write-copy into "out" - After: Directly Compress "working_buffer" into "out". [#37242](https://github.com/ClickHouse/ClickHouse/pull/37242) ([jasperzhu](https://github.com/jinjunzh)).
+* Support non-constant SQL functions (NOT) (I)LIKE and MATCH. [#37251](https://github.com/ClickHouse/ClickHouse/pull/37251) ([Robert Schulze](https://github.com/rschu1ze)).
+* Client will try every IP address returned by DNS resolution until successful connection. [#37273](https://github.com/ClickHouse/ClickHouse/pull/37273) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)).
+* - Do no longer abort server startup if configuration option "mark_cache_size" is not explicitly set. [#37326](https://github.com/ClickHouse/ClickHouse/pull/37326) ([Robert Schulze](https://github.com/rschu1ze)).
+* Allow to use String type instead of Binary in Arrow/Parquet/ORC formats. This PR introduces 3 new settings for it: `output_format_arrow_string_as_string`, `output_format_parquet_string_as_string`, `output_format_orc_string_as_string`. Default value for all settings is `false`. [#37327](https://github.com/ClickHouse/ClickHouse/pull/37327) ([Kruglov Pavel](https://github.com/Avogar)).
+* Apply setting `input_format_max_rows_to_read_for_schema_inference` for all read rows in total from all files in globs. Previously setting `input_format_max_rows_to_read_for_schema_inference` was applied for each file in glob separately and in case of huge number of nulls we could read first `input_format_max_rows_to_read_for_schema_inference` rows from each file and get nothing. Also increase default value for this setting to 25000. [#37332](https://github.com/ClickHouse/ClickHouse/pull/37332) ([Kruglov Pavel](https://github.com/Avogar)).
+* allows providing `NULL`/`NOT NULL` right after type in column declaration. [#37337](https://github.com/ClickHouse/ClickHouse/pull/37337) ([Igor Nikonov](https://github.com/devcrafter)).
+* optimize file segment PARTIALLY_DOWNLOADED get read buffer. [#37338](https://github.com/ClickHouse/ClickHouse/pull/37338) ([xiedeyantu](https://github.com/xiedeyantu)).
+* Allow to prune the list of files via virtual columns such as `_file` and `_path` when reading from S3. This is for [#37174](https://github.com/ClickHouse/ClickHouse/issues/37174) , [#23494](https://github.com/ClickHouse/ClickHouse/issues/23494). [#37356](https://github.com/ClickHouse/ClickHouse/pull/37356) ([Amos Bird](https://github.com/amosbird)).
+* Try to improve short circuit functions processing to fix problems with stress tests. [#37384](https://github.com/ClickHouse/ClickHouse/pull/37384) ([Kruglov Pavel](https://github.com/Avogar)).
+* Closes [#37395](https://github.com/ClickHouse/ClickHouse/issues/37395). [#37415](https://github.com/ClickHouse/ClickHouse/pull/37415) ([Memo](https://github.com/Joeywzr)).
+* Fix extremely rare deadlock during part fetch in zero-copy replication. Fixes [#37423](https://github.com/ClickHouse/ClickHouse/issues/37423). [#37424](https://github.com/ClickHouse/ClickHouse/pull/37424) ([metahys](https://github.com/metahys)).
+* Don't allow to create storage with unknown data format. [#37450](https://github.com/ClickHouse/ClickHouse/pull/37450) ([Kruglov Pavel](https://github.com/Avogar)).
+* Set `global_memory_usage_overcommit_max_wait_microseconds` default value to 5 seconds. Add info about `OvercommitTracker` to OOM exception message. Add `MemoryOvercommitWaitTimeMicroseconds` profile event. [#37460](https://github.com/ClickHouse/ClickHouse/pull/37460) ([Dmitry Novik](https://github.com/novikd)).
+* Play UI: Keep controls in place when the page is scrolled horizontally. This makes edits comfortable even if the table is wide and it was scrolled far to the right. The feature proposed by Maksym Tereshchenko from CaspianDB. [#37470](https://github.com/ClickHouse/ClickHouse/pull/37470) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Now more filters can be pushed down for join. [#37472](https://github.com/ClickHouse/ClickHouse/pull/37472) ([Amos Bird](https://github.com/amosbird)).
+* Modify query div in play.html to be extendable beyond 20% height. In case of very long queries it is helpful to extend the textarea element, only today, since the div is fixed height, the extended textarea hides the data div underneath. With this fix, extending the textarea element will push the data div down/up such the extended textarea won't hide it. Also, keeps query box width 100% even when the user adjusting the size of the query textarea. [#37488](https://github.com/ClickHouse/ClickHouse/pull/37488) ([guyco87](https://github.com/guyco87)).
+* Modify query div in play.html to be extendable beyond 20% height. In case of very long queries it is helpful to extend the textarea element, only today, since the div is fixed height, the extended textarea hides the data div underneath. With this fix, extending the textarea element will push the data div down/up such the extended textarea won't hide it. Also, keeps query box width 100% even when the user adjusting the size of the query textarea. [#37504](https://github.com/ClickHouse/ClickHouse/pull/37504) ([guyco87](https://github.com/guyco87)).
+* Currently clickhouse directly downloads all remote files to the local cache (even if they are only read once), which will frequently cause IO of the local hard disk. In some scenarios, these IOs may not be necessary and may easily cause negative optimization. As shown in the figure below, when we run SSB Q1-Q4, the performance of the cache has caused negative optimization. [#37516](https://github.com/ClickHouse/ClickHouse/pull/37516) ([Han Shukai](https://github.com/KinderRiven)).
+* Added `ProfileEvents` for introspection of type of written (inserted or merged) parts (`Inserted{Wide/Compact/InMemory}Parts`, `MergedInto{Wide/Compact/InMemory}Parts`. Added column `part_type` to `system.part_log`. Resolves [#37495](https://github.com/ClickHouse/ClickHouse/issues/37495). [#37536](https://github.com/ClickHouse/ClickHouse/pull/37536) ([Anton Popov](https://github.com/CurtizJ)).
+* clickhouse-keeper improvement: move broken logs to a timestamped folder. [#37565](https://github.com/ClickHouse/ClickHouse/pull/37565) ([Antonio Andelic](https://github.com/antonio2368)).
+* Do not write expired columns by TTL after subsequent merges (before only first merge/optimize of the part will not write expired by TTL columns, all other will do). [#37570](https://github.com/ClickHouse/ClickHouse/pull/37570) ([Azat Khuzhin](https://github.com/azat)).
+* More precise result of the `dumpColumnStructure` miscellaneous function in presence of LowCardinality or Sparse columns. In previous versions, these functions were converting the argument to a full column before returning the result. This is needed to provide an answer in [#6935](https://github.com/ClickHouse/ClickHouse/issues/6935). [#37633](https://github.com/ClickHouse/ClickHouse/pull/37633) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* keeper: store only unique session IDs for watches. [#37641](https://github.com/ClickHouse/ClickHouse/pull/37641) ([Azat Khuzhin](https://github.com/azat)).
+* Fix possible "Cannot write to finalized buffer". [#37645](https://github.com/ClickHouse/ClickHouse/pull/37645) ([Azat Khuzhin](https://github.com/azat)).
+* Add setting `support_batch_delete` for `DiskS3` to disable multiobject delete calls, which Google Cloud Storage doesn't support. [#37659](https://github.com/ClickHouse/ClickHouse/pull/37659) ([Fred Wulff](https://github.com/frew)).
+* Support types with non-standard defaults in ROLLUP, CUBE, GROUPING SETS. Closes [#37360](https://github.com/ClickHouse/ClickHouse/issues/37360). [#37667](https://github.com/ClickHouse/ClickHouse/pull/37667) ([Dmitry Novik](https://github.com/novikd)).
+* Add an option to disable connection pooling in ODBC bridge. [#37705](https://github.com/ClickHouse/ClickHouse/pull/37705) ([Anton Kozlov](https://github.com/tonickkozlov)).
+* ... LIKE patterns with trailing escape symbol ('\\') are now disallowed (as mandated by the SQL standard). [#37764](https://github.com/ClickHouse/ClickHouse/pull/37764) ([Robert Schulze](https://github.com/rschu1ze)).
+* Fix stacktraces collection on ARM. Closes [#37044](https://github.com/ClickHouse/ClickHouse/issues/37044). Closes [#15638](https://github.com/ClickHouse/ClickHouse/issues/15638). [#37797](https://github.com/ClickHouse/ClickHouse/pull/37797) ([Maksim Kita](https://github.com/kitaisreal)).
+* Functions `dictGetHierarchy`, `dictIsIn`, `dictGetChildren`, `dictGetDescendants` added support nullable `HIERARCHICAL` attribute in dictionaries. Closes [#35521](https://github.com/ClickHouse/ClickHouse/issues/35521). [#37805](https://github.com/ClickHouse/ClickHouse/pull/37805) ([Maksim Kita](https://github.com/kitaisreal)).
+* Expose BoringSSL version related info in the `system.build_options` table. [#37850](https://github.com/ClickHouse/ClickHouse/pull/37850) ([Bharat Nallan](https://github.com/bharatnc)).
+* **Description** Limiting the maximum cache usage per query can effectively prevent cache pool contamination. [Related Issues](https://github.com/ClickHouse/ClickHouse/issues/28961). [#37859](https://github.com/ClickHouse/ClickHouse/pull/37859) ([Han Shukai](https://github.com/KinderRiven)).
+* Now clickhouse-server removes `delete_tmp` directories on server start. Fixes [#26503](https://github.com/ClickHouse/ClickHouse/issues/26503). [#37906](https://github.com/ClickHouse/ClickHouse/pull/37906) ([alesapin](https://github.com/alesapin)).
+* Clean up broken detached parts after timeout. Closes [#25195](https://github.com/ClickHouse/ClickHouse/issues/25195). [#37975](https://github.com/ClickHouse/ClickHouse/pull/37975) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Now in MergeTree table engines family failed-to-move parts will be removed instantly. [#37994](https://github.com/ClickHouse/ClickHouse/pull/37994) ([alesapin](https://github.com/alesapin)).
+* Now if setting `always_fetch_merged_part` is enabled for ReplicatedMergeTree merges will try to find parts on other replicas rarely with smaller load for [Zoo]Keeper. [#37995](https://github.com/ClickHouse/ClickHouse/pull/37995) ([alesapin](https://github.com/alesapin)).
+* Add implicit grants with grant option too. For example `GRANT CREATE TABLE ON test.* TO A WITH GRANT OPTION` now allows `A` to execute `GRANT CREATE VIEW ON test.* TO B`. [#38017](https://github.com/ClickHouse/ClickHouse/pull/38017) ([Vitaly Baranov](https://github.com/vitlibar)).
+* Do not display `-0.0` CPU time in clickhouse-client. It can appear due to rounding errors. This closes [#38003](https://github.com/ClickHouse/ClickHouse/issues/38003). This closes [#38038](https://github.com/ClickHouse/ClickHouse/issues/38038). [#38064](https://github.com/ClickHouse/ClickHouse/pull/38064) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+
+#### Build/Testing/Packaging Improvement
+* Use clang-14 and LLVM infrastructure version 14 for builds. This closes [#34681](https://github.com/ClickHouse/ClickHouse/issues/34681). [#34754](https://github.com/ClickHouse/ClickHouse/pull/34754) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Allow to drop privileges at startup. This simplifies Docker images. Closes [#36293](https://github.com/ClickHouse/ClickHouse/issues/36293). [#36341](https://github.com/ClickHouse/ClickHouse/pull/36341) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Remove recursive submodules, because we don't need them and they can be confusing. Add style check to prevent recursive submodules. This closes [#32821](https://github.com/ClickHouse/ClickHouse/issues/32821). [#37616](https://github.com/ClickHouse/ClickHouse/pull/37616) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* Add docs spellcheck to CI. [#37790](https://github.com/ClickHouse/ClickHouse/pull/37790) ([Vladimir C](https://github.com/vdimir)).
+* Fix overly aggressive stripping which removed the embedded hash required for checking the consistency of the executable. [#37993](https://github.com/ClickHouse/ClickHouse/pull/37993) ([Robert Schulze](https://github.com/rschu1ze)).
+* fix MacOS build compressor faild. [#38007](https://github.com/ClickHouse/ClickHouse/pull/38007) ([xiedeyantu](https://github.com/xiedeyantu)).
+
+#### Bug Fix (user-visible misbehavior in official stable or prestable release)
+
+* Fix `GROUP BY` `AggregateFunction` (i.e. you `GROUP BY` by the column that has `AggregateFunction` type). [#37093](https://github.com/ClickHouse/ClickHouse/pull/37093) ([Azat Khuzhin](https://github.com/azat)).
+* Fix possible heap-use-after-free error when reading system.projection_parts and system.projection_parts_columns . This fixes [#37184](https://github.com/ClickHouse/ClickHouse/issues/37184). [#37185](https://github.com/ClickHouse/ClickHouse/pull/37185) ([Amos Bird](https://github.com/amosbird)).
+* Fix `addDependency` in WindowView. This bug can be reproduced like [#37237](https://github.com/ClickHouse/ClickHouse/issues/37237). [#37224](https://github.com/ClickHouse/ClickHouse/pull/37224) ([vxider](https://github.com/Vxider)).
+* This PR moving `addDependency` from constructor to `startup()` to avoid adding dependency to a **dropped** table, fix [#37237](https://github.com/ClickHouse/ClickHouse/issues/37237). [#37243](https://github.com/ClickHouse/ClickHouse/pull/37243) ([vxider](https://github.com/Vxider)).
+* Fix inserting defaults for missing values in columnar formats. Previously missing columns were filled with defaults for types, not for columns. [#37253](https://github.com/ClickHouse/ClickHouse/pull/37253) ([Kruglov Pavel](https://github.com/Avogar)).
+* Fix some cases of insertion nested arrays to columns of type `Object`. [#37305](https://github.com/ClickHouse/ClickHouse/pull/37305) ([Anton Popov](https://github.com/CurtizJ)).
+* Fix unexpected errors with a clash of constant strings in aggregate function, prewhere and join. Close [#36891](https://github.com/ClickHouse/ClickHouse/issues/36891). [#37336](https://github.com/ClickHouse/ClickHouse/pull/37336) ([Vladimir C](https://github.com/vdimir)).
+* Fix projections with GROUP/ORDER BY in query and optimize_aggregation_in_order (before the result was incorrect since only finish sorting was performed). [#37342](https://github.com/ClickHouse/ClickHouse/pull/37342) ([Azat Khuzhin](https://github.com/azat)).
+* Fixed error with symbols in key name in S3. Fixes [#33009](https://github.com/ClickHouse/ClickHouse/issues/33009). [#37344](https://github.com/ClickHouse/ClickHouse/pull/37344) ([Vladimir Chebotarev](https://github.com/excitoon)).
+* Throw an exception when GROUPING SETS used with ROLLUP or CUBE. [#37367](https://github.com/ClickHouse/ClickHouse/pull/37367) ([Dmitry Novik](https://github.com/novikd)).
+* Fix LOGICAL_ERROR in getMaxSourcePartsSizeForMerge during merges (in case of non standard, greater, values of `background_pool_size`/`background_merges_mutations_concurrency_ratio` has been specified in `config.xml` (new way) not in `users.xml` (deprecated way)). [#37413](https://github.com/ClickHouse/ClickHouse/pull/37413) ([Azat Khuzhin](https://github.com/azat)).
+* ``` Stop removing UTF-8 BOM in RowBinary format. [#37428](https://github.com/ClickHouse/ClickHouse/pull/37428) ([Paul Loyd](https://github.com/loyd)). ```. [#37428](https://github.com/ClickHouse/ClickHouse/pull/37428) ([Paul Loyd](https://github.com/loyd)).
+* clickhouse-keeper bugfix: fix force recovery for single node cluster. [#37440](https://github.com/ClickHouse/ClickHouse/pull/37440) ([Antonio Andelic](https://github.com/antonio2368)).
+* Fix logical error in normalizeUTF8 functions. Closes [#37298](https://github.com/ClickHouse/ClickHouse/issues/37298). [#37443](https://github.com/ClickHouse/ClickHouse/pull/37443) ([Maksim Kita](https://github.com/kitaisreal)).
+* * Fix cast lowcard of nullable in JoinSwitcher, close [#37385](https://github.com/ClickHouse/ClickHouse/issues/37385). [#37453](https://github.com/ClickHouse/ClickHouse/pull/37453) ([Vladimir C](https://github.com/vdimir)).
+* Fix named tuples output in ORC/Arrow/Parquet formats. [#37458](https://github.com/ClickHouse/ClickHouse/pull/37458) ([Kruglov Pavel](https://github.com/Avogar)).
+* Fix optimization of monotonous functions in ORDER BY clause in presence of GROUPING SETS. Fixes [#37401](https://github.com/ClickHouse/ClickHouse/issues/37401). [#37493](https://github.com/ClickHouse/ClickHouse/pull/37493) ([Dmitry Novik](https://github.com/novikd)).
+* Fix error on joining with dictionary on some conditions. Close [#37386](https://github.com/ClickHouse/ClickHouse/issues/37386). [#37530](https://github.com/ClickHouse/ClickHouse/pull/37530) ([Vladimir C](https://github.com/vdimir)).
+* Prohibit `optimize_aggregation_in_order` with `GROUPING SETS` (fixes `LOGICAL_ERROR`). [#37542](https://github.com/ClickHouse/ClickHouse/pull/37542) ([Azat Khuzhin](https://github.com/azat)).
+* Fix wrong dump information of ActionsDAG. [#37587](https://github.com/ClickHouse/ClickHouse/pull/37587) ([zhanglistar](https://github.com/zhanglistar)).
+* Fix converting types for UNION queries (may produce LOGICAL_ERROR). [#37593](https://github.com/ClickHouse/ClickHouse/pull/37593) ([Azat Khuzhin](https://github.com/azat)).
+* Fix `WITH FILL` modifier with negative intervals in `STEP` clause. Fixes [#37514](https://github.com/ClickHouse/ClickHouse/issues/37514). [#37600](https://github.com/ClickHouse/ClickHouse/pull/37600) ([Anton Popov](https://github.com/CurtizJ)).
+* Fix illegal joinGet array usage when ` join_use_nulls = 1`. This fixes [#37562](https://github.com/ClickHouse/ClickHouse/issues/37562) . [#37650](https://github.com/ClickHouse/ClickHouse/pull/37650) ([Amos Bird](https://github.com/amosbird)).
+* Fix columns number mismatch in cross join, close [#37561](https://github.com/ClickHouse/ClickHouse/issues/37561). [#37653](https://github.com/ClickHouse/ClickHouse/pull/37653) ([Vladimir C](https://github.com/vdimir)).
+* Fix segmentation fault in `show create table` from mysql database when it is configured with named collections. Closes [#37683](https://github.com/ClickHouse/ClickHouse/issues/37683). [#37690](https://github.com/ClickHouse/ClickHouse/pull/37690) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Fix RabbitMQ Storage not being able to startup on server restart if storage was create without SETTINGS clause. Closes [#37463](https://github.com/ClickHouse/ClickHouse/issues/37463). [#37691](https://github.com/ClickHouse/ClickHouse/pull/37691) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Fixed DateTime64 fractional seconds behavior prior to Unix epoch. [#37697](https://github.com/ClickHouse/ClickHouse/pull/37697) ([Andrey Zvonov](https://github.com/zvonand)).
+* SQL user defined functions disable CREATE/DROP in readonly mode. Closes [#37280](https://github.com/ClickHouse/ClickHouse/issues/37280). [#37699](https://github.com/ClickHouse/ClickHouse/pull/37699) ([Maksim Kita](https://github.com/kitaisreal)).
+* Fix formatting of Nullable arguments for executable user defined functions. Closes [#35897](https://github.com/ClickHouse/ClickHouse/issues/35897). [#37711](https://github.com/ClickHouse/ClickHouse/pull/37711) ([Maksim Kita](https://github.com/kitaisreal)).
+* Fix optimization enabled by setting `optimize_monotonous_functions_in_order_by` in distributed queries. Fixes [#36037](https://github.com/ClickHouse/ClickHouse/issues/36037). [#37724](https://github.com/ClickHouse/ClickHouse/pull/37724) ([Anton Popov](https://github.com/CurtizJ)).
+* Fix `SELECT ... INTERSECT` and `EXCEPT SELECT` statements with constant string types. [#37738](https://github.com/ClickHouse/ClickHouse/pull/37738) ([Antonio Andelic](https://github.com/antonio2368)).
+* Fix crash of FunctionHashID, closes [#37735](https://github.com/ClickHouse/ClickHouse/issues/37735). [#37742](https://github.com/ClickHouse/ClickHouse/pull/37742) ([flynn](https://github.com/ucasfl)).
+* Fix possible logical error: `Invalid Field get from type UInt64 to type Float64` in `values` table function. Closes [#37602](https://github.com/ClickHouse/ClickHouse/issues/37602). [#37754](https://github.com/ClickHouse/ClickHouse/pull/37754) ([Kruglov Pavel](https://github.com/Avogar)).
+* Fix possible segfault in schema inference in case of exception in SchemaReader constructor. Closes [#37680](https://github.com/ClickHouse/ClickHouse/issues/37680). [#37760](https://github.com/ClickHouse/ClickHouse/pull/37760) ([Kruglov Pavel](https://github.com/Avogar)).
+* Fix setting cast_ipv4_ipv6_default_on_conversion_error for internal cast function. Closes [#35156](https://github.com/ClickHouse/ClickHouse/issues/35156). [#37761](https://github.com/ClickHouse/ClickHouse/pull/37761) ([Maksim Kita](https://github.com/kitaisreal)).
+* Octal literals are not supported. [#37765](https://github.com/ClickHouse/ClickHouse/pull/37765) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)).
+* fix toString error on DatatypeDate32. [#37775](https://github.com/ClickHouse/ClickHouse/pull/37775) ([LiuNeng](https://github.com/liuneng1994)).
+* The clickhouse-keeper setting `dead_session_check_period_ms` was transformed into microseconds (multiplied by 1000), which lead to dead sessions only being cleaned up after several minutes (instead of 500ms). [#37824](https://github.com/ClickHouse/ClickHouse/pull/37824) ([Michael Lex](https://github.com/mlex)).
+* Fix possible "No more packets are available" for distributed queries (in case of `async_socket_for_remote`/`use_hedged_requests` is disabled). [#37826](https://github.com/ClickHouse/ClickHouse/pull/37826) ([Azat Khuzhin](https://github.com/azat)).
+* Do not drop the inner target table when executing `ALTER TABLE … MODIFY QUERY` in WindowView. [#37879](https://github.com/ClickHouse/ClickHouse/pull/37879) ([vxider](https://github.com/Vxider)).
+* Fix directory ownership of coordination dir in clickhouse-keeper Docker image. Fixes [#37914](https://github.com/ClickHouse/ClickHouse/issues/37914). [#37915](https://github.com/ClickHouse/ClickHouse/pull/37915) ([James Maidment](https://github.com/jamesmaidment)).
+* Dictionaries fix custom query with update field and `{condition}`. Closes [#33746](https://github.com/ClickHouse/ClickHouse/issues/33746). [#37947](https://github.com/ClickHouse/ClickHouse/pull/37947) ([Maksim Kita](https://github.com/kitaisreal)).
+* Fix possible incorrect result of `SELECT ... WITH FILL` in the case when `ORDER BY` should be applied after `WITH FILL` result (e.g. for outer query). Incorrect result was caused by optimization for `ORDER BY` expressions ([#35623](https://github.com/ClickHouse/ClickHouse/issues/35623)). Closes [#37904](https://github.com/ClickHouse/ClickHouse/issues/37904). [#37959](https://github.com/ClickHouse/ClickHouse/pull/37959) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)).
+* Add missing default columns when pushing to the target table in WindowView, fix [#37815](https://github.com/ClickHouse/ClickHouse/issues/37815). [#37965](https://github.com/ClickHouse/ClickHouse/pull/37965) ([vxider](https://github.com/Vxider)).
+* Fixed a stack overflow issue that would cause compilation to fail. [#37996](https://github.com/ClickHouse/ClickHouse/pull/37996) ([Han Shukai](https://github.com/KinderRiven)).
+* when open enable_filesystem_query_cache_limit, throw Reserved cache size exceeds the remaining cache size. [#38004](https://github.com/ClickHouse/ClickHouse/pull/38004) ([xiedeyantu](https://github.com/xiedeyantu)).
+* Query, containing ORDER BY ... WITH FILL, can generate extra rows when multiple WITH FILL columns are present. [#38074](https://github.com/ClickHouse/ClickHouse/pull/38074) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)).
+
+#### Bug Fix (user-visible misbehaviour in official stable or prestable release)
+
+* Fix converting types for UNION queries (may produce LOGICAL_ERROR). [#34775](https://github.com/ClickHouse/ClickHouse/pull/34775) ([Azat Khuzhin](https://github.com/azat)).
+* TTL merge may not be scheduled again if BackgroundExecutor is busy. --merges_with_ttl_counter is increased in selectPartsToMerge() --merge task will be ignored if BackgroundExecutor is busy --merges_with_ttl_counter will not be decrease. [#36387](https://github.com/ClickHouse/ClickHouse/pull/36387) ([lthaooo](https://github.com/lthaooo)).
+* Fix overrided settings value of `normalize_function_names`. [#36937](https://github.com/ClickHouse/ClickHouse/pull/36937) ([李扬](https://github.com/taiyang-li)).
+* Fix for exponential time decaying window functions. Now respecting boundaries of the window. [#36944](https://github.com/ClickHouse/ClickHouse/pull/36944) ([Vladimir Chebotarev](https://github.com/excitoon)).
+* Fix bug datetime64 parse from string '1969-12-31 23:59:59.123'. Close [#36994](https://github.com/ClickHouse/ClickHouse/issues/36994). [#37039](https://github.com/ClickHouse/ClickHouse/pull/37039) ([李扬](https://github.com/taiyang-li)).
+
+#### NO CL ENTRY
+
+* NO CL ENTRY:  'Revert "Fix mutations in tables with columns of type `Object`"'. [#37355](https://github.com/ClickHouse/ClickHouse/pull/37355) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* NO CL ENTRY:  'Revert "Remove height restrictions from the query div in play web tool, and m…"'. [#37501](https://github.com/ClickHouse/ClickHouse/pull/37501) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* NO CL ENTRY:  'Revert "Add support for preprocessing ZooKeeper operations in `clickhouse-keeper`"'. [#37534](https://github.com/ClickHouse/ClickHouse/pull/37534) ([Antonio Andelic](https://github.com/antonio2368)).
+* NO CL ENTRY:  'Revert "(only with zero-copy replication, non-production experimental feature not recommended to use) fix possible deadlock during fetching part"'. [#37545](https://github.com/ClickHouse/ClickHouse/pull/37545) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* NO CL ENTRY:  'Revert "RFC: Fix converting types for UNION queries (may produce LOGICAL_ERROR)"'. [#37582](https://github.com/ClickHouse/ClickHouse/pull/37582) ([Dmitry Novik](https://github.com/novikd)).
+* NO CL ENTRY:  'Revert "Revert "(only with zero-copy replication, non-production experimental feature not recommended to use) fix possible deadlock during fetching part""'. [#37598](https://github.com/ClickHouse/ClickHouse/pull/37598) ([alesapin](https://github.com/alesapin)).
+* NO CL ENTRY:  'Revert "Implemented changing comment to a ReplicatedMergeTree table"'. [#37627](https://github.com/ClickHouse/ClickHouse/pull/37627) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* NO CL ENTRY:  'Revert "Remove resursive submodules"'. [#37774](https://github.com/ClickHouse/ClickHouse/pull/37774) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+* NO CL ENTRY:  'Revert "Fix possible segfault in schema inference"'. [#37785](https://github.com/ClickHouse/ClickHouse/pull/37785) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* NO CL ENTRY:  'Revert "Revert "Fix possible segfault in schema inference""'. [#37787](https://github.com/ClickHouse/ClickHouse/pull/37787) ([Kruglov Pavel](https://github.com/Avogar)).
+* NO CL ENTRY:  'Add more Rust client libraries to documentation'. [#37880](https://github.com/ClickHouse/ClickHouse/pull/37880) ([Paul Loyd](https://github.com/loyd)).
+* NO CL ENTRY:  'Revert "Fix errors of CheckTriviallyCopyableMove type"'. [#37902](https://github.com/ClickHouse/ClickHouse/pull/37902) ([Anton Popov](https://github.com/CurtizJ)).
+* NO CL ENTRY:  'Revert "Don't try to kill empty list of containers in `integration/runner`"'. [#38001](https://github.com/ClickHouse/ClickHouse/pull/38001) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* NO CL ENTRY:  'Revert "add d3js based trace visualizer as gantt chart"'. [#38043](https://github.com/ClickHouse/ClickHouse/pull/38043) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* NO CL ENTRY:  'Revert "Add backoff to merges in replicated queue if `always_fetch_merged_part` is enabled"'. [#38082](https://github.com/ClickHouse/ClickHouse/pull/38082) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* NO CL ENTRY:  'Revert "More parallel execution for queries with `FINAL`"'. [#38094](https://github.com/ClickHouse/ClickHouse/pull/38094) ([Alexander Tokmakov](https://github.com/tavplubix)).
+* NO CL ENTRY:  'Revert "Revert "add d3js based trace visualizer as gantt chart""'. [#38129](https://github.com/ClickHouse/ClickHouse/pull/38129) ([Alexey Milovidov](https://github.com/alexey-milovidov)).
+
--- a/docs/en/development/contrib.md
+++ b/docs/en/development/contrib.md
@ -97,7 +97,7 @@ SELECT library_name, license_type, license_path FROM system.licenses ORDER BY li
 ## Adding new third-party libraries and maintaining patches in third-party libraries {#adding-third-party-libraries}

 1. Each third-party library must reside in a dedicated directory under the `contrib/` directory of the ClickHouse repository. Avoid dumps/copies of external code, instead use Git submodule feature to pull third-party code from an external upstream repository.
-2. Submodules are listed in `.gitmodule`. If the external library can be used as-is, you may reference the upstream repository directly. Otherwise, i.e. the external library requires patching/customization, create a fork of the official repository in the [Clickhouse organization in GitHub](https://github.com/ClickHouse).
+2. Submodules are listed in `.gitmodule`. If the external library can be used as-is, you may reference the upstream repository directly. Otherwise, i.e. the external library requires patching/customization, create a fork of the official repository in the [ClickHouse organization in GitHub](https://github.com/ClickHouse).
 3. In the latter case, create a branch with `clickhouse/` prefix from the branch you want to integrate, e.g. `clickhouse/master` (for `master`) or `clickhouse/release/vX.Y.Z` (for a `release/vX.Y.Z` tag). The purpose of this branch is to isolate customization of the library from upstream work. For example, pulls from the upstream repository into the fork will leave all `clickhouse/` branches unaffected. Submodules in `contrib/` must only track `clickhouse/` branches of forked third-party repositories.
 4. To patch a fork of a third-party library, create a dedicated branch with `clickhouse/` prefix in the fork, e.g. `clickhouse/fix-some-desaster`. Finally, merge the patch branch into the custom tracking branch (e.g. `clickhouse/master` or `clickhouse/release/vX.Y.Z`) using a PR.
 5. Always create patches of third-party libraries with the official repository in mind. Once a PR of a patch branch to the `clickhouse/` branch in the fork repository is done and the submodule version in ClickHouse official repository is bumped, consider opening another PR from the patch branch to the upstream library repository. This ensures, that 1) the contribution has more than a single use case and importance, 2) others will also benefit from it, 3) the change will not remain a maintenance burden solely on ClickHouse developers.
--- a/docs/en/engines/table-engines/mergetree-family/replication.md
+++ b/docs/en/engines/table-engines/mergetree-family/replication.md
@ -27,7 +27,7 @@ Compressed data for `INSERT` and `ALTER` queries is replicated (for more informa
 -   The `DROP TABLE` query deletes the replica located on the server where the query is run.
 -   The `RENAME` query renames the table on one of the replicas. In other words, replicated tables can have different names on different replicas.

-ClickHouse uses [Apache ZooKeeper](https://zookeeper.apache.org) for storing replicas meta information. Use ZooKeeper version 3.4.5 or newer.
+ClickHouse uses [ClickHouse Keeper](../../../guides/sre/keeper/clickhouse-keeper.md) for storing replicas meta information. It is possible to use ZooKeeper version 3.4.5 or newer, but ClickHouse Keeper is recommended.

 To use replication, set parameters in the [zookeeper](../../../operations/server-configuration-parameters/settings.md#server-settings_zookeeper) server configuration section.

@ -35,7 +35,7 @@ To use replication, set parameters in the [zookeeper](../../../operations/server
 Don’t neglect the security setting. ClickHouse supports the `digest` [ACL scheme](https://zookeeper.apache.org/doc/current/zookeeperProgrammers.html#sc_ZooKeeperAccessControl) of the ZooKeeper security subsystem.
 :::

-Example of setting the addresses of the ZooKeeper cluster:
+Example of setting the addresses of the ClickHouse Keeper cluster:

 ``` xml
 <zookeeper>
@ -54,8 +54,8 @@ Example of setting the addresses of the ZooKeeper cluster:
 </zookeeper>
 ```

-ClickHouse also supports to store replicas meta information in the auxiliary ZooKeeper cluster by providing ZooKeeper cluster name and path as engine arguments.
-In other word, it supports to store the metadata of differnt tables in different ZooKeeper clusters.
+ClickHouse also supports storing replicas meta information in an auxiliary ZooKeeper cluster. Do this by providing the ZooKeeper cluster name and path as engine arguments.
+In other words, it supports storing the metadata of different tables in different ZooKeeper clusters.

 Example of setting the addresses of the auxiliary ZooKeeper cluster:

@ -122,8 +122,8 @@ The `Replicated` prefix is added to the table engine name. For example:`Replicat

 **Replicated\*MergeTree parameters**

-   `zoo_path` — The path to the table in ZooKeeper.
-   `replica_name` — The replica name in ZooKeeper.
+-   `zoo_path` — The path to the table in ClickHouse Keeper.
+-   `replica_name` — The replica name in ClickHouse Keeper.
 -   `other_parameters` — Parameters of an engine which is used for creating the replicated version, for example, version in `ReplacingMergeTree`.

 Example:
@ -168,18 +168,18 @@ Example:
 </macros>
 ```

-The path to the table in ZooKeeper should be unique for each replicated table. Tables on different shards should have different paths.
+The path to the table in ClickHouse Keeper should be unique for each replicated table. Tables on different shards should have different paths.
 In this case, the path consists of the following parts:

 `/clickhouse/tables/` is the common prefix. We recommend using exactly this one.

 `{layer}-{shard}` is the shard identifier. In this example it consists of two parts, since the example cluster uses bi-level sharding. For most tasks, you can leave just the {shard} substitution, which will be expanded to the shard identifier.

-`table_name` is the name of the node for the table in ZooKeeper. It is a good idea to make it the same as the table name. It is defined explicitly, because in contrast to the table name, it does not change after a RENAME query.
+`table_name` is the name of the node for the table in ClickHouse Keeper. It is a good idea to make it the same as the table name. It is defined explicitly, because in contrast to the table name, it does not change after a RENAME query.
 *HINT*: you could add a database name in front of `table_name` as well. E.g. `db_name.table_name`

 The two built-in substitutions `{database}` and `{table}` can be used, they expand into the table name and the database name respectively (unless these macros are defined in the `macros` section). So the zookeeper path can be specified as `'/clickhouse/tables/{layer}-{shard}/{database}/{table}'`.
-Be careful with table renames when using these built-in substitutions. The path in Zookeeper cannot be changed, and when the table is renamed, the macros will expand into a different path, the table will refer to a path that does not exist in Zookeeper, and will go into read-only mode.
+Be careful with table renames when using these built-in substitutions. The path in ClickHouse Keeper cannot be changed, and when the table is renamed, the macros will expand into a different path, the table will refer to a path that does not exist in ClickHouse Keeper, and will go into read-only mode.

 The replica name identifies different replicas of the same table. You can use the server name for this, as in the example. The name only needs to be unique within each shard.

@ -220,21 +220,21 @@ To delete a replica, run `DROP TABLE`. However, only one replica is deleted –

 ## Recovery After Failures {#recovery-after-failures}

-If ZooKeeper is unavailable when a server starts, replicated tables switch to read-only mode. The system periodically attempts to connect to ZooKeeper.
+If ClickHouse Keeper is unavailable when a server starts, replicated tables switch to read-only mode. The system periodically attempts to connect to ClickHouse Keeper.

-If ZooKeeper is unavailable during an `INSERT`, or an error occurs when interacting with ZooKeeper, an exception is thrown.
+If ClickHouse Keeper is unavailable during an `INSERT`, or an error occurs when interacting with ClickHouse Keeper, an exception is thrown.

-After connecting to ZooKeeper, the system checks whether the set of data in the local file system matches the expected set of data (ZooKeeper stores this information). If there are minor inconsistencies, the system resolves them by syncing data with the replicas.
+After connecting to ClickHouse Keeper, the system checks whether the set of data in the local file system matches the expected set of data (ClickHouse Keeper stores this information). If there are minor inconsistencies, the system resolves them by syncing data with the replicas.

-If the system detects broken data parts (with the wrong size of files) or unrecognized parts (parts written to the file system but not recorded in ZooKeeper), it moves them to the `detached` subdirectory (they are not deleted). Any missing parts are copied from the replicas.
+If the system detects broken data parts (with the wrong size of files) or unrecognized parts (parts written to the file system but not recorded in ClickHouse Keeper), it moves them to the `detached` subdirectory (they are not deleted). Any missing parts are copied from the replicas.

 Note that ClickHouse does not perform any destructive actions such as automatically deleting a large amount of data.

-When the server starts (or establishes a new session with ZooKeeper), it only checks the quantity and sizes of all files. If the file sizes match but bytes have been changed somewhere in the middle, this is not detected immediately, but only when attempting to read the data for a `SELECT` query. The query throws an exception about a non-matching checksum or size of a compressed block. In this case, data parts are added to the verification queue and copied from the replicas if necessary.
+When the server starts (or establishes a new session with ClickHouse Keeper), it only checks the quantity and sizes of all files. If the file sizes match but bytes have been changed somewhere in the middle, this is not detected immediately, but only when attempting to read the data for a `SELECT` query. The query throws an exception about a non-matching checksum or size of a compressed block. In this case, data parts are added to the verification queue and copied from the replicas if necessary.

 If the local set of data differs too much from the expected one, a safety mechanism is triggered. The server enters this in the log and refuses to launch. The reason for this is that this case may indicate a configuration error, such as if a replica on a shard was accidentally configured like a replica on a different shard. However, the thresholds for this mechanism are set fairly low, and this situation might occur during normal failure recovery. In this case, data is restored semi-automatically - by “pushing a button”.

-To start recovery, create the node `/path_to_table/replica_name/flags/force_restore_data` in ZooKeeper with any content, or run the command to restore all replicated tables:
+To start recovery, create the node `/path_to_table/replica_name/flags/force_restore_data` in ClickHouse Keeper with any content, or run the command to restore all replicated tables:

 ``` bash
 sudo -u clickhouse touch /var/lib/clickhouse/flags/force_restore_data
@ -249,11 +249,11 @@ If all data and metadata disappeared from one of the servers, follow these steps
 1.  Install ClickHouse on the server. Define substitutions correctly in the config file that contains the shard identifier and replicas, if you use them.
 2.  If you had unreplicated tables that must be manually duplicated on the servers, copy their data from a replica (in the directory `/var/lib/clickhouse/data/db_name/table_name/`).
 3.  Copy table definitions located in `/var/lib/clickhouse/metadata/` from a replica. If a shard or replica identifier is defined explicitly in the table definitions, correct it so that it corresponds to this replica. (Alternatively, start the server and make all the `ATTACH TABLE` queries that should have been in the .sql files in `/var/lib/clickhouse/metadata/`.)
-4.  To start recovery, create the ZooKeeper node `/path_to_table/replica_name/flags/force_restore_data` with any content, or run the command to restore all replicated tables: `sudo -u clickhouse touch /var/lib/clickhouse/flags/force_restore_data`
+4.  To start recovery, create the ClickHouse Keeper node `/path_to_table/replica_name/flags/force_restore_data` with any content, or run the command to restore all replicated tables: `sudo -u clickhouse touch /var/lib/clickhouse/flags/force_restore_data`

 Then start the server (restart, if it is already running). Data will be downloaded from replicas.

-An alternative recovery option is to delete information about the lost replica from ZooKeeper (`/path_to_table/replica_name`), then create the replica again as described in “[Creating replicated tables](#creating-replicated-tables)”.
+An alternative recovery option is to delete information about the lost replica from ClickHouse Keeper (`/path_to_table/replica_name`), then create the replica again as described in “[Creating replicated tables](#creating-replicated-tables)”.

 There is no restriction on network bandwidth during recovery. Keep this in mind if you are restoring many replicas at once.

@ -276,13 +276,13 @@ Create a MergeTree table with a different name. Move all the data from the direc
 If you want to get rid of a `ReplicatedMergeTree` table without launching the server:

 -   Delete the corresponding `.sql` file in the metadata directory (`/var/lib/clickhouse/metadata/`).
-   Delete the corresponding path in ZooKeeper (`/path_to_table/replica_name`).
+-   Delete the corresponding path in ClickHouse Keeper (`/path_to_table/replica_name`).

 After this, you can launch the server, create a `MergeTree` table, move the data to its directory, and then restart the server.

-## Recovery When Metadata in the Zookeeper Cluster Is Lost or Damaged {#recovery-when-metadata-in-the-zookeeper-cluster-is-lost-or-damaged}
+## Recovery When Metadata in the ClickHouse Keeper Cluster Is Lost or Damaged {#recovery-when-metadata-in-the-zookeeper-cluster-is-lost-or-damaged}

-If the data in ZooKeeper was lost or damaged, you can save data by moving it to an unreplicated table as described above.
+If the data in ClickHouse Keeper was lost or damaged, you can save data by moving it to an unreplicated table as described above.

 **See Also**

--- a/docs/en/interfaces/cpp.md
+++ b/docs/en/interfaces/cpp.md
@ -7,4 +7,6 @@ sidebar_label: C++ Client Library

 See README at [clickhouse-cpp](https://github.com/ClickHouse/clickhouse-cpp) repository.

-[Original article](https://clickhouse.com/docs/en/interfaces/cpp/) <!--hide-->
+# userver Asynchronous Framework
+
+[userver (beta)](https://github.com/userver-framework/userver) has builtin support for ClickHouse.
--- a/docs/en/interfaces/third-party/integrations.md
+++ b/docs/en/interfaces/third-party/integrations.md
@ -28,6 +28,9 @@ ClickHouse, Inc. does **not** maintain the tools and libraries listed below and
    -   [Kafka](https://kafka.apache.org)
        -   [clickhouse_sinker](https://github.com/housepower/clickhouse_sinker) (uses [Go client](https://github.com/ClickHouse/clickhouse-go/))
        -   [stream-loader-clickhouse](https://github.com/adform/stream-loader)
+-   Batch processing
+    -   [Spark](https://spark.apache.org)
+        -   [spark-clickhouse-connector](https://github.com/housepower/spark-clickhouse-connector)
 -   Stream processing
    -   [Flink](https://flink.apache.org)
        -   [flink-clickhouse-sink](https://github.com/ivi-ru/flink-clickhouse-sink)
--- a/docs/en/operations/clickhouse-keeper.md
+++ b/docs/en/operations/clickhouse-keeper.md
@ -325,14 +325,14 @@ clickhouse-keeper-converter --zookeeper-logs-dir /var/lib/zookeeper/version-2 --

 ## Recovering after losing quorum

-Because Clickhouse Keeper uses Raft it can tolerate certain amount of node crashes depending on the cluster size. \
+Because ClickHouse Keeper uses Raft it can tolerate certain amount of node crashes depending on the cluster size. \
 E.g. for a 3-node cluster, it will continue working correctly if only 1 node crashes.

 Cluster configuration can be dynamically configured but there are some limitations. Reconfiguration relies on Raft also
 so to add/remove a node from the cluster you need to have a quorum. If you lose too many nodes in your cluster at the same time without any chance
 of starting them again, Raft will stop working and not allow you to reconfigure your cluster using the conventional way.

-Nevertheless, Clickhouse Keeper has a recovery mode which allows you to forcefully reconfigure your cluster with only 1 node.
+Nevertheless, ClickHouse Keeper has a recovery mode which allows you to forcefully reconfigure your cluster with only 1 node.
 This should be done only as your last resort if you cannot start your nodes again, or start a new instance on the same endpoint.

 Important things to note before continuing:
--- a/docs/en/operations/system-tables/replication_queue.md
+++ b/docs/en/operations/system-tables/replication_queue.md
@ -1,6 +1,6 @@
 # replication_queue

-Contains information about tasks from replication queues stored in Clickhouse Keeper, or ZooKeeper, for tables in the `ReplicatedMergeTree` family.
+Contains information about tasks from replication queues stored in ClickHouse Keeper, or ZooKeeper, for tables in the `ReplicatedMergeTree` family.

 Columns:

--- a/docs/en/operations/tips.md
+++ b/docs/en/operations/tips.md
@ -274,6 +274,6 @@ end script

 ## Antivirus software {#antivirus-software}

-If you use antivirus software configure it to skip folders with Clickhouse datafiles (`/var/lib/clickhouse`) otherwise performance may be reduced and you may experience unexpected errors during data ingestion and background merges.
+If you use antivirus software configure it to skip folders with ClickHouse datafiles (`/var/lib/clickhouse`) otherwise performance may be reduced and you may experience unexpected errors during data ingestion and background merges.

 [Original article](https://clickhouse.com/docs/en/operations/tips/)
--- a/docs/en/sql-reference/functions/encryption-functions.md
+++ b/docs/en/sql-reference/functions/encryption-functions.md
@ -19,11 +19,10 @@ This function encrypts data using these modes:

 -   aes-128-ecb, aes-192-ecb, aes-256-ecb
 -   aes-128-cbc, aes-192-cbc, aes-256-cbc
-   aes-128-cfb1, aes-192-cfb1, aes-256-cfb1
-   aes-128-cfb8, aes-192-cfb8, aes-256-cfb8
-   aes-128-cfb128, aes-192-cfb128, aes-256-cfb128
+-   aes-128-cfb128
 -   aes-128-ofb, aes-192-ofb, aes-256-ofb
 -   aes-128-gcm, aes-192-gcm, aes-256-gcm
+-   aes-128-ctr, aes-192-ctr, aes-256-ctr

 **Syntax**

@ -63,9 +62,9 @@ Insert some data (please avoid storing the keys/ivs in the database as this unde
 Query:

 ``` sql
-INSERT INTO encryption_test VALUES('aes-256-cfb128 no IV', encrypt('aes-256-cfb128', 'Secret', '12345678910121314151617181920212')),\
-('aes-256-cfb128 no IV, different key', encrypt('aes-256-cfb128', 'Secret', 'keykeykeykeykeykeykeykeykeykeyke')),\
-('aes-256-cfb128 with IV', encrypt('aes-256-cfb128', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv')),\
+INSERT INTO encryption_test VALUES('aes-256-ofb no IV', encrypt('aes-256-ofb', 'Secret', '12345678910121314151617181920212')),\
+('aes-256-ofb no IV, different key', encrypt('aes-256-ofb', 'Secret', 'keykeykeykeykeykeykeykeykeykeyke')),\
+('aes-256-ofb with IV', encrypt('aes-256-ofb', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv')),\
 ('aes-256-cbc no IV', encrypt('aes-256-cbc', 'Secret', '12345678910121314151617181920212'));
 ```

@ -78,12 +77,12 @@ SELECT comment, hex(secret) FROM encryption_test;
 Result:

 ``` text
-┌─comment─────────────────────────────┬─hex(secret)──────────────────────┐
-│ aes-256-cfb128 no IV                │ B4972BDC4459                     │
-│ aes-256-cfb128 no IV, different key │ 2FF57C092DC9                     │
-│ aes-256-cfb128 with IV              │ 5E6CB398F653                     │
-│ aes-256-cbc no IV                   │ 1BC0629A92450D9E73A00E7D02CF4142 │
-└─────────────────────────────────────┴──────────────────────────────────┘
+┌─comment──────────────────────────┬─hex(secret)──────────────────────┐
+│ aes-256-ofb no IV                │ B4972BDC4459                     │
+│ aes-256-ofb no IV, different key │ 2FF57C092DC9                     │
+│ aes-256-ofb with IV              │ 5E6CB398F653                     │
+│ aes-256-cbc no IV                │ 1BC0629A92450D9E73A00E7D02CF4142 │
+└──────────────────────────────────┴──────────────────────────────────┘
 ```

 Example with `-gcm`:
@ -116,9 +115,7 @@ Supported encryption modes:

 -   aes-128-ecb, aes-192-ecb, aes-256-ecb
 -   aes-128-cbc, aes-192-cbc, aes-256-cbc
-   aes-128-cfb1, aes-192-cfb1, aes-256-cfb1
-   aes-128-cfb8, aes-192-cfb8, aes-256-cfb8
-   aes-128-cfb128, aes-192-cfb128, aes-256-cfb128
+-   aes-128-cfb128
 -   aes-128-ofb, aes-192-ofb, aes-256-ofb

 **Syntax**
@ -145,7 +142,7 @@ Given equal input `encrypt` and `aes_encrypt_mysql` produce the same ciphertext:
 Query:

 ``` sql
-SELECT encrypt('aes-256-cfb128', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv') = aes_encrypt_mysql('aes-256-cfb128', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv') AS ciphertexts_equal;
+SELECT encrypt('aes-256-ofb', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv') = aes_encrypt_mysql('aes-256-ofb', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv') AS ciphertexts_equal;
 ```

 Result:
@ -161,14 +158,14 @@ But `encrypt` fails when `key` or `iv` is longer than expected:
 Query:

 ``` sql
-SELECT encrypt('aes-256-cfb128', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123');
+SELECT encrypt('aes-256-ofb', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123');
 ```

 Result:

 ``` text
-Received exception from server (version 21.1.2):
-Code: 36. DB::Exception: Received from localhost:9000. DB::Exception: Invalid key size: 33 expected 32: While processing encrypt('aes-256-cfb128', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123').
+Received exception from server (version 22.6.1):
+Code: 36. DB::Exception: Received from localhost:9000. DB::Exception: Invalid key size: 33 expected 32: While processing encrypt('aes-256-ofb', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123').
 ```

 While `aes_encrypt_mysql` produces MySQL-compatitalbe output:
@ -176,7 +173,7 @@ While `aes_encrypt_mysql` produces MySQL-compatitalbe output:
 Query:

 ``` sql
-SELECT hex(aes_encrypt_mysql('aes-256-cfb128', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123')) AS ciphertext;
+SELECT hex(aes_encrypt_mysql('aes-256-ofb', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123')) AS ciphertext;
 ```

 Result:
@ -192,7 +189,7 @@ Notice how supplying even longer `IV` produces the same result
 Query:

 ``` sql
-SELECT hex(aes_encrypt_mysql('aes-256-cfb128', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123456')) AS ciphertext
+SELECT hex(aes_encrypt_mysql('aes-256-ofb', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123456')) AS ciphertext
 ```

 Result:
@ -206,7 +203,7 @@ Result:
 Which is binary equal to what MySQL produces on same inputs:

 ``` sql
-mysql> SET  block_encryption_mode='aes-256-cfb128';
+mysql> SET  block_encryption_mode='aes-256-ofb';
 Query OK, 0 rows affected (0.00 sec)

 mysql> SELECT aes_encrypt('Secret', '123456789101213141516171819202122', 'iviviviviviviviv123456') as ciphertext;
@ -224,11 +221,10 @@ This function decrypts ciphertext into a plaintext using these modes:

 -   aes-128-ecb, aes-192-ecb, aes-256-ecb
 -   aes-128-cbc, aes-192-cbc, aes-256-cbc
-   aes-128-cfb1, aes-192-cfb1, aes-256-cfb1
-   aes-128-cfb8, aes-192-cfb8, aes-256-cfb8
-   aes-128-cfb128, aes-192-cfb128, aes-256-cfb128
+-   aes-128-cfb128
 -   aes-128-ofb, aes-192-ofb, aes-256-ofb
 -   aes-128-gcm, aes-192-gcm, aes-256-gcm
+-   aes-128-ctr, aes-192-ctr, aes-256-ctr

 **Syntax**

@ -265,12 +261,12 @@ Result:
 │ aes-256-gcm          │ A8A3CCBC6426CFEEB60E4EAE03D3E94204C1B09E0254 │
 │ aes-256-gcm with AAD │ A8A3CCBC6426D9A1017A0A932322F1852260A4AD6837 │
 └──────────────────────┴──────────────────────────────────────────────┘
-┌─comment─────────────────────────────┬─hex(secret)──────────────────────┐
-│ aes-256-cfb128 no IV                │ B4972BDC4459                     │
-│ aes-256-cfb128 no IV, different key │ 2FF57C092DC9                     │
-│ aes-256-cfb128 with IV              │ 5E6CB398F653                     │
-│ aes-256-cbc no IV                   │ 1BC0629A92450D9E73A00E7D02CF4142 │
-└─────────────────────────────────────┴──────────────────────────────────┘
+┌─comment──────────────────────────┬─hex(secret)──────────────────────┐
+│ aes-256-ofb no IV                │ B4972BDC4459                     │
+│ aes-256-ofb no IV, different key │ 2FF57C092DC9                     │
+│ aes-256-ofb with IV              │ 5E6CB398F653                     │
+│ aes-256-cbc no IV                │ 1BC0629A92450D9E73A00E7D02CF4142 │
+└──────────────────────────────────┴──────────────────────────────────┘
 ```

 Now let's try to decrypt all that data.
@ -284,13 +280,19 @@ SELECT comment, decrypt('aes-256-cfb128', secret, '12345678910121314151617181920
 Result:

 ``` text
-┌─comment─────────────────────────────┬─plaintext─┐
-│ aes-256-cfb128 no IV                │ Secret    │
-│ aes-256-cfb128 no IV, different key │ <20>4<EFBFBD>
-                                           <20>         │
-│ aes-256-cfb128 with IV              │ <20><><EFBFBD>6<EFBFBD>~        │
- │aes-256-cbc no IV                   │ <20>2*4<>h3c<33>4w<34><77>@
-└─────────────────────────────────────┴───────────┘
+┌─comment──────────────┬─plaintext──┐
+│ aes-256-gcm          │ OQ<4F>E
+                             <20>t<EFBFBD>7T<37>\<5C><><EFBFBD>\<5C>   │
+│ aes-256-gcm with AAD │ OQ<4F>E
+                             <20>\<5C><>si<73><69><EFBFBD><EFBFBD>;<3B>o<EFBFBD><6F> │
+└──────────────────────┴────────────┘
+┌─comment──────────────────────────┬─plaintext─┐
+│ aes-256-ofb no IV                │ Secret    │
+│ aes-256-ofb no IV, different key │ <20>4<EFBFBD>
+                                        <20>         │
+│ aes-256-ofb with IV              │ <20><><EFBFBD>6<EFBFBD>~        │
+ │aes-256-cbc no IV                │ <20>2*4<>h3c<33>4w<34><77>@
+└──────────────────────────────────┴───────────┘
 ```

 Notice how only a portion of the data was properly decrypted, and the rest is gibberish since either `mode`, `key`, or `iv` were different upon encryption.
@ -305,9 +307,7 @@ Supported decryption modes:

 -   aes-128-ecb, aes-192-ecb, aes-256-ecb
 -   aes-128-cbc, aes-192-cbc, aes-256-cbc
-   aes-128-cfb1, aes-192-cfb1, aes-256-cfb1
-   aes-128-cfb8, aes-192-cfb8, aes-256-cfb8
-   aes-128-cfb128, aes-192-cfb128, aes-256-cfb128
+-   aes-128-cfb128
 -   aes-128-ofb, aes-192-ofb, aes-256-ofb

 **Syntax**
@ -332,7 +332,7 @@ aes_decrypt_mysql('mode', 'ciphertext', 'key' [, iv])
 Let's decrypt data we've previously encrypted with MySQL:

 ``` sql
-mysql> SET  block_encryption_mode='aes-256-cfb128';
+mysql> SET  block_encryption_mode='aes-256-ofb';
 Query OK, 0 rows affected (0.00 sec)

 mysql> SELECT aes_encrypt('Secret', '123456789101213141516171819202122', 'iviviviviviviviv123456') as ciphertext;
@ -347,7 +347,7 @@ mysql> SELECT aes_encrypt('Secret', '123456789101213141516171819202122', 'iviviv
 Query:

 ``` sql
-SELECT aes_decrypt_mysql('aes-256-cfb128', unhex('24E9E4966469'), '123456789101213141516171819202122', 'iviviviviviviviv123456') AS plaintext
+SELECT aes_decrypt_mysql('aes-256-ofb', unhex('24E9E4966469'), '123456789101213141516171819202122', 'iviviviviviviviv123456') AS plaintext
 ```

 Result:
--- a/docs/en/sql-reference/functions/string-functions.md
+++ b/docs/en/sql-reference/functions/string-functions.md
@ -273,16 +273,16 @@ Converts ASCII Latin symbols in a string to uppercase.
 ## lowerUTF8

 Converts a string to lowercase, assuming the string contains a set of bytes that make up a UTF-8 encoded text.
-It does not detect the language. So for Turkish the result might not be exactly correct.
+It does not detect the language. E.g. for Turkish the result might not be exactly correct (i/İ vs. i/I).
 If the length of the UTF-8 byte sequence is different for upper and lower case of a code point, the result may be incorrect for this code point.
-If the string contains a set of bytes that is not UTF-8, then the behavior is undefined.
+If the string contains a sequence of bytes that are not valid UTF-8, then the behavior is undefined.

 ## upperUTF8

 Converts a string to uppercase, assuming the string contains a set of bytes that make up a UTF-8 encoded text.
-It does not detect the language. So for Turkish the result might not be exactly correct.
+It does not detect the language. E.g. for Turkish the result might not be exactly correct (i/İ vs. i/I).
 If the length of the UTF-8 byte sequence is different for upper and lower case of a code point, the result may be incorrect for this code point.
-If the string contains a set of bytes that is not UTF-8, then the behavior is undefined.
+If the string contains a sequence of bytes that are not valid UTF-8, then the behavior is undefined.

 ## isValidUTF8

--- a/docs/en/sql-reference/functions/string-search-functions.md
+++ b/docs/en/sql-reference/functions/string-search-functions.md
@ -7,7 +7,7 @@ sidebar_label: For Searching in Strings

 The search is case-sensitive by default in all these functions. There are separate variants for case insensitive search.

-:::note    
+:::note
 Functions for [replacing](../../sql-reference/functions/string-replace-functions.md) and [other manipulations with strings](../../sql-reference/functions/string-functions.md) are described separately.
 :::

@ -31,7 +31,7 @@ position(needle IN haystack)

 Alias: `locate(haystack, needle[, start_pos])`.

-:::note    
+:::note
 Syntax of `position(needle IN haystack)` provides SQL-compatibility, the function works the same way as to `position(haystack, needle)`.
 :::

@ -344,24 +344,27 @@ Returns 1, if at least one string needle<sub>i</sub> matches the string `haystac

 For a case-insensitive search or/and in UTF-8 format use functions `multiSearchAnyCaseInsensitive, multiSearchAnyUTF8, multiSearchAnyCaseInsensitiveUTF8`.

-:::note    
+:::note
 In all `multiSearch*` functions the number of needles should be less than 2<sup>8</sup> because of implementation specification.
 :::

 ## match(haystack, pattern)

-Checks whether the string matches the `pattern` regular expression. A `re2` regular expression. The [syntax](https://github.com/google/re2/wiki/Syntax) of the `re2` regular expressions is more limited than the syntax of the Perl regular expressions.
+Checks whether the string matches the regular expression `pattern` in `re2` syntax. `Re2` has a more limited [syntax](https://github.com/google/re2/wiki/Syntax) than Perl regular expressions.

 Returns 0 if it does not match, or 1 if it matches.

-The regular expression works with the string as if it is a set of bytes. The regular expression can’t contain null bytes.
+Matching is based on UTF-8, e.g. `.` matches the Unicode code point `¥` which is represented in UTF-8 using two bytes. The regular expression must not contain null bytes.
+If the haystack or pattern contain a sequence of bytes that are not valid UTF-8, then the behavior is undefined.
+No automatic Unicode normalization is performed, if you need it you can use the [normalizeUTF8*()](https://clickhouse.com/docs/en/sql-reference/functions/string-functions/) functions for that.
+
 For patterns to search for substrings in a string, it is better to use LIKE or ‘position’, since they work much faster.

 ## multiMatchAny(haystack, \[pattern<sub>1</sub>, pattern<sub>2</sub>, …, pattern<sub>n</sub>\])

 The same as `match`, but returns 0 if none of the regular expressions are matched and 1 if any of the patterns matches. It uses [hyperscan](https://github.com/intel/hyperscan) library. For patterns to search substrings in a string, it is better to use `multiSearchAny` since it works much faster.

-:::note    
+:::note
 The length of any of the `haystack` string must be less than 2<sup>32</sup> bytes otherwise the exception is thrown. This restriction takes place because of hyperscan API.
 :::

@ -385,11 +388,11 @@ The same as `multiFuzzyMatchAny`, but returns any index that matches the haystac

 The same as `multiFuzzyMatchAny`, but returns the array of all indices in any order that match the haystack within a constant edit distance.

-:::note    
+:::note
 `multiFuzzyMatch*` functions do not support UTF-8 regular expressions, and such expressions are treated as bytes because of hyperscan restriction.
 :::

-:::note    
+:::note
 To turn off all functions that use hyperscan, use setting `SET allow_hyperscan = 0;`.
 :::

@ -405,7 +408,7 @@ Extracts all the fragments of a string using a regular expression. If ‘haystac

 Matches all groups of the `haystack` string using the `pattern` regular expression. Returns an array of arrays, where the first array includes all fragments matching the first group, the second array - matching the second group, etc.

-:::note    
+:::note
 `extractAllGroupsHorizontal` function is slower than [extractAllGroupsVertical](#extractallgroups-vertical).
 :::

@ -498,6 +501,10 @@ The regular expression can contain the metasymbols `%` and `_`.

 Use the backslash (`\`) for escaping metasymbols. See the note on escaping in the description of the ‘match’ function.

+Matching is based on UTF-8, e.g. `_` matches the Unicode code point `¥` which is represented in UTF-8 using two bytes.
+If the haystack or pattern contain a sequence of bytes that are not valid UTF-8, then the behavior is undefined.
+No automatic Unicode normalization is performed, if you need it you can use the [normalizeUTF8*()](https://clickhouse.com/docs/en/sql-reference/functions/string-functions/) functions for that.
+
 For regular expressions like `%needle%`, the code is more optimal and works as fast as the `position` function.
 For other regular expressions, the code is the same as for the ‘match’ function.

@ -509,6 +516,8 @@ The same thing as ‘like’, but negative.

 Case insensitive variant of [like](https://clickhouse.com/docs/en/sql-reference/functions/string-search-functions/#function-like) function. You can use `ILIKE` operator instead of the `ilike` function.

+The function ignores the language, e.g. for Turkish (i/İ), the result might be incorrect.
+
 **Syntax**

 ``` sql
@ -577,7 +586,7 @@ Same as `ngramDistance` but calculates the non-symmetric difference between `nee

 For case-insensitive search or/and in UTF-8 format use functions `ngramSearchCaseInsensitive, ngramSearchUTF8, ngramSearchCaseInsensitiveUTF8`.

-:::note    
+:::note
 For UTF-8 case we use 3-gram distance. All these are not perfectly fair n-gram distances. We use 2-byte hashes to hash n-grams and then calculate the (non-)symmetric difference between these hash tables – collisions may occur. With UTF-8 case-insensitive format we do not use fair `tolower` function – we zero the 5-th bit (starting from zero) of each codepoint byte and first bit of zeroth byte if bytes more than one – this works for Latin and mostly for all Cyrillic letters.
 :::

--- a/docs/en/sql-reference/operators/index.md
+++ b/docs/en/sql-reference/operators/index.md
@ -43,28 +43,38 @@ For tuple subtraction: [tupleMinus](../../sql-reference/functions/tuple-function

 ## Comparison Operators

+### equals function
 `a = b` – The `equals(a, b)` function.

 `a == b` – The `equals(a, b)` function.

+### notEquals function
 `a != b` – The `notEquals(a, b)` function.

 `a <> b` – The `notEquals(a, b)` function.

+### lessOrEquals function
 `a <= b` – The `lessOrEquals(a, b)` function.

+### greaterOrEquals function
 `a >= b` – The `greaterOrEquals(a, b)` function.

+### less function
 `a < b` – The `less(a, b)` function.

+### greater function
 `a > b` – The `greater(a, b)` function.

+### like function
 `a LIKE s` – The `like(a, b)` function.

+### notLike function
 `a NOT LIKE s` – The `notLike(a, b)` function.

+### ilike function
 `a ILIKE s` – The `ilike(a, b)` function.

+### BETWEEN function
 `a BETWEEN b AND c` – The same as `a >= b AND a <= c`.

 `a NOT BETWEEN b AND c` – The same as `a < b OR a > c`.
@ -73,20 +83,28 @@ For tuple subtraction: [tupleMinus](../../sql-reference/functions/tuple-function

 See [IN operators](../../sql-reference/operators/in.md) and [EXISTS](../../sql-reference/operators/exists.md) operator.

+### in function
 `a IN ...` – The `in(a, b)` function.

+### notIn function
 `a NOT IN ...` – The `notIn(a, b)` function.

+### globalIn function
 `a GLOBAL IN ...` – The `globalIn(a, b)` function.

+### globalNotIn function
 `a GLOBAL NOT IN ...` – The `globalNotIn(a, b)` function.

+### in subquery function
 `a = ANY (subquery)` – The `in(a, subquery)` function.  

+### notIn subquery function
 `a != ANY (subquery)` – The same as `a NOT IN (SELECT singleValueOrNull(*) FROM subquery)`.

+### in subquery function
 `a = ALL (subquery)` – The same as `a IN (SELECT singleValueOrNull(*) FROM subquery)`.

+### notIn subquery function
 `a != ALL (subquery)` – The `notIn(a, subquery)` function. 


--- a/docs/en/sql-reference/statements/select/group-by.md
+++ b/docs/en/sql-reference/statements/select/group-by.md
@ -48,9 +48,9 @@ You can see that `GROUP BY` for `y = NULL` summed up `x`, as if `NULL` is this v

 If you pass several keys to `GROUP BY`, the result will give you all the combinations of the selection, as if `NULL` were a specific value.

-## WITH ROLLUP Modifier
+## ROLLUP Modifier

-`WITH ROLLUP` modifier is used to calculate subtotals for the key expressions, based on their order in the `GROUP BY` list. The subtotals rows are added after the result table.
+`ROLLUP` modifier is used to calculate subtotals for the key expressions, based on their order in the `GROUP BY` list. The subtotals rows are added after the result table.

 The subtotals are calculated in the reverse order: at first subtotals are calculated for the last key expression in the list, then for the previous one, and so on up to the first key expression.

@ -78,7 +78,7 @@ Consider the table t:
 Query:

 ```sql
-SELECT year, month, day, count(*) FROM t GROUP BY year, month, day WITH ROLLUP;
+SELECT year, month, day, count(*) FROM t GROUP BY ROLLUP(year, month, day);
 ```
 As `GROUP BY` section has three key expressions, the result contains four tables with subtotals "rolled up" from right to left:

@ -109,10 +109,14 @@ As `GROUP BY` section has three key expressions, the result contains four tables
 │    0 │     0 │   0 │       6 │
 └──────┴───────┴─────┴─────────┘
 ```
+The same query also can be written using `WITH` keyword.
+```sql
+SELECT year, month, day, count(*) FROM t GROUP BY year, month, day WITH ROLLUP;
+```

-## WITH CUBE Modifier
+## CUBE Modifier

-`WITH CUBE` modifier is used to calculate subtotals for every combination of the key expressions in the `GROUP BY` list. The subtotals rows are added after the result table.
+`CUBE` modifier is used to calculate subtotals for every combination of the key expressions in the `GROUP BY` list. The subtotals rows are added after the result table.

 In the subtotals rows the values of all "grouped" key expressions are set to `0` or empty line.

@ -138,7 +142,7 @@ Consider the table t:
 Query:

 ```sql
-SELECT year, month, day, count(*) FROM t GROUP BY year, month, day WITH CUBE;
+SELECT year, month, day, count(*) FROM t GROUP BY CUBE(year, month, day);
 ```

 As `GROUP BY` section has three key expressions, the result contains eight tables with subtotals for all key expression combinations:
@ -196,6 +200,10 @@ Columns, excluded from `GROUP BY`, are filled with zeros.
 │    0 │     0 │   0 │       6 │
 └──────┴───────┴─────┴─────────┘
 ```
+The same query also can be written using `WITH` keyword.
+```sql
+SELECT year, month, day, count(*) FROM t GROUP BY year, month, day WITH CUBE;
+```


 ## WITH TOTALS Modifier
@ -260,6 +268,39 @@ GROUP BY domain

 For every different key value encountered, `GROUP BY` calculates a set of aggregate function values.

+## GROUPING SETS modifier
+
+This is the most general modifier.
+This modifier allows to manually specify several aggregation key sets (grouping sets).
+Aggregation is performed separately for each grouping set, after that all results are combined.
+If a column is not presented in a grouping set, it's filled with a default value.
+
+In other words, modifiers described above can be represented via `GROUPING SETS`.
+Despite the fact that queries with `ROLLUP`, `CUBE` and `GROUPING SETS` modifiers are syntactically equal, they may have different performance.
+When `GROUPING SETS` try to execute everything in parallel, `ROLLUP` and `CUBE` are executing the final merging of the aggregates in a single thread.
+
+In the situation when source columns contain default values, it might be hard to distinguish if a row is a part of the aggregation which uses those columns as keys or not.
+To solve this problem `GROUPING` function must be used.
+
+**Example**
+
+The following two queries are equivalent.
+
+```sql
+-- Query 1
+SELECT year, month, day, count(*) FROM t GROUP BY year, month, day WITH ROLLUP;
+
+-- Query 2
+SELECT year, month, day, count(*) FROM t GROUP BY
+GROUPING SETS
+(
+    (year, month, day),
+    (year, month),
+    (year),
+    ()
+);
+```
+
 ## Implementation Details

 Aggregation is one of the most important features of a column-oriented DBMS, and thus it’s implementation is one of the most heavily optimized parts of ClickHouse. By default, aggregation is done in memory using a hash-table. It has 40+ specializations that are chosen automatically depending on “grouping key” data types.
--- a/docs/en/sql-reference/window-functions/index.md
+++ b/docs/en/sql-reference/window-functions/index.md
@ -55,3 +55,372 @@ https://dev.mysql.com/doc/refman/8.0/en/window-function-descriptions.html
 https://dev.mysql.com/doc/refman/8.0/en/window-functions-usage.html

 https://dev.mysql.com/doc/refman/8.0/en/window-functions-frames.html
+
+## Syntax
+
+```text
+aggregate_function (column_name)
+  OVER ([PARTITION BY groupping_column] [ORDER BY sorting_column]
+        [ROWS or RANGE expression_to_bounds_of_frame])
+```
+
+- `PARTITION BY` - defines how to break a resultset into groups.
+- `ORDER BY` - defines how to order rows inside the group during calculation aggregate_function.
+- `ROWS or RANGE` - defines bounds of a frame, aggregate_function is calculated within a frame.
+
+```text
+      PARTITION
+┌─────────────────┐  <-- UNBOUNDED PRECEDING (BEGINNING of the PARTITION)
+│                 │
+│                 │
+│=================│  <-- N PRECEDING  <─┐
+│      N ROWS     │                     │  F
+│  Before CURRENT │                     │  R
+│~~~~~~~~~~~~~~~~~│  <-- CURRENT ROW    │  A
+│     M ROWS      │                     │  M
+│   After CURRENT │                     │  E
+│=================│  <-- M FOLLOWING  <─┘
+│                 │
+│                 │
+└─────────────────┘  <--- UNBOUNDED FOLLOWING (END of the PARTITION)
+```
+
+## Examples
+
+```sql
+CREATE TABLE wf_partition
+(
+    `part_key` UInt64,
+    `value` UInt64
+)
+ENGINE = Memory;
+
+INSERT INTO wf_partition FORMAT Values
+   (1,1,1), (1,2,2), (1,3,3), (2,0,0), (3,0,0);
+
+SELECT
+    part_key,
+    value,
+    order,
+    groupArray(value) OVER (PARTITION BY part_key) AS frame_values
+FROM wf_partition
+ORDER BY
+    part_key ASC,
+    value ASC;
+
+┌─part_key─┬─value─┬─order─┬─frame_values─┐
+│        1 │     1 │     1 │ [1,2,3]      │   <┐   
+│        1 │     2 │     2 │ [1,2,3]      │    │  1-st group
+│        1 │     3 │     3 │ [1,2,3]      │   <┘ 
+│        2 │     0 │     0 │ [0]          │   <-  2-nd group
+│        3 │     0 │     0 │ [0]          │   <-  3-d group
+└──────────┴───────┴───────┴──────────────┘
+```
+
+```sql
+CREATE TABLE wf_frame
+(
+    `part_key` UInt64,
+    `value` UInt64,
+    `order` UInt64
+)
+ENGINE = Memory;
+
+INSERT INTO wf_frame FORMAT Values
+   (1,1,1), (1,2,2), (1,3,3), (1,4,4), (1,5,5);
+
+-- frame is bounded by bounds of a partition (BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)
+SELECT
+    part_key,
+    value,
+    order,
+    groupArray(value) OVER (PARTITION BY part_key ORDER BY order ASC
+         Rows BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS frame_values
+FROM wf_frame
+ORDER BY
+    part_key ASC,
+    value ASC;
+    
+┌─part_key─┬─value─┬─order─┬─frame_values─┐
+│        1 │     1 │     1 │ [1,2,3,4,5]  │
+│        1 │     2 │     2 │ [1,2,3,4,5]  │
+│        1 │     3 │     3 │ [1,2,3,4,5]  │
+│        1 │     4 │     4 │ [1,2,3,4,5]  │
+│        1 │     5 │     5 │ [1,2,3,4,5]  │
+└──────────┴───────┴───────┴──────────────┘
+
+-- short form - no bound expression, no order by
+SELECT
+    part_key,
+    value,
+    order,
+    groupArray(value) OVER (PARTITION BY part_key) AS frame_values
+FROM wf_frame
+ORDER BY
+    part_key ASC,
+    value ASC;
+┌─part_key─┬─value─┬─order─┬─frame_values─┐
+│        1 │     1 │     1 │ [1,2,3,4,5]  │
+│        1 │     2 │     2 │ [1,2,3,4,5]  │
+│        1 │     3 │     3 │ [1,2,3,4,5]  │
+│        1 │     4 │     4 │ [1,2,3,4,5]  │
+│        1 │     5 │     5 │ [1,2,3,4,5]  │
+└──────────┴───────┴───────┴──────────────┘
+
+-- frame is bounded by the beggining of a partition and the current row
+SELECT
+    part_key,
+    value,
+    order,
+    groupArray(value) OVER (PARTITION BY part_key ORDER BY order ASC
+          Rows BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS frame_values
+FROM wf_frame
+ORDER BY
+    part_key ASC,
+    value ASC;
+
+┌─part_key─┬─value─┬─order─┬─frame_values─┐
+│        1 │     1 │     1 │ [1]          │
+│        1 │     2 │     2 │ [1,2]        │
+│        1 │     3 │     3 │ [1,2,3]      │
+│        1 │     4 │     4 │ [1,2,3,4]    │
+│        1 │     5 │     5 │ [1,2,3,4,5]  │
+└──────────┴───────┴───────┴──────────────┘
+
+-- short form (frame is bounded by the beggining of a partition and the current row)
+SELECT
+    part_key,
+    value,
+    order,
+    groupArray(value) OVER (PARTITION BY part_key ORDER BY order ASC) AS frame_values
+FROM wf_frame
+ORDER BY
+    part_key ASC,
+    value ASC;
+┌─part_key─┬─value─┬─order─┬─frame_values─┐
+│        1 │     1 │     1 │ [1]          │
+│        1 │     2 │     2 │ [1,2]        │
+│        1 │     3 │     3 │ [1,2,3]      │
+│        1 │     4 │     4 │ [1,2,3,4]    │
+│        1 │     5 │     5 │ [1,2,3,4,5]  │
+└──────────┴───────┴───────┴──────────────┘
+
+-- frame is bounded by the beggining of a partition and the current row, but order is backward
+SELECT
+    part_key,
+    value,
+    order,
+    groupArray(value) OVER (PARTITION BY part_key ORDER BY order DESC) AS frame_values
+FROM wf_frame
+ORDER BY
+    part_key ASC,
+    value ASC;
+┌─part_key─┬─value─┬─order─┬─frame_values─┐
+│        1 │     1 │     1 │ [5,4,3,2,1]  │
+│        1 │     2 │     2 │ [5,4,3,2]    │
+│        1 │     3 │     3 │ [5,4,3]      │
+│        1 │     4 │     4 │ [5,4]        │
+│        1 │     5 │     5 │ [5]          │
+└──────────┴───────┴───────┴──────────────┘
+
+-- sliding frame - 1 PRECEDING ROW AND CURRENT ROW
+SELECT
+    part_key,
+    value,
+    order,
+    groupArray(value) OVER (PARTITION BY part_key ORDER BY order ASC
+          Rows BETWEEN 1 PRECEDING AND CURRENT ROW) AS frame_values
+FROM wf_frame
+ORDER BY
+    part_key ASC,
+    value ASC;
+
+┌─part_key─┬─value─┬─order─┬─frame_values─┐
+│        1 │     1 │     1 │ [1]          │
+│        1 │     2 │     2 │ [1,2]        │
+│        1 │     3 │     3 │ [2,3]        │
+│        1 │     4 │     4 │ [3,4]        │
+│        1 │     5 │     5 │ [4,5]        │
+└──────────┴───────┴───────┴──────────────┘
+
+-- sliding frame - Rows BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING 
+SELECT
+    part_key,
+    value,
+    order,
+    groupArray(value) OVER (PARTITION BY part_key ORDER BY order ASC
+          Rows BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING) AS frame_values
+FROM wf_frame
+ORDER BY
+    part_key ASC,
+    value ASC;
+┌─part_key─┬─value─┬─order─┬─frame_values─┐
+│        1 │     1 │     1 │ [1,2,3,4,5]  │
+│        1 │     2 │     2 │ [1,2,3,4,5]  │
+│        1 │     3 │     3 │ [2,3,4,5]    │
+│        1 │     4 │     4 │ [3,4,5]      │
+│        1 │     5 │     5 │ [4,5]        │
+└──────────┴───────┴───────┴──────────────┘
+```
+
+## Real world examples
+
+### Maximum/total salary per department.
+
+```sql
+CREATE TABLE employees
+(
+    `department` String,
+    `employee_name` String,
+    `salary` Float
+)
+ENGINE = Memory;
+
+INSERT INTO employees FORMAT Values
+   ('Finance', 'Jonh', 200),
+   ('Finance', 'Joan', 210),
+   ('Finance', 'Jean', 505),
+   ('IT', 'Tim', 200),
+   ('IT', 'Anna', 300),
+   ('IT', 'Elen', 500);
+
+SELECT
+    department,
+    employee_name AS emp,
+    salary,
+    max_salary_per_dep,
+    total_salary_per_dep,
+    round((salary / total_salary_per_dep) * 100, 2) AS `share_per_dep(%)`
+FROM
+(
+    SELECT
+        department,
+        employee_name,
+        salary,
+        max(salary) OVER wndw AS max_salary_per_dep,
+        sum(salary) OVER wndw AS total_salary_per_dep
+    FROM employees
+    WINDOW wndw AS (PARTITION BY department
+          rows BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)
+    ORDER BY
+        department ASC,
+        employee_name ASC
+);
+
+┌─department─┬─emp──┬─salary─┬─max_salary_per_dep─┬─total_salary_per_dep─┬─share_per_dep(%)─┐
+│ Finance    │ Jean │    505 │                505 │                  915 │            55.19 │
+│ Finance    │ Joan │    210 │                505 │                  915 │            22.95 │
+│ Finance    │ Jonh │    200 │                505 │                  915 │            21.86 │
+│ IT         │ Anna │    300 │                500 │                 1000 │               30 │
+│ IT         │ Elen │    500 │                500 │                 1000 │               50 │
+│ IT         │ Tim  │    200 │                500 │                 1000 │               20 │
+└────────────┴──────┴────────┴────────────────────┴──────────────────────┴──────────────────┘
+```
+
+### Cumulative sum.
+
+```sql
+CREATE TABLE events
+(
+    `metric` String,
+    `ts` DateTime,
+    `value` Float
+)
+ENGINE = Memory
+
+INSERT INTO warehouse VALUES
+    ('sku38', '2020-01-01', 9),
+    ('sku38', '2020-02-01', 1),
+    ('sku38', '2020-03-01', -4),
+    ('sku1', '2020-01-01', 1),
+    ('sku1', '2020-02-01', 1),
+    ('sku1', '2020-03-01', 1);
+
+SELECT
+    item,
+    ts,
+    value,
+    sum(value) OVER (PARTITION BY item ORDER BY ts ASC) AS stock_balance
+FROM warehouse
+ORDER BY
+    item ASC,
+    ts ASC;
+
+┌─item──┬──────────────────ts─┬─value─┬─stock_balance─┐
+│ sku1  │ 2020-01-01 00:00:00 │     1 │             1 │
+│ sku1  │ 2020-02-01 00:00:00 │     1 │             2 │
+│ sku1  │ 2020-03-01 00:00:00 │     1 │             3 │
+│ sku38 │ 2020-01-01 00:00:00 │     9 │             9 │
+│ sku38 │ 2020-02-01 00:00:00 │     1 │            10 │
+│ sku38 │ 2020-03-01 00:00:00 │    -4 │             6 │
+└───────┴─────────────────────┴───────┴───────────────┘
+```
+
+### Moving / Sliding Average (per 3 rows)
+
+```sql
+CREATE TABLE sensors
+(
+    `metric` String,
+    `ts` DateTime,
+    `value` Float
+)
+ENGINE = Memory;
+
+insert into sensors values('cpu_temp', '2020-01-01 00:00:00', 87),
+                          ('cpu_temp', '2020-01-01 00:00:01', 77),
+                          ('cpu_temp', '2020-01-01 00:00:02', 93),
+                          ('cpu_temp', '2020-01-01 00:00:03', 87),
+                          ('cpu_temp', '2020-01-01 00:00:04', 87),
+                          ('cpu_temp', '2020-01-01 00:00:05', 87),
+                          ('cpu_temp', '2020-01-01 00:00:06', 87),
+                          ('cpu_temp', '2020-01-01 00:00:07', 87);
+SELECT
+    metric,
+    ts,
+    value,
+    avg(value) OVER
+       (PARTITION BY metric ORDER BY ts ASC Rows BETWEEN 2 PRECEDING AND CURRENT ROW)
+         AS moving_avg_temp
+FROM sensors
+ORDER BY
+    metric ASC,
+    ts ASC;
+
+┌─metric───┬──────────────────ts─┬─value─┬───moving_avg_temp─┐
+│ cpu_temp │ 2020-01-01 00:00:00 │    87 │                87 │
+│ cpu_temp │ 2020-01-01 00:00:01 │    77 │                82 │
+│ cpu_temp │ 2020-01-01 00:00:02 │    93 │ 85.66666666666667 │
+│ cpu_temp │ 2020-01-01 00:00:03 │    87 │ 85.66666666666667 │
+│ cpu_temp │ 2020-01-01 00:00:04 │    87 │                89 │
+│ cpu_temp │ 2020-01-01 00:00:05 │    87 │                87 │
+│ cpu_temp │ 2020-01-01 00:00:06 │    87 │                87 │
+│ cpu_temp │ 2020-01-01 00:00:07 │    87 │                87 │
+└──────────┴─────────────────────┴───────┴───────────────────┘
+```
+
+### Moving / Sliding Average (per 10 seconds)
+
+```sql
+SELECT
+    metric,
+    ts,
+    value,
+    avg(value) OVER (PARTITION BY metric ORDER BY ts
+      Range BETWEEN 10 PRECEDING AND CURRENT ROW) AS moving_avg_10_seconds_temp
+FROM sensors
+ORDER BY
+    metric ASC,
+    ts ASC;
+    
+┌─metric───┬──────────────────ts─┬─value─┬─moving_avg_10_seconds_temp─┐
+│ cpu_temp │ 2020-01-01 00:00:00 │    87 │                         87 │
+│ cpu_temp │ 2020-01-01 00:01:10 │    77 │                         77 │
+│ cpu_temp │ 2020-01-01 00:02:20 │    93 │                         93 │
+│ cpu_temp │ 2020-01-01 00:03:30 │    87 │                         87 │
+│ cpu_temp │ 2020-01-01 00:04:40 │    87 │                         87 │
+│ cpu_temp │ 2020-01-01 00:05:50 │    87 │                         87 │
+│ cpu_temp │ 2020-01-01 00:06:00 │    87 │                         87 │
+│ cpu_temp │ 2020-01-01 00:07:10 │    87 │                         87 │
+└──────────┴─────────────────────┴───────┴────────────────────────────┘
+```
--- a/docs/ru/faq/index.md
+++ b/docs/ru/faq/index.md
@ -39,6 +39,6 @@ Question candidates:
 - How to kill a process (query) in ClickHouse?
 - How to implement pivot (like in pandas)?
 - How to remove the default ClickHouse user through users.d?
- Importing MySQL dump to Clickhouse
+- Importing MySQL dump to ClickHouse
 - Window function workarounds (row\_number, lag/lead, running diff/sum/average)
 ##}
--- a/docs/ru/sql-reference/data-types/lowcardinality.md
+++ b/docs/ru/sql-reference/data-types/lowcardinality.md
@ -55,5 +55,5 @@ ORDER BY id

 ## Смотрите также

-   [Reducing Clickhouse Storage Cost with the Low Cardinality Type – Lessons from an Instana Engineer](https://www.instana.com/blog/reducing-clickhouse-storage-cost-with-the-low-cardinality-type-lessons-from-an-instana-engineer/).
+-   [Reducing ClickHouse Storage Cost with the Low Cardinality Type – Lessons from an Instana Engineer](https://www.instana.com/blog/reducing-clickhouse-storage-cost-with-the-low-cardinality-type-lessons-from-an-instana-engineer/).
 -   [String Optimization (video presentation in Russian)](https://youtu.be/rqf-ILRgBdY?list=PL0Z2YDlm0b3iwXCpEFiOOYmwXzVmjJfEt). [Slides in English](https://github.com/ClickHouse/clickhouse-presentations/raw/master/meetup19/string_optimization.pdf).
--- a/docs/ru/sql-reference/functions/encryption-functions.md
+++ b/docs/ru/sql-reference/functions/encryption-functions.md
@ -11,7 +11,7 @@ sidebar_label: "Функции для шифрования"

 Длина инициализирующего вектора всегда 16 байт (лишние байты игнорируются).

-Обратите внимание, что до версии Clickhouse 21.1 эти функции работали медленно.
+Обратите внимание, что до версии ClickHouse 21.1 эти функции работали медленно.

 ## encrypt {#encrypt}

@ -19,11 +19,10 @@ sidebar_label: "Функции для шифрования"

 -   aes-128-ecb, aes-192-ecb, aes-256-ecb
 -   aes-128-cbc, aes-192-cbc, aes-256-cbc
-   aes-128-cfb1, aes-192-cfb1, aes-256-cfb1
-   aes-128-cfb8, aes-192-cfb8, aes-256-cfb8
-   aes-128-cfb128, aes-192-cfb128, aes-256-cfb128
+-   aes-128-cfb128
 -   aes-128-ofb, aes-192-ofb, aes-256-ofb
 -   aes-128-gcm, aes-192-gcm, aes-256-gcm
+-   aes-128-ctr, aes-192-ctr, aes-256-ctr

 **Синтаксис**

@ -63,9 +62,9 @@ ENGINE = Memory;
 Запрос:

 ``` sql
-INSERT INTO encryption_test VALUES('aes-256-cfb128 no IV', encrypt('aes-256-cfb128', 'Secret', '12345678910121314151617181920212')),\
-('aes-256-cfb128 no IV, different key', encrypt('aes-256-cfb128', 'Secret', 'keykeykeykeykeykeykeykeykeykeyke')),\
-('aes-256-cfb128 with IV', encrypt('aes-256-cfb128', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv')),\
+INSERT INTO encryption_test VALUES('aes-256-ofb no IV', encrypt('aes-256-ofb', 'Secret', '12345678910121314151617181920212')),\
+('aes-256-ofb no IV, different key', encrypt('aes-256-ofb', 'Secret', 'keykeykeykeykeykeykeykeykeykeyke')),\
+('aes-256-ofb with IV', encrypt('aes-256-ofb', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv')),\
 ('aes-256-cbc no IV', encrypt('aes-256-cbc', 'Secret', '12345678910121314151617181920212'));
 ```

@ -78,12 +77,12 @@ SELECT comment, hex(secret) FROM encryption_test;
 Результат:

 ``` text
-┌─comment─────────────────────────────┬─hex(secret)──────────────────────┐
-│ aes-256-cfb128 no IV                │ B4972BDC4459                     │
-│ aes-256-cfb128 no IV, different key │ 2FF57C092DC9                     │
-│ aes-256-cfb128 with IV              │ 5E6CB398F653                     │
-│ aes-256-cbc no IV                   │ 1BC0629A92450D9E73A00E7D02CF4142 │
-└─────────────────────────────────────┴──────────────────────────────────┘
+┌─comment──────────────────────────┬─hex(secret)──────────────────────┐
+│ aes-256-ofb no IV                │ B4972BDC4459                     │
+│ aes-256-ofb no IV, different key │ 2FF57C092DC9                     │
+│ aes-256-ofb with IV              │ 5E6CB398F653                     │
+│ aes-256-cbc no IV                │ 1BC0629A92450D9E73A00E7D02CF4142 │
+└──────────────────────────────────┴──────────────────────────────────┘
 ```

 Пример в режиме `-gcm`:
@ -116,9 +115,7 @@ SELECT comment, hex(secret) FROM encryption_test WHERE comment LIKE '%gcm%';

 -   aes-128-ecb, aes-192-ecb, aes-256-ecb
 -   aes-128-cbc, aes-192-cbc, aes-256-cbc
-   aes-128-cfb1, aes-192-cfb1, aes-256-cfb1
-   aes-128-cfb8, aes-192-cfb8, aes-256-cfb8
-   aes-128-cfb128, aes-192-cfb128, aes-256-cfb128
+-   aes-128-cfb128
 -   aes-128-ofb, aes-192-ofb, aes-256-ofb

 **Синтаксис**
@ -145,7 +142,7 @@ aes_encrypt_mysql('mode', 'plaintext', 'key' [, iv])
 Запрос:

 ``` sql
-SELECT encrypt('aes-256-cfb128', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv') = aes_encrypt_mysql('aes-256-cfb128', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv') AS ciphertexts_equal;
+SELECT encrypt('aes-256-ofb', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv') = aes_encrypt_mysql('aes-256-ofb', 'Secret', '12345678910121314151617181920212', 'iviviviviviviviv') AS ciphertexts_equal;
 ```

 Результат:
@ -161,14 +158,14 @@ SELECT encrypt('aes-256-cfb128', 'Secret', '12345678910121314151617181920212', '
 Запрос:

 ``` sql
-SELECT encrypt('aes-256-cfb128', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123');
+SELECT encrypt('aes-256-ofb', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123');
 ```

 Результат:

 ``` text
 Received exception from server (version 21.1.2):
-Code: 36. DB::Exception: Received from localhost:9000. DB::Exception: Invalid key size: 33 expected 32: While processing encrypt('aes-256-cfb128', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123').
+Code: 36. DB::Exception: Received from localhost:9000. DB::Exception: Invalid key size: 33 expected 32: While processing encrypt('aes-256-ofb', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123').
 ```

 Однако функция `aes_encrypt_mysql` в аналогичном случае возвращает результат, который может быть обработан MySQL:
@ -176,7 +173,7 @@ Code: 36. DB::Exception: Received from localhost:9000. DB::Exception: Invalid ke
 Запрос:

 ``` sql
-SELECT hex(aes_encrypt_mysql('aes-256-cfb128', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123')) AS ciphertext;
+SELECT hex(aes_encrypt_mysql('aes-256-ofb', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123')) AS ciphertext;
 ```

 Результат:
@ -192,7 +189,7 @@ SELECT hex(aes_encrypt_mysql('aes-256-cfb128', 'Secret', '1234567891012131415161
 Запрос:

 ``` sql
-SELECT hex(aes_encrypt_mysql('aes-256-cfb128', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123456')) AS ciphertext
+SELECT hex(aes_encrypt_mysql('aes-256-ofb', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123456')) AS ciphertext
 ```

 Результат:
@ -206,7 +203,7 @@ SELECT hex(aes_encrypt_mysql('aes-256-cfb128', 'Secret', '1234567891012131415161
 Это совпадает с результатом, возвращаемым MySQL при таких же входящих значениях:

 ``` sql
-mysql> SET  block_encryption_mode='aes-256-cfb128';
+mysql> SET  block_encryption_mode='aes-256-ofb';
 Query OK, 0 rows affected (0.00 sec)

 mysql> SELECT aes_encrypt('Secret', '123456789101213141516171819202122', 'iviviviviviviviv123456') as ciphertext;
@ -224,11 +221,10 @@ mysql> SELECT aes_encrypt('Secret', '123456789101213141516171819202122', 'iviviv

 -   aes-128-ecb, aes-192-ecb, aes-256-ecb
 -   aes-128-cbc, aes-192-cbc, aes-256-cbc
-   aes-128-cfb1, aes-192-cfb1, aes-256-cfb1
-   aes-128-cfb8, aes-192-cfb8, aes-256-cfb8
-   aes-128-cfb128, aes-192-cfb128, aes-256-cfb128
+-   aes-128-cfb128
 -   aes-128-ofb, aes-192-ofb, aes-256-ofb
 -   aes-128-gcm, aes-192-gcm, aes-256-gcm
+-   aes-128-ctr, aes-192-ctr, aes-256-ctr

 **Синтаксис**

@ -265,12 +261,12 @@ SELECT comment, hex(secret) FROM encryption_test;
 │ aes-256-gcm          │ A8A3CCBC6426CFEEB60E4EAE03D3E94204C1B09E0254 │
 │ aes-256-gcm with AAD │ A8A3CCBC6426D9A1017A0A932322F1852260A4AD6837 │
 └──────────────────────┴──────────────────────────────────────────────┘
-┌─comment─────────────────────────────┬─hex(secret)──────────────────────┐
-│ aes-256-cfb128 no IV                │ B4972BDC4459                     │
-│ aes-256-cfb128 no IV, different key │ 2FF57C092DC9                     │
-│ aes-256-cfb128 with IV              │ 5E6CB398F653                     │
-│ aes-256-cbc no IV                   │ 1BC0629A92450D9E73A00E7D02CF4142 │
-└─────────────────────────────────────┴──────────────────────────────────┘
+┌─comment──────────────────────────┬─hex(secret)──────────────────────┐
+│ aes-256-ofb no IV                │ B4972BDC4459                     │
+│ aes-256-ofb no IV, different key │ 2FF57C092DC9                     │
+│ aes-256-ofb with IV              │ 5E6CB398F653                     │
+│ aes-256-cbc no IV                │ 1BC0629A92450D9E73A00E7D02CF4142 │
+└──────────────────────────────────┴──────────────────────────────────┘
 ```

 Теперь попытаемся расшифровать эти данные:
@ -278,19 +274,25 @@ SELECT comment, hex(secret) FROM encryption_test;
 Запрос:

 ``` sql
-SELECT comment, decrypt('aes-256-cfb128', secret, '12345678910121314151617181920212') as plaintext FROM encryption_test;
+SELECT comment, decrypt('aes-256-ofb', secret, '12345678910121314151617181920212') as plaintext FROM encryption_test;
 ```

 Результат:

 ``` text
-┌─comment─────────────────────────────┬─plaintext─┐
-│ aes-256-cfb128 no IV                │ Secret    │
-│ aes-256-cfb128 no IV, different key │ <20>4<EFBFBD>
-                                           <20>         │
-│ aes-256-cfb128 with IV              │ <20><><EFBFBD>6<EFBFBD>~        │
- │aes-256-cbc no IV                   │ <20>2*4<>h3c<33>4w<34><77>@
-└─────────────────────────────────────┴───────────┘
+┌─comment──────────────┬─plaintext──┐
+│ aes-256-gcm          │ OQ<4F>E
+                             <20>t<EFBFBD>7T<37>\<5C><><EFBFBD>\<5C>   │
+│ aes-256-gcm with AAD │ OQ<4F>E
+                             <20>\<5C><>si<73><69><EFBFBD><EFBFBD>;<3B>o<EFBFBD><6F> │
+└──────────────────────┴────────────┘
+┌─comment──────────────────────────┬─plaintext─┐
+│ aes-256-ofb no IV                │ Secret    │
+│ aes-256-ofb no IV, different key │ <20>4<EFBFBD>
+                                        <20>         │
+│ aes-256-ofb with IV              │ <20><><EFBFBD>6<EFBFBD>~        │
+ │aes-256-cbc no IV                │ <20>2*4<>h3c<33>4w<34><77>@
+└──────────────────────────────────┴───────────┘
 ```

 Обратите внимание, что только часть данных была расшифрована верно. Оставшаяся часть расшифрована некорректно, так как при шифровании использовались другие значения `mode`, `key`, или `iv`.
@ -305,9 +307,7 @@ SELECT comment, decrypt('aes-256-cfb128', secret, '12345678910121314151617181920

 -   aes-128-ecb, aes-192-ecb, aes-256-ecb
 -   aes-128-cbc, aes-192-cbc, aes-256-cbc
-   aes-128-cfb1, aes-192-cfb1, aes-256-cfb1
-   aes-128-cfb8, aes-192-cfb8, aes-256-cfb8
-   aes-128-cfb128, aes-192-cfb128, aes-256-cfb128
+-   aes-128-cfb128
 -   aes-128-ofb, aes-192-ofb, aes-256-ofb

 **Синтаксис**
@ -333,7 +333,7 @@ aes_decrypt_mysql('mode', 'ciphertext', 'key' [, iv])


 ``` sql
-mysql> SET  block_encryption_mode='aes-256-cfb128';
+mysql> SET  block_encryption_mode='aes-256-ofb';
 Query OK, 0 rows affected (0.00 sec)

 mysql> SELECT aes_encrypt('Secret', '123456789101213141516171819202122', 'iviviviviviviviv123456') as ciphertext;
@ -348,7 +348,7 @@ mysql> SELECT aes_encrypt('Secret', '123456789101213141516171819202122', 'iviviv
 Запрос:

 ``` sql
-SELECT aes_decrypt_mysql('aes-256-cfb128', unhex('24E9E4966469'), '123456789101213141516171819202122', 'iviviviviviviviv123456') AS plaintext;
+SELECT aes_decrypt_mysql('aes-256-ofb', unhex('24E9E4966469'), '123456789101213141516171819202122', 'iviviviviviviviv123456') AS plaintext;
 ```

 Результат:
--- a/programs/keeper-converter/KeeperConverter.cpp
+++ b/programs/keeper-converter/KeeperConverter.cpp
@ -39,7 +39,7 @@ int mainEntryClickHouseKeeperConverter(int argc, char ** argv)

    try
    {
-        DB::KeeperStorage storage(500, "");
+        DB::KeeperStorage storage(500, "", true);

        DB::deserializeKeeperStorageFromSnapshotsDir(storage, options["zookeeper-snapshots-dir"].as<std::string>(), logger);
        DB::deserializeLogsAndApplyToStorage(storage, options["zookeeper-logs-dir"].as<std::string>(), logger);
--- a/programs/local/LocalServer.h
+++ b/programs/local/LocalServer.h
@ -3,7 +3,6 @@
 #include <Client/ClientBase.h>
 #include <Client/LocalConnection.h>

-#include <Common/ProgressIndication.h>
 #include <Common/StatusFile.h>
 #include <Common/InterruptListener.h>
 #include <Loggers/Loggers.h>
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@ -1515,7 +1515,7 @@ int Server::main(const std::vector<std::string> & /*args*/)

    /// Init trace collector only after trace_log system table was created
    /// Disable it if we collect test coverage information, because it will work extremely slow.
-#if USE_UNWIND && !WITH_COVERAGE && defined(__x86_64__)
+#if USE_UNWIND && !WITH_COVERAGE
    /// Profilers cannot work reliably with any other libunwind or without PHDR cache.
    if (hasPHDRCache())
    {
--- a/src/AggregateFunctions/AggregateFunctionAggThrow.cpp
+++ b/src/AggregateFunctions/AggregateFunctionAggThrow.cpp
@ -76,27 +76,27 @@ public:
        data(place).~Data();
    }

-    void add(AggregateDataPtr, const IColumn **, size_t, Arena *) const override
+    void add(AggregateDataPtr __restrict, const IColumn **, size_t, Arena *) const override
    {
    }

-    void merge(AggregateDataPtr, ConstAggregateDataPtr, Arena *) const override
+    void merge(AggregateDataPtr __restrict, ConstAggregateDataPtr, Arena *) const override
    {
    }

-    void serialize(ConstAggregateDataPtr, WriteBuffer & buf, std::optional<size_t> /* version */) const override
+    void serialize(ConstAggregateDataPtr __restrict, WriteBuffer & buf, std::optional<size_t> /* version */) const override
    {
        char c = 0;
        buf.write(c);
    }

-    void deserialize(AggregateDataPtr /* place */, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
+    void deserialize(AggregateDataPtr __restrict /* place */, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
    {
        char c = 0;
        buf.read(c);
    }

-    void insertResultInto(AggregateDataPtr, IColumn & to, Arena *) const override
+    void insertResultInto(AggregateDataPtr __restrict, IColumn & to, Arena *) const override
    {
        to.insertDefault();
    }
--- a/src/AggregateFunctions/AggregateFunctionAvg.h
+++ b/src/AggregateFunctions/AggregateFunctionAvg.h
@ -236,7 +236,7 @@ public:
    void addBatchSinglePlace(
        size_t row_begin,
        size_t row_end,
-        AggregateDataPtr place,
+        AggregateDataPtr __restrict place,
        const IColumn ** columns,
        Arena *,
        ssize_t if_argument_pos) const final
@ -260,7 +260,7 @@ public:
    void addBatchSinglePlaceNotNull(
        size_t row_begin,
        size_t row_end,
-        AggregateDataPtr place,
+        AggregateDataPtr __restrict place,
        const IColumn ** columns,
        const UInt8 * null_map,
        Arena *,
--- a/src/AggregateFunctions/AggregateFunctionCategoricalInformationValue.h
+++ b/src/AggregateFunctions/AggregateFunctionCategoricalInformationValue.h
@ -41,7 +41,7 @@ public:
        memset(place, 0, sizeOfData());
    }

-    void destroy(AggregateDataPtr) const noexcept override
+    void destroy(AggregateDataPtr __restrict) const noexcept override
    {
        // nothing
    }
@ -61,7 +61,7 @@ public:
        return alignof(T);
    }

-    void add(AggregateDataPtr place, const IColumn ** columns, size_t row_num, Arena *) const override
+    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override
    {
        const auto * y_col = static_cast<const ColumnUInt8 *>(columns[category_count]);
        bool y = y_col->getData()[row_num];
@ -78,7 +78,7 @@ public:
        reinterpret_cast<T *>(place)[category_count * 2 + size_t(y)] += 1;
    }

-    void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena *) const override
+    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
    {
        for (size_t i : collections::range(0, category_count + 1))
        {
@ -87,12 +87,12 @@ public:
        }
    }

-    void serialize(ConstAggregateDataPtr place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
+    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
    {
        buf.write(place, sizeOfData());
    }

-    void deserialize(AggregateDataPtr place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
+    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
    {
        buf.read(place, sizeOfData());
    }
--- a/src/AggregateFunctions/AggregateFunctionCount.h
+++ b/src/AggregateFunctions/AggregateFunctionCount.h
@ -65,7 +65,7 @@ public:
    void addBatchSinglePlace(
        size_t row_begin,
        size_t row_end,
-        AggregateDataPtr place,
+        AggregateDataPtr __restrict place,
        const IColumn ** columns,
        Arena *,
        ssize_t if_argument_pos) const override
@ -84,7 +84,7 @@ public:
    void addBatchSinglePlaceNotNull(
        size_t row_begin,
        size_t row_end,
-        AggregateDataPtr place,
+        AggregateDataPtr __restrict place,
        const IColumn ** columns,
        const UInt8 * null_map,
        Arena *,
@ -222,7 +222,7 @@ public:
    void addBatchSinglePlace(
        size_t row_begin,
        size_t row_end,
-        AggregateDataPtr place,
+        AggregateDataPtr __restrict place,
        const IColumn ** columns,
        Arena *,
        ssize_t if_argument_pos) const override
--- a/src/AggregateFunctions/AggregateFunctionIf.cpp
+++ b/src/AggregateFunctions/AggregateFunctionIf.cpp
@ -122,7 +122,7 @@ public:
    void addBatchSinglePlace(
        size_t row_begin,
        size_t row_end,
-        AggregateDataPtr place,
+        AggregateDataPtr __restrict place,
        const IColumn ** columns,
        Arena * arena,
        ssize_t) const override
--- a/src/AggregateFunctions/AggregateFunctionIf.h
+++ b/src/AggregateFunctions/AggregateFunctionIf.h
@ -100,7 +100,7 @@ public:
    void addBatch(
        size_t row_begin,
        size_t row_end,
-        AggregateDataPtr * places,
+        AggregateDataPtr * __restrict places,
        size_t place_offset,
        const IColumn ** columns,
        Arena * arena,
@ -112,7 +112,7 @@ public:
    void addBatchSinglePlace(
        size_t row_begin,
        size_t row_end,
-        AggregateDataPtr place,
+        AggregateDataPtr __restrict place,
        const IColumn ** columns,
        Arena * arena,
        ssize_t) const override
@ -123,7 +123,7 @@ public:
    void addBatchSinglePlaceNotNull(
        size_t row_begin,
        size_t row_end,
-        AggregateDataPtr place,
+        AggregateDataPtr __restrict place,
        const IColumn ** columns,
        const UInt8 * null_map,
        Arena * arena,
--- a/src/AggregateFunctions/AggregateFunctionMLMethod.h
+++ b/src/AggregateFunctions/AggregateFunctionMLMethod.h
@ -362,7 +362,7 @@ public:
    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override { this->data(place).read(buf); }

    void predictValues(
-        ConstAggregateDataPtr place,
+        ConstAggregateDataPtr __restrict place,
        IColumn & to,
        const ColumnsWithTypeAndName & arguments,
        size_t offset,
--- a/src/AggregateFunctions/AggregateFunctionMap.h
+++ b/src/AggregateFunctions/AggregateFunctionMap.h
@ -105,7 +105,7 @@ public:

    DataTypePtr getReturnType() const override { return std::make_shared<DataTypeMap>(DataTypes{key_type, nested_func->getReturnType()}); }

-    void add(AggregateDataPtr place, const IColumn ** columns, size_t row_num, Arena * arena) const override
+    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena * arena) const override
    {
        const auto & map_column = assert_cast<const ColumnMap &>(*columns[0]);
        const auto & map_nested_tuple = map_column.getNestedData();
@ -160,7 +160,7 @@ public:
        }
    }

-    void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena * arena) const override
+    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena * arena) const override
    {
        auto & merged_maps = this->data(place).merged_maps;
        const auto & rhs_maps = this->data(rhs).merged_maps;
@ -178,7 +178,7 @@ public:
        }
    }

-    void serialize(ConstAggregateDataPtr place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
+    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
    {
        auto & merged_maps = this->data(place).merged_maps;
        writeVarUInt(merged_maps.size(), buf);
@ -190,7 +190,7 @@ public:
        }
    }

-    void deserialize(AggregateDataPtr place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena * arena) const override
+    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena * arena) const override
    {
        auto & merged_maps = this->data(place).merged_maps;
        UInt64 size;
@ -209,7 +209,7 @@ public:
        }
    }

-    void insertResultInto(AggregateDataPtr place, IColumn & to, Arena * arena) const override
+    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena * arena) const override
    {
        auto & map_column = assert_cast<ColumnMap &>(to);
        auto & nested_column = map_column.getNestedColumn();
--- a/src/AggregateFunctions/AggregateFunctionNothing.h
+++ b/src/AggregateFunctions/AggregateFunctionNothing.h
@ -33,11 +33,11 @@ public:

    bool allocatesMemoryInArena() const override { return false; }

-    void create(AggregateDataPtr) const override
+    void create(AggregateDataPtr __restrict) const override
    {
    }

-    void destroy(AggregateDataPtr) const noexcept override
+    void destroy(AggregateDataPtr __restrict) const noexcept override
    {
    }

@ -56,11 +56,11 @@ public:
        return 1;
    }

-    void add(AggregateDataPtr, const IColumn **, size_t, Arena *) const override
+    void add(AggregateDataPtr __restrict, const IColumn **, size_t, Arena *) const override
    {
    }

-    void merge(AggregateDataPtr, ConstAggregateDataPtr, Arena *) const override
+    void merge(AggregateDataPtr __restrict, ConstAggregateDataPtr, Arena *) const override
    {
    }

@ -69,14 +69,14 @@ public:
        writeChar('\0', buf);
    }

-    void deserialize(AggregateDataPtr, ReadBuffer & buf, std::optional<size_t>, Arena *) const override
+    void deserialize(AggregateDataPtr __restrict, ReadBuffer & buf, std::optional<size_t>, Arena *) const override
    {
        [[maybe_unused]] char symbol;
        readChar(symbol, buf);
        assert(symbol == '\0');
    }

-    void insertResultInto(AggregateDataPtr, IColumn & to, Arena *) const override
+    void insertResultInto(AggregateDataPtr __restrict, IColumn & to, Arena *) const override
    {
        to.insertDefault();
    }
--- a/src/AggregateFunctions/AggregateFunctionNull.h
+++ b/src/AggregateFunctions/AggregateFunctionNull.h
@ -309,7 +309,7 @@ public:
    void addBatchSinglePlace( /// NOLINT
        size_t row_begin,
        size_t row_end,
-        AggregateDataPtr place,
+        AggregateDataPtr __restrict place,
        const IColumn ** columns,
        Arena * arena,
        ssize_t if_argument_pos = -1) const override
--- a/src/AggregateFunctions/AggregateFunctionOrFill.h
+++ b/src/AggregateFunctions/AggregateFunctionOrFill.h
@ -99,7 +99,7 @@ public:
    }

    void add(
-        AggregateDataPtr place,
+        AggregateDataPtr __restrict place,
        const IColumn ** columns,
        size_t row_num,
        Arena * arena) const override
@ -138,7 +138,7 @@ public:
    void addBatchSinglePlace( /// NOLINT
        size_t row_begin,
        size_t row_end,
-        AggregateDataPtr place,
+        AggregateDataPtr __restrict place,
        const IColumn ** columns,
        Arena * arena,
        ssize_t if_argument_pos = -1) const override
@ -169,7 +169,7 @@ public:
    void addBatchSinglePlaceNotNull( /// NOLINT
        size_t row_begin,
        size_t row_end,
-        AggregateDataPtr place,
+        AggregateDataPtr __restrict place,
        const IColumn ** columns,
        const UInt8 * null_map,
        Arena * arena,
@ -206,7 +206,7 @@ public:
    }

    void merge(
-        AggregateDataPtr place,
+        AggregateDataPtr __restrict place,
        ConstAggregateDataPtr rhs,
        Arena * arena) const override
    {
@ -227,14 +227,14 @@ public:
            (places[i] + place_offset)[size_of_data] |= rhs[i][size_of_data];
    }

-    void serialize(ConstAggregateDataPtr place, WriteBuffer & buf, std::optional<size_t> version) const override
+    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> version) const override
    {
        nested_function->serialize(place, buf, version);

        writeChar(place[size_of_data], buf);
    }

-    void deserialize(AggregateDataPtr place, ReadBuffer & buf, std::optional<size_t> version, Arena * arena) const override
+    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> version, Arena * arena) const override
    {
        nested_function->deserialize(place, buf, version, arena);

@ -261,7 +261,7 @@ public:
    }

    void insertResultInto(
-        AggregateDataPtr place,
+        AggregateDataPtr __restrict place,
        IColumn & to,
        Arena * arena) const override
    {
--- a/src/AggregateFunctions/AggregateFunctionResample.h
+++ b/src/AggregateFunctions/AggregateFunctionResample.h
@ -134,7 +134,7 @@ public:
            nested_function->destroy(place + i * size_of_data);
    }

-    void add(AggregateDataPtr place, const IColumn ** columns, size_t row_num, Arena * arena) const override
+    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena * arena) const override
    {
        Key key;

@ -151,19 +151,19 @@ public:
        nested_function->add(place + pos * size_of_data, columns, row_num, arena);
    }

-    void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena * arena) const override
+    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena * arena) const override
    {
        for (size_t i = 0; i < total; ++i)
            nested_function->merge(place + i * size_of_data, rhs + i * size_of_data, arena);
    }

-    void serialize(ConstAggregateDataPtr place, WriteBuffer & buf, std::optional<size_t> version) const override
+    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> version) const override
    {
        for (size_t i = 0; i < total; ++i)
            nested_function->serialize(place + i * size_of_data, buf, version);
    }

-    void deserialize(AggregateDataPtr place, ReadBuffer & buf, std::optional<size_t> version, Arena * arena) const override
+    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> version, Arena * arena) const override
    {
        for (size_t i = 0; i < total; ++i)
            nested_function->deserialize(place + i * size_of_data, buf, version, arena);
@ -174,7 +174,7 @@ public:
        return std::make_shared<DataTypeArray>(nested_function->getReturnType());
    }

-    void insertResultInto(AggregateDataPtr place, IColumn & to, Arena * arena) const override
+    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena * arena) const override
    {
        auto & col = assert_cast<ColumnArray &>(to);
        auto & col_offsets = assert_cast<ColumnArray::ColumnOffsets &>(col.getOffsetsColumn());
--- a/src/AggregateFunctions/AggregateFunctionSequenceNextNode.h
+++ b/src/AggregateFunctions/AggregateFunctionSequenceNextNode.h
@ -158,8 +158,8 @@ class SequenceNextNodeImpl final
    using Self = SequenceNextNodeImpl<T, Node>;

    using Data = SequenceNextNodeGeneralData<Node>;
-    static Data & data(AggregateDataPtr place) { return *reinterpret_cast<Data *>(place); }
-    static const Data & data(ConstAggregateDataPtr place) { return *reinterpret_cast<const Data *>(place); }
+    static Data & data(AggregateDataPtr __restrict place) { return *reinterpret_cast<Data *>(place); }
+    static const Data & data(ConstAggregateDataPtr __restrict place) { return *reinterpret_cast<const Data *>(place); }

    static constexpr size_t base_cond_column_idx = 2;
    static constexpr size_t event_column_idx = 1;
@ -216,7 +216,7 @@ public:
        a.value.push_back(v->clone(arena), arena);
    }

-    void create(AggregateDataPtr place) const override /// NOLINT
+    void create(AggregateDataPtr __restrict place) const override /// NOLINT
    {
        new (place) Data;
    }
--- a/src/AggregateFunctions/AggregateFunctionSimpleLinearRegression.h
+++ b/src/AggregateFunctions/AggregateFunctionSimpleLinearRegression.h
@ -110,7 +110,7 @@ public:
    }

    void add(
-        AggregateDataPtr place,
+        AggregateDataPtr __restrict place,
        const IColumn ** columns,
        size_t row_num,
        Arena *
@ -125,17 +125,17 @@ public:
        this->data(place).add(x, y);
    }

-    void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena *) const override
+    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
    {
        this->data(place).merge(this->data(rhs));
    }

-    void serialize(ConstAggregateDataPtr place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
+    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
    {
        this->data(place).serialize(buf);
    }

-    void deserialize(AggregateDataPtr place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
+    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
    {
        this->data(place).deserialize(buf);
    }
@ -163,7 +163,7 @@ public:
    bool allocatesMemoryInArena() const override { return false; }

    void insertResultInto(
-        AggregateDataPtr place,
+        AggregateDataPtr __restrict place,
        IColumn & to,
        Arena *) const override
    {
--- a/src/AggregateFunctions/AggregateFunctionSparkbar.h
+++ b/src/AggregateFunctions/AggregateFunctionSparkbar.h
@ -298,7 +298,7 @@ public:
        }
    }

-    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena * /*arena*/) const override
+    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr __restrict rhs, Arena * /*arena*/) const override
    {
        this->data(place).merge(this->data(rhs));
    }
--- a/src/AggregateFunctions/AggregateFunctionSum.h
+++ b/src/AggregateFunctions/AggregateFunctionSum.h
@ -445,7 +445,7 @@ public:
    void addBatchSinglePlace(
        size_t row_begin,
        size_t row_end,
-        AggregateDataPtr place,
+        AggregateDataPtr __restrict place,
        const IColumn ** columns,
        Arena *,
        ssize_t if_argument_pos) const override
@ -465,7 +465,7 @@ public:
    void addBatchSinglePlaceNotNull(
        size_t row_begin,
        size_t row_end,
-        AggregateDataPtr place,
+        AggregateDataPtr __restrict place,
        const IColumn ** columns,
        const UInt8 * null_map,
        Arena *,
--- a/src/AggregateFunctions/IAggregateFunction.h
+++ b/src/AggregateFunctions/IAggregateFunction.h
@ -150,7 +150,7 @@ public:
    /// Used for machine learning methods. Predict result from trained model.
    /// Will insert result into `to` column for rows in range [offset, offset + limit).
    virtual void predictValues(
-        ConstAggregateDataPtr /* place */,
+        ConstAggregateDataPtr __restrict /* place */,
        IColumn & /*to*/,
        const ColumnsWithTypeAndName & /*arguments*/,
        size_t /*offset*/,
@ -209,7 +209,7 @@ public:
    virtual void addBatchSinglePlace( /// NOLINT
        size_t row_begin,
        size_t row_end,
-        AggregateDataPtr place,
+        AggregateDataPtr __restrict place,
        const IColumn ** columns,
        Arena * arena,
        ssize_t if_argument_pos = -1) const = 0;
@ -218,7 +218,7 @@ public:
    virtual void addBatchSparseSinglePlace(
        size_t row_begin,
        size_t row_end,
-        AggregateDataPtr place,
+        AggregateDataPtr __restrict place,
        const IColumn ** columns,
        Arena * arena) const = 0;

@ -228,7 +228,7 @@ public:
    virtual void addBatchSinglePlaceNotNull( /// NOLINT
        size_t row_begin,
        size_t row_end,
-        AggregateDataPtr place,
+        AggregateDataPtr __restrict place,
        const IColumn ** columns,
        const UInt8 * null_map,
        Arena * arena,
@ -237,7 +237,7 @@ public:
    virtual void addBatchSinglePlaceFromInterval( /// NOLINT
        size_t row_begin,
        size_t row_end,
-        AggregateDataPtr place,
+        AggregateDataPtr __restrict place,
        const IColumn ** columns,
        Arena * arena,
        ssize_t if_argument_pos = -1)
@ -370,7 +370,7 @@ template <typename Derived>
 class IAggregateFunctionHelper : public IAggregateFunction
 {
 private:
-    static void addFree(const IAggregateFunction * that, AggregateDataPtr place, const IColumn ** columns, size_t row_num, Arena * arena)
+    static void addFree(const IAggregateFunction * that, AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena * arena)
    {
        static_cast<const Derived &>(*that).add(place, columns, row_num, arena);
    }
@ -450,7 +450,7 @@ public:
    void addBatchSinglePlace( /// NOLINT
        size_t row_begin,
        size_t row_end,
-        AggregateDataPtr place,
+        AggregateDataPtr __restrict place,
        const IColumn ** columns,
        Arena * arena,
        ssize_t if_argument_pos = -1) const override
@ -474,7 +474,7 @@ public:
    void addBatchSparseSinglePlace(
        size_t row_begin,
        size_t row_end,
-        AggregateDataPtr place,
+        AggregateDataPtr __restrict place,
        const IColumn ** columns,
        Arena * arena) const override
    {
@ -493,7 +493,7 @@ public:
    void addBatchSinglePlaceNotNull( /// NOLINT
        size_t row_begin,
        size_t row_end,
-        AggregateDataPtr place,
+        AggregateDataPtr __restrict place,
        const IColumn ** columns,
        const UInt8 * null_map,
        Arena * arena,
@ -517,7 +517,7 @@ public:
    void addBatchSinglePlaceFromInterval( /// NOLINT
        size_t row_begin,
        size_t row_end,
-        AggregateDataPtr place,
+        AggregateDataPtr __restrict place,
        const IColumn ** columns,
        Arena * arena,
        ssize_t if_argument_pos = -1)
@ -661,7 +661,7 @@ public:
    IAggregateFunctionDataHelper(const DataTypes & argument_types_, const Array & parameters_)
        : IAggregateFunctionHelper<Derived>(argument_types_, parameters_) {}

-    void create(AggregateDataPtr place) const override /// NOLINT
+    void create(AggregateDataPtr __restrict place) const override /// NOLINT
    {
        new (place) Data;
    }
--- a/src/Client/ClientBase.cpp
+++ b/src/Client/ClientBase.cpp
@ -1275,7 +1275,7 @@ try
        }

        /// Check if server send Log packet
-        receiveLogs(parsed_query);
+        receiveLogsAndProfileEvents(parsed_query);

        /// Check if server send Exception packet
        auto packet_type = connection->checkPacket(0);
@ -1328,11 +1328,11 @@ void ClientBase::sendDataFromStdin(Block & sample, const ColumnsDescription & co


 /// Process Log packets, used when inserting data by blocks
-void ClientBase::receiveLogs(ASTPtr parsed_query)
+void ClientBase::receiveLogsAndProfileEvents(ASTPtr parsed_query)
 {
    auto packet_type = connection->checkPacket(0);

-    while (packet_type && *packet_type == Protocol::Server::Log)
+    while (packet_type && (*packet_type == Protocol::Server::Log || *packet_type == Protocol::Server::ProfileEvents))
    {
        receiveAndProcessPacket(parsed_query, false);
        packet_type = connection->checkPacket(0);
--- a/src/Client/ClientBase.h
+++ b/src/Client/ClientBase.h
@ -117,7 +117,7 @@ protected:
 private:
    void receiveResult(ASTPtr parsed_query);
    bool receiveAndProcessPacket(ASTPtr parsed_query, bool cancelled_);
-    void receiveLogs(ASTPtr parsed_query);
+    void receiveLogsAndProfileEvents(ASTPtr parsed_query);
    bool receiveSampleBlock(Block & out, ColumnsDescription & columns_description, ASTPtr parsed_query);
    bool receiveEndOfQuery();
    void cancelQuery();
--- a/src/Client/ConnectionPoolWithFailover.cpp
+++ b/src/Client/ConnectionPoolWithFailover.cpp
@ -20,6 +20,7 @@ namespace DB
 namespace ErrorCodes
 {
    extern const int LOGICAL_ERROR;
+    extern const int ALL_CONNECTION_TRIES_FAILED;
 }


@ -45,6 +46,9 @@ IConnectionPool::Entry ConnectionPoolWithFailover::get(const ConnectionTimeouts
                                                       const Settings * settings,
                                                       bool /*force_connected*/)
 {
+    if (nested_pools.empty())
+        throw DB::Exception(DB::ErrorCodes::ALL_CONNECTION_TRIES_FAILED, "Cannot get connection from ConnectionPoolWithFailover cause nested pools are empty");
+
    TryGetEntryFunc try_get_entry = [&](NestedPool & pool, std::string & fail_message)
    {
        return tryGetEntry(pool, timeouts, fail_message, settings);
@ -167,6 +171,9 @@ std::vector<ConnectionPoolWithFailover::TryResult> ConnectionPoolWithFailover::g
        PoolMode pool_mode,
        const TryGetEntryFunc & try_get_entry)
 {
+    if (nested_pools.empty())
+        throw DB::Exception(DB::ErrorCodes::ALL_CONNECTION_TRIES_FAILED, "Cannot get connection from ConnectionPoolWithFailover cause nested pools are empty");
+
    size_t min_entries = (settings && settings->skip_unavailable_shards) ? 0 : 1;
    size_t max_tries = (settings ?
        size_t{settings->connections_with_failover_max_tries} :
--- a/src/Client/LocalConnection.cpp
+++ b/src/Client/LocalConnection.cpp
@ -18,6 +18,7 @@ namespace ErrorCodes
    extern const int UNKNOWN_PACKET_FROM_SERVER;
    extern const int UNKNOWN_EXCEPTION;
    extern const int NOT_IMPLEMENTED;
+    extern const int LOGICAL_ERROR;
 }

 LocalConnection::LocalConnection(ContextPtr context_, bool send_progress_, bool send_profile_events_, const String & server_display_name_)
@ -62,9 +63,13 @@ void LocalConnection::updateProgress(const Progress & value)
    state->progress.incrementPiecewiseAtomically(value);
 }

-void LocalConnection::getProfileEvents(Block & block)
+void LocalConnection::sendProfileEvents()
 {
-    ProfileEvents::getProfileEvents(server_display_name, state->profile_queue, block, last_sent_snapshots);
+    Block profile_block;
+    state->after_send_profile_events.restart();
+    next_packet_type = Protocol::Server::ProfileEvents;
+    ProfileEvents::getProfileEvents(server_display_name, state->profile_queue, profile_block, last_sent_snapshots);
+    state->block.emplace(std::move(profile_block));
 }

 void LocalConnection::sendQuery(
@ -192,13 +197,14 @@ void LocalConnection::sendData(const Block & block, const String &, bool)
        return;

    if (state->pushing_async_executor)
-    {
        state->pushing_async_executor->push(block);
-    }
    else if (state->pushing_executor)
-    {
        state->pushing_executor->push(block);
-    }
+    else
+        throw Exception("Unknown executor", ErrorCodes::LOGICAL_ERROR);
+
+    if (send_profile_events)
+        sendProfileEvents();
 }

 void LocalConnection::sendCancel()
@ -264,11 +270,7 @@ bool LocalConnection::poll(size_t)

        if (send_profile_events && (state->after_send_profile_events.elapsedMicroseconds() >= query_context->getSettingsRef().interactive_delay))
        {
-            Block block;
-            state->after_send_profile_events.restart();
-            next_packet_type = Protocol::Server::ProfileEvents;
-            getProfileEvents(block);
-            state->block.emplace(std::move(block));
+            sendProfileEvents();
            return true;
        }

@ -349,11 +351,7 @@ bool LocalConnection::poll(size_t)

        if (send_profile_events && state->executor)
        {
-            Block block;
-            state->after_send_profile_events.restart();
-            next_packet_type = Protocol::Server::ProfileEvents;
-            getProfileEvents(block);
-            state->block.emplace(std::move(block));
+            sendProfileEvents();
            return true;
        }
    }
--- a/src/Client/LocalConnection.h
+++ b/src/Client/LocalConnection.h
@ -142,7 +142,7 @@ private:

    void updateProgress(const Progress & value);

-    void getProfileEvents(Block & block);
+    void sendProfileEvents();

    bool pollImpl();

--- a/src/Common/FileCache.h
+++ b/src/Common/FileCache.h
@ -1,430 +0,0 @@
-#pragma once
-
-#include <atomic>
-#include <chrono>
-#include <list>
-#include <memory>
-#include <mutex>
-#include <unordered_map>
-#include <unordered_set>
-#include <boost/functional/hash.hpp>
-#include <boost/noncopyable.hpp>
-#include <map>
-
-#include "FileCache_fwd.h"
-#include <IO/ReadSettings.h>
-#include <Common/logger_useful.h>
-#include <Common/FileSegment.h>
-#include <Core/Types.h>
-
-
-namespace DB
-{
-
-namespace ErrorCodes
-{
-    extern const int LOGICAL_ERROR;
-}
-
-class IFileCache;
-using FileCachePtr = std::shared_ptr<IFileCache>;
-
-/**
- * Local cache for remote filesystem files, represented as a set of non-overlapping non-empty file segments.
- */
-class IFileCache : private boost::noncopyable
-{
-friend class FileSegment;
-friend struct FileSegmentsHolder;
-friend class FileSegmentRangeWriter;
-
-public:
-    using Key = UInt128;
-    using Downloader = std::unique_ptr<SeekableReadBuffer>;
-
-    IFileCache(
-        const String & cache_base_path_,
-        const FileCacheSettings & cache_settings_);
-
-    virtual ~IFileCache() = default;
-
-    /// Restore cache from local filesystem.
-    virtual void initialize() = 0;
-
-    virtual void remove(const Key & key) = 0;
-
-    virtual void remove() = 0;
-
-    static bool isReadOnly();
-
-    /// Cache capacity in bytes.
-    size_t capacity() const { return max_size; }
-
-    static Key hash(const String & path);
-
-    String getPathInLocalCache(const Key & key, size_t offset);
-
-    String getPathInLocalCache(const Key & key);
-
-    const String & getBasePath() const { return cache_base_path; }
-
-    virtual std::vector<String> tryGetCachePaths(const Key & key) = 0;
-
-    /**
-     * Given an `offset` and `size` representing [offset, offset + size) bytes interval,
-     * return list of cached non-overlapping non-empty
-     * file segments `[segment1, ..., segmentN]` which intersect with given interval.
-     *
-     * Segments in returned list are ordered in ascending order and represent a full contiguous
-     * interval (no holes). Each segment in returned list has state: DOWNLOADED, DOWNLOADING or EMPTY.
-     *
-     * As long as pointers to returned file segments are hold
-     * it is guaranteed that these file segments are not removed from cache.
-     */
-    virtual FileSegmentsHolder getOrSet(const Key & key, size_t offset, size_t size) = 0;
-
-    /**
-     * Segments in returned list are ordered in ascending order and represent a full contiguous
-     * interval (no holes). Each segment in returned list has state: DOWNLOADED, DOWNLOADING or EMPTY.
-     *
-     * If file segment has state EMPTY, then it is also marked as "detached". E.g. it is "detached"
-     * from cache (not owned by cache), and as a result will never change it's state and will be destructed
-     * with the destruction of the holder, while in getOrSet() EMPTY file segments can eventually change
-     * it's state (and become DOWNLOADED).
-     */
-    virtual FileSegmentsHolder get(const Key & key, size_t offset, size_t size) = 0;
-
-    virtual FileSegmentsHolder setDownloading(const Key & key, size_t offset, size_t size) = 0;
-
-    virtual FileSegments getSnapshot() const = 0;
-
-    /// For debug.
-    virtual String dumpStructure(const Key & key) = 0;
-
-    virtual size_t getUsedCacheSize() const = 0;
-
-    virtual size_t getFileSegmentsNum() const = 0;
-
-protected:
-    String cache_base_path;
-    size_t max_size;
-    size_t max_element_size;
-    size_t max_file_segment_size;
-
-    bool is_initialized = false;
-
-    mutable std::mutex mutex;
-
-    class LRUQueue
-    {
-    public:
-        struct FileKeyAndOffset
-        {
-            Key key;
-            size_t offset;
-            size_t size;
-            size_t hits = 0;
-
-            FileKeyAndOffset(const Key & key_, size_t offset_, size_t size_) : key(key_), offset(offset_), size(size_) {}
-        };
-
-        using Iterator = typename std::list<FileKeyAndOffset>::iterator;
-
-        size_t getTotalCacheSize(std::lock_guard<std::mutex> & /* cache_lock */) const { return cache_size; }
-
-        size_t getElementsNum(std::lock_guard<std::mutex> & /* cache_lock */) const { return queue.size(); }
-
-        Iterator add(const Key & key, size_t offset, size_t size, std::lock_guard<std::mutex> & cache_lock);
-
-        void remove(Iterator queue_it, std::lock_guard<std::mutex> & cache_lock);
-
-        void moveToEnd(Iterator queue_it, std::lock_guard<std::mutex> & cache_lock);
-
-        /// Space reservation for a file segment is incremental, so we need to be able to increment size of the queue entry.
-        void incrementSize(Iterator queue_it, size_t size_increment, std::lock_guard<std::mutex> & cache_lock);
-
-        String toString(std::lock_guard<std::mutex> & cache_lock) const;
-
-        bool contains(const Key & key, size_t offset, std::lock_guard<std::mutex> & cache_lock) const;
-
-        Iterator begin() { return queue.begin(); }
-
-        Iterator end() { return queue.end(); }
-
-        void removeAll(std::lock_guard<std::mutex> & cache_lock);
-
-    private:
-        std::list<FileKeyAndOffset> queue;
-        size_t cache_size = 0;
-    };
-
-    using AccessKeyAndOffset = std::pair<Key, size_t>;
-
-    struct KeyAndOffsetHash
-    {
-        std::size_t operator()(const AccessKeyAndOffset & key) const
-        {
-            return std::hash<UInt128>()(key.first) ^ std::hash<UInt64>()(key.second);
-        }
-    };
-
-    using AccessRecord = std::unordered_map<AccessKeyAndOffset, LRUQueue::Iterator, KeyAndOffsetHash>;
-
-    /// Used to track and control the cache access of each query.
-    /// Through it, we can realize the processing of different queries by the cache layer.
-    struct QueryContext
-    {
-        LRUQueue lru_queue;
-        AccessRecord records;
-
-        size_t cache_size = 0;
-        size_t max_cache_size;
-
-        bool skip_download_if_exceeds_query_cache;
-
-        QueryContext(size_t max_cache_size_, bool skip_download_if_exceeds_query_cache_)
-            : max_cache_size(max_cache_size_)
-            , skip_download_if_exceeds_query_cache(skip_download_if_exceeds_query_cache_) {}
-
-        void remove(const Key & key, size_t offset, size_t size, std::lock_guard<std::mutex> & cache_lock)
-        {
-            if (cache_size < size)
-                throw Exception(ErrorCodes::LOGICAL_ERROR, "Deleted cache size exceeds existing cache size");
-
-            if (!skip_download_if_exceeds_query_cache)
-            {
-                auto record = records.find({key, offset});
-                if (record != records.end())
-                {
-                    lru_queue.remove(record->second, cache_lock);
-                    records.erase({key, offset});
-                }
-            }
-            cache_size -= size;
-        }
-
-        void reserve(const Key & key, size_t offset, size_t size, std::lock_guard<std::mutex> & cache_lock)
-        {
-            if (cache_size + size > max_cache_size)
-                throw Exception(ErrorCodes::LOGICAL_ERROR, "Reserved cache size exceeds the remaining cache size");
-
-            if (!skip_download_if_exceeds_query_cache)
-            {
-                auto record = records.find({key, offset});
-                if (record == records.end())
-                {
-                    auto queue_iter = lru_queue.add(key, offset, 0, cache_lock);
-                    record = records.insert({{key, offset}, queue_iter}).first;
-                }
-                record->second->size += size;
-            }
-            cache_size += size;
-        }
-
-        void use(const Key & key, size_t offset, std::lock_guard<std::mutex> & cache_lock)
-        {
-            if (!skip_download_if_exceeds_query_cache)
-            {
-                auto record = records.find({key, offset});
-                if (record != records.end())
-                    lru_queue.moveToEnd(record->second, cache_lock);
-            }
-        }
-
-        size_t getMaxCacheSize() { return max_cache_size; }
-
-        size_t getCacheSize() { return cache_size; }
-
-        LRUQueue & queue() { return lru_queue; }
-
-        bool isSkipDownloadIfExceed() { return skip_download_if_exceeds_query_cache; }
-    };
-
-    using QueryContextPtr = std::shared_ptr<QueryContext>;
-    using QueryContextMap = std::unordered_map<String, QueryContextPtr>;
-
-    QueryContextMap query_map;
-
-    bool enable_filesystem_query_cache_limit;
-
-    QueryContextPtr getCurrentQueryContext(std::lock_guard<std::mutex> & cache_lock);
-
-    QueryContextPtr getQueryContext(const String & query_id, std::lock_guard<std::mutex> & cache_lock);
-
-    void removeQueryContext(const String & query_id);
-
-    QueryContextPtr getOrSetQueryContext(const String & query_id, const ReadSettings & settings, std::lock_guard<std::mutex> &);
-
-    virtual bool tryReserve(
-        const Key & key, size_t offset, size_t size,
-        std::lock_guard<std::mutex> & cache_lock) = 0;
-
-    virtual void remove(
-        Key key, size_t offset,
-        std::lock_guard<std::mutex> & cache_lock,
-        std::lock_guard<std::mutex> & segment_lock) = 0;
-
-    virtual bool isLastFileSegmentHolder(
-        const Key & key, size_t offset,
-        std::lock_guard<std::mutex> & cache_lock,
-        std::lock_guard<std::mutex> & segment_lock) = 0;
-
-    /// If file segment was partially downloaded and then space reservation fails (because of no
-    /// space left), then update corresponding cache cell metadata (file segment size).
-    virtual void reduceSizeToDownloaded(
-        const Key & key, size_t offset,
-        std::lock_guard<std::mutex> & cache_lock,
-        std::lock_guard<std::mutex> & segment_lock) = 0;
-
-    void assertInitialized() const;
-
-public:
-    /// Save a query context information, and adopt different cache policies
-    /// for different queries through the context cache layer.
-    struct QueryContextHolder : private boost::noncopyable
-    {
-        explicit QueryContextHolder(const String & query_id_, IFileCache * cache_, QueryContextPtr context_);
-
-        QueryContextHolder() = default;
-
-        ~QueryContextHolder();
-
-        String query_id {};
-        IFileCache * cache = nullptr;
-        QueryContextPtr context = nullptr;
-    };
-
-    QueryContextHolder getQueryContextHolder(const String & query_id, const ReadSettings & settings);
-};
-
-class LRUFileCache final : public IFileCache
-{
-public:
-    LRUFileCache(
-        const String & cache_base_path_,
-        const FileCacheSettings & cache_settings_);
-
-    FileSegmentsHolder getOrSet(const Key & key, size_t offset, size_t size) override;
-
-    FileSegmentsHolder get(const Key & key, size_t offset, size_t size) override;
-
-    FileSegments getSnapshot() const override;
-
-    void initialize() override;
-
-    void remove(const Key & key) override;
-
-    void remove() override;
-
-    std::vector<String> tryGetCachePaths(const Key & key) override;
-
-    size_t getUsedCacheSize() const override;
-
-    size_t getFileSegmentsNum() const override;
-
-private:
-    struct FileSegmentCell : private boost::noncopyable
-    {
-        FileSegmentPtr file_segment;
-
-        /// Iterator is put here on first reservation attempt, if successful.
-        std::optional<LRUQueue::Iterator> queue_iterator;
-
-        /// Pointer to file segment is always hold by the cache itself.
-        /// Apart from pointer in cache, it can be hold by cache users, when they call
-        /// getorSet(), but cache users always hold it via FileSegmentsHolder.
-        bool releasable() const { return file_segment.unique(); }
-
-        size_t size() const { return file_segment->reserved_size; }
-
-        FileSegmentCell(FileSegmentPtr file_segment_, LRUFileCache * cache, std::lock_guard<std::mutex> & cache_lock);
-
-        FileSegmentCell(FileSegmentCell && other) noexcept
-            : file_segment(std::move(other.file_segment))
-            , queue_iterator(other.queue_iterator) {}
-    };
-
-    using FileSegmentsByOffset = std::map<size_t, FileSegmentCell>;
-    using CachedFiles = std::unordered_map<Key, FileSegmentsByOffset>;
-
-    CachedFiles files;
-    LRUQueue queue;
-
-    LRUQueue stash_queue;
-    AccessRecord records;
-
-    size_t max_stash_element_size;
-    size_t enable_cache_hits_threshold;
-
-    Poco::Logger * log;
-
-    FileSegments getImpl(
-        const Key & key, const FileSegment::Range & range,
-        std::lock_guard<std::mutex> & cache_lock);
-
-    FileSegmentCell * getCell(
-        const Key & key, size_t offset, std::lock_guard<std::mutex> & cache_lock);
-
-    FileSegmentCell * addCell(
-        const Key & key, size_t offset, size_t size,
-        FileSegment::State state, std::lock_guard<std::mutex> & cache_lock);
-
-    void useCell(const FileSegmentCell & cell, FileSegments & result, std::lock_guard<std::mutex> & cache_lock);
-
-    bool tryReserve(
-        const Key & key, size_t offset, size_t size,
-        std::lock_guard<std::mutex> & cache_lock) override;
-
-    bool tryReserveForMainList(
-        const Key & key, size_t offset, size_t size,
-        QueryContextPtr query_context,
-        std::lock_guard<std::mutex> & cache_lock);
-
-    void remove(
-        Key key, size_t offset,
-        std::lock_guard<std::mutex> & cache_lock,
-        std::lock_guard<std::mutex> & segment_lock) override;
-
-    bool isLastFileSegmentHolder(
-        const Key & key, size_t offset,
-        std::lock_guard<std::mutex> & cache_lock,
-        std::lock_guard<std::mutex> & segment_lock) override;
-
-    void reduceSizeToDownloaded(
-        const Key & key, size_t offset,
-        std::lock_guard<std::mutex> & cache_lock,
-        std::lock_guard<std::mutex> & segment_lock) override;
-
-    size_t getAvailableCacheSize() const;
-
-    void loadCacheInfoIntoMemory(std::lock_guard<std::mutex> & cache_lock);
-
-    FileSegments splitRangeIntoCells(
-        const Key & key, size_t offset, size_t size, FileSegment::State state, std::lock_guard<std::mutex> & cache_lock);
-
-    String dumpStructureUnlocked(const Key & key_, std::lock_guard<std::mutex> & cache_lock);
-
-    void fillHolesWithEmptyFileSegments(
-        FileSegments & file_segments, const Key & key, const FileSegment::Range & range, bool fill_with_detached_file_segments, std::lock_guard<std::mutex> & cache_lock);
-
-    FileSegmentsHolder setDownloading(const Key & key, size_t offset, size_t size) override;
-
-    size_t getUsedCacheSizeUnlocked(std::lock_guard<std::mutex> & cache_lock) const;
-
-    size_t getAvailableCacheSizeUnlocked(std::lock_guard<std::mutex> & cache_lock) const;
-
-    size_t getFileSegmentsNumUnlocked(std::lock_guard<std::mutex> & cache_lock) const;
-
-    void assertCacheCellsCorrectness(const FileSegmentsByOffset & cells_by_offset, std::lock_guard<std::mutex> & cache_lock);
-
-public:
-    String dumpStructure(const Key & key_) override;
-
-    void assertCacheCorrectness(const Key & key, std::lock_guard<std::mutex> & cache_lock);
-
-    void assertCacheCorrectness(std::lock_guard<std::mutex> & cache_lock);
-
-    void assertQueueCorrectness(std::lock_guard<std::mutex> & cache_lock);
-};
-
-}
--- a/src/Common/FileCacheFactory.cpp
+++ b/src/Common/FileCacheFactory.cpp
@ -1,5 +1,6 @@
 #include "FileCacheFactory.h"
-#include "FileCache.h"
+#include "IFileCache.h"
+#include "LRUFileCache.h"

 namespace DB
 {
--- a/src/Common/FileCacheSettings.cpp
+++ b/src/Common/FileCacheSettings.cpp
@ -13,6 +13,8 @@ void FileCacheSettings::loadFromConfig(const Poco::Util::AbstractConfiguration &
    cache_on_write_operations = config.getUInt64(config_prefix + ".cache_on_write_operations", false);
    enable_filesystem_query_cache_limit = config.getUInt64(config_prefix + ".enable_filesystem_query_cache_limit", false);
    enable_cache_hits_threshold = config.getUInt64(config_prefix + ".enable_cache_hits_threshold", REMOTE_FS_OBJECTS_CACHE_ENABLE_HITS_THRESHOLD);
+    do_not_evict_index_and_mark_files = config.getUInt64(config_prefix + ".do_not_evict_index_and_mark_files", true);
+    allow_to_remove_persistent_segments_from_cache_by_default = config.getUInt64(config_prefix + ".allow_to_remove_persistent_segments_from_cache_by_default", true);
 }

 }
--- a/src/Common/FileCacheSettings.h
+++ b/src/Common/FileCacheSettings.h
@ -12,10 +12,14 @@ struct FileCacheSettings
    size_t max_size = 0;
    size_t max_elements = REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_ELEMENTS;
    size_t max_file_segment_size = REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_FILE_SEGMENT_SIZE;
+
    bool cache_on_write_operations = false;
-    bool enable_filesystem_query_cache_limit = false;

    size_t enable_cache_hits_threshold = REMOTE_FS_OBJECTS_CACHE_ENABLE_HITS_THRESHOLD;
+    bool enable_filesystem_query_cache_limit = false;
+
+    bool do_not_evict_index_and_mark_files = true;
+    bool allow_to_remove_persistent_segments_from_cache_by_default = true;

    void loadFromConfig(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix);
 };
--- a/src/Common/FileSegment.cpp
+++ b/src/Common/FileSegment.cpp
@ -1,11 +1,12 @@
 #include "FileSegment.h"
 #include <base/getThreadId.h>
-#include <Common/FileCache.h>
 #include <Common/hex.h>
+#include <Common/logger_useful.h>
 #include <IO/WriteBufferFromString.h>
 #include <IO/Operators.h>
 #include <filesystem>

+
 namespace CurrentMetrics
 {
 extern const Metric CacheDetachedFileSegments;
@ -25,7 +26,8 @@ FileSegment::FileSegment(
        size_t size_,
        const Key & key_,
        IFileCache * cache_,
-        State download_state_)
+        State download_state_,
+        bool is_persistent_)
    : segment_range(offset_, offset_ + size_ - 1)
    , download_state(download_state_)
    , file_key(key_)
@ -35,6 +37,7 @@ FileSegment::FileSegment(
 #else
    , log(&Poco::Logger::get("FileSegment"))
 #endif
+    , is_persistent(is_persistent_) /// Not really used for now, see PR 36171
 {
    /// On creation, file segment state can be EMPTY, DOWNLOADED, DOWNLOADING.
    switch (download_state)
@ -241,7 +244,7 @@ void FileSegment::write(const char * from, size_t size, size_t offset_)
                            "Cache writer was finalized (downloaded size: {}, state: {})",
                            downloaded_size, stateToString(download_state));

-        auto download_path = cache->getPathInLocalCache(key(), offset());
+        auto download_path = getPathInLocalCache();
        cache_writer = std::make_unique<WriteBufferFromFile>(download_path);
    }

@ -271,6 +274,11 @@ void FileSegment::write(const char * from, size_t size, size_t offset_)
    assert(getDownloadOffset() == offset_ + size);
 }

+String FileSegment::getPathInLocalCache() const
+{
+    return cache->getPathInLocalCache(key(), offset(), isPersistent());
+}
+
 void FileSegment::writeInMemory(const char * from, size_t size)
 {
    if (!size)
@ -287,7 +295,7 @@ void FileSegment::writeInMemory(const char * from, size_t size)
    if (cache_writer)
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Cache writer already initialized");

-    auto download_path = cache->getPathInLocalCache(key(), offset());
+    auto download_path = getPathInLocalCache();
    cache_writer = std::make_unique<WriteBufferFromFile>(download_path, size + 1);

    try
@ -677,7 +685,7 @@ void FileSegment::assertCorrectnessImpl(std::lock_guard<std::mutex> & /* segment
 {
    assert(downloader_id.empty() == (download_state != FileSegment::State::DOWNLOADING));
    assert(!downloader_id.empty() == (download_state == FileSegment::State::DOWNLOADING));
-    assert(download_state != FileSegment::State::DOWNLOADED || std::filesystem::file_size(cache->getPathInLocalCache(key(), offset())) > 0);
+    assert(download_state != FileSegment::State::DOWNLOADED || std::filesystem::file_size(getPathInLocalCache()) > 0);
 }

 void FileSegment::throwIfDetached() const
@ -729,6 +737,7 @@ FileSegmentPtr FileSegment::getSnapshot(const FileSegmentPtr & file_segment, std
    snapshot->ref_count = file_segment.use_count();
    snapshot->downloaded_size = file_segment->getDownloadedSize();
    snapshot->download_state = file_segment->state();
+    snapshot->is_persistent = file_segment->isPersistent();

    return snapshot;
 }
--- a/src/Common/FileSegment.h
+++ b/src/Common/FileSegment.h
@ -1,8 +1,9 @@
 #pragma once

 #include <boost/noncopyable.hpp>
-#include <IO/WriteBufferFromFile.h>
+#include <Common/IFileCache.h>
 #include <Core/Types.h>
+#include <IO/WriteBufferFromFile.h>
 #include <IO/SeekableReadBuffer.h>
 #include <list>

@ -31,7 +32,7 @@ friend struct FileSegmentsHolder;
 friend class FileSegmentRangeWriter;

 public:
-    using Key = UInt128;
+    using Key = IFileCache::Key;
    using RemoteFileReaderPtr = std::shared_ptr<SeekableReadBuffer>;
    using LocalCacheWriterPtr = std::unique_ptr<WriteBufferFromFile>;

@ -70,8 +71,12 @@ public:
    };

    FileSegment(
-        size_t offset_, size_t size_, const Key & key_,
-        IFileCache * cache_, State download_state_);
+        size_t offset_,
+        size_t size_,
+        const Key & key_,
+        IFileCache * cache_,
+        State download_state_,
+        bool is_persistent_ = false);

    ~FileSegment();

@ -100,6 +105,8 @@ public:

    size_t offset() const { return range().left; }

+    bool isPersistent() const { return is_persistent; }
+
    State wait();

    bool reserve(size_t size);
@ -161,6 +168,8 @@ public:

    [[noreturn]] void throwIfDetached() const;

+    String getPathInLocalCache() const;
+
 private:
    size_t availableSize() const { return reserved_size - downloaded_size; }

@ -237,6 +246,9 @@ private:
    std::atomic<size_t> hits_count = 0; /// cache hits.
    std::atomic<size_t> ref_count = 0; /// Used for getting snapshot state

+    /// Currently no-op. (will be added in PR 36171)
+    /// Defined if a file comply by the eviction policy.
+    bool is_persistent;
    CurrentMetrics::Increment metric_increment{CurrentMetrics::CacheFileSegments};
 };

--- a/src/Common/IFileCache.cpp
+++ b/src/Common/IFileCache.cpp
@ -0,0 +1,201 @@
+#include "IFileCache.h"
+
+#include <Common/hex.h>
+#include <Common/CurrentThread.h>
+#include <Common/SipHash.h>
+#include <Common/FileCacheSettings.h>
+#include <IO/ReadSettings.h>
+#include <filesystem>
+
+namespace fs = std::filesystem;
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int REMOTE_FS_OBJECT_CACHE_ERROR;
+    extern const int LOGICAL_ERROR;
+}
+
+IFileCache::IFileCache(
+    const String & cache_base_path_,
+    const FileCacheSettings & cache_settings_)
+    : cache_base_path(cache_base_path_)
+    , max_size(cache_settings_.max_size)
+    , max_element_size(cache_settings_.max_elements)
+    , max_file_segment_size(cache_settings_.max_file_segment_size)
+    , enable_filesystem_query_cache_limit(cache_settings_.enable_filesystem_query_cache_limit)
+{
+}
+
+String IFileCache::Key::toString() const
+{
+    return getHexUIntLowercase(key);
+}
+
+IFileCache::Key IFileCache::hash(const String & path)
+{
+    return Key(sipHash128(path.data(), path.size()));
+}
+
+String IFileCache::getPathInLocalCache(const Key & key, size_t offset, bool is_persistent) const
+{
+    auto key_str = key.toString();
+    return fs::path(cache_base_path)
+        / key_str.substr(0, 3)
+        / key_str
+        / (std::to_string(offset) + (is_persistent ? "_persistent" : ""));
+}
+
+String IFileCache::getPathInLocalCache(const Key & key) const
+{
+    auto key_str = key.toString();
+    return fs::path(cache_base_path) / key_str.substr(0, 3) / key_str;
+}
+
+static bool isQueryInitialized()
+{
+    return CurrentThread::isInitialized()
+        && CurrentThread::get().getQueryContext()
+        && CurrentThread::getQueryId().size != 0;
+}
+
+bool IFileCache::isReadOnly()
+{
+    return !isQueryInitialized();
+}
+
+void IFileCache::assertInitialized() const
+{
+    if (!is_initialized)
+        throw Exception(ErrorCodes::REMOTE_FS_OBJECT_CACHE_ERROR, "Cache not initialized");
+}
+
+IFileCache::QueryContextPtr IFileCache::getCurrentQueryContext(std::lock_guard<std::mutex> & cache_lock)
+{
+    if (!isQueryInitialized())
+        return nullptr;
+
+    return getQueryContext(CurrentThread::getQueryId().toString(), cache_lock);
+}
+
+IFileCache::QueryContextPtr IFileCache::getQueryContext(const String & query_id, std::lock_guard<std::mutex> & /* cache_lock */)
+{
+    auto query_iter = query_map.find(query_id);
+    return (query_iter == query_map.end()) ? nullptr : query_iter->second;
+}
+
+void IFileCache::removeQueryContext(const String & query_id)
+{
+    std::lock_guard cache_lock(mutex);
+    auto query_iter = query_map.find(query_id);
+
+    if (query_iter == query_map.end())
+    {
+        throw Exception(
+            ErrorCodes::LOGICAL_ERROR,
+            "Attempt to release query context that does not exist (query_id: {})",
+            query_id);
+    }
+
+    query_map.erase(query_iter);
+}
+
+IFileCache::QueryContextPtr IFileCache::getOrSetQueryContext(
+    const String & query_id, const ReadSettings & settings, std::lock_guard<std::mutex> & cache_lock)
+{
+    if (query_id.empty())
+        return nullptr;
+
+    auto context = getQueryContext(query_id, cache_lock);
+    if (context)
+        return context;
+
+    auto query_context = std::make_shared<QueryContext>(settings.max_query_cache_size, settings.skip_download_if_exceeds_query_cache);
+    auto query_iter = query_map.emplace(query_id, query_context).first;
+    return query_iter->second;
+}
+
+IFileCache::QueryContextHolder IFileCache::getQueryContextHolder(const String & query_id, const ReadSettings & settings)
+{
+    std::lock_guard cache_lock(mutex);
+
+    if (!enable_filesystem_query_cache_limit || settings.max_query_cache_size == 0)
+        return {};
+
+    /// if enable_filesystem_query_cache_limit is true, and max_query_cache_size large than zero,
+    /// we create context query for current query.
+    auto context = getOrSetQueryContext(query_id, settings, cache_lock);
+    return QueryContextHolder(query_id, this, context);
+}
+
+void IFileCache::QueryContext::remove(const Key & key, size_t offset, size_t size, std::lock_guard<std::mutex> & cache_lock)
+{
+    if (cache_size < size)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Deleted cache size exceeds existing cache size");
+
+    if (!skip_download_if_exceeds_query_cache)
+    {
+        auto record = records.find({key, offset});
+        if (record != records.end())
+        {
+            lru_queue.remove(record->second, cache_lock);
+            records.erase({key, offset});
+        }
+    }
+    cache_size -= size;
+}
+
+void IFileCache::QueryContext::reserve(const Key & key, size_t offset, size_t size, std::lock_guard<std::mutex> & cache_lock)
+{
+    if (cache_size + size > max_cache_size)
+    {
+        throw Exception(
+            ErrorCodes::LOGICAL_ERROR,
+            "Reserved cache size exceeds the remaining cache size (key: {}, offset: {})",
+            key.toString(), offset);
+    }
+
+    if (!skip_download_if_exceeds_query_cache)
+    {
+        auto record = records.find({key, offset});
+        if (record == records.end())
+        {
+            auto queue_iter = lru_queue.add(key, offset, 0, cache_lock);
+            record = records.insert({{key, offset}, queue_iter}).first;
+        }
+        record->second->size += size;
+    }
+    cache_size += size;
+}
+
+void IFileCache::QueryContext::use(const Key & key, size_t offset, std::lock_guard<std::mutex> & cache_lock)
+{
+    if (skip_download_if_exceeds_query_cache)
+        return;
+
+    auto record = records.find({key, offset});
+    if (record != records.end())
+        lru_queue.moveToEnd(record->second, cache_lock);
+}
+
+IFileCache::QueryContextHolder::QueryContextHolder(
+    const String & query_id_,
+    IFileCache * cache_,
+    IFileCache::QueryContextPtr context_)
+    : query_id(query_id_)
+    , cache(cache_)
+    , context(context_)
+{
+}
+
+IFileCache::QueryContextHolder::~QueryContextHolder()
+{
+    /// If only the query_map and the current holder hold the context_query,
+    /// the query has been completed and the query_context is released.
+    if (context && context.use_count() == 2)
+        cache->removeQueryContext(query_id);
+}
+
+}
--- a/src/Common/IFileCache.h
+++ b/src/Common/IFileCache.h
@ -0,0 +1,267 @@
+#pragma once
+
+#include <Core/Types.h>
+#include <Common/FileCache_fwd.h>
+
+#include <boost/noncopyable.hpp>
+#include <list>
+#include <unordered_map>
+#include <functional>
+
+
+namespace DB
+{
+
+class FileSegment;
+using FileSegmentPtr = std::shared_ptr<FileSegment>;
+using FileSegments = std::list<FileSegmentPtr>;
+struct FileSegmentsHolder;
+struct ReadSettings;
+
+/**
+ * Local cache for remote filesystem files, represented as a set of non-overlapping non-empty file segments.
+ */
+class IFileCache : private boost::noncopyable
+{
+friend class FileSegment;
+friend struct FileSegmentsHolder;
+friend class FileSegmentRangeWriter;
+
+public:
+    struct Key
+    {
+        UInt128 key;
+        String toString() const;
+
+        Key() = default;
+        explicit Key(const UInt128 & key_) : key(key_) {}
+
+        bool operator==(const Key & other) const { return key == other.key; }
+    };
+
+    IFileCache(
+        const String & cache_base_path_,
+        const FileCacheSettings & cache_settings_);
+
+    virtual ~IFileCache() = default;
+
+    /// Restore cache from local filesystem.
+    virtual void initialize() = 0;
+
+    virtual void removeIfExists(const Key & key) = 0;
+
+    virtual void removeIfReleasable(bool remove_persistent_files) = 0;
+
+    static bool isReadOnly();
+
+    /// Cache capacity in bytes.
+    size_t capacity() const { return max_size; }
+
+    static Key hash(const String & path);
+
+    String getPathInLocalCache(const Key & key, size_t offset, bool is_persistent) const;
+
+    String getPathInLocalCache(const Key & key) const;
+
+    const String & getBasePath() const { return cache_base_path; }
+
+    virtual std::vector<String> tryGetCachePaths(const Key & key) = 0;
+
+    /**
+     * Given an `offset` and `size` representing [offset, offset + size) bytes interval,
+     * return list of cached non-overlapping non-empty
+     * file segments `[segment1, ..., segmentN]` which intersect with given interval.
+     *
+     * Segments in returned list are ordered in ascending order and represent a full contiguous
+     * interval (no holes). Each segment in returned list has state: DOWNLOADED, DOWNLOADING or EMPTY.
+     *
+     * As long as pointers to returned file segments are hold
+     * it is guaranteed that these file segments are not removed from cache.
+     */
+    virtual FileSegmentsHolder getOrSet(const Key & key, size_t offset, size_t size, bool is_persistent) = 0;
+
+    /**
+     * Segments in returned list are ordered in ascending order and represent a full contiguous
+     * interval (no holes). Each segment in returned list has state: DOWNLOADED, DOWNLOADING or EMPTY.
+     *
+     * If file segment has state EMPTY, then it is also marked as "detached". E.g. it is "detached"
+     * from cache (not owned by cache), and as a result will never change it's state and will be destructed
+     * with the destruction of the holder, while in getOrSet() EMPTY file segments can eventually change
+     * it's state (and become DOWNLOADED).
+     */
+    virtual FileSegmentsHolder get(const Key & key, size_t offset, size_t size) = 0;
+
+    virtual FileSegmentsHolder setDownloading(const Key & key, size_t offset, size_t size, bool is_persistent) = 0;
+
+    virtual FileSegments getSnapshot() const = 0;
+
+    /// For debug.
+    virtual String dumpStructure(const Key & key) = 0;
+
+    virtual size_t getUsedCacheSize() const = 0;
+
+    virtual size_t getFileSegmentsNum() const = 0;
+
+protected:
+    String cache_base_path;
+    size_t max_size;
+    size_t max_element_size;
+    size_t max_file_segment_size;
+
+    bool is_initialized = false;
+
+    mutable std::mutex mutex;
+
+    virtual bool tryReserve(
+        const Key & key, size_t offset, size_t size,
+        std::lock_guard<std::mutex> & cache_lock) = 0;
+
+    virtual void remove(
+        Key key, size_t offset,
+        std::lock_guard<std::mutex> & cache_lock,
+        std::lock_guard<std::mutex> & segment_lock) = 0;
+
+    virtual bool isLastFileSegmentHolder(
+        const Key & key, size_t offset,
+        std::lock_guard<std::mutex> & cache_lock,
+        std::lock_guard<std::mutex> & segment_lock) = 0;
+
+    virtual void reduceSizeToDownloaded(
+        const Key & key, size_t offset,
+        std::lock_guard<std::mutex> & cache_lock,
+        std::lock_guard<std::mutex> & /* segment_lock */) = 0;
+
+    void assertInitialized() const;
+
+    class LRUQueue
+    {
+    public:
+        struct FileKeyAndOffset
+        {
+            Key key;
+            size_t offset;
+            size_t size;
+            size_t hits = 0;
+
+            FileKeyAndOffset(const Key & key_, size_t offset_, size_t size_) : key(key_), offset(offset_), size(size_) {}
+        };
+
+        using Iterator = typename std::list<FileKeyAndOffset>::iterator;
+
+        size_t getTotalCacheSize(std::lock_guard<std::mutex> & /* cache_lock */) const { return cache_size; }
+
+        size_t getElementsNum(std::lock_guard<std::mutex> & /* cache_lock */) const { return queue.size(); }
+
+        Iterator add(const Key & key, size_t offset, size_t size, std::lock_guard<std::mutex> & cache_lock);
+
+        void remove(Iterator queue_it, std::lock_guard<std::mutex> & cache_lock);
+
+        void moveToEnd(Iterator queue_it, std::lock_guard<std::mutex> & cache_lock);
+
+        /// Space reservation for a file segment is incremental, so we need to be able to increment size of the queue entry.
+        void incrementSize(Iterator queue_it, size_t size_increment, std::lock_guard<std::mutex> & cache_lock);
+
+        String toString(std::lock_guard<std::mutex> & cache_lock) const;
+
+        bool contains(const Key & key, size_t offset, std::lock_guard<std::mutex> & cache_lock) const;
+
+        Iterator begin() { return queue.begin(); }
+
+        Iterator end() { return queue.end(); }
+
+        void removeAll(std::lock_guard<std::mutex> & cache_lock);
+
+    private:
+        std::list<FileKeyAndOffset> queue;
+        size_t cache_size = 0;
+    };
+
+    using AccessKeyAndOffset = std::pair<Key, size_t>;
+    struct KeyAndOffsetHash
+    {
+        std::size_t operator()(const AccessKeyAndOffset & key) const
+        {
+            return std::hash<UInt128>()(key.first.key) ^ std::hash<UInt64>()(key.second);
+        }
+    };
+
+    using AccessRecord = std::unordered_map<AccessKeyAndOffset, LRUQueue::Iterator, KeyAndOffsetHash>;
+
+    /// Used to track and control the cache access of each query.
+    /// Through it, we can realize the processing of different queries by the cache layer.
+    struct QueryContext
+    {
+        LRUQueue lru_queue;
+        AccessRecord records;
+
+        size_t cache_size = 0;
+        size_t max_cache_size;
+
+        bool skip_download_if_exceeds_query_cache;
+
+        QueryContext(size_t max_cache_size_, bool skip_download_if_exceeds_query_cache_)
+            : max_cache_size(max_cache_size_)
+            , skip_download_if_exceeds_query_cache(skip_download_if_exceeds_query_cache_) {}
+
+        void remove(const Key & key, size_t offset, size_t size, std::lock_guard<std::mutex> & cache_lock);
+
+        void reserve(const Key & key, size_t offset, size_t size, std::lock_guard<std::mutex> & cache_lock);
+
+        void use(const Key & key, size_t offset, std::lock_guard<std::mutex> & cache_lock);
+
+        size_t getMaxCacheSize() const { return max_cache_size; }
+
+        size_t getCacheSize() const { return cache_size; }
+
+        LRUQueue & queue() { return lru_queue; }
+
+        bool isSkipDownloadIfExceed() const { return skip_download_if_exceeds_query_cache; }
+    };
+
+    using QueryContextPtr = std::shared_ptr<QueryContext>;
+    using QueryContextMap = std::unordered_map<String, QueryContextPtr>;
+
+    QueryContextMap query_map;
+
+    bool enable_filesystem_query_cache_limit;
+
+    QueryContextPtr getCurrentQueryContext(std::lock_guard<std::mutex> & cache_lock);
+
+    QueryContextPtr getQueryContext(const String & query_id, std::lock_guard<std::mutex> & cache_lock);
+
+    void removeQueryContext(const String & query_id);
+
+    QueryContextPtr getOrSetQueryContext(const String & query_id, const ReadSettings & settings, std::lock_guard<std::mutex> &);
+
+public:
+    /// Save a query context information, and adopt different cache policies
+    /// for different queries through the context cache layer.
+    struct QueryContextHolder : private boost::noncopyable
+    {
+        QueryContextHolder(const String & query_id_, IFileCache * cache_, QueryContextPtr context_);
+
+        QueryContextHolder() = default;
+
+        ~QueryContextHolder();
+
+        String query_id;
+        IFileCache * cache = nullptr;
+        QueryContextPtr context;
+    };
+
+    QueryContextHolder getQueryContextHolder(const String & query_id, const ReadSettings & settings);
+
+};
+
+using FileCachePtr = std::shared_ptr<IFileCache>;
+
+}
+
+namespace std
+{
+template <> struct hash<DB::IFileCache::Key>
+{
+    std::size_t operator()(const DB::IFileCache::Key & k) const { return hash<UInt128>()(k.key); }
+};
+
+}
--- a/src/Common/LRUFileCache.cpp
+++ b/src/Common/LRUFileCache.cpp
@ -1,4 +1,4 @@
-#include "FileCache.h"
+#include "LRUFileCache.h"

 #include <Common/randomSeed.h>
 #include <Common/SipHash.h>
@ -22,130 +22,12 @@ namespace ErrorCodes
    extern const int LOGICAL_ERROR;
 }

-namespace
-{
-    String keyToStr(const IFileCache::Key & key)
-    {
-        return getHexUIntLowercase(key);
-    }
-}
-
-static bool isQueryInitialized()
-{
-    return CurrentThread::isInitialized() && CurrentThread::get().getQueryContext() && CurrentThread::getQueryId().size != 0;
-}
-
-IFileCache::IFileCache(
-    const String & cache_base_path_,
-    const FileCacheSettings & cache_settings_)
-    : cache_base_path(cache_base_path_)
-    , max_size(cache_settings_.max_size)
-    , max_element_size(cache_settings_.max_elements)
-    , max_file_segment_size(cache_settings_.max_file_segment_size)
-    , enable_filesystem_query_cache_limit(cache_settings_.enable_filesystem_query_cache_limit)
-{
-}
-
-IFileCache::Key IFileCache::hash(const String & path)
-{
-    return sipHash128(path.data(), path.size());
-}
-
-String IFileCache::getPathInLocalCache(const Key & key, size_t offset)
-{
-    auto key_str = keyToStr(key);
-    return fs::path(cache_base_path) / key_str.substr(0, 3) / key_str / std::to_string(offset);
-}
-
-String IFileCache::getPathInLocalCache(const Key & key)
-{
-    auto key_str = keyToStr(key);
-    return fs::path(cache_base_path) / key_str.substr(0, 3) / key_str;
-}
-
-bool IFileCache::isReadOnly()
-{
-    return (!isQueryInitialized());
-}
-
-void IFileCache::assertInitialized() const
-{
-    if (!is_initialized)
-        throw Exception(ErrorCodes::REMOTE_FS_OBJECT_CACHE_ERROR, "Cache not initialized");
-}
-
-IFileCache::QueryContextPtr IFileCache::getCurrentQueryContext(std::lock_guard<std::mutex> & cache_lock)
-{
-    if (!isQueryInitialized())
-        return nullptr;
-
-    return getQueryContext(CurrentThread::getQueryId().toString(), cache_lock);
-}
-
-IFileCache::QueryContextPtr IFileCache::getQueryContext(const String & query_id, std::lock_guard<std::mutex> &)
-{
-    auto query_iter = query_map.find(query_id);
-    return (query_iter == query_map.end()) ? nullptr : query_iter->second;
-}
-
-void IFileCache::removeQueryContext(const String & query_id)
-{
-    std::lock_guard cache_lock(mutex);
-    auto query_iter = query_map.find(query_id);
-
-    if (query_iter == query_map.end())
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to release query context that does not exist");
-
-    query_map.erase(query_iter);
-}
-
-IFileCache::QueryContextPtr IFileCache::getOrSetQueryContext(const String & query_id, const ReadSettings & settings, std::lock_guard<std::mutex> & cache_lock)
-{
-    if (query_id.empty())
-        return nullptr;
-
-    auto context = getQueryContext(query_id, cache_lock);
-    if (!context)
-    {
-        auto query_iter = query_map.insert({query_id, std::make_shared<QueryContext>(settings.max_query_cache_size, settings.skip_download_if_exceeds_query_cache)}).first;
-        context = query_iter->second;
-    }
-    return context;
-}
-
-IFileCache::QueryContextHolder IFileCache::getQueryContextHolder(const String & query_id, const ReadSettings & settings)
-{
-    std::lock_guard cache_lock(mutex);
-
-    /// if enable_filesystem_query_cache_limit is true, and max_query_cache_size large than zero,
-    /// we create context query for current query.
-    if (enable_filesystem_query_cache_limit && settings.max_query_cache_size)
-    {
-        auto context = getOrSetQueryContext(query_id, settings, cache_lock);
-        return QueryContextHolder(query_id, this, context);
-    }
-    else
-        return QueryContextHolder();
-}
-
-IFileCache::QueryContextHolder::QueryContextHolder(const String & query_id_, IFileCache * cache_, IFileCache::QueryContextPtr context_)
-    : query_id(query_id_), cache(cache_), context(context_)
-{
-}
-
-IFileCache::QueryContextHolder::~QueryContextHolder()
-{
-    /// If only the query_map and the current holder hold the context_query,
-    /// the query has been completed and the query_context is released.
-    if (context && context.use_count() == 2)
-        cache->removeQueryContext(query_id);
-}
-
 LRUFileCache::LRUFileCache(const String & cache_base_path_, const FileCacheSettings & cache_settings_)
    : IFileCache(cache_base_path_, cache_settings_)
    , max_stash_element_size(cache_settings_.max_elements)
    , enable_cache_hits_threshold(cache_settings_.enable_cache_hits_threshold)
    , log(&Poco::Logger::get("LRUFileCache"))
+    , allow_to_remove_persistent_segments_from_cache_by_default(cache_settings_.allow_to_remove_persistent_segments_from_cache_by_default)
 {
 }

@ -155,9 +37,20 @@ void LRUFileCache::initialize()
    if (!is_initialized)
    {
        if (fs::exists(cache_base_path))
-            loadCacheInfoIntoMemory(cache_lock);
+        {
+            try
+            {
+                loadCacheInfoIntoMemory(cache_lock);
+            }
+            catch (...)
+            {
+                tryLogCurrentException(__PRETTY_FUNCTION__);
+                return;
+            }
+        }
        else
            fs::create_directories(cache_base_path);
+
        is_initialized = true;
    }
 }
@ -168,7 +61,7 @@ void LRUFileCache::useCell(
    auto file_segment = cell.file_segment;

    if (file_segment->isDownloaded()
-        && fs::file_size(getPathInLocalCache(file_segment->key(), file_segment->offset())) == 0)
+        && fs::file_size(getPathInLocalCache(file_segment->key(), file_segment->offset(), file_segment->isPersistent())) == 0)
        throw Exception(ErrorCodes::LOGICAL_ERROR,
                        "Cannot have zero size downloaded file segments. Current file segment: {}",
                        file_segment->range().toString());
@ -218,8 +111,10 @@ FileSegments LRUFileCache::getImpl(

        files.erase(key);

+        /// Note: it is guaranteed that there is no concurrency with files deletion,
+        /// because cache files are deleted only inside IFileCache and under cache lock.
        if (fs::exists(key_path))
-            fs::remove(key_path);
+            fs::remove_all(key_path);

        return {};
    }
@ -281,7 +176,7 @@ FileSegments LRUFileCache::getImpl(
 }

 FileSegments LRUFileCache::splitRangeIntoCells(
-    const Key & key, size_t offset, size_t size, FileSegment::State state, std::lock_guard<std::mutex> & cache_lock)
+    const Key & key, size_t offset, size_t size, FileSegment::State state, bool is_persistent, std::lock_guard<std::mutex> & cache_lock)
 {
    assert(size > 0);

@ -297,7 +192,7 @@ FileSegments LRUFileCache::splitRangeIntoCells(
        current_cell_size = std::min(remaining_size, max_file_segment_size);
        remaining_size -= current_cell_size;

-        auto * cell = addCell(key, current_pos, current_cell_size, state, cache_lock);
+        auto * cell = addCell(key, current_pos, current_cell_size, state, is_persistent, cache_lock);
        if (cell)
            file_segments.push_back(cell->file_segment);
        assert(cell);
@ -314,6 +209,7 @@ void LRUFileCache::fillHolesWithEmptyFileSegments(
    const Key & key,
    const FileSegment::Range & range,
    bool fill_with_detached_file_segments,
+    bool is_persistent,
    std::lock_guard<std::mutex> & cache_lock)
 {
    /// There are segments [segment1, ..., segmentN]
@ -369,7 +265,7 @@ void LRUFileCache::fillHolesWithEmptyFileSegments(
        }
        else
        {
-            file_segments.splice(it, splitRangeIntoCells(key, current_pos, hole_size, FileSegment::State::EMPTY, cache_lock));
+            file_segments.splice(it, splitRangeIntoCells(key, current_pos, hole_size, FileSegment::State::EMPTY, is_persistent, cache_lock));
        }

        current_pos = segment_range.right + 1;
@ -397,12 +293,12 @@ void LRUFileCache::fillHolesWithEmptyFileSegments(
        else
        {
            file_segments.splice(
-                file_segments.end(), splitRangeIntoCells(key, current_pos, hole_size, FileSegment::State::EMPTY, cache_lock));
+                file_segments.end(), splitRangeIntoCells(key, current_pos, hole_size, FileSegment::State::EMPTY, is_persistent, cache_lock));
        }
    }
 }

-FileSegmentsHolder LRUFileCache::getOrSet(const Key & key, size_t offset, size_t size)
+FileSegmentsHolder LRUFileCache::getOrSet(const Key & key, size_t offset, size_t size, bool is_persistent)
 {
    assertInitialized();

@ -419,11 +315,11 @@ FileSegmentsHolder LRUFileCache::getOrSet(const Key & key, size_t offset, size_t

    if (file_segments.empty())
    {
-        file_segments = splitRangeIntoCells(key, offset, size, FileSegment::State::EMPTY, cache_lock);
+        file_segments = splitRangeIntoCells(key, offset, size, FileSegment::State::EMPTY, is_persistent, cache_lock);
    }
    else
    {
-        fillHolesWithEmptyFileSegments(file_segments, key, range, false, cache_lock);
+        fillHolesWithEmptyFileSegments(file_segments, key, range, /* fill_with_detached */false, is_persistent, cache_lock);
    }

    assert(!file_segments.empty());
@ -456,14 +352,15 @@ FileSegmentsHolder LRUFileCache::get(const Key & key, size_t offset, size_t size
    }
    else
    {
-        fillHolesWithEmptyFileSegments(file_segments, key, range, true, cache_lock);
+        fillHolesWithEmptyFileSegments(file_segments, key, range, /* fill_with_detached */true, /* is_persistent */false, cache_lock);
    }

    return FileSegmentsHolder(std::move(file_segments));
 }

 LRUFileCache::FileSegmentCell * LRUFileCache::addCell(
-    const Key & key, size_t offset, size_t size, FileSegment::State state,
+    const Key & key, size_t offset, size_t size,
+    FileSegment::State state, bool is_persistent,
    std::lock_guard<std::mutex> & cache_lock)
 {
    /// Create a file segment cell and put it in `files` map by [key][offset].
@ -475,10 +372,11 @@ LRUFileCache::FileSegmentCell * LRUFileCache::addCell(
        throw Exception(
            ErrorCodes::LOGICAL_ERROR,
            "Cache already exists for key: `{}`, offset: {}, size: {}.\nCurrent cache structure: {}",
-            keyToStr(key), offset, size, dumpStructureUnlocked(key, cache_lock));
+            key.toString(), offset, size, dumpStructureUnlocked(key, cache_lock));

    auto skip_or_download = [&]() -> FileSegmentPtr
    {
+        FileSegment::State result_state = state;
        if (state == FileSegment::State::EMPTY && enable_cache_hits_threshold)
        {
            auto record = records.find({key, offset});
@ -496,7 +394,7 @@ LRUFileCache::FileSegmentCell * LRUFileCache::addCell(
                }

                /// For segments that do not reach the download threshold, we do not download them, but directly read them
-                return std::make_shared<FileSegment>(offset, size, key, this, FileSegment::State::SKIP_CACHE);
+                result_state = FileSegment::State::SKIP_CACHE;
            }
            else
            {
@ -504,12 +402,11 @@ LRUFileCache::FileSegmentCell * LRUFileCache::addCell(
                queue_iter->hits++;
                stash_queue.moveToEnd(queue_iter, cache_lock);

-                state = queue_iter->hits >= enable_cache_hits_threshold ? FileSegment::State::EMPTY : FileSegment::State::SKIP_CACHE;
-                return std::make_shared<FileSegment>(offset, size, key, this, state);
+                result_state = queue_iter->hits >= enable_cache_hits_threshold ? FileSegment::State::EMPTY : FileSegment::State::SKIP_CACHE;
            }
        }
-        else
-            return std::make_shared<FileSegment>(offset, size, key, this, state);
+
+        return std::make_shared<FileSegment>(offset, size, key, this, result_state, is_persistent);
    };

    FileSegmentCell cell(skip_or_download(), this, cache_lock);
@ -527,12 +424,16 @@ LRUFileCache::FileSegmentCell * LRUFileCache::addCell(
        throw Exception(
            ErrorCodes::LOGICAL_ERROR,
            "Failed to insert into cache key: `{}`, offset: {}, size: {}",
-            keyToStr(key), offset, size);
+            key.toString(), offset, size);

    return &(it->second);
 }

-FileSegmentsHolder LRUFileCache::setDownloading(const Key & key, size_t offset, size_t size)
+FileSegmentsHolder LRUFileCache::setDownloading(
+    const Key & key,
+    size_t offset,
+    size_t size,
+    bool is_persistent)
 {
    std::lock_guard cache_lock(mutex);

@ -545,9 +446,9 @@ FileSegmentsHolder LRUFileCache::setDownloading(const Key & key, size_t offset,
        throw Exception(
            ErrorCodes::REMOTE_FS_OBJECT_CACHE_ERROR,
            "Cache cell already exists for key `{}` and offset {}",
-            keyToStr(key), offset);
+            key.toString(), offset);

-    auto file_segments = splitRangeIntoCells(key, offset, size, FileSegment::State::DOWNLOADING, cache_lock);
+    auto file_segments = splitRangeIntoCells(key, offset, size, FileSegment::State::DOWNLOADING, is_persistent, cache_lock);
    return FileSegmentsHolder(std::move(file_segments));
 }

@ -708,7 +609,7 @@ bool LRUFileCache::tryReserveForMainList(
            throw Exception(
                ErrorCodes::REMOTE_FS_OBJECT_CACHE_ERROR,
                "Cache became inconsistent. Key: {}, offset: {}",
-                keyToStr(key), offset);
+                key.toString(), offset);

        size_t cell_size = cell->size();
        assert(entry_size == cell_size);
@ -790,7 +691,7 @@ bool LRUFileCache::tryReserveForMainList(
    return true;
 }

-void LRUFileCache::remove(const Key & key)
+void LRUFileCache::removeIfExists(const Key & key)
 {
    assertInitialized();

@ -825,6 +726,7 @@ void LRUFileCache::remove(const Key & key)
        if (file_segment)
        {
            std::lock_guard<std::mutex> segment_lock(file_segment->mutex);
+            file_segment->detach(cache_lock, segment_lock);
            remove(file_segment->key(), file_segment->offset(), cache_lock, segment_lock);
        }
    }
@ -836,14 +738,16 @@ void LRUFileCache::remove(const Key & key)
        files.erase(key);

        if (fs::exists(key_path))
-            fs::remove(key_path);
+            fs::remove_all(key_path);
    }
 }

-void LRUFileCache::remove()
+void LRUFileCache::removeIfReleasable(bool remove_persistent_files)
 {
    /// Try remove all cached files by cache_base_path.
    /// Only releasable file segments are evicted.
+    /// `remove_persistent_files` defines whether non-evictable by some criteria files
+    /// (they do not comply with the cache eviction policy) should also be removed.

    std::lock_guard cache_lock(mutex);

@ -860,7 +764,10 @@ void LRUFileCache::remove()
        if (cell->releasable())
        {
            auto file_segment = cell->file_segment;
-            if (file_segment)
+            if (file_segment
+                && (!file_segment->isPersistent()
+                    || remove_persistent_files
+                    || allow_to_remove_persistent_segments_from_cache_by_default))
            {
                std::lock_guard segment_lock(file_segment->mutex);
                file_segment->detach(cache_lock, segment_lock);
@ -872,17 +779,23 @@ void LRUFileCache::remove()
    /// Remove all access information.
    records.clear();
    stash_queue.removeAll(cache_lock);
+
+#ifndef NDEBUG
+    assertCacheCorrectness(cache_lock);
+#endif
 }

 void LRUFileCache::remove(
    Key key, size_t offset,
    std::lock_guard<std::mutex> & cache_lock, std::lock_guard<std::mutex> & /* segment_lock */)
 {
-    LOG_TEST(log, "Remove. Key: {}, offset: {}", keyToStr(key), offset);
+    LOG_TEST(log, "Remove. Key: {}, offset: {}", key.toString(), offset);

    auto * cell = getCell(key, offset, cache_lock);
    if (!cell)
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "No cache cell for key: {}, offset: {}", keyToStr(key), offset);
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "No cache cell for key: {}, offset: {}", key.toString(), offset);
+
+    bool is_persistent_file_segment = cell->file_segment->isPersistent();

    if (cell->queue_iterator)
    {
@ -892,7 +805,7 @@ void LRUFileCache::remove(
    auto & offsets = files[key];
    offsets.erase(offset);

-    auto cache_file_path = getPathInLocalCache(key, offset);
+    auto cache_file_path = getPathInLocalCache(key, offset, is_persistent_file_segment);
    if (fs::exists(cache_file_path))
    {
        try
@ -906,14 +819,14 @@ void LRUFileCache::remove(
                files.erase(key);

                if (fs::exists(key_path))
-                    fs::remove(key_path);
+                    fs::remove_all(key_path);
            }
        }
        catch (...)
        {
            throw Exception(ErrorCodes::REMOTE_FS_OBJECT_CACHE_ERROR,
                            "Removal of cached file failed. Key: {}, offset: {}, path: {}, error: {}",
-                            keyToStr(key), offset, cache_file_path, getCurrentExceptionMessage(false));
+                            key.toString(), offset, cache_file_path, getCurrentExceptionMessage(false));
        }
    }
 }
@ -927,18 +840,33 @@ void LRUFileCache::loadCacheInfoIntoMemory(std::lock_guard<std::mutex> & cache_l

    /// cache_base_path / key_prefix / key / offset

+    if (!files.empty())
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Cache already initialized");
+
    fs::directory_iterator key_prefix_it{cache_base_path};
    for (; key_prefix_it != fs::directory_iterator(); ++key_prefix_it)
    {
        fs::directory_iterator key_it{key_prefix_it->path()};
        for (; key_it != fs::directory_iterator(); ++key_it)
        {
-            key = unhexUInt<UInt128>(key_it->path().filename().string().data());
+            key = Key(unhexUInt<UInt128>(key_it->path().filename().string().data()));

            fs::directory_iterator offset_it{key_it->path()};
            for (; offset_it != fs::directory_iterator(); ++offset_it)
            {
-                bool parsed = tryParse<UInt64>(offset, offset_it->path().filename());
+                auto offset_with_suffix = offset_it->path().filename().string();
+                auto delim_pos = offset_with_suffix.find('_');
+                bool parsed;
+                bool is_persistent = false;
+
+                if (delim_pos == std::string::npos)
+                    parsed = tryParse<UInt64>(offset, offset_with_suffix);
+                else
+                {
+                    parsed = tryParse<UInt64>(offset, offset_with_suffix.substr(0, delim_pos));
+                    is_persistent = offset_with_suffix.substr(delim_pos+1) == "persistent";
+                }
+
                if (!parsed)
                {
                    LOG_WARNING(log, "Unexpected file: ", offset_it->path().string());
@ -954,7 +882,7 @@ void LRUFileCache::loadCacheInfoIntoMemory(std::lock_guard<std::mutex> & cache_l

                if (tryReserve(key, offset, size, cache_lock))
                {
-                    auto * cell = addCell(key, offset, size, FileSegment::State::DOWNLOADED, cache_lock);
+                    auto * cell = addCell(key, offset, size, FileSegment::State::DOWNLOADED, is_persistent, cache_lock);
                    if (cell)
                        queue_entries.emplace_back(*cell->queue_iterator, cell->file_segment);
                }
@ -1003,7 +931,7 @@ void LRUFileCache::reduceSizeToDownloaded(
        throw Exception(
            ErrorCodes::LOGICAL_ERROR,
            "No cell found for key: {}, offset: {}",
-            keyToStr(key), offset);
+            key.toString(), offset);
    }

    const auto & file_segment = cell->file_segment;
@ -1014,7 +942,7 @@ void LRUFileCache::reduceSizeToDownloaded(
        throw Exception(
            ErrorCodes::LOGICAL_ERROR,
            "Nothing to reduce, file segment fully downloaded, key: {}, offset: {}",
-            keyToStr(key), offset);
+            key.toString(), offset);
    }

    cell->file_segment = std::make_shared<FileSegment>(offset, downloaded_size, key, this, FileSegment::State::DOWNLOADED);
@ -1027,7 +955,7 @@ bool LRUFileCache::isLastFileSegmentHolder(
    auto * cell = getCell(key, offset, cache_lock);

    if (!cell)
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "No cell found for key: {}, offset: {}", keyToStr(key), offset);
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "No cell found for key: {}, offset: {}", key.toString(), offset);

    /// The caller of this method is last file segment holder if use count is 2 (the second pointer is cache itself)
    return cell->file_segment.use_count() == 2;
@ -1058,7 +986,7 @@ std::vector<String> LRUFileCache::tryGetCachePaths(const Key & key)
    for (const auto & [offset, cell] : cells_by_offset)
    {
        if (cell.file_segment->state() == FileSegment::State::DOWNLOADED)
-            cache_paths.push_back(getPathInLocalCache(key, offset));
+            cache_paths.push_back(getPathInLocalCache(key, offset, cell.file_segment->isPersistent()));
    }

    return cache_paths;
@ -1139,7 +1067,7 @@ IFileCache::LRUQueue::Iterator IFileCache::LRUQueue::add(
            throw Exception(
                ErrorCodes::LOGICAL_ERROR,
                "Attempt to add duplicate queue entry to queue. (Key: {}, offset: {}, size: {})",
-                keyToStr(key), offset, size);
+                key.toString(), offset, size);
    }
 #endif

@ -1190,7 +1118,7 @@ String IFileCache::LRUQueue::toString(std::lock_guard<std::mutex> & /* cache_loc
    {
        if (!result.empty())
            result += ", ";
-        result += fmt::format("{}: [{}, {}]", keyToStr(key), offset, offset + size - 1);
+        result += fmt::format("{}: [{}, {}]", key.toString(), offset, offset + size - 1);
    }
    return result;
 }
--- a/src/Common/LRUFileCache.h
+++ b/src/Common/LRUFileCache.h
@ -0,0 +1,157 @@
+#pragma once
+
+#include <atomic>
+#include <chrono>
+#include <list>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include <unordered_set>
+#include <boost/functional/hash.hpp>
+#include <boost/noncopyable.hpp>
+#include <map>
+
+#include <Common/logger_useful.h>
+#include <Common/FileSegment.h>
+#include <Common/IFileCache.h>
+
+
+namespace DB
+{
+
+/**
+ * Local cache for remote filesystem files, represented as a set of non-overlapping non-empty file segments.
+ * Implements LRU eviction policy.
+ */
+class LRUFileCache final : public IFileCache
+{
+public:
+    LRUFileCache(
+        const String & cache_base_path_,
+        const FileCacheSettings & cache_settings_);
+
+    FileSegmentsHolder getOrSet(const Key & key, size_t offset, size_t size, bool is_persistent) override;
+
+    FileSegmentsHolder get(const Key & key, size_t offset, size_t size) override;
+
+    FileSegments getSnapshot() const override;
+
+    void initialize() override;
+
+    void removeIfExists(const Key & key) override;
+
+    void removeIfReleasable(bool remove_persistent_files) override;
+
+    std::vector<String> tryGetCachePaths(const Key & key) override;
+
+    size_t getUsedCacheSize() const override;
+
+    size_t getFileSegmentsNum() const override;
+
+private:
+    struct FileSegmentCell : private boost::noncopyable
+    {
+        FileSegmentPtr file_segment;
+
+        /// Iterator is put here on first reservation attempt, if successful.
+        std::optional<LRUQueue::Iterator> queue_iterator;
+
+        /// Pointer to file segment is always hold by the cache itself.
+        /// Apart from pointer in cache, it can be hold by cache users, when they call
+        /// getorSet(), but cache users always hold it via FileSegmentsHolder.
+        bool releasable() const {return file_segment.unique(); }
+
+        size_t size() const { return file_segment->reserved_size; }
+
+        FileSegmentCell(FileSegmentPtr file_segment_, LRUFileCache * cache, std::lock_guard<std::mutex> & cache_lock);
+
+        FileSegmentCell(FileSegmentCell && other) noexcept
+            : file_segment(std::move(other.file_segment))
+            , queue_iterator(other.queue_iterator) {}
+    };
+
+    using FileSegmentsByOffset = std::map<size_t, FileSegmentCell>;
+    using CachedFiles = std::unordered_map<Key, FileSegmentsByOffset>;
+
+    CachedFiles files;
+    LRUQueue queue;
+
+    LRUQueue stash_queue;
+    AccessRecord records;
+
+    size_t max_stash_element_size;
+    size_t enable_cache_hits_threshold;
+
+    Poco::Logger * log;
+    bool allow_to_remove_persistent_segments_from_cache_by_default;
+
+    FileSegments getImpl(
+        const Key & key, const FileSegment::Range & range,
+        std::lock_guard<std::mutex> & cache_lock);
+
+    FileSegmentCell * getCell(
+        const Key & key, size_t offset, std::lock_guard<std::mutex> & cache_lock);
+
+    FileSegmentCell * addCell(
+        const Key & key, size_t offset, size_t size,
+        FileSegment::State state, bool is_persistent,
+        std::lock_guard<std::mutex> & cache_lock);
+
+    void useCell(const FileSegmentCell & cell, FileSegments & result, std::lock_guard<std::mutex> & cache_lock);
+
+    bool tryReserve(
+        const Key & key, size_t offset, size_t size,
+        std::lock_guard<std::mutex> & cache_lock) override;
+
+    bool tryReserveForMainList(
+        const Key & key, size_t offset, size_t size,
+        QueryContextPtr query_context,
+        std::lock_guard<std::mutex> & cache_lock);
+
+    void remove(
+        Key key, size_t offset,
+        std::lock_guard<std::mutex> & cache_lock,
+        std::lock_guard<std::mutex> & segment_lock) override;
+
+    bool isLastFileSegmentHolder(
+        const Key & key, size_t offset,
+        std::lock_guard<std::mutex> & cache_lock,
+        std::lock_guard<std::mutex> & segment_lock) override;
+
+    size_t getAvailableCacheSize() const;
+
+    void loadCacheInfoIntoMemory(std::lock_guard<std::mutex> & cache_lock);
+
+    FileSegments splitRangeIntoCells(
+        const Key & key, size_t offset, size_t size, FileSegment::State state, bool is_persistent, std::lock_guard<std::mutex> & cache_lock);
+
+    String dumpStructureUnlocked(const Key & key_, std::lock_guard<std::mutex> & cache_lock);
+
+    void fillHolesWithEmptyFileSegments(
+        FileSegments & file_segments, const Key & key, const FileSegment::Range & range, bool fill_with_detached_file_segments, bool is_persistent, std::lock_guard<std::mutex> & cache_lock);
+
+    FileSegmentsHolder setDownloading(const Key & key, size_t offset, size_t size, bool is_persistent) override;
+
+    size_t getUsedCacheSizeUnlocked(std::lock_guard<std::mutex> & cache_lock) const;
+
+    size_t getAvailableCacheSizeUnlocked(std::lock_guard<std::mutex> & cache_lock) const;
+
+    size_t getFileSegmentsNumUnlocked(std::lock_guard<std::mutex> & cache_lock) const;
+
+    void assertCacheCellsCorrectness(const FileSegmentsByOffset & cells_by_offset, std::lock_guard<std::mutex> & cache_lock);
+
+    void reduceSizeToDownloaded(
+        const Key & key, size_t offset,
+        std::lock_guard<std::mutex> & cache_lock, std::lock_guard<std::mutex> & /* segment_lock */) override;
+
+public:
+    String dumpStructure(const Key & key_) override;
+
+    void assertCacheCorrectness(const Key & key, std::lock_guard<std::mutex> & cache_lock);
+
+    void assertCacheCorrectness(std::lock_guard<std::mutex> & cache_lock);
+
+    void assertQueueCorrectness(std::lock_guard<std::mutex> & cache_lock);
+};
+
+}
--- a/src/Common/ProgressIndication.cpp
+++ b/src/Common/ProgressIndication.cpp
@ -53,8 +53,11 @@ void ProgressIndication::resetProgress()
    show_progress_bar = false;
    written_progress_chars = 0;
    write_progress_on_update = false;
-    host_cpu_usage.clear();
-    thread_data.clear();
+    {
+        std::lock_guard lock(profile_events_mutex);
+        host_cpu_usage.clear();
+        thread_data.clear();
+    }
 }

 void ProgressIndication::setFileProgressCallback(ContextMutablePtr context, bool write_progress_on_update_)
@ -71,6 +74,8 @@ void ProgressIndication::setFileProgressCallback(ContextMutablePtr context, bool

 void ProgressIndication::addThreadIdToList(String const & host, UInt64 thread_id)
 {
+    std::lock_guard lock(profile_events_mutex);
+
    auto & thread_to_times = thread_data[host];
    if (thread_to_times.contains(thread_id))
        return;
@ -79,6 +84,8 @@ void ProgressIndication::addThreadIdToList(String const & host, UInt64 thread_id

 void ProgressIndication::updateThreadEventData(HostToThreadTimesMap & new_thread_data, UInt64 elapsed_time)
 {
+    std::lock_guard lock(profile_events_mutex);
+
    for (auto & new_host_map : new_thread_data)
    {
        host_cpu_usage[new_host_map.first] = calculateCPUUsage(new_host_map.second, elapsed_time);
@ -88,6 +95,8 @@ void ProgressIndication::updateThreadEventData(HostToThreadTimesMap & new_thread

 size_t ProgressIndication::getUsedThreadsCount() const
 {
+    std::lock_guard lock(profile_events_mutex);
+
    return std::accumulate(thread_data.cbegin(), thread_data.cend(), 0,
        [] (size_t acc, auto const & threads)
        {
@ -97,6 +106,8 @@ size_t ProgressIndication::getUsedThreadsCount() const

 double ProgressIndication::getCPUUsage() const
 {
+    std::lock_guard lock(profile_events_mutex);
+
    double res = 0;
    for (const auto & elem : host_cpu_usage)
        res += elem.second;
@ -105,6 +116,8 @@ double ProgressIndication::getCPUUsage() const

 ProgressIndication::MemoryUsage ProgressIndication::getMemoryUsage() const
 {
+    std::lock_guard lock(profile_events_mutex);
+
    return std::accumulate(thread_data.cbegin(), thread_data.cend(), MemoryUsage{},
        [](MemoryUsage const & acc, auto const & host_data)
        {
@ -137,6 +150,8 @@ void ProgressIndication::writeFinalProgress()

 void ProgressIndication::writeProgress()
 {
+    std::lock_guard lock(progress_mutex);
+
    /// Output all progress bar commands to stderr at once to avoid flicker.
    WriteBufferFromFileDescriptor message(STDERR_FILENO, 1024);

--- a/src/Common/ProgressIndication.h
+++ b/src/Common/ProgressIndication.h
@ -2,6 +2,7 @@

 #include <unordered_map>
 #include <unordered_set>
+#include <mutex>
 #include <IO/Progress.h>
 #include <Interpreters/Context.h>
 #include <base/types.h>
@ -92,6 +93,16 @@ private:

    std::unordered_map<String, double> host_cpu_usage;
    HostToThreadTimesMap thread_data;
+    /// In case of all of the above:
+    /// - clickhouse-local
+    /// - input_format_parallel_parsing=true
+    /// - write_progress_on_update=true
+    ///
+    /// It is possible concurrent access to the following:
+    /// - writeProgress() (class properties) (guarded with progress_mutex)
+    /// - thread_data/host_cpu_usage (guarded with profile_events_mutex)
+    mutable std::mutex profile_events_mutex;
+    mutable std::mutex progress_mutex;
 };

 }
--- a/src/Common/SipHash.h
+++ b/src/Common/SipHash.h
@ -147,6 +147,11 @@ public:
        update(x.data(), x.length());
    }

+    ALWAYS_INLINE void update(const std::string_view x)
+    {
+        update(x.data(), x.size());
+    }
+
    /// Get the result in some form. This can only be done once!

    void get128(char * out)
--- a/src/Common/ZooKeeper/IKeeper.h
+++ b/src/Common/ZooKeeper/IKeeper.h
@ -46,17 +46,17 @@ using ACLs = std::vector<ACL>;

 struct Stat
 {
-    int64_t czxid;
-    int64_t mzxid;
-    int64_t ctime;
-    int64_t mtime;
-    int32_t version;
-    int32_t cversion;
-    int32_t aversion;
-    int64_t ephemeralOwner; /// NOLINT
-    int32_t dataLength; /// NOLINT
-    int32_t numChildren; /// NOLINT
-    int64_t pzxid;
+    int64_t czxid{0};
+    int64_t mzxid{0};
+    int64_t ctime{0};
+    int64_t mtime{0};
+    int32_t version{0};
+    int32_t cversion{0};
+    int32_t aversion{0};
+    int64_t ephemeralOwner{0}; /// NOLINT
+    int32_t dataLength{0}; /// NOLINT
+    int32_t numChildren{0}; /// NOLINT
+    int64_t pzxid{0};
 };

 enum class Error : int32_t
--- a/src/Common/ZooKeeper/ZooKeeperCommon.cpp
+++ b/src/Common/ZooKeeper/ZooKeeperCommon.cpp
@ -5,6 +5,7 @@
 #include <IO/WriteBufferFromString.h>
 #include <IO/Operators.h>
 #include <IO/ReadHelpers.h>
+#include <fmt/format.h>
 #include <Common/logger_useful.h>
 #include <array>

@ -27,6 +28,17 @@ void ZooKeeperResponse::write(WriteBuffer & out) const
    out.next();
 }

+std::string ZooKeeperRequest::toString() const
+{
+    return fmt::format(
+        "XID = {}\n"
+        "OpNum = {}\n"
+        "Additional info:\n{}",
+        xid,
+        getOpNum(),
+        toStringImpl());
+}
+
 void ZooKeeperRequest::write(WriteBuffer & out) const
 {
    /// Excessive copy to calculate length.
@ -48,6 +60,11 @@ void ZooKeeperSyncRequest::readImpl(ReadBuffer & in)
    Coordination::read(path, in);
 }

+std::string ZooKeeperSyncRequest::toStringImpl() const
+{
+    return fmt::format("path = {}", path);
+}
+
 void ZooKeeperSyncResponse::readImpl(ReadBuffer & in)
 {
    Coordination::read(path, in);
@ -93,6 +110,15 @@ void ZooKeeperAuthRequest::readImpl(ReadBuffer & in)
    Coordination::read(data, in);
 }

+std::string ZooKeeperAuthRequest::toStringImpl() const
+{
+    return fmt::format(
+        "type = {}\n"
+        "scheme = {}",
+        type,
+        scheme);
+}
+
 void ZooKeeperCreateRequest::writeImpl(WriteBuffer & out) const
 {
    Coordination::write(path, out);
@ -124,6 +150,17 @@ void ZooKeeperCreateRequest::readImpl(ReadBuffer & in)
        is_sequential = true;
 }

+std::string ZooKeeperCreateRequest::toStringImpl() const
+{
+    return fmt::format(
+        "path = {}\n"
+        "is_ephemeral = {}\n"
+        "is_sequential = {}",
+        path,
+        is_ephemeral,
+        is_sequential);
+}
+
 void ZooKeeperCreateResponse::readImpl(ReadBuffer & in)
 {
    Coordination::read(path_created, in);
@ -140,6 +177,15 @@ void ZooKeeperRemoveRequest::writeImpl(WriteBuffer & out) const
    Coordination::write(version, out);
 }

+std::string ZooKeeperRemoveRequest::toStringImpl() const
+{
+    return fmt::format(
+        "path = {}\n"
+        "version = {}",
+        path,
+        version);
+}
+
 void ZooKeeperRemoveRequest::readImpl(ReadBuffer & in)
 {
    Coordination::read(path, in);
@ -158,6 +204,11 @@ void ZooKeeperExistsRequest::readImpl(ReadBuffer & in)
    Coordination::read(has_watch, in);
 }

+std::string ZooKeeperExistsRequest::toStringImpl() const
+{
+    return fmt::format("path = {}", path);
+}
+
 void ZooKeeperExistsResponse::readImpl(ReadBuffer & in)
 {
    Coordination::read(stat, in);
@ -180,6 +231,11 @@ void ZooKeeperGetRequest::readImpl(ReadBuffer & in)
    Coordination::read(has_watch, in);
 }

+std::string ZooKeeperGetRequest::toStringImpl() const
+{
+    return fmt::format("path = {}", path);
+}
+
 void ZooKeeperGetResponse::readImpl(ReadBuffer & in)
 {
    Coordination::read(data, in);
@ -206,6 +262,15 @@ void ZooKeeperSetRequest::readImpl(ReadBuffer & in)
    Coordination::read(version, in);
 }

+std::string ZooKeeperSetRequest::toStringImpl() const
+{
+    return fmt::format(
+        "path = {}\n"
+        "version = {}",
+        path,
+        version);
+}
+
 void ZooKeeperSetResponse::readImpl(ReadBuffer & in)
 {
    Coordination::read(stat, in);
@ -228,6 +293,11 @@ void ZooKeeperListRequest::readImpl(ReadBuffer & in)
    Coordination::read(has_watch, in);
 }

+std::string ZooKeeperListRequest::toStringImpl() const
+{
+    return fmt::format("path = {}", path);
+}
+
 void ZooKeeperListResponse::readImpl(ReadBuffer & in)
 {
    Coordination::read(names, in);
@ -255,6 +325,11 @@ void ZooKeeperSetACLRequest::readImpl(ReadBuffer & in)
    Coordination::read(version, in);
 }

+std::string ZooKeeperSetACLRequest::toStringImpl() const
+{
+    return fmt::format("path = {}\n", "version = {}", path, version);
+}
+
 void ZooKeeperSetACLResponse::writeImpl(WriteBuffer & out) const
 {
    Coordination::write(stat, out);
@ -275,6 +350,11 @@ void ZooKeeperGetACLRequest::writeImpl(WriteBuffer & out) const
    Coordination::write(path, out);
 }

+std::string ZooKeeperGetACLRequest::toStringImpl() const
+{
+    return fmt::format("path = {}", path);
+}
+
 void ZooKeeperGetACLResponse::writeImpl(WriteBuffer & out) const
 {
    Coordination::write(acl, out);
@ -299,6 +379,11 @@ void ZooKeeperCheckRequest::readImpl(ReadBuffer & in)
    Coordination::read(version, in);
 }

+std::string ZooKeeperCheckRequest::toStringImpl() const
+{
+    return fmt::format("path = {}\n", "version = {}", path, version);
+}
+
 void ZooKeeperErrorResponse::readImpl(ReadBuffer & in)
 {
    Coordination::Error read_error;
@ -401,6 +486,17 @@ void ZooKeeperMultiRequest::readImpl(ReadBuffer & in)
    }
 }

+std::string ZooKeeperMultiRequest::toStringImpl() const
+{
+    auto out = fmt::memory_buffer();
+    for (const auto & request : requests)
+    {
+        const auto & zk_request = dynamic_cast<const ZooKeeperRequest &>(*request);
+        format_to(std::back_inserter(out), "SubRequest\n{}\n", zk_request.toString());
+    }
+    return {out.data(), out.size()};
+}
+
 bool ZooKeeperMultiRequest::isReadRequest() const
 {
    /// Possibly we can do better
--- a/src/Common/ZooKeeper/ZooKeeperCommon.h
+++ b/src/Common/ZooKeeper/ZooKeeperCommon.h
@ -68,10 +68,13 @@ struct ZooKeeperRequest : virtual Request

    /// Writes length, xid, op_num, then the rest.
    void write(WriteBuffer & out) const;
+    std::string toString() const;

    virtual void writeImpl(WriteBuffer &) const = 0;
    virtual void readImpl(ReadBuffer &) = 0;

+    virtual std::string toStringImpl() const { return ""; }
+
    static std::shared_ptr<ZooKeeperRequest> read(ReadBuffer & in);

    virtual ZooKeeperResponsePtr makeResponse() const = 0;
@ -100,6 +103,7 @@ struct ZooKeeperSyncRequest final : ZooKeeperRequest
    OpNum getOpNum() const override { return OpNum::Sync; }
    void writeImpl(WriteBuffer & out) const override;
    void readImpl(ReadBuffer & in) override;
+    std::string toStringImpl() const override;
    ZooKeeperResponsePtr makeResponse() const override;
    bool isReadRequest() const override { return false; }

@ -150,6 +154,7 @@ struct ZooKeeperAuthRequest final : ZooKeeperRequest
    OpNum getOpNum() const override { return OpNum::Auth; }
    void writeImpl(WriteBuffer & out) const override;
    void readImpl(ReadBuffer & in) override;
+    std::string toStringImpl() const override;

    ZooKeeperResponsePtr makeResponse() const override;
    bool isReadRequest() const override { return false; }
@ -202,6 +207,7 @@ struct ZooKeeperCreateRequest final : public CreateRequest, ZooKeeperRequest
    OpNum getOpNum() const override { return OpNum::Create; }
    void writeImpl(WriteBuffer & out) const override;
    void readImpl(ReadBuffer & in) override;
+    std::string toStringImpl() const override;

    ZooKeeperResponsePtr makeResponse() const override;
    bool isReadRequest() const override { return false; }
@ -232,6 +238,7 @@ struct ZooKeeperRemoveRequest final : RemoveRequest, ZooKeeperRequest
    OpNum getOpNum() const override { return OpNum::Remove; }
    void writeImpl(WriteBuffer & out) const override;
    void readImpl(ReadBuffer & in) override;
+    std::string toStringImpl() const override;

    ZooKeeperResponsePtr makeResponse() const override;
    bool isReadRequest() const override { return false; }
@ -255,6 +262,7 @@ struct ZooKeeperExistsRequest final : ExistsRequest, ZooKeeperRequest
    OpNum getOpNum() const override { return OpNum::Exists; }
    void writeImpl(WriteBuffer & out) const override;
    void readImpl(ReadBuffer & in) override;
+    std::string toStringImpl() const override;

    ZooKeeperResponsePtr makeResponse() const override;
    bool isReadRequest() const override { return true; }
@ -278,6 +286,7 @@ struct ZooKeeperGetRequest final : GetRequest, ZooKeeperRequest
    OpNum getOpNum() const override { return OpNum::Get; }
    void writeImpl(WriteBuffer & out) const override;
    void readImpl(ReadBuffer & in) override;
+    std::string toStringImpl() const override;

    ZooKeeperResponsePtr makeResponse() const override;
    bool isReadRequest() const override { return true; }
@ -304,6 +313,7 @@ struct ZooKeeperSetRequest final : SetRequest, ZooKeeperRequest
    OpNum getOpNum() const override { return OpNum::Set; }
    void writeImpl(WriteBuffer & out) const override;
    void readImpl(ReadBuffer & in) override;
+    std::string toStringImpl() const override;
    ZooKeeperResponsePtr makeResponse() const override;
    bool isReadRequest() const override { return false; }

@ -328,6 +338,7 @@ struct ZooKeeperListRequest : ListRequest, ZooKeeperRequest
    OpNum getOpNum() const override { return OpNum::List; }
    void writeImpl(WriteBuffer & out) const override;
    void readImpl(ReadBuffer & in) override;
+    std::string toStringImpl() const override;
    ZooKeeperResponsePtr makeResponse() const override;
    bool isReadRequest() const override { return true; }

@ -363,6 +374,7 @@ struct ZooKeeperCheckRequest final : CheckRequest, ZooKeeperRequest
    OpNum getOpNum() const override { return OpNum::Check; }
    void writeImpl(WriteBuffer & out) const override;
    void readImpl(ReadBuffer & in) override;
+    std::string toStringImpl() const override;

    ZooKeeperResponsePtr makeResponse() const override;
    bool isReadRequest() const override { return true; }
@ -397,6 +409,7 @@ struct ZooKeeperSetACLRequest final : SetACLRequest, ZooKeeperRequest
    OpNum getOpNum() const override { return OpNum::SetACL; }
    void writeImpl(WriteBuffer & out) const override;
    void readImpl(ReadBuffer & in) override;
+    std::string toStringImpl() const override;
    ZooKeeperResponsePtr makeResponse() const override;
    bool isReadRequest() const override { return false; }

@ -417,6 +430,7 @@ struct ZooKeeperGetACLRequest final : GetACLRequest, ZooKeeperRequest
    OpNum getOpNum() const override { return OpNum::GetACL; }
    void writeImpl(WriteBuffer & out) const override;
    void readImpl(ReadBuffer & in) override;
+    std::string toStringImpl() const override;
    ZooKeeperResponsePtr makeResponse() const override;
    bool isReadRequest() const override { return true; }

@ -441,6 +455,7 @@ struct ZooKeeperMultiRequest final : MultiRequest, ZooKeeperRequest

    void writeImpl(WriteBuffer & out) const override;
    void readImpl(ReadBuffer & in) override;
+    std::string toStringImpl() const override;

    ZooKeeperResponsePtr makeResponse() const override;
    bool isReadRequest() const override;
--- a/src/Common/tests/gtest_lru_file_cache.cpp
+++ b/src/Common/tests/gtest_lru_file_cache.cpp
@ -1,7 +1,8 @@
 #include <iomanip>
 #include <iostream>
 #include <gtest/gtest.h>
-#include <Common/FileCache.h>
+#include <Common/LRUFileCache.h>
+#include <Common/FileSegment.h>
 #include <Common/CurrentThread.h>
 #include <Common/filesystemHelpers.h>
 #include <Common/FileCacheSettings.h>
@ -46,14 +47,9 @@ std::vector<DB::FileSegmentPtr> fromHolder(const DB::FileSegmentsHolder & holder
    return std::vector<DB::FileSegmentPtr>(holder.file_segments.begin(), holder.file_segments.end());
 }

-String keyToStr(const DB::IFileCache::Key & key)
-{
-    return getHexUIntLowercase(key);
-}
-
 String getFileSegmentPath(const String & base_path, const DB::IFileCache::Key & key, size_t offset)
 {
-    auto key_str = keyToStr(key);
+    auto key_str = key.toString();
    return fs::path(base_path) / key_str.substr(0, 3) / key_str / DB::toString(offset);
 }

@ -62,7 +58,7 @@ void download(DB::FileSegmentPtr file_segment)
    const auto & key = file_segment->key();
    size_t size = file_segment->range().size();

-    auto key_str = keyToStr(key);
+    auto key_str = key.toString();
    auto subdir = fs::path(cache_base_path) / key_str.substr(0, 3) / key_str;
    if (!fs::exists(subdir))
        fs::create_directories(subdir);
@ -112,7 +108,7 @@ TEST(LRUFileCache, get)
    auto key = cache.hash("key1");

    {
-        auto holder = cache.getOrSet(key, 0, 10);  /// Add range [0, 9]
+        auto holder = cache.getOrSet(key, 0, 10, false);  /// Add range [0, 9]
        auto segments = fromHolder(holder);
        /// Range was not present in cache. It should be added in cache as one while file segment.
        ASSERT_EQ(segments.size(), 1);
@ -141,7 +137,7 @@ TEST(LRUFileCache, get)

    {
        /// Want range [5, 14], but [0, 9] already in cache, so only [10, 14] will be put in cache.
-        auto holder = cache.getOrSet(key, 5, 10);
+        auto holder = cache.getOrSet(key, 5, 10, false);
        auto segments = fromHolder(holder);
        ASSERT_EQ(segments.size(), 2);

@ -161,14 +157,14 @@ TEST(LRUFileCache, get)
    ASSERT_EQ(cache.getUsedCacheSize(), 15);

    {
-        auto holder = cache.getOrSet(key, 9, 1);  /// Get [9, 9]
+        auto holder = cache.getOrSet(key, 9, 1, false);  /// Get [9, 9]
        auto segments = fromHolder(holder);
        ASSERT_EQ(segments.size(), 1);
        assertRange(7, segments[0], DB::FileSegment::Range(0, 9), DB::FileSegment::State::DOWNLOADED);
    }

    {
-        auto holder = cache.getOrSet(key, 9, 2);  /// Get [9, 10]
+        auto holder = cache.getOrSet(key, 9, 2, false);  /// Get [9, 10]
        auto segments = fromHolder(holder);
        ASSERT_EQ(segments.size(), 2);
        assertRange(8, segments[0], DB::FileSegment::Range(0, 9), DB::FileSegment::State::DOWNLOADED);
@ -176,16 +172,15 @@ TEST(LRUFileCache, get)
    }

    {
-        auto holder = cache.getOrSet(key, 10, 1);  /// Get [10, 10]
+        auto holder = cache.getOrSet(key, 10, 1, false);  /// Get [10, 10]
        auto segments = fromHolder(holder);
        ASSERT_EQ(segments.size(), 1);
        assertRange(10, segments[0], DB::FileSegment::Range(10, 14), DB::FileSegment::State::DOWNLOADED);
    }

-    complete(cache.getOrSet(key, 17, 4)); /// Get [17, 20]
-    complete(cache.getOrSet(key, 24, 3)); /// Get [24, 26]
-    // complete(cache.getOrSet(key, 27, 1)); /// Get [27, 27]
-
+    complete(cache.getOrSet(key, 17, 4, false)); /// Get [17, 20]
+    complete(cache.getOrSet(key, 24, 3, false)); /// Get [24, 26]
+    /// complete(cache.getOrSet(key, 27, 1, false)); /// Get [27, 27]

    /// Current cache:    [__________][_____]   [____]    [___][]
    ///                   ^          ^^     ^   ^    ^    ^   ^^^
@ -195,7 +190,7 @@ TEST(LRUFileCache, get)
    ASSERT_EQ(cache.getUsedCacheSize(), 22);

    {
-        auto holder = cache.getOrSet(key, 0, 26); /// Get [0, 25]
+        auto holder = cache.getOrSet(key, 0, 26, false); /// Get [0, 25]
        auto segments = fromHolder(holder);
        ASSERT_EQ(segments.size(), 6);

@ -229,14 +224,14 @@ TEST(LRUFileCache, get)
        /// as max elements size is reached, next attempt to put something in cache should fail.
        /// This will also check that [27, 27] was indeed evicted.

-        auto holder1 = cache.getOrSet(key, 27, 1);
+        auto holder1 = cache.getOrSet(key, 27, 1, false);
        auto segments_1 = fromHolder(holder1); /// Get [27, 27]
        ASSERT_EQ(segments_1.size(), 1);
        assertRange(17, segments_1[0], DB::FileSegment::Range(27, 27), DB::FileSegment::State::EMPTY);
    }

    {
-        auto holder = cache.getOrSet(key, 12, 10); /// Get [12, 21]
+        auto holder = cache.getOrSet(key, 12, 10, false); /// Get [12, 21]
        auto segments = fromHolder(holder);
        ASSERT_EQ(segments.size(), 4);

@ -260,7 +255,7 @@ TEST(LRUFileCache, get)
    ASSERT_EQ(cache.getFileSegmentsNum(), 5);

    {
-        auto holder = cache.getOrSet(key, 23, 5); /// Get [23, 28]
+        auto holder = cache.getOrSet(key, 23, 5, false); /// Get [23, 28]
        auto segments = fromHolder(holder);
        ASSERT_EQ(segments.size(), 3);

@ -281,12 +276,12 @@ TEST(LRUFileCache, get)
    ///                   17      21 2324  26  28

    {
-        auto holder5 = cache.getOrSet(key, 2, 3); /// Get [2, 4]
+        auto holder5 = cache.getOrSet(key, 2, 3,false); /// Get [2, 4]
        auto s5 = fromHolder(holder5);
        ASSERT_EQ(s5.size(), 1);
        assertRange(25, s5[0], DB::FileSegment::Range(2, 4), DB::FileSegment::State::EMPTY);

-        auto holder1 = cache.getOrSet(key, 30, 2); /// Get [30, 31]
+        auto holder1 = cache.getOrSet(key, 30, 2, false); /// Get [30, 31]
        auto s1 = fromHolder(holder1);
        ASSERT_EQ(s1.size(), 1);
        assertRange(26, s1[0], DB::FileSegment::Range(30, 31), DB::FileSegment::State::EMPTY);
@ -302,20 +297,20 @@ TEST(LRUFileCache, get)
        ///                   ^   ^       ^  ^   ^  ^   ^  ^
        ///                   2   4       23 24  26 27  30 31

-        auto holder2 = cache.getOrSet(key, 23, 1); /// Get [23, 23]
+        auto holder2 = cache.getOrSet(key, 23, 1, false); /// Get [23, 23]
        auto s2 = fromHolder(holder2);
        ASSERT_EQ(s2.size(), 1);

-        auto holder3 = cache.getOrSet(key, 24, 3); /// Get [24, 26]
+        auto holder3 = cache.getOrSet(key, 24, 3, false); /// Get [24, 26]
        auto s3 = fromHolder(holder3);
        ASSERT_EQ(s3.size(), 1);

-        auto holder4 = cache.getOrSet(key, 27, 1); /// Get [27, 27]
+        auto holder4 = cache.getOrSet(key, 27, 1, false); /// Get [27, 27]
        auto s4 = fromHolder(holder4);
        ASSERT_EQ(s4.size(), 1);

        /// All cache is now unreleasable because pointers are still hold
-        auto holder6 = cache.getOrSet(key, 0, 40);
+        auto holder6 = cache.getOrSet(key, 0, 40, false);
        auto f = fromHolder(holder6);
        ASSERT_EQ(f.size(), 9);

@ -336,7 +331,7 @@ TEST(LRUFileCache, get)
    }

    {
-        auto holder = cache.getOrSet(key, 2, 3); /// Get [2, 4]
+        auto holder = cache.getOrSet(key, 2, 3, false); /// Get [2, 4]
        auto segments = fromHolder(holder);
        ASSERT_EQ(segments.size(), 1);
        assertRange(31, segments[0], DB::FileSegment::Range(2, 4), DB::FileSegment::State::DOWNLOADED);
@ -347,7 +342,7 @@ TEST(LRUFileCache, get)
    ///                   2   4       23 24  26 27  30 31

    {
-        auto holder = cache.getOrSet(key, 25, 5); /// Get [25, 29]
+        auto holder = cache.getOrSet(key, 25, 5, false); /// Get [25, 29]
        auto segments = fromHolder(holder);
        ASSERT_EQ(segments.size(), 3);

@ -371,7 +366,7 @@ TEST(LRUFileCache, get)
            DB::CurrentThread::QueryScope query_scope_holder_1(query_context_1);
            thread_status_1.attachQueryContext(query_context_1);

-            auto holder_2 = cache.getOrSet(key, 25, 5); /// Get [25, 29] once again.
+            auto holder_2 = cache.getOrSet(key, 25, 5, false); /// Get [25, 29] once again.
            auto segments_2 = fromHolder(holder_2);
            ASSERT_EQ(segments.size(), 3);

@ -414,7 +409,7 @@ TEST(LRUFileCache, get)
        /// and notify_all() is also called from destructor of holder.

        std::optional<DB::FileSegmentsHolder> holder;
-        holder.emplace(cache.getOrSet(key, 3, 23)); /// Get [3, 25]
+        holder.emplace(cache.getOrSet(key, 3, 23, false)); /// Get [3, 25]

        auto segments = fromHolder(*holder);
        ASSERT_EQ(segments.size(), 3);
@ -440,7 +435,7 @@ TEST(LRUFileCache, get)
            DB::CurrentThread::QueryScope query_scope_holder_1(query_context_1);
            thread_status_1.attachQueryContext(query_context_1);

-            auto holder_2 = cache.getOrSet(key, 3, 23); /// Get [3, 25] once again
+            auto holder_2 = cache.getOrSet(key, 3, 23, false); /// Get [3, 25] once again
            auto segments_2 = fromHolder(*holder);
            ASSERT_EQ(segments_2.size(), 3);

@ -487,7 +482,8 @@ TEST(LRUFileCache, get)
        auto cache2 = DB::LRUFileCache(cache_base_path, settings);
        cache2.initialize();

-        auto holder1 = cache2.getOrSet(key, 2, 28); /// Get [2, 29]
+        auto holder1 = cache2.getOrSet(key, 2, 28, false); /// Get [2, 29]
+
        auto segments1 = fromHolder(holder1);
        ASSERT_EQ(segments1.size(), 5);

@ -506,7 +502,7 @@ TEST(LRUFileCache, get)
        auto cache2 = DB::LRUFileCache(caches_dir / "cache2", settings2);
        cache2.initialize();

-        auto holder1 = cache2.getOrSet(key, 0, 25); /// Get [0, 24]
+        auto holder1 = cache2.getOrSet(key, 0, 25, false); /// Get [0, 24]
        auto segments1 = fromHolder(holder1);

        ASSERT_EQ(segments1.size(), 3);
--- a/src/Coordination/KeeperDispatcher.cpp
+++ b/src/Coordination/KeeperDispatcher.cpp
@ -442,14 +442,14 @@ void KeeperDispatcher::finishSession(int64_t session_id)

 void KeeperDispatcher::addErrorResponses(const KeeperStorage::RequestsForSessions & requests_for_sessions, Coordination::Error error)
 {
-    for (const auto & [session_id, time, request] : requests_for_sessions)
+    for (const auto & request_for_session : requests_for_sessions)
    {
        KeeperStorage::ResponsesForSessions responses;
-        auto response = request->makeResponse();
-        response->xid = request->xid;
+        auto response = request_for_session.request->makeResponse();
+        response->xid = request_for_session.request->xid;
        response->zxid = 0;
        response->error = error;
-        if (!responses_queue.push(DB::KeeperStorage::ResponseForSession{session_id, response}))
+        if (!responses_queue.push(DB::KeeperStorage::ResponseForSession{request_for_session.session_id, response}))
            throw Exception(ErrorCodes::SYSTEM_ERROR,
                "Could not push error response xid {} zxid {} error message {} to responses queue",
                response->xid,
--- a/src/Coordination/KeeperServer.cpp
+++ b/src/Coordination/KeeperServer.cpp
@ -15,6 +15,8 @@
 #include <IO/WriteHelpers.h>
 #include <boost/algorithm/string.hpp>
 #include <libnuraft/cluster_config.hxx>
+#include <libnuraft/log_val_type.hxx>
+#include <libnuraft/ptr.hxx>
 #include <libnuraft/raft_server.hxx>
 #include <Poco/Util/AbstractConfiguration.h>
 #include <Poco/Util/Application.h>
@ -108,7 +110,8 @@ KeeperServer::KeeperServer(
          snapshots_queue_,
          configuration_and_settings_->snapshot_storage_path,
          coordination_settings,
-          checkAndGetSuperdigest(configuration_and_settings_->super_digest)))
+          checkAndGetSuperdigest(configuration_and_settings_->super_digest),
+          config.getBool("keeper_server.digest_enabled", true)))
    , state_manager(nuraft::cs_new<KeeperStateManager>(
          server_id, "keeper_server", configuration_and_settings_->log_storage_path, config, coordination_settings))
    , log(&Poco::Logger::get("KeeperServer"))
@ -315,6 +318,23 @@ void KeeperServer::startup(const Poco::Util::AbstractConfiguration & config, boo

    state_manager->loadLogStore(state_machine->last_commit_index() + 1, coordination_settings->reserved_log_items);

+    auto log_store = state_manager->load_log_store();
+    auto next_log_idx = log_store->next_slot();
+    if (next_log_idx > 0 && next_log_idx > state_machine->last_commit_index())
+    {
+        auto log_entries = log_store->log_entries(state_machine->last_commit_index() + 1, next_log_idx);
+
+        LOG_INFO(log, "Preprocessing {} log entries", log_entries->size());
+        auto idx = state_machine->last_commit_index() + 1;
+        for (const auto & entry : *log_entries)
+        {
+            if (entry && entry->get_val_type() == nuraft::log_val_type::app_log)
+                state_machine->pre_commit(idx, entry->get_buf());
+
+            ++idx;
+        }
+    }
+
    loadLatestConfig();

    last_local_config = state_manager->parseServersConfiguration(config, true).cluster_config;
@ -367,18 +387,35 @@ void KeeperServer::shutdown()
 namespace
 {

-nuraft::ptr<nuraft::buffer> getZooKeeperLogEntry(int64_t session_id, int64_t time, const Coordination::ZooKeeperRequestPtr & request)
+// Serialize the request with all the necessary information for the leader
+// we don't know ZXID and digest yet so we don't serialize it
+nuraft::ptr<nuraft::buffer> getZooKeeperRequestMessage(const KeeperStorage::RequestForSession & request_for_session)
 {
-    DB::WriteBufferFromNuraftBuffer buf;
-    DB::writeIntBinary(session_id, buf);
-    request->write(buf);
-    DB::writeIntBinary(time, buf);
-    return buf.getBuffer();
+    DB::WriteBufferFromNuraftBuffer write_buf;
+    DB::writeIntBinary(request_for_session.session_id, write_buf);
+    request_for_session.request->write(write_buf);
+    DB::writeIntBinary(request_for_session.time, write_buf);
+    return write_buf.getBuffer();
+}
+
+// Serialize the request for the log entry
+nuraft::ptr<nuraft::buffer> getZooKeeperLogEntry(const KeeperStorage::RequestForSession & request_for_session)
+{
+    DB::WriteBufferFromNuraftBuffer write_buf;
+    DB::writeIntBinary(request_for_session.session_id, write_buf);
+    request_for_session.request->write(write_buf);
+    DB::writeIntBinary(request_for_session.time, write_buf);
+    DB::writeIntBinary(request_for_session.zxid, write_buf);
+    assert(request_for_session.digest);
+    DB::writeIntBinary(request_for_session.digest->version, write_buf);
+    if (request_for_session.digest->version != KeeperStorage::DigestVersion::NO_DIGEST)
+        DB::writeIntBinary(request_for_session.digest->value, write_buf);
+
+    return write_buf.getBuffer();
 }

 }

-
 void KeeperServer::putLocalReadRequest(const KeeperStorage::RequestForSession & request_for_session)
 {
    if (!request_for_session.request->isReadRequest())
@ -390,8 +427,10 @@ void KeeperServer::putLocalReadRequest(const KeeperStorage::RequestForSession &
 RaftAppendResult KeeperServer::putRequestBatch(const KeeperStorage::RequestsForSessions & requests_for_sessions)
 {
    std::vector<nuraft::ptr<nuraft::buffer>> entries;
-    for (const auto & [session_id, time, request] : requests_for_sessions)
-        entries.push_back(getZooKeeperLogEntry(session_id, time, request));
+    for (const auto & request_for_session : requests_for_sessions)
+    {
+        entries.push_back(getZooKeeperRequestMessage(request_for_session));
+    }

    std::lock_guard lock{server_write_mutex};
    if (is_recovering)
@ -501,7 +540,33 @@ nuraft::cb_func::ReturnCode KeeperServer::callbackFunc(nuraft::cb_func::Type typ
    }

    if (initialized_flag)
+    {
+        switch (type)
+        {
+            // This event is called before a single log is appended to the entry on the leader node
+            case nuraft::cb_func::PreAppendLog:
+            {
+                // we are relying on the fact that request are being processed under a mutex
+                // and not a RW lock
+                auto & entry = *static_cast<LogEntryPtr *>(param->ctx);
+
+                assert(entry->get_val_type() == nuraft::app_log);
+                auto next_zxid = state_machine->getNextZxid();
+
+                auto & entry_buf = entry->get_buf();
+                auto request_for_session = state_machine->parseRequest(entry_buf);
+                request_for_session.zxid = next_zxid;
+                state_machine->preprocess(request_for_session);
+                request_for_session.digest = state_machine->getNodesDigest();
+                entry = nuraft::cs_new<nuraft::log_entry>(entry->get_term(), getZooKeeperLogEntry(request_for_session), entry->get_val_type());
+                break;
+            }
+            default:
+                break;
+        }
+
        return nuraft::cb_func::ReturnCode::Ok;
+    }

    size_t last_commited = state_machine->last_commit_index();
    size_t next_index = state_manager->getLogStore()->next_slot();
--- a/src/Coordination/KeeperSnapshotManager.cpp
+++ b/src/Coordination/KeeperSnapshotManager.cpp
@ -1,14 +1,14 @@
-#include <Coordination/KeeperSnapshotManager.h>
-#include <IO/WriteHelpers.h>
 #include <Compression/CompressedReadBuffer.h>
 #include <Compression/CompressedWriteBuffer.h>
-#include <IO/ReadHelpers.h>
-#include <Common/ZooKeeper/ZooKeeperIO.h>
+#include <Coordination/KeeperSnapshotManager.h>
 #include <Coordination/ReadBufferFromNuraftBuffer.h>
 #include <Coordination/WriteBufferFromNuraftBuffer.h>
-#include <IO/WriteBufferFromFile.h>
 #include <IO/ReadBufferFromFile.h>
+#include <IO/ReadHelpers.h>
+#include <IO/WriteBufferFromFile.h>
+#include <IO/WriteHelpers.h>
 #include <IO/copyData.h>
+#include <Common/ZooKeeper/ZooKeeperIO.h>
 #include <Coordination/pathUtils.h>
 #include <filesystem>
 #include <memory>
@ -149,6 +149,19 @@ void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, Wr
 {
    writeBinary(static_cast<uint8_t>(snapshot.version), out);
    serializeSnapshotMetadata(snapshot.snapshot_meta, out);
+
+    if (snapshot.version >= SnapshotVersion::V5)
+    {
+        writeBinary(snapshot.zxid, out);
+        if (snapshot.storage->digest_enabled)
+        {
+            writeBinary(static_cast<uint8_t>(KeeperStorage::CURRENT_DIGEST_VERSION), out);
+            writeBinary(snapshot.nodes_digest, out);
+        }
+        else
+            writeBinary(static_cast<uint8_t>(KeeperStorage::NO_DIGEST), out);
+    }
+
    writeBinary(snapshot.session_id, out);

    /// Better to sort before serialization, otherwise snapshots can be different on different replicas
@ -178,7 +191,7 @@ void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, Wr
        /// Benign race condition possible while taking snapshot: NuRaft decide to create snapshot at some log id
        /// and only after some time we lock storage and enable snapshot mode. So snapshot_container_size can be
        /// slightly bigger than required.
-        if (static_cast<size_t>(node.stat.mzxid) > snapshot.snapshot_meta->get_last_log_idx())
+        if (node.stat.mzxid > snapshot.zxid)
            break;

        writeBinary(path, out);
@ -194,7 +207,8 @@ void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, Wr

    /// Session must be saved in a sorted order,
    /// otherwise snapshots will be different
-    std::vector<std::pair<int64_t, int64_t>> sorted_session_and_timeout(snapshot.session_and_timeout.begin(), snapshot.session_and_timeout.end());
+    std::vector<std::pair<int64_t, int64_t>> sorted_session_and_timeout(
+        snapshot.session_and_timeout.begin(), snapshot.session_and_timeout.end());
    ::sort(sorted_session_and_timeout.begin(), sorted_session_and_timeout.end());

    /// Serialize sessions
@ -238,10 +252,34 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial
    deserialization_result.snapshot_meta = deserializeSnapshotMetadata(in);
    KeeperStorage & storage = *deserialization_result.storage;

+    bool recalculate_digest = storage.digest_enabled;
+    if (version >= SnapshotVersion::V5)
+    {
+        readBinary(storage.zxid, in);
+        uint8_t digest_version;
+        readBinary(digest_version, in);
+        if (digest_version != KeeperStorage::DigestVersion::NO_DIGEST)
+        {
+            uint64_t nodes_digest;
+            readBinary(nodes_digest, in);
+            if (digest_version == KeeperStorage::CURRENT_DIGEST_VERSION)
+            {
+                storage.nodes_digest = nodes_digest;
+                recalculate_digest = false;
+            }
+        }
+
+        storage.old_snapshot_zxid = 0;
+    }
+    else
+    {
+        storage.zxid = deserialization_result.snapshot_meta->get_last_log_idx();
+        storage.old_snapshot_zxid = storage.zxid;
+    }
+
    int64_t session_id;
    readBinary(session_id, in);

-    storage.zxid = deserialization_result.snapshot_meta->get_last_log_idx();
    storage.session_id_counter = session_id;

    /// Before V1 we serialized ACL without acl_map
@ -275,6 +313,9 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial
    size_t snapshot_container_size;
    readBinary(snapshot_container_size, in);

+    if (recalculate_digest)
+        storage.nodes_digest = 0;
+
    size_t current_size = 0;
    while (current_size < snapshot_container_size)
    {
@ -287,6 +328,9 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial
            storage.ephemerals[node.stat.ephemeralOwner].insert(path);

        current_size++;
+
+        if (recalculate_digest)
+            storage.nodes_digest += node.getDigest(path);
    }

    for (const auto & itr : storage.container)
@ -294,7 +338,8 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial
        if (itr.key != "/")
        {
            auto parent_path = parentPath(itr.key);
-            storage.container.updateValue(parent_path, [path = itr.key] (KeeperStorage::Node & value) { value.addChild(getBaseName(path)); });
+            storage.container.updateValue(
+                parent_path, [path = itr.key](KeeperStorage::Node & value) { value.addChild(getBaseName(path)); });
        }
    }

@ -368,6 +413,8 @@ KeeperStorageSnapshot::KeeperStorageSnapshot(KeeperStorage * storage_, uint64_t
    , snapshot_meta(std::make_shared<SnapshotMetadata>(up_to_log_idx_, 0, std::make_shared<nuraft::cluster_config>()))
    , session_id(storage->session_id_counter)
    , cluster_config(cluster_config_)
+    , zxid(storage->zxid)
+    , nodes_digest(storage->nodes_digest)
 {
    auto [size, ver] = storage->container.snapshotSizeWithVersion();
    snapshot_container_size = size;
@ -378,11 +425,14 @@ KeeperStorageSnapshot::KeeperStorageSnapshot(KeeperStorage * storage_, uint64_t
    session_and_auth = storage->session_and_auth;
 }

-KeeperStorageSnapshot::KeeperStorageSnapshot(KeeperStorage * storage_, const SnapshotMetadataPtr & snapshot_meta_, const ClusterConfigPtr & cluster_config_)
+KeeperStorageSnapshot::KeeperStorageSnapshot(
+    KeeperStorage * storage_, const SnapshotMetadataPtr & snapshot_meta_, const ClusterConfigPtr & cluster_config_)
    : storage(storage_)
    , snapshot_meta(snapshot_meta_)
    , session_id(storage->session_id_counter)
    , cluster_config(cluster_config_)
+    , zxid(storage->zxid)
+    , nodes_digest(storage->nodes_digest)
 {
    auto [size, ver] = storage->container.snapshotSizeWithVersion();
    snapshot_container_size = size;
@ -399,14 +449,18 @@ KeeperStorageSnapshot::~KeeperStorageSnapshot()
 }

 KeeperSnapshotManager::KeeperSnapshotManager(
-    const std::string & snapshots_path_, size_t snapshots_to_keep_,
+    const std::string & snapshots_path_,
+    size_t snapshots_to_keep_,
    bool compress_snapshots_zstd_,
-    const std::string & superdigest_, size_t storage_tick_time_)
+    const std::string & superdigest_,
+    size_t storage_tick_time_,
+    const bool digest_enabled_)
    : snapshots_path(snapshots_path_)
    , snapshots_to_keep(snapshots_to_keep_)
    , compress_snapshots_zstd(compress_snapshots_zstd_)
    , superdigest(superdigest_)
    , storage_tick_time(storage_tick_time_)
+    , digest_enabled(digest_enabled_)
 {
    namespace fs = std::filesystem;

@ -529,7 +583,7 @@ SnapshotDeserializationResult KeeperSnapshotManager::deserializeSnapshotFromBuff
        compressed_reader = std::make_unique<CompressedReadBuffer>(*reader);

    SnapshotDeserializationResult result;
-    result.storage = std::make_unique<KeeperStorage>(storage_tick_time, superdigest);
+    result.storage = std::make_unique<KeeperStorage>(storage_tick_time, superdigest, digest_enabled);
    KeeperStorageSnapshot::deserialize(result, *compressed_reader);
    return result;
 }
@ -568,7 +622,7 @@ std::pair<std::string, std::error_code> KeeperSnapshotManager::serializeSnapshot
    std::string tmp_snapshot_path = std::filesystem::path{snapshots_path} / tmp_snapshot_file_name;
    std::string new_snapshot_path = std::filesystem::path{snapshots_path} / snapshot_file_name;

-    auto writer = std::make_unique<WriteBufferFromFile>(tmp_snapshot_path, O_WRONLY | O_TRUNC | O_CREAT | O_CLOEXEC| O_APPEND);
+    auto writer = std::make_unique<WriteBufferFromFile>(tmp_snapshot_path, O_WRONLY | O_TRUNC | O_CREAT | O_CLOEXEC | O_APPEND);
    std::unique_ptr<WriteBuffer> compressed_writer;
    if (compress_snapshots_zstd)
        compressed_writer = wrapWriteBufferWithCompressionMethod(std::move(writer), CompressionMethod::Zstd, 3);
--- a/src/Coordination/KeeperSnapshotManager.h
+++ b/src/Coordination/KeeperSnapshotManager.h
@ -1,10 +1,10 @@
 #pragma once
 #include <filesystem>
 #include <system_error>
-#include <libnuraft/nuraft.hxx>
 #include <Coordination/KeeperStorage.h>
-#include <IO/WriteBuffer.h>
 #include <IO/ReadBuffer.h>
+#include <IO/WriteBuffer.h>
+#include <libnuraft/nuraft.hxx>

 namespace DB
 {
@ -21,9 +21,10 @@ enum SnapshotVersion : uint8_t
    V2 = 2, /// with 64 bit buffer header
    V3 = 3, /// compress snapshots with ZSTD codec
    V4 = 4, /// add Node size to snapshots
+    V5 = 5, /// add ZXID and digest to snapshots
 };

-static constexpr auto CURRENT_SNAPSHOT_VERSION = SnapshotVersion::V4;
+static constexpr auto CURRENT_SNAPSHOT_VERSION = SnapshotVersion::V5;

 /// What is stored in binary shapsnot
 struct SnapshotDeserializationResult
@ -49,7 +50,8 @@ struct KeeperStorageSnapshot
 public:
    KeeperStorageSnapshot(KeeperStorage * storage_, uint64_t up_to_log_idx_, const ClusterConfigPtr & cluster_config_ = nullptr);

-    KeeperStorageSnapshot(KeeperStorage * storage_, const SnapshotMetadataPtr & snapshot_meta_, const ClusterConfigPtr & cluster_config_ = nullptr);
+    KeeperStorageSnapshot(
+        KeeperStorage * storage_, const SnapshotMetadataPtr & snapshot_meta_, const ClusterConfigPtr & cluster_config_ = nullptr);

    ~KeeperStorageSnapshot();

@ -77,6 +79,10 @@ public:
    std::unordered_map<uint64_t, Coordination::ACLs> acl_map;
    /// Cluster config from snapshot, can be empty
    ClusterConfigPtr cluster_config;
+    /// Last committed ZXID
+    int64_t zxid;
+    /// Current digest of committed nodes
+    uint64_t nodes_digest;
 };

 using KeeperStorageSnapshotPtr = std::shared_ptr<KeeperStorageSnapshot>;
@ -91,8 +97,12 @@ class KeeperSnapshotManager
 {
 public:
    KeeperSnapshotManager(
-        const std::string & snapshots_path_, size_t snapshots_to_keep_,
-        bool compress_snapshots_zstd_ = true, const std::string & superdigest_ = "", size_t storage_tick_time_ = 500);
+        const std::string & snapshots_path_,
+        size_t snapshots_to_keep_,
+        bool compress_snapshots_zstd_ = true,
+        const std::string & superdigest_ = "",
+        size_t storage_tick_time_ = 500,
+        bool digest_enabled_ = true);

    /// Restore storage from latest available snapshot
    SnapshotDeserializationResult restoreFromLatestSnapshot();
@ -118,10 +128,7 @@ public:
    void removeSnapshot(uint64_t log_idx);

    /// Total amount of snapshots
-    size_t totalSnapshots() const
-    {
-        return existing_snapshots.size();
-    }
+    size_t totalSnapshots() const { return existing_snapshots.size(); }

    /// The most fresh snapshot log index we have
    size_t getLatestSnapshotIndex() const
@ -161,6 +168,7 @@ private:
    const std::string superdigest;
    /// Storage sessions timeout check interval (also for deserializatopn)
    size_t storage_tick_time;
+    const bool digest_enabled;
 };

 /// Keeper create snapshots in background thread. KeeperStateMachine just create
--- a/src/Coordination/KeeperStateMachine.cpp
+++ b/src/Coordination/KeeperStateMachine.cpp
@ -1,12 +1,14 @@
-#include <sys/mman.h>
 #include <cerrno>
+#include <future>
+#include <Coordination/KeeperSnapshotManager.h>
 #include <Coordination/KeeperStateMachine.h>
 #include <Coordination/ReadBufferFromNuraftBuffer.h>
 #include <Coordination/WriteBufferFromNuraftBuffer.h>
 #include <IO/ReadHelpers.h>
+#include <sys/mman.h>
+#include "Common/ZooKeeper/ZooKeeperCommon.h"
 #include <Common/ZooKeeper/ZooKeeperIO.h>
-#include <Coordination/KeeperSnapshotManager.h>
-#include <future>
+#include "Coordination/KeeperStorage.h"

 namespace DB
 {
@ -19,52 +21,29 @@ namespace ErrorCodes

 namespace
 {
-    KeeperStorage::RequestForSession parseRequest(nuraft::buffer & data)
-    {
-        ReadBufferFromNuraftBuffer buffer(data);
-        KeeperStorage::RequestForSession request_for_session;
-        readIntBinary(request_for_session.session_id, buffer);
-
-        int32_t length;
-        Coordination::read(length, buffer);
-
-        int32_t xid;
-        Coordination::read(xid, buffer);
-
-        Coordination::OpNum opnum;
-
-        Coordination::read(opnum, buffer);
-
-        request_for_session.request = Coordination::ZooKeeperRequestFactory::instance().get(opnum);
-        request_for_session.request->xid = xid;
-        request_for_session.request->readImpl(buffer);
-
-        if (!buffer.eof())
-            readIntBinary(request_for_session.time, buffer);
-        else /// backward compatibility
-            request_for_session.time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
-
-
-        return request_for_session;
-    }
 }

 KeeperStateMachine::KeeperStateMachine(
-        ResponsesQueue & responses_queue_,
-        SnapshotsQueue & snapshots_queue_,
-        const std::string & snapshots_path_,
-        const CoordinationSettingsPtr & coordination_settings_,
-        const std::string & superdigest_)
+    ResponsesQueue & responses_queue_,
+    SnapshotsQueue & snapshots_queue_,
+    const std::string & snapshots_path_,
+    const CoordinationSettingsPtr & coordination_settings_,
+    const std::string & superdigest_,
+    const bool digest_enabled_)
    : coordination_settings(coordination_settings_)
    , snapshot_manager(
-        snapshots_path_, coordination_settings->snapshots_to_keep,
-        coordination_settings->compress_snapshots_with_zstd_format, superdigest_,
-        coordination_settings->dead_session_check_period_ms.totalMilliseconds())
+          snapshots_path_,
+          coordination_settings->snapshots_to_keep,
+          coordination_settings->compress_snapshots_with_zstd_format,
+          superdigest_,
+          coordination_settings->dead_session_check_period_ms.totalMilliseconds(),
+          digest_enabled_)
    , responses_queue(responses_queue_)
    , snapshots_queue(snapshots_queue_)
    , last_committed_idx(0)
    , log(&Poco::Logger::get("KeeperStateMachine"))
    , superdigest(superdigest_)
+    , digest_enabled(digest_enabled_)
 {
 }

@ -82,7 +61,8 @@ void KeeperStateMachine::init()

        try
        {
-            auto snapshot_deserialization_result = snapshot_manager.deserializeSnapshotFromBuffer(snapshot_manager.deserializeSnapshotBufferFromDisk(latest_log_index));
+            auto snapshot_deserialization_result
+                = snapshot_manager.deserializeSnapshotFromBuffer(snapshot_manager.deserializeSnapshotBufferFromDisk(latest_log_index));
            latest_snapshot_path = snapshot_manager.getLatestSnapshotPath();
            storage = std::move(snapshot_deserialization_result.storage);
            latest_snapshot_meta = snapshot_deserialization_result.snapshot_meta;
@ -93,7 +73,11 @@ void KeeperStateMachine::init()
        }
        catch (const DB::Exception & ex)
        {
-            LOG_WARNING(log, "Failed to load from snapshot with index {}, with error {}, will remove it from disk", latest_log_index, ex.displayText());
+            LOG_WARNING(
+                log,
+                "Failed to load from snapshot with index {}, with error {}, will remove it from disk",
+                latest_log_index,
+                ex.displayText());
            snapshot_manager.removeSnapshot(latest_log_index);
        }
    }
@ -111,16 +95,117 @@ void KeeperStateMachine::init()
    }

    if (!storage)
-        storage = std::make_unique<KeeperStorage>(coordination_settings->dead_session_check_period_ms.totalMilliseconds(), superdigest);
+        storage = std::make_unique<KeeperStorage>(
+            coordination_settings->dead_session_check_period_ms.totalMilliseconds(), superdigest, digest_enabled);
+}
+
+namespace
+{
+
+void assertDigest(
+    const KeeperStorage::Digest & first,
+    const KeeperStorage::Digest & second,
+    const Coordination::ZooKeeperRequest & request,
+    bool committing)
+{
+    if (!KeeperStorage::checkDigest(first, second))
+    {
+        LOG_FATAL(
+            &Poco::Logger::get("KeeperStateMachine"),
+            "Digest for nodes is not matching after {} request of type '{}'.\nExpected digest - {}, actual digest - {} (digest version "
+            "{}). Keeper will "
+            "terminate to avoid inconsistencies.\nExtra information about the request:\n{}",
+            committing ? "committing" : "preprocessing",
+            request.getOpNum(),
+            first.value,
+            second.value,
+            first.version,
+            request.toString());
+        std::terminate();
+    }
+}
+
+}
+
+nuraft::ptr<nuraft::buffer> KeeperStateMachine::pre_commit(uint64_t log_idx, nuraft::buffer & data)
+{
+    auto request_for_session = parseRequest(data);
+    if (!request_for_session.zxid)
+        request_for_session.zxid = log_idx;
+
+    preprocess(request_for_session);
+    return nullptr;
+}
+
+KeeperStorage::RequestForSession KeeperStateMachine::parseRequest(nuraft::buffer & data)
+{
+    ReadBufferFromNuraftBuffer buffer(data);
+    KeeperStorage::RequestForSession request_for_session;
+    readIntBinary(request_for_session.session_id, buffer);
+
+    int32_t length;
+    Coordination::read(length, buffer);
+
+    int32_t xid;
+    Coordination::read(xid, buffer);
+
+    Coordination::OpNum opnum;
+
+    Coordination::read(opnum, buffer);
+
+    request_for_session.request = Coordination::ZooKeeperRequestFactory::instance().get(opnum);
+    request_for_session.request->xid = xid;
+    request_for_session.request->readImpl(buffer);
+
+    if (!buffer.eof())
+        readIntBinary(request_for_session.time, buffer);
+    else /// backward compatibility
+        request_for_session.time
+            = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
+
+    if (!buffer.eof())
+        readIntBinary(request_for_session.zxid, buffer);
+
+    if (!buffer.eof())
+    {
+        request_for_session.digest.emplace();
+        readIntBinary(request_for_session.digest->version, buffer);
+        if (request_for_session.digest->version != KeeperStorage::DigestVersion::NO_DIGEST)
+            readIntBinary(request_for_session.digest->value, buffer);
+    }
+
+    return request_for_session;
+}
+
+void KeeperStateMachine::preprocess(const KeeperStorage::RequestForSession & request_for_session)
+{
+    if (request_for_session.request->getOpNum() == Coordination::OpNum::SessionID)
+        return;
+
+    std::lock_guard lock(storage_and_responses_lock);
+    storage->preprocessRequest(
+        request_for_session.request,
+        request_for_session.session_id,
+        request_for_session.time,
+        request_for_session.zxid,
+        true /* check_acl */,
+        request_for_session.digest);
+
+    if (digest_enabled && request_for_session.digest)
+        assertDigest(*request_for_session.digest, storage->getNodesDigest(false), *request_for_session.request, false);
 }

 nuraft::ptr<nuraft::buffer> KeeperStateMachine::commit(const uint64_t log_idx, nuraft::buffer & data)
 {
    auto request_for_session = parseRequest(data);
+    if (!request_for_session.zxid)
+        request_for_session.zxid = log_idx;
+
    /// Special processing of session_id request
    if (request_for_session.request->getOpNum() == Coordination::OpNum::SessionID)
    {
-        const Coordination::ZooKeeperSessionIDRequest & session_id_request = dynamic_cast<const Coordination::ZooKeeperSessionIDRequest &>(*request_for_session.request);
+        const Coordination::ZooKeeperSessionIDRequest & session_id_request
+            = dynamic_cast<const Coordination::ZooKeeperSessionIDRequest &>(*request_for_session.request);
        int64_t session_id;
        std::shared_ptr<Coordination::ZooKeeperSessionIDResponse> response = std::make_shared<Coordination::ZooKeeperSessionIDResponse>();
        response->internal_id = session_id_request.internal_id;
@ -140,10 +225,19 @@ nuraft::ptr<nuraft::buffer> KeeperStateMachine::commit(const uint64_t log_idx, n
    else
    {
        std::lock_guard lock(storage_and_responses_lock);
-        KeeperStorage::ResponsesForSessions responses_for_sessions = storage->processRequest(request_for_session.request, request_for_session.session_id, request_for_session.time, log_idx);
+        KeeperStorage::ResponsesForSessions responses_for_sessions = storage->processRequest(
+            request_for_session.request, request_for_session.session_id, request_for_session.zxid);
        for (auto & response_for_session : responses_for_sessions)
            if (!responses_queue.push(response_for_session))
-                throw Exception(ErrorCodes::SYSTEM_ERROR, "Could not push response with session id {} into responses queue", response_for_session.session_id);
+                throw Exception(
+                    ErrorCodes::SYSTEM_ERROR,
+                    "Could not push response with session id {} into responses queue",
+                    response_for_session.session_id);
+
+        if (digest_enabled && request_for_session.digest)
+        {
+            assertDigest(*request_for_session.digest, storage->getNodesDigest(true), *request_for_session.request, true);
+        }
    }

    last_committed_idx = log_idx;
@ -157,14 +251,18 @@ bool KeeperStateMachine::apply_snapshot(nuraft::snapshot & s)
    { /// save snapshot into memory
        std::lock_guard lock(snapshots_lock);
        if (s.get_last_log_idx() != latest_snapshot_meta->get_last_log_idx())
-            throw Exception(ErrorCodes::LOGICAL_ERROR, "Required to apply snapshot with last log index {}, but our last log index is {}",
-                            s.get_last_log_idx(), latest_snapshot_meta->get_last_log_idx());
+            throw Exception(
+                ErrorCodes::LOGICAL_ERROR,
+                "Required to apply snapshot with last log index {}, but our last log index is {}",
+                s.get_last_log_idx(),
+                latest_snapshot_meta->get_last_log_idx());
        latest_snapshot_ptr = latest_snapshot_buf;
    }

    { /// deserialize and apply snapshot to storage
        std::lock_guard lock(storage_and_responses_lock);
-        auto snapshot_deserialization_result = snapshot_manager.deserializeSnapshotFromBuffer(snapshot_manager.deserializeSnapshotBufferFromDisk(s.get_last_log_idx()));
+        auto snapshot_deserialization_result
+            = snapshot_manager.deserializeSnapshotFromBuffer(snapshot_manager.deserializeSnapshotBufferFromDisk(s.get_last_log_idx()));
        storage = std::move(snapshot_deserialization_result.storage);
        latest_snapshot_meta = snapshot_deserialization_result.snapshot_meta;
        cluster_config = snapshot_deserialization_result.cluster_config;
@ -182,6 +280,19 @@ void KeeperStateMachine::commit_config(const uint64_t /* log_idx */, nuraft::ptr
    cluster_config = ClusterConfig::deserialize(*tmp);
 }

+void KeeperStateMachine::rollback(uint64_t log_idx, nuraft::buffer & data)
+{
+    auto request_for_session = parseRequest(data);
+    // If we received a log from an older node, use the log_idx as the zxid
+    // log_idx will always be larger or equal to the zxid so we can safely do this
+    // (log_idx is increased for all logs, while zxid is only increased for requests)
+    if (!request_for_session.zxid)
+        request_for_session.zxid = log_idx;
+
+    std::lock_guard lock(storage_and_responses_lock);
+    storage->rollbackRequest(request_for_session.zxid);
+}
+
 nuraft::ptr<nuraft::snapshot> KeeperStateMachine::last_snapshot()
 {
    /// Just return the latest snapshot.
@ -189,9 +300,7 @@ nuraft::ptr<nuraft::snapshot> KeeperStateMachine::last_snapshot()
    return latest_snapshot_meta;
 }

-void KeeperStateMachine::create_snapshot(
-    nuraft::snapshot & s,
-    nuraft::async_result<bool>::handler_type & when_done)
+void KeeperStateMachine::create_snapshot(nuraft::snapshot & s, nuraft::async_result<bool>::handler_type & when_done)
 {
    LOG_DEBUG(log, "Creating snapshot {}", s.get_last_log_idx());

@ -204,19 +313,22 @@ void KeeperStateMachine::create_snapshot(
    }

    /// create snapshot task for background execution (in snapshot thread)
-    snapshot_task.create_snapshot = [this, when_done] (KeeperStorageSnapshotPtr && snapshot)
+    snapshot_task.create_snapshot = [this, when_done](KeeperStorageSnapshotPtr && snapshot)
    {
        nuraft::ptr<std::exception> exception(nullptr);
        bool ret = true;
        try
        {
-            {   /// Read storage data without locks and create snapshot
+            { /// Read storage data without locks and create snapshot
                std::lock_guard lock(snapshots_lock);
-                auto [path, error_code]= snapshot_manager.serializeSnapshotToDisk(*snapshot);
+                auto [path, error_code] = snapshot_manager.serializeSnapshotToDisk(*snapshot);
                if (error_code)
                {
-                    throw Exception(ErrorCodes::SYSTEM_ERROR, "Snapshot {} was created failed, error: {}",
-                            snapshot->snapshot_meta->get_last_log_idx(), error_code.message());
+                    throw Exception(
+                        ErrorCodes::SYSTEM_ERROR,
+                        "Snapshot {} was created failed, error: {}",
+                        snapshot->snapshot_meta->get_last_log_idx(),
+                        error_code.message());
                }
                latest_snapshot_path = path;
                latest_snapshot_meta = snapshot->snapshot_meta;
@ -251,11 +363,7 @@ void KeeperStateMachine::create_snapshot(
 }

 void KeeperStateMachine::save_logical_snp_obj(
-    nuraft::snapshot & s,
-    uint64_t & obj_id,
-    nuraft::buffer & data,
-    bool /*is_first_obj*/,
-    bool /*is_last_obj*/)
+    nuraft::snapshot & s, uint64_t & obj_id, nuraft::buffer & data, bool /*is_first_obj*/, bool /*is_last_obj*/)
 {
    LOG_DEBUG(log, "Saving snapshot {} obj_id {}", s.get_last_log_idx(), obj_id);

@ -311,13 +419,8 @@ static int bufferFromFile(Poco::Logger * log, const std::string & path, nuraft::
 }

 int KeeperStateMachine::read_logical_snp_obj(
-    nuraft::snapshot & s,
-    void* & /*user_snp_ctx*/,
-    uint64_t obj_id,
-    nuraft::ptr<nuraft::buffer> & data_out,
-    bool & is_last_obj)
+    nuraft::snapshot & s, void *& /*user_snp_ctx*/, uint64_t obj_id, nuraft::ptr<nuraft::buffer> & data_out, bool & is_last_obj)
 {
-
    LOG_DEBUG(log, "Reading snapshot {} obj_id {}", s.get_last_log_idx(), obj_id);

    std::lock_guard lock(snapshots_lock);
@ -325,8 +428,11 @@ int KeeperStateMachine::read_logical_snp_obj(
    /// Let's wait and NuRaft will retry this call.
    if (s.get_last_log_idx() != latest_snapshot_meta->get_last_log_idx())
    {
-        LOG_WARNING(log, "Required to apply snapshot with last log index {}, but our last log index is {}. Will ignore this one and retry",
-                        s.get_last_log_idx(), latest_snapshot_meta->get_last_log_idx());
+        LOG_WARNING(
+            log,
+            "Required to apply snapshot with last log index {}, but our last log index is {}. Will ignore this one and retry",
+            s.get_last_log_idx(),
+            latest_snapshot_meta->get_last_log_idx());
        return -1;
    }
    if (bufferFromFile(log, latest_snapshot_path, data_out))
@ -343,10 +449,16 @@ void KeeperStateMachine::processReadRequest(const KeeperStorage::RequestForSessi
 {
    /// Pure local request, just process it with storage
    std::lock_guard lock(storage_and_responses_lock);
-    auto responses = storage->processRequest(request_for_session.request, request_for_session.session_id, request_for_session.time, std::nullopt);
+    auto responses = storage->processRequest(
+        request_for_session.request,
+        request_for_session.session_id,
+        std::nullopt,
+        true /*check_acl*/,
+        true /*is_local*/);
    for (const auto & response : responses)
        if (!responses_queue.push(response))
-            throw Exception(ErrorCodes::SYSTEM_ERROR, "Could not push response with session id {} into responses queue", response.session_id);
+            throw Exception(
+                ErrorCodes::SYSTEM_ERROR, "Could not push response with session id {} into responses queue", response.session_id);
 }

 void KeeperStateMachine::shutdownStorage()
@ -361,6 +473,18 @@ std::vector<int64_t> KeeperStateMachine::getDeadSessions()
    return storage->getDeadSessions();
 }

+int64_t KeeperStateMachine::getNextZxid() const
+{
+    std::lock_guard lock(storage_and_responses_lock);
+    return storage->getNextZXID();
+}
+
+KeeperStorage::Digest KeeperStateMachine::getNodesDigest() const
+{
+    std::lock_guard lock(storage_and_responses_lock);
+    return storage->getNodesDigest(false);
+}
+
 uint64_t KeeperStateMachine::getLastProcessedZxid() const
 {
    std::lock_guard lock(storage_and_responses_lock);
--- a/src/Coordination/KeeperStateMachine.h
+++ b/src/Coordination/KeeperStateMachine.h
@ -1,11 +1,11 @@
 #pragma once

-#include <Common/ConcurrentBoundedQueue.h>
-#include <Coordination/KeeperStorage.h>
-#include <libnuraft/nuraft.hxx>
-#include <Common/logger_useful.h>
 #include <Coordination/CoordinationSettings.h>
 #include <Coordination/KeeperSnapshotManager.h>
+#include <Coordination/KeeperStorage.h>
+#include <libnuraft/nuraft.hxx>
+#include <Common/ConcurrentBoundedQueue.h>
+#include <Common/logger_useful.h>


 namespace DB
@ -20,23 +20,28 @@ class KeeperStateMachine : public nuraft::state_machine
 {
 public:
    KeeperStateMachine(
-        ResponsesQueue & responses_queue_, SnapshotsQueue & snapshots_queue_,
-        const std::string & snapshots_path_, const CoordinationSettingsPtr & coordination_settings_,
-        const std::string & superdigest_ = "");
+        ResponsesQueue & responses_queue_,
+        SnapshotsQueue & snapshots_queue_,
+        const std::string & snapshots_path_,
+        const CoordinationSettingsPtr & coordination_settings_,
+        const std::string & superdigest_ = "",
+        bool digest_enabled_ = true);

    /// Read state from the latest snapshot
    void init();

-    /// Currently not supported
-    nuraft::ptr<nuraft::buffer> pre_commit(const uint64_t /*log_idx*/, nuraft::buffer & /*data*/) override { return nullptr; }
+    static KeeperStorage::RequestForSession parseRequest(nuraft::buffer & data);
+
+    void preprocess(const KeeperStorage::RequestForSession & request_for_session);
+
+    nuraft::ptr<nuraft::buffer> pre_commit(uint64_t log_idx, nuraft::buffer & data) override;

    nuraft::ptr<nuraft::buffer> commit(const uint64_t log_idx, nuraft::buffer & data) override; /// NOLINT

    /// Save new cluster config to our snapshot (copy of the config stored in StateManager)
    void commit_config(const uint64_t log_idx, nuraft::ptr<nuraft::cluster_config> & new_conf) override; /// NOLINT

-    /// Currently not supported
-    void rollback(const uint64_t /*log_idx*/, nuraft::buffer & /*data*/) override {}
+    void rollback(uint64_t log_idx, nuraft::buffer & data) override;

    uint64_t last_commit_index() override { return last_committed_idx; }

@ -46,32 +51,18 @@ public:
    nuraft::ptr<nuraft::snapshot> last_snapshot() override;

    /// Create new snapshot from current state.
-    void create_snapshot(
-        nuraft::snapshot & s,
-        nuraft::async_result<bool>::handler_type & when_done) override;
+    void create_snapshot(nuraft::snapshot & s, nuraft::async_result<bool>::handler_type & when_done) override;

    /// Save snapshot which was send by leader to us. After that we will apply it in apply_snapshot.
-    void save_logical_snp_obj(
-        nuraft::snapshot & s,
-        uint64_t & obj_id,
-        nuraft::buffer & data,
-        bool is_first_obj,
-        bool is_last_obj) override;
+    void save_logical_snp_obj(nuraft::snapshot & s, uint64_t & obj_id, nuraft::buffer & data, bool is_first_obj, bool is_last_obj) override;

    /// Better name is `serialize snapshot` -- save existing snapshot (created by create_snapshot) into
    /// in-memory buffer data_out.
    int read_logical_snp_obj(
-        nuraft::snapshot & s,
-        void* & user_snp_ctx,
-        uint64_t obj_id,
-        nuraft::ptr<nuraft::buffer> & data_out,
-        bool & is_last_obj) override;
+        nuraft::snapshot & s, void *& user_snp_ctx, uint64_t obj_id, nuraft::ptr<nuraft::buffer> & data_out, bool & is_last_obj) override;

    /// just for test
-    KeeperStorage & getStorage()
-    {
-        return *storage;
-    }
+    KeeperStorage & getStorage() { return *storage; }

    void shutdownStorage();

@ -82,6 +73,10 @@ public:

    std::vector<int64_t> getDeadSessions();

+    int64_t getNextZxid() const;
+
+    KeeperStorage::Digest getNodesDigest() const;
+
    /// Introspection functions for 4lw commands
    uint64_t getLastProcessedZxid() const;

@ -101,7 +96,6 @@ public:
    uint64_t getLatestSnapshotBufSize() const;

 private:
-
    /// In our state machine we always have a single snapshot which is stored
    /// in memory in compressed (serialized) format.
    SnapshotMetadataPtr latest_snapshot_meta = nullptr;
@ -145,6 +139,8 @@ private:

    /// Special part of ACL system -- superdigest specified in server config.
    const std::string superdigest;
+
+    const bool digest_enabled;
 };

 }
--- a/src/Coordination/KeeperStorage.cpp
+++ b/src/Coordination/KeeperStorage.cpp
--- a/src/Coordination/KeeperStorage.h
+++ b/src/Coordination/KeeperStorage.h
@ -1,14 +1,14 @@
 #pragma once

-#include <Common/ZooKeeper/IKeeper.h>
-#include <Common/ConcurrentBoundedQueue.h>
-#include <Common/ZooKeeper/ZooKeeperCommon.h>
-#include <Coordination/SessionExpiryQueue.h>
-#include <Coordination/ACLMap.h>
-#include <Coordination/SnapshotableHashTable.h>
-#include <IO/WriteBufferFromString.h>
 #include <unordered_map>
 #include <vector>
+#include <Coordination/ACLMap.h>
+#include <Coordination/SessionExpiryQueue.h>
+#include <Coordination/SnapshotableHashTable.h>
+#include <IO/WriteBufferFromString.h>
+#include <Common/ConcurrentBoundedQueue.h>
+#include <Common/ZooKeeper/IKeeper.h>
+#include <Common/ZooKeeper/ZooKeeperCommon.h>

 #include <absl/container/flat_hash_set.h>

@ -29,7 +29,6 @@ struct KeeperStorageSnapshot;
 class KeeperStorage
 {
 public:
-
    struct Node
    {
        uint64_t acl_id = 0; /// 0 -- no ACL by default
@ -41,31 +40,43 @@ public:
        Node() : size_bytes(sizeof(Node)) { }

        /// Object memory size
-        uint64_t sizeInBytes() const
-        {
-            return size_bytes;
-        }
+        uint64_t sizeInBytes() const { return size_bytes; }

        void setData(String new_data);

-        const auto & getData() const noexcept
-        {
-            return data;
-        }
+        const auto & getData() const noexcept { return data; }

        void addChild(StringRef child_path);

        void removeChild(StringRef child_path);

-        const auto & getChildren() const noexcept
-        {
-            return children;
-        }
+        const auto & getChildren() const noexcept { return children; }
+
+        // Invalidate the calculated digest so it's recalculated again on the next
+        // getDigest call
+        void invalidateDigestCache() const;
+
+        // get the calculated digest of the node
+        UInt64 getDigest(std::string_view path) const;
+
+        // copy only necessary information for preprocessing and digest calculation
+        // (e.g. we don't need to copy list of children)
+        void shallowCopy(const Node & other);
+
    private:
        String data;
        ChildrenSet children{};
+        mutable std::optional<UInt64> cached_digest;
    };

+    enum DigestVersion : uint8_t
+    {
+        NO_DIGEST = 0,
+        V0 = 1
+    };
+
+    static constexpr auto CURRENT_DIGEST_VERSION = DigestVersion::V0;
+
    struct ResponseForSession
    {
        int64_t session_id;
@ -73,11 +84,30 @@ public:
    };
    using ResponsesForSessions = std::vector<ResponseForSession>;

+    struct Digest
+    {
+        DigestVersion version{DigestVersion::NO_DIGEST};
+        uint64_t value{0};
+    };
+
+    static bool checkDigest(const Digest & first, const Digest & second)
+    {
+        if (first.version != second.version)
+            return true;
+
+        if (first.version == DigestVersion::NO_DIGEST)
+            return true;
+
+        return first.value == second.value;
+    }
+
    struct RequestForSession
    {
        int64_t session_id;
        int64_t time;
        Coordination::ZooKeeperRequestPtr request;
+        int64_t zxid{0};
+        std::optional<Digest> digest;
    };

    struct AuthID
@ -85,10 +115,7 @@ public:
        std::string scheme;
        std::string id;

-        bool operator==(const AuthID & other) const
-        {
-            return scheme == other.scheme && id == other.id;
-        }
+        bool operator==(const AuthID & other) const { return scheme == other.scheme && id == other.id; }
    };

    using RequestsForSessions = std::vector<RequestForSession>;
@ -112,6 +139,150 @@ public:
    /// container.
    Container container;

+    // Applying ZooKeeper request to storage consists of two steps:
+    //  - preprocessing which, instead of applying the changes directly to storage,
+    //    generates deltas with those changes, denoted with the request ZXID
+    //  - processing which applies deltas with the correct ZXID to the storage
+    //
+    // Delta objects allow us two things:
+    //  - fetch the latest, uncommitted state of an object by getting the committed
+    //    state of that same object from the storage and applying the deltas
+    //    in the same order as they are defined
+    //  - quickly commit the changes to the storage
+    struct CreateNodeDelta
+    {
+        Coordination::Stat stat;
+        bool is_sequental;
+        Coordination::ACLs acls;
+        String data;
+    };
+
+    struct RemoveNodeDelta
+    {
+        int32_t version{-1};
+        int64_t ephemeral_owner{0};
+    };
+
+    struct UpdateNodeDelta
+    {
+        std::function<void(Node &)> update_fn;
+        int32_t version{-1};
+    };
+
+    struct SetACLDelta
+    {
+        Coordination::ACLs acls;
+        int32_t version{-1};
+    };
+
+    struct ErrorDelta
+    {
+        Coordination::Error error;
+    };
+
+    struct FailedMultiDelta
+    {
+        std::vector<Coordination::Error> error_codes;
+    };
+
+    // Denotes end of a subrequest in multi request
+    struct SubDeltaEnd
+    {
+    };
+
+    struct AddAuthDelta
+    {
+        int64_t session_id;
+        AuthID auth_id;
+    };
+
+    using Operation = std::
+        variant<CreateNodeDelta, RemoveNodeDelta, UpdateNodeDelta, SetACLDelta, AddAuthDelta, ErrorDelta, SubDeltaEnd, FailedMultiDelta>;
+
+    struct Delta
+    {
+        Delta(String path_, int64_t zxid_, Operation operation_) : path(std::move(path_)), zxid(zxid_), operation(std::move(operation_)) { }
+
+        Delta(int64_t zxid_, Coordination::Error error) : Delta("", zxid_, ErrorDelta{error}) { }
+
+        Delta(int64_t zxid_, Operation subdelta) : Delta("", zxid_, subdelta) { }
+
+        String path;
+        int64_t zxid;
+        Operation operation;
+    };
+
+    struct UncommittedState
+    {
+        explicit UncommittedState(KeeperStorage & storage_) : storage(storage_) { }
+
+        void addDeltas(std::vector<Delta> new_deltas);
+        void commit(int64_t commit_zxid);
+        void rollback(int64_t rollback_zxid);
+
+        std::shared_ptr<Node> getNode(StringRef path) const;
+        Coordination::ACLs getACLs(StringRef path) const;
+
+        void applyDelta(const Delta & delta);
+
+        bool hasACL(int64_t session_id, bool is_local, std::function<bool(const AuthID &)> predicate)
+        {
+            for (const auto & session_auth : storage.session_and_auth[session_id])
+            {
+                if (predicate(session_auth))
+                    return true;
+            }
+
+            if (is_local)
+                return false;
+
+            for (const auto & delta : deltas)
+            {
+                if (const auto * auth_delta = std::get_if<KeeperStorage::AddAuthDelta>(&delta.operation);
+                    auth_delta && auth_delta->session_id == session_id && predicate(auth_delta->auth_id))
+                    return true;
+            }
+
+            return false;
+        }
+
+        std::shared_ptr<Node> tryGetNodeFromStorage(StringRef path) const;
+
+        struct UncommittedNode
+        {
+            std::shared_ptr<Node> node{nullptr};
+            Coordination::ACLs acls{};
+            int64_t zxid{0};
+        };
+
+        mutable std::unordered_map<std::string, UncommittedNode> nodes;
+        std::list<Delta> deltas;
+        KeeperStorage & storage;
+    };
+
+    UncommittedState uncommitted_state{*this};
+
+    Coordination::Error commit(int64_t zxid);
+
+    // Create node in the storage
+    // Returns false if it failed to create the node, true otherwise
+    // We don't care about the exact failure because we should've caught it during preprocessing
+    bool createNode(
+        const std::string & path,
+        String data,
+        const Coordination::Stat & stat,
+        bool is_sequental,
+        Coordination::ACLs node_acls);
+
+    // Remove node in the storage
+    // Returns false if it failed to remove the node, true otherwise
+    // We don't care about the exact failure because we should've caught it during preprocessing
+    bool removeNode(const std::string & path, int32_t version);
+
+    bool checkACL(StringRef path, int32_t permissions, int64_t session_id, bool is_local);
+
+    void unregisterEphemeralPath(int64_t session_id, const std::string & path);
+
    /// Mapping session_id -> set of ephemeral nodes paths
    Ephemerals ephemerals;
    /// Mapping session_id -> set of watched nodes paths
@ -126,23 +297,50 @@ public:

    /// Global id of all requests applied to storage
    int64_t zxid{0};
+
+    // older Keeper node (pre V5 snapshots) can create snapshots and receive logs from newer Keeper nodes
+    // this can lead to some inconsistencies, e.g. from snapshot it will use log_idx as zxid
+    // while the log will have a smaller zxid because it's generated by the newer nodes
+    // we save the value loaded from snapshot to know when is it okay to have
+    // smaller zxid in newer requests
+    int64_t old_snapshot_zxid{0};
+
+    struct TransactionInfo
+    {
+        int64_t zxid;
+        Digest nodes_digest;
+    };
+
+    std::deque<TransactionInfo> uncommitted_transactions;
+
+    uint64_t nodes_digest{0};
+
    bool finalized{false};

    /// Currently active watches (node_path -> subscribed sessions)
    Watches watches;
-    Watches list_watches;   /// Watches for 'list' request (watches on children).
+    Watches list_watches; /// Watches for 'list' request (watches on children).

    void clearDeadWatches(int64_t session_id);

-    /// Get current zxid
-    int64_t getZXID() const
+    /// Get current committed zxid
+    int64_t getZXID() const { return zxid; }
+
+    int64_t getNextZXID() const
    {
-        return zxid;
+        if (uncommitted_transactions.empty())
+            return zxid + 1;
+
+        return uncommitted_transactions.back().zxid + 1;
    }

+    Digest getNodesDigest(bool committed) const;
+
+    const bool digest_enabled;
+
    const String superdigest;

-    KeeperStorage(int64_t tick_time_ms, const String & superdigest_);
+    KeeperStorage(int64_t tick_time_ms, const String & superdigest_, bool digest_enabled_);

    /// Allocate new session id with the specified timeouts
    int64_t getSessionID(int64_t session_timeout_ms)
@ -160,85 +358,69 @@ public:
        session_expiry_queue.addNewSessionOrUpdate(session_id, session_timeout_ms);
    }

+    UInt64 calculateNodesDigest(UInt64 current_digest, const std::vector<Delta> & new_deltas) const;
+
    /// Process user request and return response.
    /// check_acl = false only when converting data from ZooKeeper.
-    ResponsesForSessions processRequest(const Coordination::ZooKeeperRequestPtr & request, int64_t session_id, int64_t time, std::optional<int64_t> new_last_zxid, bool check_acl = true);
+    ResponsesForSessions processRequest(
+        const Coordination::ZooKeeperRequestPtr & request,
+        int64_t session_id,
+        std::optional<int64_t> new_last_zxid,
+        bool check_acl = true,
+        bool is_local = false);
+    void preprocessRequest(
+        const Coordination::ZooKeeperRequestPtr & request,
+        int64_t session_id,
+        int64_t time,
+        int64_t new_last_zxid,
+        bool check_acl = true,
+        std::optional<Digest> digest = std::nullopt);
+    void rollbackRequest(int64_t rollback_zxid);

    void finalize();

    /// Set of methods for creating snapshots

    /// Turn on snapshot mode, so data inside Container is not deleted, but replaced with new version.
-    void enableSnapshotMode(size_t up_to_version)
-    {
-        container.enableSnapshotMode(up_to_version);
-
-    }
+    void enableSnapshotMode(size_t up_to_version) { container.enableSnapshotMode(up_to_version); }

    /// Turn off snapshot mode.
-    void disableSnapshotMode()
-    {
-        container.disableSnapshotMode();
-    }
+    void disableSnapshotMode() { container.disableSnapshotMode(); }

-    Container::const_iterator getSnapshotIteratorBegin() const
-    {
-        return container.begin();
-    }
+    Container::const_iterator getSnapshotIteratorBegin() const { return container.begin(); }

    /// Clear outdated data from internal container.
-    void clearGarbageAfterSnapshot()
-    {
-        container.clearOutdatedNodes();
-    }
+    void clearGarbageAfterSnapshot() { container.clearOutdatedNodes(); }

    /// Get all active sessions
-    const SessionAndTimeout & getActiveSessions() const
-    {
-        return session_and_timeout;
-    }
+    const SessionAndTimeout & getActiveSessions() const { return session_and_timeout; }

    /// Get all dead sessions
-    std::vector<int64_t> getDeadSessions() const
-    {
-        return session_expiry_queue.getExpiredSessions();
-    }
+    std::vector<int64_t> getDeadSessions() const { return session_expiry_queue.getExpiredSessions(); }

    /// Introspection functions mostly used in 4-letter commands
-    uint64_t getNodesCount() const
-    {
-        return container.size();
-    }
+    uint64_t getNodesCount() const { return container.size(); }

-    uint64_t getApproximateDataSize() const
-    {
-        return container.getApproximateDataSize();
-    }
-
-    uint64_t getArenaDataSize() const
-    {
-        return container.keyArenaSize();
-    }
+    uint64_t getApproximateDataSize() const { return container.getApproximateDataSize(); }

+    uint64_t getArenaDataSize() const { return container.keyArenaSize(); }

    uint64_t getTotalWatchesCount() const;

-    uint64_t getWatchedPathsCount() const
-    {
-        return watches.size() + list_watches.size();
-    }
+    uint64_t getWatchedPathsCount() const { return watches.size() + list_watches.size(); }

    uint64_t getSessionsWithWatchesCount() const;

-    uint64_t getSessionWithEphemeralNodesCount() const
-    {
-        return ephemerals.size();
-    }
+    uint64_t getSessionWithEphemeralNodesCount() const { return ephemerals.size(); }
    uint64_t getTotalEphemeralNodesCount() const;

    void dumpWatches(WriteBufferFromOwnString & buf) const;
    void dumpWatchesByPath(WriteBufferFromOwnString & buf) const;
    void dumpSessionsAndEphemerals(WriteBufferFromOwnString & buf) const;
+
+private:
+    void removeDigest(const Node & node, std::string_view path);
+    void addDigest(const Node & node, std::string_view path);
 };

 using KeeperStoragePtr = std::unique_ptr<KeeperStorage>;
--- a/src/Coordination/WriteBufferFromNuraftBuffer.h
+++ b/src/Coordination/WriteBufferFromNuraftBuffer.h
@ -12,7 +12,6 @@ public:
    WriteBufferFromNuraftBuffer();

    nuraft::ptr<nuraft::buffer> getBuffer();
-    bool isFinished() const { return finalized; }

    ~WriteBufferFromNuraftBuffer() override;

--- a/src/Coordination/ZooKeeperDataReader.cpp
+++ b/src/Coordination/ZooKeeperDataReader.cpp
@ -520,7 +520,8 @@ bool deserializeTxn(KeeperStorage & storage, ReadBuffer & in, Poco::Logger * /*l
            if (request->getOpNum() == Coordination::OpNum::Multi && hasErrorsInMultiRequest(request))
                return true;

-            storage.processRequest(request, session_id, time, zxid, /* check_acl = */ false);
+            storage.preprocessRequest(request, session_id, time, zxid, /* check_acl = */ false);
+            storage.processRequest(request, session_id, zxid, /* check_acl = */ false);
        }
    }

--- a/src/Coordination/tests/gtest_coordination.cpp
+++ b/src/Coordination/tests/gtest_coordination.cpp
@ -1,6 +1,8 @@
 #include <chrono>
 #include <gtest/gtest.h>
+#include "Common/ZooKeeper/IKeeper.h"

+#include "Coordination/KeeperStorage.h"
 #include "config_core.h"

 #if USE_NURAFT
@ -1031,24 +1033,24 @@ TEST_P(CoordinationTest, SnapshotableHashMapDataSize)

    world.disableSnapshotMode();
    world.insert("world", n1);
-    EXPECT_EQ(world.getApproximateDataSize(), 177);
+    EXPECT_EQ(world.getApproximateDataSize(), 193);
    world.updateValue("world", [&](Node & value) { value = n2; });
-    EXPECT_EQ(world.getApproximateDataSize(), 195);
+    EXPECT_EQ(world.getApproximateDataSize(), 211);

    world.erase("world");
    EXPECT_EQ(world.getApproximateDataSize(), 0);

    world.enableSnapshotMode(100000);
    world.insert("world", n1);
-    EXPECT_EQ(world.getApproximateDataSize(), 177);
+    EXPECT_EQ(world.getApproximateDataSize(), 193);
    world.updateValue("world", [&](Node & value) { value = n2; });
-    EXPECT_EQ(world.getApproximateDataSize(), 372);
+    EXPECT_EQ(world.getApproximateDataSize(), 404);

    world.clearOutdatedNodes();
-    EXPECT_EQ(world.getApproximateDataSize(), 195);
+    EXPECT_EQ(world.getApproximateDataSize(), 211);

    world.erase("world");
-    EXPECT_EQ(world.getApproximateDataSize(), 195);
+    EXPECT_EQ(world.getApproximateDataSize(), 211);

    world.clear();
    EXPECT_EQ(world.getApproximateDataSize(), 0);
@ -1069,7 +1071,7 @@ TEST_P(CoordinationTest, TestStorageSnapshotSimple)
    ChangelogDirTest test("./snapshots");
    DB::KeeperSnapshotManager manager("./snapshots", 3, params.enable_compression);

-    DB::KeeperStorage storage(500, "");
+    DB::KeeperStorage storage(500, "", true);
    addNode(storage, "/hello", "world", 1);
    addNode(storage, "/hello/somepath", "somedata", 3);
    storage.session_id_counter = 5;
@ -1117,7 +1119,7 @@ TEST_P(CoordinationTest, TestStorageSnapshotMoreWrites)
    ChangelogDirTest test("./snapshots");
    DB::KeeperSnapshotManager manager("./snapshots", 3, params.enable_compression);

-    DB::KeeperStorage storage(500, "");
+    DB::KeeperStorage storage(500, "", true);
    storage.getSessionID(130);

    for (size_t i = 0; i < 50; ++i)
@ -1158,7 +1160,7 @@ TEST_P(CoordinationTest, TestStorageSnapshotManySnapshots)
    ChangelogDirTest test("./snapshots");
    DB::KeeperSnapshotManager manager("./snapshots", 3, params.enable_compression);

-    DB::KeeperStorage storage(500, "");
+    DB::KeeperStorage storage(500, "", true);
    storage.getSessionID(130);

    for (size_t j = 1; j <= 5; ++j)
@ -1196,7 +1198,7 @@ TEST_P(CoordinationTest, TestStorageSnapshotMode)
    auto params = GetParam();
    ChangelogDirTest test("./snapshots");
    DB::KeeperSnapshotManager manager("./snapshots", 3, params.enable_compression);
-    DB::KeeperStorage storage(500, "");
+    DB::KeeperStorage storage(500, "", true);
    for (size_t i = 0; i < 50; ++i)
    {
        addNode(storage, "/hello_" + std::to_string(i), "world_" + std::to_string(i));
@ -1249,7 +1251,7 @@ TEST_P(CoordinationTest, TestStorageSnapshotBroken)
    auto params = GetParam();
    ChangelogDirTest test("./snapshots");
    DB::KeeperSnapshotManager manager("./snapshots", 3, params.enable_compression);
-    DB::KeeperStorage storage(500, "");
+    DB::KeeperStorage storage(500, "", true);
    for (size_t i = 0; i < 50; ++i)
    {
        addNode(storage, "/hello_" + std::to_string(i), "world_" + std::to_string(i));
@ -1269,7 +1271,7 @@ TEST_P(CoordinationTest, TestStorageSnapshotBroken)
    EXPECT_THROW(manager.restoreFromLatestSnapshot(), DB::Exception);
 }

-nuraft::ptr<nuraft::buffer> getBufferFromZKRequest(int64_t session_id, const Coordination::ZooKeeperRequestPtr & request)
+nuraft::ptr<nuraft::buffer> getBufferFromZKRequest(int64_t session_id, int64_t zxid, const Coordination::ZooKeeperRequestPtr & request)
 {
    DB::WriteBufferFromNuraftBuffer buf;
    DB::writeIntBinary(session_id, buf);
@ -1277,12 +1279,14 @@ nuraft::ptr<nuraft::buffer> getBufferFromZKRequest(int64_t session_id, const Coo
    using namespace std::chrono;
    auto time = duration_cast<milliseconds>(system_clock::now().time_since_epoch()).count();
    DB::writeIntBinary(time, buf);
+    DB::writeIntBinary(zxid, buf);
+    DB::writeIntBinary(DB::KeeperStorage::DigestVersion::NO_DIGEST, buf);
    return buf.getBuffer();
 }

-nuraft::ptr<nuraft::log_entry> getLogEntryFromZKRequest(size_t term, int64_t session_id, const Coordination::ZooKeeperRequestPtr & request)
+nuraft::ptr<nuraft::log_entry> getLogEntryFromZKRequest(size_t term, int64_t session_id, int64_t zxid, const Coordination::ZooKeeperRequestPtr & request)
 {
-    auto buffer = getBufferFromZKRequest(session_id, request);
+    auto buffer = getBufferFromZKRequest(session_id, zxid, request);
    return nuraft::cs_new<nuraft::log_entry>(term, buffer);
 }

@ -1304,10 +1308,11 @@ void testLogAndStateMachine(Coordination::CoordinationSettingsPtr settings, uint
    {
        std::shared_ptr<ZooKeeperCreateRequest> request = std::make_shared<ZooKeeperCreateRequest>();
        request->path = "/hello_" + std::to_string(i);
-        auto entry = getLogEntryFromZKRequest(0, 1, request);
+        auto entry = getLogEntryFromZKRequest(0, 1, i, request);
        changelog.append(entry);
        changelog.end_of_append_batch(0, 0);

+        state_machine->pre_commit(i, changelog.entry_at(i)->get_buf());
        state_machine->commit(i, changelog.entry_at(i)->get_buf());
        bool snapshot_created = false;
        if (i % settings->snapshot_distance == 0)
@ -1352,6 +1357,7 @@ void testLogAndStateMachine(Coordination::CoordinationSettingsPtr settings, uint

    for (size_t i = restore_machine->last_commit_index() + 1; i < restore_changelog.next_slot(); ++i)
    {
+        restore_machine->pre_commit(i, changelog.entry_at(i)->get_buf());
        restore_machine->commit(i, changelog.entry_at(i)->get_buf());
    }

@ -1453,7 +1459,8 @@ TEST_P(CoordinationTest, TestEphemeralNodeRemove)
    std::shared_ptr<ZooKeeperCreateRequest> request_c = std::make_shared<ZooKeeperCreateRequest>();
    request_c->path = "/hello";
    request_c->is_ephemeral = true;
-    auto entry_c = getLogEntryFromZKRequest(0, 1, request_c);
+    auto entry_c = getLogEntryFromZKRequest(0, 1, state_machine->getNextZxid(), request_c);
+    state_machine->pre_commit(1, entry_c->get_buf());
    state_machine->commit(1, entry_c->get_buf());
    const auto & storage = state_machine->getStorage();

@ -1461,7 +1468,8 @@ TEST_P(CoordinationTest, TestEphemeralNodeRemove)
    std::shared_ptr<ZooKeeperRemoveRequest> request_d = std::make_shared<ZooKeeperRemoveRequest>();
    request_d->path = "/hello";
    /// Delete from other session
-    auto entry_d = getLogEntryFromZKRequest(0, 2, request_d);
+    auto entry_d = getLogEntryFromZKRequest(0, 2, state_machine->getNextZxid(), request_d);
+    state_machine->pre_commit(2, entry_d->get_buf());
    state_machine->commit(2, entry_d->get_buf());

    EXPECT_EQ(storage.ephemerals.size(), 0);
@ -1481,7 +1489,7 @@ TEST_P(CoordinationTest, TestRotateIntervalChanges)
        {
            std::shared_ptr<ZooKeeperCreateRequest> request = std::make_shared<ZooKeeperCreateRequest>();
            request->path = "/hello_" + std::to_string(i);
-            auto entry = getLogEntryFromZKRequest(0, 1, request);
+            auto entry = getLogEntryFromZKRequest(0, 1, i, request);
            changelog.append(entry);
            changelog.end_of_append_batch(0, 0);
        }
@ -1496,7 +1504,7 @@ TEST_P(CoordinationTest, TestRotateIntervalChanges)
    {
        std::shared_ptr<ZooKeeperCreateRequest> request = std::make_shared<ZooKeeperCreateRequest>();
        request->path = "/hello_" + std::to_string(100 + i);
-        auto entry = getLogEntryFromZKRequest(0, 1, request);
+        auto entry = getLogEntryFromZKRequest(0, 1, i, request);
        changelog_1.append(entry);
        changelog_1.end_of_append_batch(0, 0);
    }
@ -1511,7 +1519,7 @@ TEST_P(CoordinationTest, TestRotateIntervalChanges)
    {
        std::shared_ptr<ZooKeeperCreateRequest> request = std::make_shared<ZooKeeperCreateRequest>();
        request->path = "/hello_" + std::to_string(200 + i);
-        auto entry = getLogEntryFromZKRequest(0, 1, request);
+        auto entry = getLogEntryFromZKRequest(0, 1, i, request);
        changelog_2.append(entry);
        changelog_2.end_of_append_batch(0, 0);
    }
@ -1531,7 +1539,7 @@ TEST_P(CoordinationTest, TestRotateIntervalChanges)
    {
        std::shared_ptr<ZooKeeperCreateRequest> request = std::make_shared<ZooKeeperCreateRequest>();
        request->path = "/hello_" + std::to_string(300 + i);
-        auto entry = getLogEntryFromZKRequest(0, 1, request);
+        auto entry = getLogEntryFromZKRequest(0, 1, i, request);
        changelog_3.append(entry);
        changelog_3.end_of_append_batch(0, 0);
    }
@ -1578,7 +1586,7 @@ TEST_P(CoordinationTest, TestCompressedLogsMultipleRewrite)
    {
        std::shared_ptr<ZooKeeperCreateRequest> request = std::make_shared<ZooKeeperCreateRequest>();
        request->path = "/hello_" + std::to_string(i);
-        auto entry = getLogEntryFromZKRequest(0, 1, request);
+        auto entry = getLogEntryFromZKRequest(0, 1, i, request);
        changelog.append(entry);
        changelog.end_of_append_batch(0, 0);
    }
@ -1590,7 +1598,7 @@ TEST_P(CoordinationTest, TestCompressedLogsMultipleRewrite)
    {
        std::shared_ptr<ZooKeeperCreateRequest> request = std::make_shared<ZooKeeperCreateRequest>();
        request->path = "/hello_" + std::to_string(i);
-        auto entry = getLogEntryFromZKRequest(0, 1, request);
+        auto entry = getLogEntryFromZKRequest(0, 1, i, request);
        changelog1.append(entry);
        changelog1.end_of_append_batch(0, 0);
    }
@ -1601,7 +1609,7 @@ TEST_P(CoordinationTest, TestCompressedLogsMultipleRewrite)
    {
        std::shared_ptr<ZooKeeperCreateRequest> request = std::make_shared<ZooKeeperCreateRequest>();
        request->path = "/hello_" + std::to_string(i);
-        auto entry = getLogEntryFromZKRequest(0, 1, request);
+        auto entry = getLogEntryFromZKRequest(0, 1, i, request);
        changelog2.append(entry);
        changelog2.end_of_append_batch(0, 0);
    }
@ -1614,7 +1622,7 @@ TEST_P(CoordinationTest, TestStorageSnapshotDifferentCompressions)
    ChangelogDirTest test("./snapshots");
    DB::KeeperSnapshotManager manager("./snapshots", 3, params.enable_compression);

-    DB::KeeperStorage storage(500, "");
+    DB::KeeperStorage storage(500, "", true);
    addNode(storage, "/hello", "world", 1);
    addNode(storage, "/hello/somepath", "somedata", 3);
    storage.session_id_counter = 5;
@ -1766,7 +1774,7 @@ TEST_P(CoordinationTest, TestStorageSnapshotEqual)
    {
        DB::KeeperSnapshotManager manager("./snapshots", 3, params.enable_compression);

-        DB::KeeperStorage storage(500, "");
+        DB::KeeperStorage storage(500, "", true);
        for (size_t j = 0; j < 5000; ++j)
        {
            addNode(storage, "/hello_" + std::to_string(j), "world", 1);
@ -1810,7 +1818,7 @@ TEST_P(CoordinationTest, TestLogGap)
    {
        std::shared_ptr<ZooKeeperCreateRequest> request = std::make_shared<ZooKeeperCreateRequest>();
        request->path = "/hello_" + std::to_string(i);
-        auto entry = getLogEntryFromZKRequest(0, 1, request);
+        auto entry = getLogEntryFromZKRequest(0, 1, i, request);
        changelog.append(entry);
        changelog.end_of_append_batch(0, 0);
    }
@ -1824,6 +1832,129 @@ TEST_P(CoordinationTest, TestLogGap)
    EXPECT_EQ(changelog1.next_slot(), 61);
 }

+template <typename ResponseType>
+ResponseType getSingleResponse(const auto & responses)
+{
+    EXPECT_FALSE(responses.empty());
+    return dynamic_cast<ResponseType &>(*responses[0].response);
+}
+
+TEST_P(CoordinationTest, TestUncommittedStateBasicCrud)
+{
+    using namespace DB;
+    using namespace Coordination;
+
+    DB::KeeperStorage storage{500, "", true};
+
+    constexpr std::string_view path = "/test";
+
+    const auto get_committed_data = [&]() -> std::optional<String>
+    {
+        auto request = std::make_shared<ZooKeeperGetRequest>();
+        request->path = path;
+        auto responses = storage.processRequest(request, 0, std::nullopt, true, true);
+        const auto & get_response = getSingleResponse<ZooKeeperGetResponse>(responses);
+
+        if (get_response.error != Error::ZOK)
+            return std::nullopt;
+
+        return get_response.data;
+    };
+
+    const auto preprocess_get = [&](int64_t zxid)
+    {
+        auto get_request = std::make_shared<ZooKeeperGetRequest>();
+        get_request->path = path;
+        storage.preprocessRequest(get_request, 0, 0, zxid);
+        return get_request;
+    };
+
+    const auto create_request = std::make_shared<ZooKeeperCreateRequest>();
+    create_request->path = path;
+    create_request->data = "initial_data";
+    storage.preprocessRequest(create_request, 0, 0, 1);
+    storage.preprocessRequest(create_request, 0, 0, 2);
+
+    ASSERT_FALSE(get_committed_data());
+
+    const auto after_create_get = preprocess_get(3);
+
+    ASSERT_FALSE(get_committed_data());
+
+    const auto set_request = std::make_shared<ZooKeeperSetRequest>();
+    set_request->path = path;
+    set_request->data = "new_data";
+    storage.preprocessRequest(set_request, 0, 0, 4);
+
+    const auto after_set_get = preprocess_get(5);
+
+    ASSERT_FALSE(get_committed_data());
+
+    const auto remove_request = std::make_shared<ZooKeeperRemoveRequest>();
+    remove_request->path = path;
+    storage.preprocessRequest(remove_request, 0, 0, 6);
+    storage.preprocessRequest(remove_request, 0, 0, 7);
+
+    const auto after_remove_get = preprocess_get(8);
+
+    ASSERT_FALSE(get_committed_data());
+
+    {
+        const auto responses = storage.processRequest(create_request, 0, 1);
+        const auto & create_response = getSingleResponse<ZooKeeperCreateResponse>(responses);
+        ASSERT_EQ(create_response.error, Error::ZOK);
+    }
+
+    {
+        const auto responses = storage.processRequest(create_request, 0, 2);
+        const auto & create_response = getSingleResponse<ZooKeeperCreateResponse>(responses);
+        ASSERT_EQ(create_response.error, Error::ZNODEEXISTS);
+    }
+
+    {
+        const auto responses = storage.processRequest(after_create_get, 0, 3);
+        const auto & get_response = getSingleResponse<ZooKeeperGetResponse>(responses);
+        ASSERT_EQ(get_response.error, Error::ZOK);
+        ASSERT_EQ(get_response.data, "initial_data");
+    }
+
+    ASSERT_EQ(get_committed_data(), "initial_data");
+
+    {
+        const auto responses = storage.processRequest(set_request, 0, 4);
+        const auto & create_response = getSingleResponse<ZooKeeperSetResponse>(responses);
+        ASSERT_EQ(create_response.error, Error::ZOK);
+    }
+
+    {
+        const auto responses = storage.processRequest(after_set_get, 0, 5);
+        const auto & get_response = getSingleResponse<ZooKeeperGetResponse>(responses);
+        ASSERT_EQ(get_response.error, Error::ZOK);
+        ASSERT_EQ(get_response.data, "new_data");
+    }
+
+    ASSERT_EQ(get_committed_data(), "new_data");
+
+    {
+        const auto responses = storage.processRequest(remove_request, 0, 6);
+        const auto & create_response = getSingleResponse<ZooKeeperRemoveResponse>(responses);
+        ASSERT_EQ(create_response.error, Error::ZOK);
+    }
+
+    {
+        const auto responses = storage.processRequest(remove_request, 0, 7);
+        const auto & create_response = getSingleResponse<ZooKeeperRemoveResponse>(responses);
+        ASSERT_EQ(create_response.error, Error::ZNONODE);
+    }
+
+    {
+        const auto responses = storage.processRequest(after_remove_get, 0, 8);
+        const auto & get_response = getSingleResponse<ZooKeeperGetResponse>(responses);
+        ASSERT_EQ(get_response.error, Error::ZNONODE);
+    }
+
+    ASSERT_FALSE(get_committed_data());
+}

 INSTANTIATE_TEST_SUITE_P(CoordinationTestSuite,
    CoordinationTest,
--- a/src/Core/ProtocolDefines.h
+++ b/src/Core/ProtocolDefines.h
@ -52,6 +52,8 @@
 /// NOTE: DBMS_TCP_PROTOCOL_VERSION has nothing common with VERSION_REVISION,
 /// later is just a number for server version (one number instead of commit SHA)
 /// for simplicity (sometimes it may be more convenient in some use cases).
-#define DBMS_TCP_PROTOCOL_VERSION 54455
+#define DBMS_TCP_PROTOCOL_VERSION 54456

 #define DBMS_MIN_PROTOCOL_VERSION_WITH_INITIAL_QUERY_START_TIME 54449
+
+#define DBMS_MIN_PROTOCOL_VERSION_WITH_PROFILE_EVENTS_IN_INSERT 54456
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -183,6 +183,8 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
    \
    M(Int64, network_zstd_compression_level, 1, "Allows you to select the level of ZSTD compression.", 0) \
    \
+    M(Int64, zstd_window_log_max, 0, "Allows you to select the max window log of ZSTD (it will not be used for MergeTree family)", 0) \
+    \
    M(UInt64, priority, 0, "Priority of the query. 1 - the highest, higher value - lower priority; 0 - do not use priorities.", 0) \
    M(Int64, os_thread_priority, 0, "If non zero - set corresponding 'nice' value for query processing threads. Can be used to adjust query priority for OS scheduler.", 0) \
    \
@ -481,7 +483,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
    M(Seconds, lock_acquire_timeout, DBMS_DEFAULT_LOCK_ACQUIRE_TIMEOUT_SEC, "How long locking request should wait before failing", 0) \
    M(Bool, materialize_ttl_after_modify, true, "Apply TTL for old data, after ALTER MODIFY TTL query", 0) \
    M(String, function_implementation, "", "Choose function implementation for specific target or variant (experimental). If empty enable all of them.", 0) \
-    M(Bool, allow_experimental_geo_types, false, "Allow geo data types such as Point, Ring, Polygon, MultiPolygon", 0) \
+    M(Bool, allow_experimental_geo_types, true, "Allow geo data types such as Point, Ring, Polygon, MultiPolygon", 0) \
    M(Bool, data_type_default_nullable, false, "Data types without NULL or NOT NULL will make Nullable", 0) \
    M(Bool, cast_keep_nullable, false, "CAST operator keep Nullable for result data type", 0) \
    M(Bool, cast_ipv4_ipv6_default_on_conversion_error, false, "CAST operator into IPv4, CAST operator into IPV6 type, toIPv4, toIPv6 functions will return default value instead of throwing exception on conversion error.", 0) \
@ -494,8 +496,8 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
    M(Bool, allow_non_metadata_alters, true, "Allow to execute alters which affects not only tables metadata, but also data on disk", 0) \
    M(Bool, enable_global_with_statement, true, "Propagate WITH statements to UNION queries and all subqueries", 0) \
    M(Bool, aggregate_functions_null_for_empty, false, "Rewrite all aggregate functions in a query, adding -OrNull suffix to them", 0) \
-    M(Bool, optimize_syntax_fuse_functions, false, "Allow apply syntax optimisation: fuse aggregate functions", 0) \
-    M(Bool, optimize_fuse_sum_count_avg, false, "Fuse functions `sum, avg, count` with identical arguments into one `sumCount` (`optimize_syntax_fuse_functions should be enabled)", 0) \
+    M(Bool, optimize_syntax_fuse_functions, false, "Not ready for production, do not use. Allow apply syntax optimisation: fuse aggregate functions", 0) \
+    M(Bool, optimize_fuse_sum_count_avg, false, "Not ready for production, do not use. Fuse functions `sum, avg, count` with identical arguments into one `sumCount` (`optimize_syntax_fuse_functions should be enabled)", 0) \
    M(Bool, flatten_nested, true, "If true, columns of type Nested will be flatten to separate array columns instead of one array of tuples", 0) \
    M(Bool, asterisk_include_materialized_columns, false, "Include MATERIALIZED columns for wildcard query", 0) \
    M(Bool, asterisk_include_alias_columns, false, "Include ALIAS columns for wildcard query", 0) \
@ -550,7 +552,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
    M(UInt64, function_range_max_elements_in_block, 500000000, "Maximum number of values generated by function 'range' per block of data (sum of array sizes for every row in a block, see also 'max_block_size' and 'min_insert_block_size_rows'). It is a safety threshold.", 0) \
    M(ShortCircuitFunctionEvaluation, short_circuit_function_evaluation, ShortCircuitFunctionEvaluation::ENABLE, "Setting for short-circuit function evaluation configuration. Possible values: 'enable' - use short-circuit function evaluation for functions that are suitable for it, 'disable' - disable short-circuit function evaluation, 'force_enable' - use short-circuit function evaluation for all functions.", 0) \
    \
-    M(String, local_filesystem_read_method, "pread", "Method of reading data from local filesystem, one of: read, pread, mmap, pread_threadpool.", 0) \
+    M(String, local_filesystem_read_method, "pread_threadpool", "Method of reading data from local filesystem, one of: read, pread, mmap, pread_threadpool.", 0) \
    M(String, remote_filesystem_read_method, "threadpool", "Method of reading data from remote filesystem, one of: read, threadpool.", 0) \
    M(Bool, local_filesystem_read_prefetch, false, "Should use prefetching when reading data from local filesystem.", 0) \
    M(Bool, remote_filesystem_read_prefetch, true, "Should use prefetching when reading data from remote filesystem.", 0) \
@ -600,6 +602,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
    M(TransactionsWaitCSNMode, wait_changes_become_visible_after_commit_mode, TransactionsWaitCSNMode::WAIT_UNKNOWN, "Wait for committed changes to become actually visible in the latest snapshot", 0) \
    M(Bool, throw_if_no_data_to_insert, true, "Enables or disables empty INSERTs, enabled by default", 0) \
    M(Bool, compatibility_ignore_auto_increment_in_create_table, false, "Ignore AUTO_INCREMENT keyword in column declaration if true, otherwise return error. It simplifies migration from MySQL", 0) \
+    M(Bool, multiple_joins_try_to_keep_original_names, false, "Do not add aliases to top level expression list on multiple joins rewrite", 0) \
    // End of COMMON_SETTINGS
    // Please add settings related to formats into the FORMAT_FACTORY_SETTINGS and move obsolete settings to OBSOLETE_SETTINGS.

--- a/src/Core/SortDescription.cpp
+++ b/src/Core/SortDescription.cpp
@ -82,7 +82,7 @@ static Poco::Logger * getLogger()
    return &logger;
 }

-void compileSortDescriptionIfNeeded(SortDescription & description, const DataTypes & sort_description_types, bool increase_compile_attemps)
+void compileSortDescriptionIfNeeded(SortDescription & description, const DataTypes & sort_description_types, bool increase_compile_attempts)
 {
    static std::unordered_map<UInt128, UInt64, UInt128Hash> counter;
    static std::mutex mutex;
@ -109,7 +109,7 @@ void compileSortDescriptionIfNeeded(SortDescription & description, const DataTyp
        UInt64 & current_counter = counter[sort_description_hash_key];
        if (current_counter < description.min_count_to_compile_sort_description)
        {
-            current_counter += static_cast<UInt64>(increase_compile_attemps);
+            current_counter += static_cast<UInt64>(increase_compile_attempts);
            return;
        }
    }
@ -142,11 +142,11 @@ void compileSortDescriptionIfNeeded(SortDescription & description, const DataTyp

 #else

-void compileSortDescriptionIfNeeded(SortDescription & description, const DataTypes & sort_description_types, bool increase_compile_attemps)
+void compileSortDescriptionIfNeeded(SortDescription & description, const DataTypes & sort_description_types, bool increase_compile_attempts)
 {
    (void)(description);
    (void)(sort_description_types);
-    (void)(increase_compile_attemps);
+    (void)(increase_compile_attempts);
 }

 #endif
--- a/src/Core/SortDescription.h
+++ b/src/Core/SortDescription.h
@ -107,7 +107,7 @@ public:
 /** Compile sort description for header_types.
  * Description is compiled only if compilation attempts to compile identical description is more than min_count_to_compile_sort_description.
  */
-void compileSortDescriptionIfNeeded(SortDescription & description, const DataTypes & sort_description_types, bool increase_compile_attemps);
+void compileSortDescriptionIfNeeded(SortDescription & description, const DataTypes & sort_description_types, bool increase_compile_attempts);

 /// Outputs user-readable description into `out`.
 void dumpSortDescription(const SortDescription & description, WriteBuffer & out);
--- a/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.cpp
+++ b/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.cpp
@ -185,8 +185,11 @@ bool AsynchronousReadIndirectBufferFromRemoteFS::nextImpl()
        }

        prefetch_buffer.swap(memory);
+
        /// Adjust the working buffer so that it ignores `offset` bytes.
-        setWithBytesToIgnore(memory.data(), size, offset);
+        internal_buffer = Buffer(memory.data(), memory.data() + memory.size());
+        working_buffer = Buffer(memory.data() + offset, memory.data() + size);
+        pos = working_buffer.begin();
    }
    else
    {
@ -202,7 +205,9 @@ bool AsynchronousReadIndirectBufferFromRemoteFS::nextImpl()
        if (size)
        {
            /// Adjust the working buffer so that it ignores `offset` bytes.
-            setWithBytesToIgnore(memory.data(), size, offset);
+            internal_buffer = Buffer(memory.data(), memory.data() + memory.size());
+            working_buffer = Buffer(memory.data() + offset, memory.data() + size);
+            pos = working_buffer.begin();
        }
    }

--- a/Show More
+++ b/Show More