Merge branch 'master' into pp-system-unfreeze

2024-09-20 00:30:49 +00:00 · 2022-06-13 14:46:30 +00:00 · 2022-06-13 14:46:30 +00:00 · b9cf6fe367
commit b9cf6fe367
parent 879e162f92 9ec56fa362
247 changed files with 5671 additions and 1305 deletions
--- a/.pylintrc
+++ b/.pylintrc
@ -13,9 +13,7 @@ max-statements=200
 ignore-long-lines = (# )?<?https?://\S+>?$

 [MESSAGES CONTROL]
-disable = bad-continuation,
-          missing-docstring,
-          bad-whitespace,
+disable = missing-docstring,
          too-few-public-methods,
          invalid-name,
          too-many-arguments,
--- a/base/base/errnoToString.cpp
+++ b/base/base/errnoToString.cpp
@ -9,7 +9,7 @@ std::string errnoToString(int code, int the_errno)
    char buf[buf_size];
 #ifndef _GNU_SOURCE
    int rc = strerror_r(the_errno, buf, buf_size);
-#ifdef __APPLE__
+#ifdef OS_DARWIN
    if (rc != 0 && rc != EINVAL)
 #else
    if (rc != 0)
--- a/base/base/getAvailableMemoryAmount.cpp
+++ b/base/base/getAvailableMemoryAmount.cpp
@ -16,7 +16,7 @@ uint64_t getAvailableMemoryAmountOrZero()
 {
 #if defined(_SC_PHYS_PAGES) // linux
    return getPageSize() * sysconf(_SC_PHYS_PAGES);
-#elif defined(__FreeBSD__)
+#elif defined(OS_FREEBSD)
    struct vmtotal vmt;
    size_t vmt_size = sizeof(vmt);
    if (sysctlbyname("vm.vmtotal", &vmt, &vmt_size, NULL, 0) == 0)
--- a/base/base/phdr_cache.cpp
+++ b/base/base/phdr_cache.cpp
@ -6,7 +6,7 @@

 #include <base/defines.h>

-#if defined(__linux__) && !defined(THREAD_SANITIZER) && !defined(USE_MUSL)
+#if defined(OS_LINUX) && !defined(THREAD_SANITIZER) && !defined(USE_MUSL)
    #define USE_PHDR_CACHE 1
 #endif

--- a/contrib/boringssl-cmake/CMakeLists.txt
+++ b/contrib/boringssl-cmake/CMakeLists.txt
@ -705,3 +705,109 @@ target_compile_options(_crypto PRIVATE -Wno-gnu-anonymous-struct)

 add_library(OpenSSL::Crypto ALIAS _crypto)
 add_library(OpenSSL::SSL ALIAS _ssl)
+
+# Helper function used in the populate_openssl_vars function below
+function(from_hex HEX DEC)
+  string(TOUPPER "${HEX}" HEX)
+  set(_res 0)
+  string(LENGTH "${HEX}" _strlen)
+
+  while (_strlen GREATER 0)
+    math(EXPR _res "${_res} * 16")
+    string(SUBSTRING "${HEX}" 0 1 NIBBLE)
+    string(SUBSTRING "${HEX}" 1 -1 HEX)
+    if (NIBBLE STREQUAL "A")
+      math(EXPR _res "${_res} + 10")
+    elseif (NIBBLE STREQUAL "B")
+      math(EXPR _res "${_res} + 11")
+    elseif (NIBBLE STREQUAL "C")
+      math(EXPR _res "${_res} + 12")
+    elseif (NIBBLE STREQUAL "D")
+      math(EXPR _res "${_res} + 13")
+    elseif (NIBBLE STREQUAL "E")
+      math(EXPR _res "${_res} + 14")
+    elseif (NIBBLE STREQUAL "F")
+      math(EXPR _res "${_res} + 15")
+    else ()
+      math(EXPR _res "${_res} + ${NIBBLE}")
+    endif ()
+
+    string(LENGTH "${HEX}" _strlen)
+  endwhile ()
+
+  set(${DEC} ${_res} PARENT_SCOPE)
+endfunction()
+
+# ClickHouse uses BoringSSL which is a fork of OpenSSL.
+# This populates CMAKE var OPENSSL_VERSION from the OPENSSL_VERSION_NUMBER defined
+# in contrib/boringssl/include/openssl/base.h. It also sets the CMAKE var OPENSSL_IS_BORING_SSL
+# if it's defined in the file. Both OPENSSL_VERSION and OPENSSL_IS_BORING_SSL variables will be
+# used to populate flags in the `system.build_options` table for more context on ssl version used.
+# This cmake script is adopted from FindOpenSSL cmake module and slightly modified for this use-case .
+if (EXISTS "${BORINGSSL_SOURCE_DIR}/include/openssl/base.h")
+  file(STRINGS "${BORINGSSL_SOURCE_DIR}/include/openssl/base.h" openssl_version_str
+          REGEX "^#[\t ]*define[\t ]+OPENSSL_VERSION_NUMBER[\t ]+0x([0-9a-fA-F])+.*")
+
+  file(STRINGS "${BORINGSSL_SOURCE_DIR}/include/openssl/base.h" openssl_is_boringssl
+          REGEX "^#[\t ]*define[\t ]+OPENSSL_IS_BORINGSSL.*")
+
+  # Set to true if OPENSSL_IS_BORING_SSL is defined
+  if (openssl_is_boringssl)
+    set(OPENSSL_IS_BORING_SSL 1)
+  endif ()
+
+  # If openssl_version_str is defined extrapolate and set OPENSSL_VERSION
+  if (openssl_version_str)
+    # The version number is encoded as 0xMNNFFPPS: major minor fix patch status
+    # The status gives if this is a developer or prerelease and is ignored here.
+    # Major, minor, and fix directly translate into the version numbers shown in
+    # the string. The patch field translates to the single character suffix that
+    # indicates the bug fix state, which 00 -> nothing, 01 -> a, 02 -> b and so
+    # on.
+
+    string(REGEX REPLACE "^.*OPENSSL_VERSION_NUMBER[\t ]+0x([0-9a-fA-F])([0-9a-fA-F][0-9a-fA-F])([0-9a-fA-F][0-9a-fA-F])([0-9a-fA-F][0-9a-fA-F])([0-9a-fA-F]).*$"
+            "\\1;\\2;\\3;\\4;\\5" OPENSSL_VERSION_LIST "${openssl_version_str}")
+    list(GET OPENSSL_VERSION_LIST 0 OPENSSL_VERSION_MAJOR)
+    list(GET OPENSSL_VERSION_LIST 1 OPENSSL_VERSION_MINOR)
+    from_hex("${OPENSSL_VERSION_MINOR}" OPENSSL_VERSION_MINOR)
+    list(GET OPENSSL_VERSION_LIST 2 OPENSSL_VERSION_FIX)
+    from_hex("${OPENSSL_VERSION_FIX}" OPENSSL_VERSION_FIX)
+    list(GET OPENSSL_VERSION_LIST 3 OPENSSL_VERSION_PATCH)
+
+    if (NOT OPENSSL_VERSION_PATCH STREQUAL "00")
+      from_hex("${OPENSSL_VERSION_PATCH}" _tmp)
+      # 96 is the ASCII code of 'a' minus 1
+      math(EXPR OPENSSL_VERSION_PATCH_ASCII "${_tmp} + 96")
+      unset(_tmp)
+      # Once anyone knows how OpenSSL would call the patch versions beyond 'z'
+      # this should be updated to handle that, too. This has not happened yet
+      # so it is simply ignored here for now.
+      string(ASCII "${OPENSSL_VERSION_PATCH_ASCII}" OPENSSL_VERSION_PATCH_STRING)
+    endif ()
+
+    set(OPENSSL_VERSION "${OPENSSL_VERSION_MAJOR}.${OPENSSL_VERSION_MINOR}.${OPENSSL_VERSION_FIX}${OPENSSL_VERSION_PATCH_STRING}")
+  else ()
+    # Since OpenSSL 3.0.0, the new version format is MAJOR.MINOR.PATCH and
+    # a new OPENSSL_VERSION_STR macro contains exactly that
+    file(STRINGS "${BORINGSSL_SOURCE_DIR}/include/openssl/base.h" OPENSSL_VERSION_STR
+            REGEX "^#[\t ]*define[\t ]+OPENSSL_VERSION_STR[\t ]+\"([0-9])+\\.([0-9])+\\.([0-9])+\".*")
+    string(REGEX REPLACE "^.*OPENSSL_VERSION_STR[\t ]+\"([0-9]+\\.[0-9]+\\.[0-9]+)\".*$"
+            "\\1" OPENSSL_VERSION_STR "${OPENSSL_VERSION_STR}")
+
+    set(OPENSSL_VERSION "${OPENSSL_VERSION_STR}")
+
+    # Setting OPENSSL_VERSION_MAJOR OPENSSL_VERSION_MINOR and OPENSSL_VERSION_FIX
+    string(REGEX MATCHALL "([0-9])+" OPENSSL_VERSION_NUMBER "${OPENSSL_VERSION}")
+    list(POP_FRONT OPENSSL_VERSION_NUMBER
+            OPENSSL_VERSION_MAJOR
+            OPENSSL_VERSION_MINOR
+            OPENSSL_VERSION_FIX)
+
+    unset(OPENSSL_VERSION_NUMBER)
+    unset(OPENSSL_VERSION_STR)
+  endif ()
+endif ()
+
+# Set CMAKE variables so that they can be referenced properly from everywhere
+set(OPENSSL_VERSION "${OPENSSL_VERSION}" CACHE INTERNAL "")
+set(OPENSSL_IS_BORING_SSL "${OPENSSL_IS_BORING_SSL}" CACHE INTERNAL 0)
--- a/docker/docs/builder/Dockerfile
+++ b/docker/docs/builder/Dockerfile
@ -1,15 +1,16 @@
 # rebuild in #36968
 # docker build -t clickhouse/docs-builder .
 # nodejs 17 prefers ipv6 and is broken in our environment
-FROM node:16.14.2-alpine3.15
+FROM node:16-alpine

 RUN apk add --no-cache git openssh bash

-# TODO: clean before merge!
-ARG DOCS_BRANCH=main
+# At this point we want to really update /opt/clickhouse-docs
+# despite the cached images
+ARG CACHE_INVALIDATOR=0

 RUN git clone https://github.com/ClickHouse/clickhouse-docs.git \
-    --depth=1 --branch=${DOCS_BRANCH} /opt/clickhouse-docs
+    --depth=1 --branch=main /opt/clickhouse-docs

 WORKDIR /opt/clickhouse-docs

--- a/docker/docs/builder/run.sh
+++ b/docker/docs/builder/run.sh
@ -8,8 +8,6 @@ if [ "$GIT_DOCS_BRANCH" ] && ! [ "$GIT_DOCS_BRANCH" == "$GIT_BRANCH" ]; then
  git fetch origin --depth=1 -- "$GIT_DOCS_BRANCH:$GIT_DOCS_BRANCH"
  git checkout "$GIT_DOCS_BRANCH"
 else
-  # Untracked yarn.lock could cause pull to fail
-  git clean -fdx
  # Update docs repo
  git pull
 fi
--- a/docker/keeper/entrypoint.sh
+++ b/docker/keeper/entrypoint.sh
@ -42,6 +42,7 @@ DATA_DIR="${CLICKHOUSE_DATA_DIR:-/var/lib/clickhouse}"
 LOG_DIR="${LOG_DIR:-/var/log/clickhouse-keeper}"
 LOG_PATH="${LOG_DIR}/clickhouse-keeper.log"
 ERROR_LOG_PATH="${LOG_DIR}/clickhouse-keeper.err.log"
+COORDINATION_DIR="${DATA_DIR}/coordination"
 COORDINATION_LOG_DIR="${DATA_DIR}/coordination/log"
 COORDINATION_SNAPSHOT_DIR="${DATA_DIR}/coordination/snapshots"
 CLICKHOUSE_WATCHDOG_ENABLE=${CLICKHOUSE_WATCHDOG_ENABLE:-0}
@ -49,6 +50,7 @@ CLICKHOUSE_WATCHDOG_ENABLE=${CLICKHOUSE_WATCHDOG_ENABLE:-0}
 for dir in "$DATA_DIR" \
  "$LOG_DIR" \
  "$TMP_DIR" \
+  "$COORDINATION_DIR" \
  "$COORDINATION_LOG_DIR" \
  "$COORDINATION_SNAPSHOT_DIR"
 do
--- a/docker/test/style/Dockerfile
+++ b/docker/test/style/Dockerfile
@ -8,16 +8,16 @@ ARG apt_archive="http://archive.ubuntu.com"
 RUN sed -i "s|http://archive.ubuntu.com|$apt_archive|g" /etc/apt/sources.list

 RUN apt-get update && env DEBIAN_FRONTEND=noninteractive apt-get install --yes \
+    aspell \
    curl \
    git \
    libxml2-utils \
    moreutils \
-    pylint \
    python3-fuzzywuzzy \
    python3-pip \
    shellcheck \
    yamllint \
-    && pip3 install black boto3 codespell dohq-artifactory PyGithub unidiff
+    && pip3 install black boto3 codespell dohq-artifactory PyGithub unidiff pylint==2.6.2

 # Architecture of the image when BuildKit/buildx is used
 ARG TARGETARCH
--- a/docker/test/style/process_style_check_result.py
+++ b/docker/test/style/process_style_check_result.py
@ -18,6 +18,7 @@ def process_result(result_folder):
        ("typos", "typos_output.txt"),
        ("whitespaces", "whitespaces_output.txt"),
        ("workflows", "workflows_output.txt"),
+        ("doc typos", "doc_spell_output.txt"),
    )

    for name, out_file in checks:
--- a/docker/test/style/run.sh
+++ b/docker/test/style/run.sh
@ -11,6 +11,8 @@ echo "Check python formatting with black" | ts
 ./check-black -n              |& tee /test_output/black_output.txt
 echo "Check typos" | ts
 ./check-typos                 |& tee /test_output/typos_output.txt
+echo "Check docs spelling" | ts
+./check-doc-aspell            |& tee /test_output/doc_spell_output.txt
 echo "Check whitespaces" | ts
 ./check-whitespaces -n        |& tee /test_output/whitespaces_output.txt
 echo "Check workflows" | ts
--- a/docs/en/development/adding_test_queries.md
+++ b/docs/en/development/adding_test_queries.md
@ -138,7 +138,7 @@ It's important to name tests correctly, so one could turn some tests subset off

 | Tester flag| What should be in test name | When flag should be added |
 |---|---|---|---|
-| `--[no-]zookeeper`| "zookeeper" or "replica" | Test uses tables from ReplicatedMergeTree family |
+| `--[no-]zookeeper`| "zookeeper" or "replica" | Test uses tables from `ReplicatedMergeTree` family |
 | `--[no-]shard` | "shard" or "distributed" or "global"| Test using connections to 127.0.0.2 or similar |
 | `--[no-]long` | "long" or "deadlock" or "race" | Test runs longer than 60 seconds |

--- a/docs/en/development/architecture.md
+++ b/docs/en/development/architecture.md
@ -5,7 +5,7 @@ sidebar_position: 62

 # Overview of ClickHouse Architecture

-ClickHouse is a true column-oriented DBMS. Data is stored by columns, and during the execution of arrays (vectors or chunks of columns). 
+ClickHouse is a true column-oriented DBMS. Data is stored by columns, and during the execution of arrays (vectors or chunks of columns).
 Whenever possible, operations are dispatched on arrays, rather than on individual values. It is called “vectorized query execution” and it helps lower the cost of actual data processing.

 > This idea is nothing new. It dates back to the `APL` (A programming language, 1957) and its descendants: `A +` (APL dialect), `J` (1990), `K` (1993), and `Q` (programming language from Kx Systems, 2003). Array programming is used in scientific data processing. Neither is this idea something new in relational databases: for example, it is used in the `VectorWise` system (also known as Actian Vector Analytic Database by Actian Corporation).
@ -149,13 +149,13 @@ The server implements several different interfaces:
 -   A TCP interface for the native ClickHouse client and for cross-server communication during distributed query execution.
 -   An interface for transferring data for replication.

-Internally, it is just a primitive multithreaded server without coroutines or fibers. Since the server is not designed to process a high rate of simple queries but to process a relatively low rate of complex queries, each of them can process a vast amount of data for analytics.
+Internally, it is just a primitive multithread server without coroutines or fibers. Since the server is not designed to process a high rate of simple queries but to process a relatively low rate of complex queries, each of them can process a vast amount of data for analytics.

 The server initializes the `Context` class with the necessary environment for query execution: the list of available databases, users and access rights, settings, clusters, the process list, the query log, and so on. Interpreters use this environment.

 We maintain full backward and forward compatibility for the server TCP protocol: old clients can talk to new servers, and new clients can talk to old servers. But we do not want to maintain it eternally, and we are removing support for old versions after about one year.

-:::note    
+:::note
 For most external applications, we recommend using the HTTP interface because it is simple and easy to use. The TCP protocol is more tightly linked to internal data structures: it uses an internal format for passing blocks of data, and it uses custom framing for compressed data. We haven’t released a C library for that protocol because it requires linking most of the ClickHouse codebase, which is not practical.
 :::

@ -178,7 +178,7 @@ To execute queries and do side activities ClickHouse allocates threads from one

 Server pool is a `Poco::ThreadPool` class instance defined in `Server::main()` method. It can have at most `max_connection` threads. Every thread is dedicated to a single active connection.

-Global thread pool is `GlobalThreadPool` singleton class. To allocate thread from it `ThreadFromGlobalPool` is used. It has an interface similar to `std::thread`, but pulls thread from the global pool and does all necessary initializations. It is configured with the following settings:
+Global thread pool is `GlobalThreadPool` singleton class. To allocate thread from it `ThreadFromGlobalPool` is used. It has an interface similar to `std::thread`, but pulls thread from the global pool and does all necessary initialization. It is configured with the following settings:
  * `max_thread_pool_size` - limit on thread count in pool.
  * `max_thread_pool_free_size` - limit on idle thread count waiting for new jobs.
  * `thread_pool_queue_size` - limit on scheduled job count.
@ -189,7 +189,7 @@ IO thread pool is implemented as a plain `ThreadPool` accessible via `IOThreadPo

 For periodic task execution there is `BackgroundSchedulePool` class. You can register tasks using `BackgroundSchedulePool::TaskHolder` objects and the pool ensures that no task runs two jobs at the same time. It also allows you to postpone task execution to a specific instant in the future or temporarily deactivate task. Global `Context` provides a few instances of this class for different purposes. For general purpose tasks `Context::getSchedulePool()` is used.

-There are also specialized thread pools for preemptable tasks. Such `IExecutableTask` task can be split into ordered sequence of jobs, called steps. To schedule these tasks in a manner allowing short tasks to be prioritied over long ones `MergeTreeBackgroundExecutor` is used. As name suggests it is used for background MergeTree related operations such as merges, mutations, fetches and moves. Pool instances are available using `Context::getCommonExecutor()` and other similar methods.
+There are also specialized thread pools for preemptable tasks. Such `IExecutableTask` task can be split into ordered sequence of jobs, called steps. To schedule these tasks in a manner allowing short tasks to be prioritized over long ones `MergeTreeBackgroundExecutor` is used. As name suggests it is used for background MergeTree related operations such as merges, mutations, fetches and moves. Pool instances are available using `Context::getCommonExecutor()` and other similar methods.

 No matter what pool is used for a job, at start `ThreadStatus` instance is created for this job. It encapsulates all per-thread information: thread id, query id, performance counters, resource consumption and many other useful data. Job can access it via thread local pointer by `CurrentThread::get()` call, so we do not need to pass it to every function.

@ -201,7 +201,7 @@ Servers in a cluster setup are mostly independent. You can create a `Distributed

 Things become more complicated when you have subqueries in IN or JOIN clauses, and each of them uses a `Distributed` table. We have different strategies for the execution of these queries.

-There is no global query plan for distributed query execution. Each node has its local query plan for its part of the job. We only have simple one-pass distributed query execution: we send queries for remote nodes and then merge the results. But this is not feasible for complicated queries with high cardinality GROUP BYs or with a large amount of temporary data for JOIN. In such cases, we need to “reshuffle” data between servers, which requires additional coordination. ClickHouse does not support that kind of query execution, and we need to work on it.
+There is no global query plan for distributed query execution. Each node has its local query plan for its part of the job. We only have simple one-pass distributed query execution: we send queries for remote nodes and then merge the results. But this is not feasible for complicated queries with high cardinality `GROUP BY`s or with a large amount of temporary data for JOIN. In such cases, we need to “reshuffle” data between servers, which requires additional coordination. ClickHouse does not support that kind of query execution, and we need to work on it.

 ## Merge Tree {#merge-tree}

@ -231,7 +231,7 @@ Replication is physical: only compressed parts are transferred between nodes, no

 Besides, each replica stores its state in ZooKeeper as the set of parts and its checksums. When the state on the local filesystem diverges from the reference state in ZooKeeper, the replica restores its consistency by downloading missing and broken parts from other replicas. When there is some unexpected or broken data in the local filesystem, ClickHouse does not remove it, but moves it to a separate directory and forgets it.

-:::note    
+:::note
 The ClickHouse cluster consists of independent shards, and each shard consists of replicas. The cluster is **not elastic**, so after adding a new shard, data is not rebalanced between shards automatically. Instead, the cluster load is supposed to be adjusted to be uneven. This implementation gives you more control, and it is ok for relatively small clusters, such as tens of nodes. But for clusters with hundreds of nodes that we are using in production, this approach becomes a significant drawback. We should implement a table engine that spans across the cluster with dynamically replicated regions that could be split and balanced between clusters automatically.
 :::

--- a/docs/en/development/build-osx.md
+++ b/docs/en/development/build-osx.md
@ -4,7 +4,7 @@ sidebar_label: Build on Mac OS X
 description: How to build ClickHouse on Mac OS X
 ---

-# How to Build ClickHouse on Mac OS X 
+# How to Build ClickHouse on Mac OS X

 :::info You don't have to build ClickHouse yourself!
 You can install pre-built ClickHouse as described in [Quick Start](https://clickhouse.com/#quick-start). Follow **macOS (Intel)** or **macOS (Apple silicon)** installation instructions.
@ -20,9 +20,9 @@ It is also possible to compile with Apple's XCode `apple-clang` or Homebrew's `g

 First install [Homebrew](https://brew.sh/)

-## For Apple's Clang (discouraged): Install Xcode and Command Line Tools {#install-xcode-and-command-line-tools}
+## For Apple's Clang (discouraged): Install XCode and Command Line Tools {#install-xcode-and-command-line-tools}

-Install the latest [Xcode](https://apps.apple.com/am/app/xcode/id497799835?mt=12) from App Store.
+Install the latest [XCode](https://apps.apple.com/am/app/xcode/id497799835?mt=12) from App Store.

 Open it at least once to accept the end-user license agreement and automatically install the required components.

@ -62,7 +62,7 @@ cmake --build build
 # The resulting binary will be created at: build/programs/clickhouse
 ```

-To build using Xcode's native AppleClang compiler in Xcode IDE (this option is only for development builds and workflows, and is **not recommended** unless you know what you are doing):
+To build using XCode native AppleClang compiler in XCode IDE (this option is only for development builds and workflows, and is **not recommended** unless you know what you are doing):

 ``` bash
 cd ClickHouse
@ -71,7 +71,7 @@ mkdir build
 cd build
 XCODE_IDE=1 ALLOW_APPLECLANG=1 cmake -G Xcode -DCMAKE_BUILD_TYPE=Debug -DENABLE_JEMALLOC=OFF ..
 cmake --open .
-# ...then, in Xcode IDE select ALL_BUILD scheme and start the building process.
+# ...then, in XCode IDE select ALL_BUILD scheme and start the building process.
 # The resulting binary will be created at: ./programs/Debug/clickhouse
 ```

@ -91,9 +91,9 @@ cmake --build build

 ## Caveats {#caveats}

-If you intend to run `clickhouse-server`, make sure to increase the system’s maxfiles variable.
+If you intend to run `clickhouse-server`, make sure to increase the system’s `maxfiles` variable.

-:::note    
+:::note
 You’ll need to use sudo.
 :::

--- a/docs/en/development/build.md
+++ b/docs/en/development/build.md
@ -19,7 +19,7 @@ The following tutorial is based on the Ubuntu Linux system. With appropriate cha
 ### Install Git, CMake, Python and Ninja {#install-git-cmake-python-and-ninja}

 ``` bash
-sudo apt-get install git cmake python ninja-build
+sudo apt-get install git cmake ccache python3 ninja-build
 ```

 Or cmake3 instead of cmake on older systems.
@ -130,7 +130,7 @@ Here is an example of how to install the new `cmake` from the official website:
 ```
 wget https://github.com/Kitware/CMake/releases/download/v3.22.2/cmake-3.22.2-linux-x86_64.sh
 chmod +x cmake-3.22.2-linux-x86_64.sh
-./cmake-3.22.2-linux-x86_64.sh 
+./cmake-3.22.2-linux-x86_64.sh
 export PATH=/home/milovidov/work/cmake-3.22.2-linux-x86_64/bin/:${PATH}
 hash cmake
 ```
@ -163,7 +163,7 @@ ClickHouse is available in pre-built binaries and packages. Binaries are portabl

 They are built for stable, prestable and testing releases as long as for every commit to master and for every pull request.

-To find the freshest build from `master`, go to [commits page](https://github.com/ClickHouse/ClickHouse/commits/master), click on the first green checkmark or red cross near commit, and click to the “Details” link right after “ClickHouse Build Check”.
+To find the freshest build from `master`, go to [commits page](https://github.com/ClickHouse/ClickHouse/commits/master), click on the first green check mark or red cross near commit, and click to the “Details” link right after “ClickHouse Build Check”.

 ## Faster builds for development: Split build configuration {#split-build}

--- a/docs/en/development/cmake-in-clickhouse.md
+++ b/docs/en/development/cmake-in-clickhouse.md
@ -19,7 +19,7 @@ cmake .. \

 ## CMake files types

-1. ClickHouse's source CMake files (located in the root directory and in /src).
+1. ClickHouse source CMake files (located in the root directory and in /src).
 2. Arch-dependent CMake files (located in /cmake/*os_name*).
 3. Libraries finders (search for contrib libraries, located in /contrib/*/CMakeLists.txt).
 4. Contrib build CMake files (used instead of libraries' own CMake files, located in /cmake/modules)
@ -456,7 +456,7 @@ option(ENABLE_TESTS "Provide unit_test_dbms target with Google.test unit tests"

 #### If the option's state could produce unwanted (or unusual) result, explicitly warn the user.

-Suppose you have an option that may strip debug symbols from the ClickHouse's part.
+Suppose you have an option that may strip debug symbols from the ClickHouse part.
 This can speed up the linking process, but produces a binary that cannot be debugged.
 In that case, prefer explicitly raising a warning telling the developer that he may be doing something wrong.
 Also, such options should be disabled if applies.
--- a/docs/en/development/continuous-integration.md
+++ b/docs/en/development/continuous-integration.md
@ -31,7 +31,7 @@ If you are not sure what to do, ask a maintainer for help.
 ## Merge With Master

 Verifies that the PR can be merged to master. If not, it will fail with the
-message 'Cannot fetch mergecommit'. To fix this check, resolve the conflict as
+message `Cannot fetch mergecommit`. To fix this check, resolve the conflict as
 described in the [GitHub
 documentation](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/resolving-a-merge-conflict-on-github),
 or merge the `master` branch to your pull request branch using git.
@ -57,7 +57,7 @@ You have to specify a changelog category for your change (e.g., Bug Fix), and
 write a user-readable message describing the change for [CHANGELOG.md](../whats-new/changelog/)


-## Push To Dockerhub
+## Push To DockerHub

 Builds docker images used for build and tests, then pushes them to DockerHub.

@ -118,7 +118,7 @@ Builds ClickHouse in various configurations for use in further steps. You have t
 - **Compiler**: `gcc-9` or `clang-10` (or `clang-10-xx` for other architectures e.g. `clang-10-freebsd`).
 - **Build type**: `Debug` or `RelWithDebInfo` (cmake).
 - **Sanitizer**: `none` (without sanitizers), `address` (ASan), `memory` (MSan), `undefined` (UBSan), or `thread` (TSan).
- **Splitted** `splitted` is a [split build](../development/build.md#split-build)
+- **Split** `splitted` is a [split build](../development/build.md#split-build)
 - **Status**: `success` or `fail`
 - **Build log**: link to the building and files copying log, useful when build failed.
 - **Build time**.
--- a/docs/en/development/contrib.md
+++ b/docs/en/development/contrib.md
@ -96,9 +96,9 @@ SELECT library_name, license_type, license_path FROM system.licenses ORDER BY li

 ## Adding new third-party libraries and maintaining patches in third-party libraries {#adding-third-party-libraries}

-1. Each third-party libary must reside in a dedicated directory under the `contrib/` directory of the ClickHouse repository. Avoid dumps/copies of external code, instead use Git's submodule feature to pull third-party code from an external upstream repository.
-2. Submodules are listed in `.gitmodule`. If the external library can be used as-is, you may reference the upstream repository directly. Otherwise, i.e. the external libary requires patching/customization, create a fork of the official repository in the [Clickhouse organization in GitHub](https://github.com/ClickHouse).
+1. Each third-party library must reside in a dedicated directory under the `contrib/` directory of the ClickHouse repository. Avoid dumps/copies of external code, instead use Git submodule feature to pull third-party code from an external upstream repository.
+2. Submodules are listed in `.gitmodule`. If the external library can be used as-is, you may reference the upstream repository directly. Otherwise, i.e. the external library requires patching/customization, create a fork of the official repository in the [Clickhouse organization in GitHub](https://github.com/ClickHouse).
 3. In the latter case, create a branch with `clickhouse/` prefix from the branch you want to integrate, e.g. `clickhouse/master` (for `master`) or `clickhouse/release/vX.Y.Z` (for a `release/vX.Y.Z` tag). The purpose of this branch is to isolate customization of the library from upstream work. For example, pulls from the upstream repository into the fork will leave all `clickhouse/` branches unaffected. Submodules in `contrib/` must only track `clickhouse/` branches of forked third-party repositories.
 4. To patch a fork of a third-party library, create a dedicated branch with `clickhouse/` prefix in the fork, e.g. `clickhouse/fix-some-desaster`. Finally, merge the patch branch into the custom tracking branch (e.g. `clickhouse/master` or `clickhouse/release/vX.Y.Z`) using a PR.
-5. Always create patches of third-party libraries with the official repository in mind. Once a PR of a patch branch to the `clickhouse/` branch in the fork repository is done and the submodule version in ClickHouse's official repository is bumped, consider opening another PR from the patch branch to the upstream library repository. This ensures, that 1) the contribution has more than a single use case and importance, 2) others will also benefit from it, 3) the change will not remain a maintenance burden solely on ClickHouse developers.
+5. Always create patches of third-party libraries with the official repository in mind. Once a PR of a patch branch to the `clickhouse/` branch in the fork repository is done and the submodule version in ClickHouse official repository is bumped, consider opening another PR from the patch branch to the upstream library repository. This ensures, that 1) the contribution has more than a single use case and importance, 2) others will also benefit from it, 3) the change will not remain a maintenance burden solely on ClickHouse developers.
 9. To update a submodule with changes in the upstream repository, first merge upstream `master` (or a new `versionX.Y.Z` tag) into the `clickhouse`-tracking branch in the fork repository. Conflicts with patches/customization will need to be resolved in this merge (see Step 4.). Once the merge is done, bump the submodule in ClickHouse to point to the new hash in the fork.
--- a/docs/en/development/developer-instruction.md
+++ b/docs/en/development/developer-instruction.md
@ -70,7 +70,7 @@ You can also clone the repository via https protocol:

 This, however, will not let you send your changes to the server. You can still use it temporarily and add the SSH keys later replacing the remote address of the repository with `git remote` command.

-You can also add original ClickHouse repo’s address to your local repository to pull updates from there:
+You can also add original ClickHouse repo address to your local repository to pull updates from there:

    git remote add upstream git@github.com:ClickHouse/ClickHouse.git

@ -177,7 +177,7 @@ If you require to build all the binaries (utilities and tests), you should run n

 Full build requires about 30GB of free disk space or 15GB to build the main binaries.

-When a large amount of RAM is available on build machine you should limit the number of build tasks run in parallel with `-j` param:
+When a large amount of RAM is available on build machine you should limit the number of build tasks run in parallel with `-j` parameter:

    ninja -j 1 clickhouse-server clickhouse-client

@ -269,7 +269,7 @@ Developing ClickHouse often requires loading realistic datasets. It is particula

 Navigate to your fork repository in GitHub’s UI. If you have been developing in a branch, you need to select that branch. There will be a “Pull request” button located on the screen. In essence, this means “create a request for accepting my changes into the main repository”.

-A pull request can be created even if the work is not completed yet. In this case please put the word “WIP” (work in progress) at the beginning of the title, it can be changed later. This is useful for cooperative reviewing and discussion of changes as well as for running all of the available tests. It is important that you provide a brief description of your changes, it will later be used for generating release changelogs.
+A pull request can be created even if the work is not completed yet. In this case please put the word “WIP” (work in progress) at the beginning of the title, it can be changed later. This is useful for cooperative reviewing and discussion of changes as well as for running all of the available tests. It is important that you provide a brief description of your changes, it will later be used for generating release changelog.

 Testing will commence as soon as ClickHouse employees label your PR with a tag “can be tested”. The results of some first checks (e.g. code style) will come in within several minutes. Build check results will arrive within half an hour. And the main set of tests will report itself within an hour.

--- a/docs/en/development/integrating_rust_libraries.md
+++ b/docs/en/development/integrating_rust_libraries.md
@ -2,7 +2,7 @@

 Rust library integration will be described based on BLAKE3 hash-function integration.

-The first step is forking a library and making neccessary changes for Rust and C/C++ compatibility.
+The first step is forking a library and making necessary changes for Rust and C/C++ compatibility.

 After forking library repository you need to change target settings in Cargo.toml file. Firstly, you need to switch build to static library. Secondly, you need to add cbindgen crate to the crate list. We will use it later to generate C-header automatically.

@ -51,9 +51,9 @@ pub unsafe extern "C" fn blake3_apply_shim(
 }
 ```

-This method gets C-compatible string, its size and output string pointer as input. Then, it converts C-compatible inputs into types that are used by actual library methods and calls them. After that, it should convert library methods' outputs back into C-compatible type. In that particular case library supported direct writing into pointer by method fill(), so the convertion was not needed. The main advice here is to create less methods, so you will need to do less convertions on each method call and won't create much overhead.
+This method gets C-compatible string, its size and output string pointer as input. Then, it converts C-compatible inputs into types that are used by actual library methods and calls them. After that, it should convert library methods' outputs back into C-compatible type. In that particular case library supported direct writing into pointer by method fill(), so the conversion was not needed. The main advice here is to create less methods, so you will need to do less conversions on each method call and won't create much overhead.

-Also, you should use attribute #[no_mangle] and extern "C" for every C-compatible attribute. Without it library can compile incorrectly and cbindgen won't launch header autogeneration.
+Also, you should use attribute #[no_mangle] and `extern "C"` for every C-compatible attribute. Without it library can compile incorrectly and cbindgen won't launch header autogeneration.

 After all these steps you can test your library in a small project to find all problems with compatibility or header generation. If any problems occur during header generation, you can try to configure it with cbindgen.toml file (you can find an example of it in BLAKE3 directory or a template here: [https://github.com/eqrion/cbindgen/blob/master/template.toml](https://github.com/eqrion/cbindgen/blob/master/template.toml)). If everything works correctly, you can finally integrate its methods into ClickHouse.

--- a/docs/en/development/style.md
+++ b/docs/en/development/style.md
@ -4,7 +4,7 @@ sidebar_label: C++ Guide
 description: A list of recommendations regarding coding style, naming convention, formatting and more
 ---

-# How to Write C++ Code 
+# How to Write C++ Code

 ## General Recommendations {#general-recommendations}

@ -196,7 +196,7 @@ std::cerr << static_cast<int>(c) << std::endl;

 The same is true for small methods in any classes or structs.

-For templated classes and structs, do not separate the method declarations from the implementation (because otherwise they must be defined in the same translation unit).
+For template classes and structs, do not separate the method declarations from the implementation (because otherwise they must be defined in the same translation unit).

 **31.** You can wrap lines at 140 characters, instead of 80.

@ -285,7 +285,7 @@ Note: You can use Doxygen to generate documentation from these comments. But Dox
 /// WHAT THE FAIL???
 ```

-**14.** Do not use comments to make delimeters.
+**14.** Do not use comments to make delimiters.

 ``` cpp
 ///******************************************************
@ -491,7 +491,7 @@ if (0 != close(fd))
    throwFromErrno("Cannot close file " + file_name, ErrorCodes::CANNOT_CLOSE_FILE);
 ```

-You can use assert to check invariants in code.
+You can use assert to check invariant in code.

 **4.** Exception types.

@ -552,9 +552,9 @@ Do not try to implement lock-free data structures unless it is your primary area

 In most cases, prefer references.

-**10.** const.
+**10.** `const`.

-Use constant references, pointers to constants, `const_iterator`, and const methods.
+Use constant references, pointers to constants, `const_iterator`, and `const` methods.

 Consider `const` to be default and use non-`const` only when necessary.

@ -596,7 +596,7 @@ public:
    AggregateFunctionPtr get(const String & name, const DataTypes & argument_types) const;
 ```

-**15.** namespace.
+**15.** `namespace`.

 There is no need to use a separate `namespace` for application code.

@ -606,7 +606,7 @@ For medium to large libraries, put everything in a `namespace`.

 In the library’s `.h` file, you can use `namespace detail` to hide implementation details not needed for the application code.

-In a `.cpp` file, you can use a `static` or anonymous namespace to hide symbols.
+In a `.cpp` file, you can use a `static` or anonymous `namespace` to hide symbols.

 Also, a `namespace` can be used for an `enum` to prevent the corresponding names from falling into an external `namespace` (but it’s better to use an `enum class`).

--- a/docs/en/development/tests.md
+++ b/docs/en/development/tests.md
@ -4,7 +4,7 @@ sidebar_label: Testing
 description: Most of ClickHouse features can be tested with functional tests and they are mandatory to use for every change in ClickHouse code that can be tested that way.
 ---

-# ClickHouse Testing 
+# ClickHouse Testing

 ## Functional Tests

@ -85,7 +85,7 @@ Performance tests allow to measure and compare performance of some isolated part

 Each test run one or multiple queries (possibly with combinations of parameters) in a loop.

-If you want to improve performance of ClickHouse in some scenario, and if improvements can be observed on simple queries, it is highly recommended to write a performance test. It always makes sense to use `perf top` or other perf tools during your tests.
+If you want to improve performance of ClickHouse in some scenario, and if improvements can be observed on simple queries, it is highly recommended to write a performance test. It always makes sense to use `perf top` or other `perf` tools during your tests.

 ## Test Tools and Scripts {#test-tools-and-scripts}

@ -228,7 +228,7 @@ Our Security Team did some basic overview of ClickHouse capabilities from the se

 We run `clang-tidy` on per-commit basis. `clang-static-analyzer` checks are also enabled. `clang-tidy` is also used for some style checks.

-We have evaluated `clang-tidy`, `Coverity`, `cppcheck`, `PVS-Studio`, `tscancode`, `CodeQL`. You will find instructions for usage in `tests/instructions/` directory. 
+We have evaluated `clang-tidy`, `Coverity`, `cppcheck`, `PVS-Studio`, `tscancode`, `CodeQL`. You will find instructions for usage in `tests/instructions/` directory.

 If you use `CLion` as an IDE, you can leverage some `clang-tidy` checks out of the box.

@ -244,7 +244,7 @@ In debug build we also involve a customization of libc that ensures that no "har

 Debug assertions are used extensively.

-In debug build, if exception with "logical error" code (implies a bug) is being thrown, the program is terminated prematurally. It allows to use exceptions in release build but make it an assertion in debug build.
+In debug build, if exception with "logical error" code (implies a bug) is being thrown, the program is terminated prematurely. It allows to use exceptions in release build but make it an assertion in debug build.

 Debug version of jemalloc is used for debug builds.
 Debug version of libc++ is used for debug builds.
@ -253,7 +253,7 @@ Debug version of libc++ is used for debug builds.

 Data stored on disk is checksummed. Data in MergeTree tables is checksummed in three ways simultaneously* (compressed data blocks, uncompressed data blocks, the total checksum across blocks). Data transferred over network between client and server or between servers is also checksummed. Replication ensures bit-identical data on replicas.

-It is required to protect from faulty hardware (bit rot on storage media, bit flips in RAM on server, bit flips in RAM of network controller, bit flips in RAM of network switch, bit flips in RAM of client, bit flips on the wire). Note that bit flips are common and likely to occur even for ECC RAM and in presense of TCP checksums (if you manage to run thousands of servers processing petabytes of data each day). [See the video (russian)](https://www.youtube.com/watch?v=ooBAQIe0KlQ).
+It is required to protect from faulty hardware (bit rot on storage media, bit flips in RAM on server, bit flips in RAM of network controller, bit flips in RAM of network switch, bit flips in RAM of client, bit flips on the wire). Note that bit flips are common and likely to occur even for ECC RAM and in presence of TCP checksums (if you manage to run thousands of servers processing petabytes of data each day). [See the video (russian)](https://www.youtube.com/watch?v=ooBAQIe0KlQ).

 ClickHouse provides diagnostics that will help ops engineers to find faulty hardware.

--- a/docs/en/engines/table-engines/index.md
+++ b/docs/en/engines/table-engines/index.md
@ -12,7 +12,7 @@ The table engine (type of table) determines:
 -   Which queries are supported, and how.
 -   Concurrent data access.
 -   Use of indexes, if present.
-   Whether multithreaded request execution is possible.
+-   Whether multithread request execution is possible.
 -   Data replication parameters.

 ## Engine Families {#engine-families}
--- a/docs/en/engines/table-engines/mergetree-family/replacingmergetree.md
+++ b/docs/en/engines/table-engines/mergetree-family/replacingmergetree.md
@ -40,7 +40,7 @@ Uniqueness of rows is determined by the `ORDER BY` table section, not `PRIMARY K
    When merging, `ReplacingMergeTree` from all the rows with the same sorting key leaves only one:

    -   The last in the selection, if `ver` not set. A selection is a set of rows in a set of parts participating in the merge. The most recently created part (the last insert) will be the last one in the selection. Thus, after deduplication, the very last row from the most recent insert will remain for each unique sorting key.
-    -   With the maximum version, if `ver` specified.
+    -   With the maximum version, if `ver` specified. If `ver` is the same for several rows, then it will use "if `ver` is not specified" rule for them, i.e. the most recent inserted row will remain.

 **Query clauses**

--- a/docs/en/getting-started/example-datasets/metrica.md
+++ b/docs/en/getting-started/example-datasets/metrica.md
@ -1,78 +1,139 @@
 ---
 sidebar_label: Web Analytics Data
-description: Dataset consists of two tables containing anonymized web analytics data with hits and visits
+description: Dataset consisting of two tables containing anonymized web analytics data with hits and visits
 ---

 # Anonymized Web Analytics Data

-Dataset consists of two tables containing anonymized web analytics data with hits (`hits_v1`) and visits (`visits_v1`).
+This dataset consists of two tables containing anonymized web analytics data with hits (`hits_v1`) and visits (`visits_v1`).

-The dataset consists of two tables, either of them can be downloaded as a compressed `tsv.xz` file or as prepared partitions. In addition to that, an extended version of the `hits` table containing 100 million rows is available as TSV at https://datasets.clickhouse.com/hits/tsv/hits_100m_obfuscated_v1.tsv.xz and as prepared partitions at https://datasets.clickhouse.com/hits/partitions/hits_100m_obfuscated_v1.tar.xz.
+The tables can be downloaded as compressed `tsv.xz` files. In addition to the sample worked with in this document, an extended (7.5GB) version of the `hits` table containing 100 million rows is available as TSV at [https://datasets.clickhouse.com/hits/tsv/hits_100m_obfuscated_v1.tsv.xz](https://datasets.clickhouse.com/hits/tsv/hits_100m_obfuscated_v1.tsv.xz).

-## Obtaining Tables from Prepared Partitions {#obtaining-tables-from-prepared-partitions}
+## Download and ingest the data

-Download and import hits table:
-
-``` bash
-curl -O https://datasets.clickhouse.com/hits/partitions/hits_v1.tar
-tar xvf hits_v1.tar -C /var/lib/clickhouse # path to ClickHouse data directory
-# check permissions on unpacked data, fix if required
-sudo service clickhouse-server restart
-clickhouse-client --query "SELECT COUNT(*) FROM datasets.hits_v1"
-```
-
-Download and import visits:
-
-``` bash
-curl -O https://datasets.clickhouse.com/visits/partitions/visits_v1.tar
-tar xvf visits_v1.tar -C /var/lib/clickhouse # path to ClickHouse data directory
-# check permissions on unpacked data, fix if required
-sudo service clickhouse-server restart
-clickhouse-client --query "SELECT COUNT(*) FROM datasets.visits_v1"
-```
-
-## Obtaining Tables from Compressed TSV File {#obtaining-tables-from-compressed-tsv-file}
-
-Download and import hits from compressed TSV file:
+### Download the hits compressed TSV file:

 ``` bash
 curl https://datasets.clickhouse.com/hits/tsv/hits_v1.tsv.xz | unxz --threads=`nproc` > hits_v1.tsv
 # Validate the checksum
 md5sum hits_v1.tsv
 # Checksum should be equal to: f3631b6295bf06989c1437491f7592cb
-# now create table
-clickhouse-client --query "CREATE DATABASE IF NOT EXISTS datasets"
-# for hits_v1
-clickhouse-client --query "CREATE TABLE datasets.hits_v1 ( WatchID UInt64,  JavaEnable UInt8,  Title String,  GoodEvent Int16,  EventTime DateTime,  EventDate Date,  CounterID UInt32,  ClientIP UInt32,  ClientIP6 FixedString(16),  RegionID UInt32,  UserID UInt64,  CounterClass Int8,  OS UInt8,  UserAgent UInt8,  URL String,  Referer String,  URLDomain String,  RefererDomain String,  Refresh UInt8,  IsRobot UInt8,  RefererCategories Array(UInt16),  URLCategories Array(UInt16), URLRegions Array(UInt32),  RefererRegions Array(UInt32),  ResolutionWidth UInt16,  ResolutionHeight UInt16,  ResolutionDepth UInt8,  FlashMajor UInt8, FlashMinor UInt8,  FlashMinor2 String,  NetMajor UInt8,  NetMinor UInt8, UserAgentMajor UInt16,  UserAgentMinor FixedString(2),  CookieEnable UInt8, JavascriptEnable UInt8,  IsMobile UInt8,  MobilePhone UInt8,  MobilePhoneModel String,  Params String,  IPNetworkID UInt32,  TraficSourceID Int8, SearchEngineID UInt16,  SearchPhrase String,  AdvEngineID UInt8,  IsArtifical UInt8,  WindowClientWidth UInt16,  WindowClientHeight UInt16,  ClientTimeZone Int16,  ClientEventTime DateTime,  SilverlightVersion1 UInt8, SilverlightVersion2 UInt8,  SilverlightVersion3 UInt32,  SilverlightVersion4 UInt16,  PageCharset String,  CodeVersion UInt32,  IsLink UInt8,  IsDownload UInt8,  IsNotBounce UInt8,  FUniqID UInt64,  HID UInt32,  IsOldCounter UInt8, IsEvent UInt8,  IsParameter UInt8,  DontCountHits UInt8,  WithHash UInt8, HitColor FixedString(1),  UTCEventTime DateTime,  Age UInt8,  Sex UInt8,  Income UInt8,  Interests UInt16,  Robotness UInt8,  GeneralInterests Array(UInt16), RemoteIP UInt32,  RemoteIP6 FixedString(16),  WindowName Int32,  OpenerName Int32,  HistoryLength Int16,  BrowserLanguage FixedString(2),  BrowserCountry FixedString(2),  SocialNetwork String,  SocialAction String,  HTTPError UInt16, SendTiming Int32,  DNSTiming Int32,  ConnectTiming Int32,  ResponseStartTiming Int32,  ResponseEndTiming Int32,  FetchTiming Int32,  RedirectTiming Int32, DOMInteractiveTiming Int32,  DOMContentLoadedTiming Int32,  DOMCompleteTiming Int32,  LoadEventStartTiming Int32,  LoadEventEndTiming Int32, NSToDOMContentLoadedTiming Int32,  FirstPaintTiming Int32,  RedirectCount Int8, SocialSourceNetworkID UInt8,  SocialSourcePage String,  ParamPrice Int64, ParamOrderID String,  ParamCurrency FixedString(3),  ParamCurrencyID UInt16, GoalsReached Array(UInt32),  OpenstatServiceName String,  OpenstatCampaignID String,  OpenstatAdID String,  OpenstatSourceID String,  UTMSource String, UTMMedium String,  UTMCampaign String,  UTMContent String,  UTMTerm String, FromTag String,  HasGCLID UInt8,  RefererHash UInt64,  URLHash UInt64,  CLID UInt32,  YCLID UInt64,  ShareService String,  ShareURL String,  ShareTitle String,  ParsedParams Nested(Key1 String,  Key2 String, Key3 String, Key4 String, Key5 String,  ValueDouble Float64),  IslandID FixedString(16),  RequestNum UInt32,  RequestTry UInt8) ENGINE = MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS index_granularity = 8192"
-# for hits_100m_obfuscated
-clickhouse-client --query="CREATE TABLE default.hits_100m_obfuscated (WatchID UInt64, JavaEnable UInt8, Title String, GoodEvent Int16, EventTime DateTime, EventDate Date, CounterID UInt32, ClientIP UInt32, RegionID UInt32, UserID UInt64, CounterClass Int8, OS UInt8, UserAgent UInt8, URL String, Referer String, Refresh UInt8, RefererCategoryID UInt16, RefererRegionID UInt32, URLCategoryID UInt16, URLRegionID UInt32, ResolutionWidth UInt16, ResolutionHeight UInt16, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, FlashMinor2 String, NetMajor UInt8, NetMinor UInt8, UserAgentMajor UInt16, UserAgentMinor FixedString(2), CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, MobilePhone UInt8, MobilePhoneModel String, Params String, IPNetworkID UInt32, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, IsArtifical UInt8, WindowClientWidth UInt16, WindowClientHeight UInt16, ClientTimeZone Int16, ClientEventTime DateTime, SilverlightVersion1 UInt8, SilverlightVersion2 UInt8, SilverlightVersion3 UInt32, SilverlightVersion4 UInt16, PageCharset String, CodeVersion UInt32, IsLink UInt8, IsDownload UInt8, IsNotBounce UInt8, FUniqID UInt64, OriginalURL String, HID UInt32, IsOldCounter UInt8, IsEvent UInt8, IsParameter UInt8, DontCountHits UInt8, WithHash UInt8, HitColor FixedString(1), LocalEventTime DateTime, Age UInt8, Sex UInt8, Income UInt8, Interests UInt16, Robotness UInt8, RemoteIP UInt32, WindowName Int32, OpenerName Int32, HistoryLength Int16, BrowserLanguage FixedString(2), BrowserCountry FixedString(2), SocialNetwork String, SocialAction String, HTTPError UInt16, SendTiming UInt32, DNSTiming UInt32, ConnectTiming UInt32, ResponseStartTiming UInt32, ResponseEndTiming UInt32, FetchTiming UInt32, SocialSourceNetworkID UInt8, SocialSourcePage String, ParamPrice Int64, ParamOrderID String, ParamCurrency FixedString(3), ParamCurrencyID UInt16, OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, RefererHash UInt64, URLHash UInt64, CLID UInt32) ENGINE = MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER  BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS index_granularity = 8192"
+```

-# import data
+### Create the database and table
+
+```bash
+clickhouse-client --query "CREATE DATABASE IF NOT EXISTS datasets"
+```
+
+For hits_v1
+
+```bash
+clickhouse-client --query "CREATE TABLE datasets.hits_v1 ( WatchID UInt64,  JavaEnable UInt8,  Title String,  GoodEvent Int16,  EventTime DateTime,  EventDate Date,  CounterID UInt32,  ClientIP UInt32,  ClientIP6 FixedString(16),  RegionID UInt32,  UserID UInt64,  CounterClass Int8,  OS UInt8,  UserAgent UInt8,  URL String,  Referer String,  URLDomain String,  RefererDomain String,  Refresh UInt8,  IsRobot UInt8,  RefererCategories Array(UInt16),  URLCategories Array(UInt16), URLRegions Array(UInt32),  RefererRegions Array(UInt32),  ResolutionWidth UInt16,  ResolutionHeight UInt16,  ResolutionDepth UInt8,  FlashMajor UInt8, FlashMinor UInt8,  FlashMinor2 String,  NetMajor UInt8,  NetMinor UInt8, UserAgentMajor UInt16,  UserAgentMinor FixedString(2),  CookieEnable UInt8, JavascriptEnable UInt8,  IsMobile UInt8,  MobilePhone UInt8,  MobilePhoneModel String,  Params String,  IPNetworkID UInt32,  TraficSourceID Int8, SearchEngineID UInt16,  SearchPhrase String,  AdvEngineID UInt8,  IsArtifical UInt8,  WindowClientWidth UInt16,  WindowClientHeight UInt16,  ClientTimeZone Int16,  ClientEventTime DateTime,  SilverlightVersion1 UInt8, SilverlightVersion2 UInt8,  SilverlightVersion3 UInt32,  SilverlightVersion4 UInt16,  PageCharset String,  CodeVersion UInt32,  IsLink UInt8,  IsDownload UInt8,  IsNotBounce UInt8,  FUniqID UInt64,  HID UInt32,  IsOldCounter UInt8, IsEvent UInt8,  IsParameter UInt8,  DontCountHits UInt8,  WithHash UInt8, HitColor FixedString(1),  UTCEventTime DateTime,  Age UInt8,  Sex UInt8,  Income UInt8,  Interests UInt16,  Robotness UInt8,  GeneralInterests Array(UInt16), RemoteIP UInt32,  RemoteIP6 FixedString(16),  WindowName Int32,  OpenerName Int32,  HistoryLength Int16,  BrowserLanguage FixedString(2),  BrowserCountry FixedString(2),  SocialNetwork String,  SocialAction String,  HTTPError UInt16, SendTiming Int32,  DNSTiming Int32,  ConnectTiming Int32,  ResponseStartTiming Int32,  ResponseEndTiming Int32,  FetchTiming Int32,  RedirectTiming Int32, DOMInteractiveTiming Int32,  DOMContentLoadedTiming Int32,  DOMCompleteTiming Int32,  LoadEventStartTiming Int32,  LoadEventEndTiming Int32, NSToDOMContentLoadedTiming Int32,  FirstPaintTiming Int32,  RedirectCount Int8, SocialSourceNetworkID UInt8,  SocialSourcePage String,  ParamPrice Int64, ParamOrderID String,  ParamCurrency FixedString(3),  ParamCurrencyID UInt16, GoalsReached Array(UInt32),  OpenstatServiceName String,  OpenstatCampaignID String,  OpenstatAdID String,  OpenstatSourceID String,  UTMSource String, UTMMedium String,  UTMCampaign String,  UTMContent String,  UTMTerm String, FromTag String,  HasGCLID UInt8,  RefererHash UInt64,  URLHash UInt64,  CLID UInt32,  YCLID UInt64,  ShareService String,  ShareURL String,  ShareTitle String,  ParsedParams Nested(Key1 String,  Key2 String, Key3 String, Key4 String, Key5 String,  ValueDouble Float64),  IslandID FixedString(16),  RequestNum UInt32,  RequestTry UInt8) ENGINE = MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS index_granularity = 8192"
+```
+
+Or for hits_100m_obfuscated
+
+```bash
+clickhouse-client --query="CREATE TABLE default.hits_100m_obfuscated (WatchID UInt64, JavaEnable UInt8, Title String, GoodEvent Int16, EventTime DateTime, EventDate Date, CounterID UInt32, ClientIP UInt32, RegionID UInt32, UserID UInt64, CounterClass Int8, OS UInt8, UserAgent UInt8, URL String, Referer String, Refresh UInt8, RefererCategoryID UInt16, RefererRegionID UInt32, URLCategoryID UInt16, URLRegionID UInt32, ResolutionWidth UInt16, ResolutionHeight UInt16, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, FlashMinor2 String, NetMajor UInt8, NetMinor UInt8, UserAgentMajor UInt16, UserAgentMinor FixedString(2), CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, MobilePhone UInt8, MobilePhoneModel String, Params String, IPNetworkID UInt32, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, IsArtifical UInt8, WindowClientWidth UInt16, WindowClientHeight UInt16, ClientTimeZone Int16, ClientEventTime DateTime, SilverlightVersion1 UInt8, SilverlightVersion2 UInt8, SilverlightVersion3 UInt32, SilverlightVersion4 UInt16, PageCharset String, CodeVersion UInt32, IsLink UInt8, IsDownload UInt8, IsNotBounce UInt8, FUniqID UInt64, OriginalURL String, HID UInt32, IsOldCounter UInt8, IsEvent UInt8, IsParameter UInt8, DontCountHits UInt8, WithHash UInt8, HitColor FixedString(1), LocalEventTime DateTime, Age UInt8, Sex UInt8, Income UInt8, Interests UInt16, Robotness UInt8, RemoteIP UInt32, WindowName Int32, OpenerName Int32, HistoryLength Int16, BrowserLanguage FixedString(2), BrowserCountry FixedString(2), SocialNetwork String, SocialAction String, HTTPError UInt16, SendTiming UInt32, DNSTiming UInt32, ConnectTiming UInt32, ResponseStartTiming UInt32, ResponseEndTiming UInt32, FetchTiming UInt32, SocialSourceNetworkID UInt8, SocialSourcePage String, ParamPrice Int64, ParamOrderID String, ParamCurrency FixedString(3), ParamCurrencyID UInt16, OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, RefererHash UInt64, URLHash UInt64, CLID UInt32) ENGINE = MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER  BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS index_granularity = 8192"
+```
+
+### Import the hits data:
+
+```bash
 cat hits_v1.tsv | clickhouse-client --query "INSERT INTO datasets.hits_v1 FORMAT TSV" --max_insert_block_size=100000
-# optionally you can optimize table
-clickhouse-client --query "OPTIMIZE TABLE datasets.hits_v1 FINAL"
+```
+
+Verify the count of rows
+
+```bash
 clickhouse-client --query "SELECT COUNT(*) FROM datasets.hits_v1"
 ```

-Download and import visits from compressed tsv-file:
+```response
+8873898
+```
+
+### Download the visits compressed TSV file:

 ``` bash
 curl https://datasets.clickhouse.com/visits/tsv/visits_v1.tsv.xz | unxz --threads=`nproc` > visits_v1.tsv
 # Validate the checksum
 md5sum visits_v1.tsv
 # Checksum should be equal to: 6dafe1a0f24e59e3fc2d0fed85601de6
-# now create table
-clickhouse-client --query "CREATE DATABASE IF NOT EXISTS datasets"
+```
+
+### Create the visits table
+
+```bash
 clickhouse-client --query "CREATE TABLE datasets.visits_v1 ( CounterID UInt32,  StartDate Date,  Sign Int8,  IsNew UInt8,  VisitID UInt64,  UserID UInt64,  StartTime DateTime,  Duration UInt32,  UTCStartTime DateTime,  PageViews Int32,  Hits Int32,  IsBounce UInt8,  Referer String,  StartURL String,  RefererDomain String,  StartURLDomain String,  EndURL String,  LinkURL String,  IsDownload UInt8,  TraficSourceID Int8,  SearchEngineID UInt16,  SearchPhrase String,  AdvEngineID UInt8,  PlaceID Int32,  RefererCategories Array(UInt16),  URLCategories Array(UInt16),  URLRegions Array(UInt32),  RefererRegions Array(UInt32),  IsYandex UInt8,  GoalReachesDepth Int32,  GoalReachesURL Int32,  GoalReachesAny Int32,  SocialSourceNetworkID UInt8,  SocialSourcePage String,  MobilePhoneModel String,  ClientEventTime DateTime,  RegionID UInt32,  ClientIP UInt32,  ClientIP6 FixedString(16),  RemoteIP UInt32,  RemoteIP6 FixedString(16),  IPNetworkID UInt32,  SilverlightVersion3 UInt32,  CodeVersion UInt32,  ResolutionWidth UInt16,  ResolutionHeight UInt16,  UserAgentMajor UInt16,  UserAgentMinor UInt16,  WindowClientWidth UInt16,  WindowClientHeight UInt16,  SilverlightVersion2 UInt8,  SilverlightVersion4 UInt16,  FlashVersion3 UInt16,  FlashVersion4 UInt16,  ClientTimeZone Int16,  OS UInt8,  UserAgent UInt8,  ResolutionDepth UInt8,  FlashMajor UInt8,  FlashMinor UInt8,  NetMajor UInt8,  NetMinor UInt8,  MobilePhone UInt8,  SilverlightVersion1 UInt8,  Age UInt8,  Sex UInt8,  Income UInt8,  JavaEnable UInt8,  CookieEnable UInt8,  JavascriptEnable UInt8,  IsMobile UInt8,  BrowserLanguage UInt16,  BrowserCountry UInt16,  Interests UInt16,  Robotness UInt8,  GeneralInterests Array(UInt16),  Params Array(String),  Goals Nested(ID UInt32, Serial UInt32, EventTime DateTime,  Price Int64,  OrderID String, CurrencyID UInt32),  WatchIDs Array(UInt64),  ParamSumPrice Int64,  ParamCurrency FixedString(3),  ParamCurrencyID UInt16,  ClickLogID UInt64,  ClickEventID Int32,  ClickGoodEvent Int32,  ClickEventTime DateTime,  ClickPriorityID Int32,  ClickPhraseID Int32,  ClickPageID Int32,  ClickPlaceID Int32,  ClickTypeID Int32,  ClickResourceID Int32,  ClickCost UInt32,  ClickClientIP UInt32,  ClickDomainID UInt32,  ClickURL String,  ClickAttempt UInt8,  ClickOrderID UInt32,  ClickBannerID UInt32,  ClickMarketCategoryID UInt32,  ClickMarketPP UInt32,  ClickMarketCategoryName String,  ClickMarketPPName String,  ClickAWAPSCampaignName String,  ClickPageName String,  ClickTargetType UInt16,  ClickTargetPhraseID UInt64,  ClickContextType UInt8,  ClickSelectType Int8,  ClickOptions String,  ClickGroupBannerID Int32,  OpenstatServiceName String,  OpenstatCampaignID String,  OpenstatAdID String,  OpenstatSourceID String,  UTMSource String,  UTMMedium String,  UTMCampaign String,  UTMContent String,  UTMTerm String,  FromTag String,  HasGCLID UInt8,  FirstVisit DateTime,  PredLastVisit Date,  LastVisit Date,  TotalVisits UInt32,  TraficSource    Nested(ID Int8,  SearchEngineID UInt16, AdvEngineID UInt8, PlaceID UInt16, SocialSourceNetworkID UInt8, Domain String, SearchPhrase String, SocialSourcePage String),  Attendance FixedString(16),  CLID UInt32,  YCLID UInt64,  NormalizedRefererHash UInt64,  SearchPhraseHash UInt64,  RefererDomainHash UInt64,  NormalizedStartURLHash UInt64,  StartURLDomainHash UInt64,  NormalizedEndURLHash UInt64,  TopLevelDomain UInt64,  URLScheme UInt64,  OpenstatServiceNameHash UInt64,  OpenstatCampaignIDHash UInt64,  OpenstatAdIDHash UInt64,  OpenstatSourceIDHash UInt64,  UTMSourceHash UInt64,  UTMMediumHash UInt64,  UTMCampaignHash UInt64,  UTMContentHash UInt64,  UTMTermHash UInt64,  FromHash UInt64,  WebVisorEnabled UInt8,  WebVisorActivity UInt32,  ParsedParams    Nested(Key1 String,  Key2 String,  Key3 String,  Key4 String, Key5 String, ValueDouble    Float64),  Market Nested(Type UInt8, GoalID UInt32, OrderID String,  OrderPrice Int64,  PP UInt32,  DirectPlaceID UInt32,  DirectOrderID  UInt32,  DirectBannerID UInt32,  GoodID String, GoodName String, GoodQuantity Int32,  GoodPrice Int64),  IslandID FixedString(16)) ENGINE = CollapsingMergeTree(Sign) PARTITION BY toYYYYMM(StartDate) ORDER BY (CounterID, StartDate, intHash32(UserID), VisitID) SAMPLE BY intHash32(UserID) SETTINGS index_granularity = 8192"
-# import data
+```
+
+### Import the visits data
+```bash
 cat visits_v1.tsv | clickhouse-client --query "INSERT INTO datasets.visits_v1 FORMAT TSV" --max_insert_block_size=100000
-# optionally you can optimize table
-clickhouse-client --query "OPTIMIZE TABLE datasets.visits_v1 FINAL"
+```
+
+Verify the count
+```bash
 clickhouse-client --query "SELECT COUNT(*) FROM datasets.visits_v1"
 ```

-## Example Queries {#example-queries}
+```response
+1680609
+```

-[The ClickHouse tutorial](../../tutorial.md) is based on this web analytics dataset, and the recommended way to get started with this dataset is to go through the tutorial.
+## An example JOIN 

-Additional examples of queries to these tables can be found among [stateful tests](https://github.com/ClickHouse/ClickHouse/tree/master/tests/queries/1_stateful) of ClickHouse (they are named `test.hits` and `test.visits` there).
+The hits and visits dataset is used in the ClickHouse test
+routines, this is one of the queries from the test suite. The rest
+of the tests are refernced in the *What's Next* section at the
+end of this page.
+
+```sql
+clickhouse-client --query "SELECT
+    EventDate,
+    hits,
+    visits
+FROM
+(
+    SELECT
+        EventDate,
+        count() AS hits
+    FROM datasets.hits_v1
+    GROUP BY EventDate
+) ANY LEFT JOIN
+(
+    SELECT
+        StartDate AS EventDate,
+        sum(Sign) AS visits
+    FROM datasets.visits_v1
+    GROUP BY EventDate
+) USING EventDate
+ORDER BY hits DESC
+LIMIT 10
+SETTINGS joined_subquery_requires_alias = 0
+FORMAT PrettyCompact"
+```
+
+```response
+┌──EventDate─┬────hits─┬─visits─┐
+│ 2014-03-17 │ 1406958 │ 265108 │
+│ 2014-03-19 │ 1405797 │ 261624 │
+│ 2014-03-18 │ 1383658 │ 258723 │
+│ 2014-03-20 │ 1353623 │ 255328 │
+│ 2014-03-21 │ 1245779 │ 236232 │
+│ 2014-03-23 │ 1046491 │ 202212 │
+│ 2014-03-22 │ 1031592 │ 197354 │
+└────────────┴─────────┴────────┘
+```
+
+## Next Steps
+
+[A Practical Introduction to Sparse Primary Indexes in ClickHouse](../../guides/improving-query-performance/sparse-primary-indexes/sparse-primary-indexes-intro.md) uses the hits dataset to discuss the differences in ClickHouse indexing compared to traditional relational databases, how ClickHouse builds and uses a sparse primary index, and indexing best practices.
+
+Additional examples of queries to these tables can be found among the ClickHouse [stateful tests](https://github.com/ClickHouse/ClickHouse/blob/d7129855757f38ceec3e4ecc6dafacdabe9b178f/tests/queries/1_stateful/00172_parallel_join.sql).
+
+:::note
+The test suite uses a database name `test`, and the tables are named `hits` and `visits`.  You can rename your database and tables, or edit the SQL from the test file.  
+:::
--- a/docs/en/getting-started/install.md
+++ b/docs/en/getting-started/install.md
@ -190,8 +190,7 @@ sudo ./clickhouse install

 ### From Precompiled Binaries for Non-Standard Environments {#from-binaries-non-linux}

-For non-Linux operating systems and for AArch64 CPU arhitecture, ClickHouse builds are provided as a cross-compiled binary from the latest commit of the `master` branch (with a few hours delay).
-
+For non-Linux operating systems and for AArch64 CPU architecture, ClickHouse builds are provided as a cross-compiled binary from the latest commit of the `master` branch (with a few hours delay).

 -   [MacOS x86_64](https://builds.clickhouse.com/master/macos/clickhouse)
     ```bash
--- a/docs/en/interfaces/formats.md
+++ b/docs/en/interfaces/formats.md
@ -119,7 +119,7 @@ Dates with times are written in the format `YYYY-MM-DD hh:mm:ss` and parsed in t
 This all occurs in the system time zone at the time the client or server starts (depending on which of them formats data). For dates with times, daylight saving time is not specified. So if a dump has times during daylight saving time, the dump does not unequivocally match the data, and parsing will select one of the two times.
 During a read operation, incorrect dates and dates with times can be parsed with natural overflow or as null dates and times, without an error message.

-As an exception, parsing dates with times is also supported in Unix timestamp format, if it consists of exactly 10 decimal digits. The result is not time zone-dependent. The formats YYYY-MM-DD hh:mm:ss and NNNNNNNNNN are differentiated automatically.
+As an exception, parsing dates with times is also supported in Unix timestamp format, if it consists of exactly 10 decimal digits. The result is not time zone-dependent. The formats `YYYY-MM-DD hh:mm:ss` and `NNNNNNNNNN` are differentiated automatically.

 Strings are output with backslash-escaped special characters. The following escape sequences are used for output: `\b`, `\f`, `\r`, `\n`, `\t`, `\0`, `\'`, `\\`. Parsing also supports the sequences `\a`, `\v`, and `\xHH` (hex escape sequences) and any `\c` sequences, where `c` is any character (these sequences are converted to `c`). Thus, reading data supports formats where a line feed can be written as `\n` or `\`, or as a line feed. For example, the string `Hello world` with a line feed between the words instead of space can be parsed in any of the following variations:

@ -333,8 +333,9 @@ Total rows: 2
 ```

 ``` sql
-INSERT INTO UserActivity FORMAT Template SETTINGS
+INSERT INTO UserActivity SETTINGS
 format_template_resultset = '/some/path/resultset.format', format_template_row = '/some/path/row.format'
+FORMAT Template
 ```

 `/some/path/resultset.format`:
@ -359,8 +360,9 @@ Similar to `Template`, but skips whitespace characters between delimiters and va
 It’s possible to read `JSON` using this format, if values of columns have the same order in all rows. For example, the following request can be used for inserting data from output example of format [JSON](#json):

 ``` sql
-INSERT INTO table_name FORMAT TemplateIgnoreSpaces SETTINGS
+INSERT INTO table_name SETTINGS
 format_template_resultset = '/some/path/resultset.format', format_template_row = '/some/path/row.format', format_template_rows_between_delimiter = ','
+FORMAT TemplateIgnoreSpaces
 ```

 `/some/path/resultset.format`:
@ -816,7 +818,7 @@ Columns that are not present in the block will be filled with default values (yo

 ## JSONEachRow {#jsoneachrow}

-In this format, CliskHouse outputs each row as a separated, newline-delimited JSON Object.
+In this format, ClickHouse outputs each row as a separated, newline-delimited JSON Object.

 Example:

@ -1337,7 +1339,7 @@ Arrays can be nested and can have a value of the `Nullable` type as an argument.
 You can insert CapnProto data from a file into ClickHouse table by the following command:

 ``` bash
-$ cat capnproto_messages.bin | clickhouse-client --query "INSERT INTO test.hits FORMAT CapnProto SETTINGS format_schema = 'schema:Message'"
+$ cat capnproto_messages.bin | clickhouse-client --query "INSERT INTO test.hits SETTINGS format_schema = 'schema:Message' FORMAT CapnProto"
 ```

 Where `schema.capnp` looks like this:
@ -1363,9 +1365,9 @@ Columns `name` ([String](../sql-reference/data-types/string.md)) and `value` (nu
 Rows may optionally contain `help` ([String](../sql-reference/data-types/string.md)) and `timestamp` (number).
 Column `type` ([String](../sql-reference/data-types/string.md)) is either `counter`, `gauge`, `histogram`, `summary`, `untyped` or empty.
 Each metric value may also have some `labels` ([Map(String, String)](../sql-reference/data-types/map.md)).
-Several consequent rows may refer to the one metric with different lables. The table should be sorted by metric name (e.g., with `ORDER BY name`).
+Several consequent rows may refer to the one metric with different labels. The table should be sorted by metric name (e.g., with `ORDER BY name`).

-There's special requirements for labels for `histogram` and `summary`, see [Prometheus doc](https://prometheus.io/docs/instrumenting/exposition_formats/#histograms-and-summaries) for the details. Special rules applied to row with labels `{'count':''}` and `{'sum':''}`, they'll be convered to `<metric_name>_count` and `<metric_name>_sum` respectively.
+There's special requirements for labels for `histogram` and `summary`, see [Prometheus doc](https://prometheus.io/docs/instrumenting/exposition_formats/#histograms-and-summaries) for the details. Special rules applied to row with labels `{'count':''}` and `{'sum':''}`, they'll be converted to `<metric_name>_count` and `<metric_name>_sum` respectively.

 **Example:**

@ -1439,7 +1441,7 @@ SELECT * FROM test.table FORMAT Protobuf SETTINGS format_schema = 'schemafile:Me
 ```

 ``` bash
-cat protobuf_messages.bin | clickhouse-client --query "INSERT INTO test.table FORMAT Protobuf SETTINGS format_schema='schemafile:MessageType'"
+cat protobuf_messages.bin | clickhouse-client --query "INSERT INTO test.table SETTINGS format_schema='schemafile:MessageType' FORMAT Protobuf"
 ```

 where the file `schemafile.proto` looks like this:
@ -1665,7 +1667,7 @@ To exchange data with Hadoop, you can use [HDFS table engine](../engines/table-e

 ### Parquet format settings {#parquet-format-settings}

- [output_format_parquet_row_group_size](../operations/settings/settings.md#output_format_parquet_row_group_size) - row group size in rows while data output. Default value - `1000000`.  
+- [output_format_parquet_row_group_size](../operations/settings/settings.md#output_format_parquet_row_group_size) - row group size in rows while data output. Default value - `1000000`.
 - [output_format_parquet_string_as_string](../operations/settings/settings.md#output_format_parquet_string_as_string) - use Parquet String type instead of Binary for String columns. Default value - `false`.
 - [input_format_parquet_import_nested](../operations/settings/settings.md#input_format_parquet_import_nested) - allow inserting array of structs into [Nested](../sql-reference/data-types/nested-data-structures/nested.md) table in Parquet input format. Default value - `false`.
 - [input_format_parquet_case_insensitive_column_matching](../operations/settings/settings.md#input_format_parquet_case_insensitive_column_matching) - ignore case when matching Parquet columns with ClickHouse columns. Default value - `false`.
@ -1845,7 +1847,7 @@ When working with the `Regexp` format, you can use the following settings:
    -   Quoted (similarly to [Values](#data-format-values))
    -   Raw (extracts subpatterns as a whole, no escaping rules, similarly to [TSVRaw](#tabseparatedraw))

-   `format_regexp_skip_unmatched` — [UInt8](../sql-reference/data-types/int-uint.md). Defines the need to throw an exeption in case the `format_regexp` expression does not match the imported data. Can be set to `0` or `1`.
+-   `format_regexp_skip_unmatched` — [UInt8](../sql-reference/data-types/int-uint.md). Defines the need to throw an exception in case the `format_regexp` expression does not match the imported data. Can be set to `0` or `1`.

 **Usage**

@ -1875,7 +1877,7 @@ CREATE TABLE imp_regex_table (id UInt32, array Array(UInt32), string String, dat
 Import command:

 ```bash
-$ cat data.tsv | clickhouse-client  --query "INSERT INTO imp_regex_table FORMAT Regexp SETTINGS format_regexp='id: (.+?) array: (.+?) string: (.+?) date: (.+?)', format_regexp_escaping_rule='Escaped', format_regexp_skip_unmatched=0;"
+$ cat data.tsv | clickhouse-client  --query "INSERT INTO imp_regex_table SETTINGS format_regexp='id: (.+?) array: (.+?) string: (.+?) date: (.+?)', format_regexp_escaping_rule='Escaped', format_regexp_skip_unmatched=0 FORMAT Regexp;"
 ```

 Query:
--- a/docs/en/interfaces/http.md
+++ b/docs/en/interfaces/http.md
@ -422,7 +422,7 @@ Now `rule` can configure `method`, `headers`, `url`, `handler`:

    -   `query` — use with `predefined_query_handler` type, executes query when the handler is called.

-    -   `query_param_name` — use with `dynamic_query_handler` type, extracts and executes the value corresponding to the `query_param_name` value in HTTP request params.
+    -   `query_param_name` — use with `dynamic_query_handler` type, extracts and executes the value corresponding to the `query_param_name` value in HTTP request parameters.

    -   `status` — use with `static` type, response status code.

@ -477,9 +477,9 @@ In one `predefined_query_handler` only supports one `query` of an insert type.

 ### dynamic_query_handler {#dynamic_query_handler}

-In `dynamic_query_handler`, the query is written in the form of param of the HTTP request. The difference is that in `predefined_query_handler`, the query is written in the configuration file. You can configure `query_param_name` in `dynamic_query_handler`.
+In `dynamic_query_handler`, the query is written in the form of parameter of the HTTP request. The difference is that in `predefined_query_handler`, the query is written in the configuration file. You can configure `query_param_name` in `dynamic_query_handler`.

-ClickHouse extracts and executes the value corresponding to the `query_param_name` value in the URL of the HTTP request. The default value of `query_param_name` is `/query` . It is an optional configuration. If there is no definition in the configuration file, the param is not passed in.
+ClickHouse extracts and executes the value corresponding to the `query_param_name` value in the URL of the HTTP request. The default value of `query_param_name` is `/query` . It is an optional configuration. If there is no definition in the configuration file, the parameter is not passed in.

 To experiment with this functionality, the example defines the values of [max_threads](../operations/settings/settings.md#settings-max_threads) and `max_final_threads` and `queries` whether the settings were set successfully.

--- a/docs/en/interfaces/postgresql.md
+++ b/docs/en/interfaces/postgresql.md
@ -5,7 +5,7 @@ sidebar_label: PostgreSQL Interface

 # PostgreSQL Interface

-ClickHouse supports the PostgreSQL wire protocol, which allows you to use Postgres clients to connect to ClickHouse. In a sense, ClickHouse can pretend to be a PostgreSQL instance - allowing you to connect a PostgreSQL client application to ClickHouse that is not already directy supported by ClickHouse (for example, Amazon Redshift).
+ClickHouse supports the PostgreSQL wire protocol, which allows you to use Postgres clients to connect to ClickHouse. In a sense, ClickHouse can pretend to be a PostgreSQL instance - allowing you to connect a PostgreSQL client application to ClickHouse that is not already directly supported by ClickHouse (for example, Amazon Redshift).

 To enable the PostgreSQL wire protocol, add the [postgresql_port](../operations/server-configuration-parameters/settings#server_configuration_parameters-postgresql_port) setting to your server's configuration file. For example, you could define the port in a new XML file in your `config.d` folder:

@ -59,7 +59,7 @@ The PostgreSQL protocol currently only supports plain-text passwords.

 ## Using SSL

-If you have SSL/TLS configured on your ClickHouse instance, then `postgresql_port` will use the same settings (the port is shared for both secure and unsecure clients).
+If you have SSL/TLS configured on your ClickHouse instance, then `postgresql_port` will use the same settings (the port is shared for both secure and insecure clients).

 Each client has their own method of how to connect using SSL. The following command demonstrates how to pass in the certificates and key to securely connect `psql` to ClickHouse:

--- a/docs/en/interfaces/third-party/client-libraries.md
+++ b/docs/en/interfaces/third-party/client-libraries.md
@ -47,6 +47,8 @@ ClickHouse Inc does **not** maintain the libraries listed below and hasn’t don
    -   [ClickHouse (Ruby)](https://github.com/shlima/click_house)
    -   [clickhouse-activerecord](https://github.com/PNixx/clickhouse-activerecord)
 -   Rust
+    -   [clickhouse.rs](https://github.com/loyd/clickhouse.rs)
+    -   [clickhouse-rs](https://github.com/suharev7/clickhouse-rs)
    -   [Klickhouse](https://github.com/Protryon/klickhouse)
 -   R
    -   [clickhouse-r](https://github.com/hannesmuehleisen/clickhouse-r)
--- a/docs/en/operations/clickhouse-keeper.md
+++ b/docs/en/operations/clickhouse-keeper.md
@ -53,7 +53,7 @@ Internal coordination settings are located in the `<keeper_server>.<coordination
 -    `auto_forwarding` — Allow to forward write requests from followers to the leader (default: true).
 -    `shutdown_timeout` — Wait to finish internal connections and shutdown (ms) (default: 5000).
 -    `startup_timeout` — If the server doesn't connect to other quorum participants in the specified timeout it will terminate (ms) (default: 30000).
-    `four_letter_word_white_list` — White list of 4lw commands (default: "conf,cons,crst,envi,ruok,srst,srvr,stat,wchc,wchs,dirs,mntr,isro").
+-    `four_letter_word_white_list` — White list of 4lw commands (default: `conf,cons,crst,envi,ruok,srst,srvr,stat,wchc,wchs,dirs,mntr,isro`).

 Quorum configuration is located in the `<keeper_server>.<raft_configuration>` section and contain servers description.

@ -122,7 +122,7 @@ clickhouse keeper --config /etc/your_path_to_config/config.xml

 ClickHouse Keeper also provides 4lw commands which are almost the same with Zookeeper. Each command is composed of four letters such as `mntr`, `stat` etc. There are some more interesting commands: `stat` gives some general information about the server and connected clients, while `srvr` and `cons` give extended details on server and connections respectively.

-The 4lw commands has a white list configuration `four_letter_word_white_list` which has default value "conf,cons,crst,envi,ruok,srst,srvr,stat,wchc,wchs,dirs,mntr,isro".
+The 4lw commands has a white list configuration `four_letter_word_white_list` which has default value `conf,cons,crst,envi,ruok,srst,srvr,stat,wchc,wchs,dirs,mntr,isro`.

 You can issue the commands to ClickHouse Keeper via telnet or nc, at the client port.

@ -132,7 +132,7 @@ echo mntr | nc localhost 9181

 Bellow is the detailed 4lw commands:

- `ruok`: Tests if server is running in a non-error state. The server will respond with imok if it is running. Otherwise it will not respond at all. A response of "imok" does not necessarily indicate that the server has joined the quorum, just that the server process is active and bound to the specified client port. Use "stat" for details on state wrt quorum and client connection information.
+- `ruok`: Tests if server is running in a non-error state. The server will respond with `imok` if it is running. Otherwise it will not respond at all. A response of `imok` does not necessarily indicate that the server has joined the quorum, just that the server process is active and bound to the specified client port. Use "stat" for details on state wrt quorum and client connection information.

 ```
 imok
@ -330,9 +330,9 @@ E.g. for a 3-node cluster, it will continue working correctly if only 1 node cra

 Cluster configuration can be dynamically configured but there are some limitations. Reconfiguration relies on Raft also
 so to add/remove a node from the cluster you need to have a quorum. If you lose too many nodes in your cluster at the same time without any chance
-of starting them again, Raft will stop working and not allow you to reconfigure your cluster using the convenvtional way.
+of starting them again, Raft will stop working and not allow you to reconfigure your cluster using the conventional way.

-Nevertheless, Clickhouse Keeper has a recovery mode which allows you to forcfully reconfigure your cluster with only 1 node.
+Nevertheless, Clickhouse Keeper has a recovery mode which allows you to forcefully reconfigure your cluster with only 1 node.
 This should be done only as your last resort if you cannot start your nodes again, or start a new instance on the same endpoint.

 Important things to note before continuing:
--- a/docs/en/operations/configuration-files.md
+++ b/docs/en/operations/configuration-files.md
@ -57,7 +57,7 @@ Substitutions can also be performed from ZooKeeper. To do this, specify the attr

 The `config.xml` file can specify a separate config with user settings, profiles, and quotas. The relative path to this config is set in the `users_config` element. By default, it is `users.xml`. If `users_config` is omitted, the user settings, profiles, and quotas are specified directly in `config.xml`.

-Users configuration can be splitted into separate files similar to `config.xml` and `config.d/`.
+Users configuration can be split into separate files similar to `config.xml` and `config.d/`.
 Directory name is defined as `users_config` setting without `.xml` postfix concatenated with `.d`.
 Directory `users.d` is used by default, as `users_config` defaults to `users.xml`.

--- a/docs/en/operations/tips.md
+++ b/docs/en/operations/tips.md
@ -70,7 +70,7 @@ Regardless of RAID use, always use replication for data security.
 Enable NCQ with a long queue. For HDD, choose the CFQ scheduler, and for SSD, choose noop. Don’t reduce the ‘readahead’ setting.
 For HDD, enable the write cache.

-Make sure that [fstrim](https://en.wikipedia.org/wiki/Trim_(computing)) is enabled for NVME and SSD disks in your OS (usually it's implemented using a cronjob or systemd service).
+Make sure that [`fstrim`](https://en.wikipedia.org/wiki/Trim_(computing)) is enabled for NVME and SSD disks in your OS (usually it's implemented using a cronjob or systemd service).

 ## File System {#file-system}

@ -94,7 +94,7 @@ Use at least a 10 GB network, if possible. 1 Gb will also work, but it will be m

 ## Huge Pages {#huge-pages}

-If you are using old Linux kernel, disable transparent huge pages. It interferes with memory allocators, which leads to significant performance degradation.
+If you are using old Linux kernel, disable transparent huge pages. It interferes with memory allocator, which leads to significant performance degradation.
 On newer Linux kernels transparent huge pages are alright.

 ``` bash
@ -107,7 +107,7 @@ If you are using OpenStack, set
 ```
 cpu_mode=host-passthrough
 ```
-in nova.conf.
+in `nova.conf`.

 If you are using libvirt, set
 ```
@ -136,7 +136,7 @@ Do not change `minSessionTimeout` setting, large values may affect ClickHouse re

 With the default settings, ZooKeeper is a time bomb:

-> The ZooKeeper server won’t delete files from old snapshots and logs when using the default configuration (see autopurge), and this is the responsibility of the operator.
+> The ZooKeeper server won’t delete files from old snapshots and logs when using the default configuration (see `autopurge`), and this is the responsibility of the operator.

 This bomb must be defused.

@ -241,7 +241,7 @@ JAVA_OPTS="-Xms{{ '{{' }} cluster.get('xms','128M') {{ '}}' }} \
    -XX:MaxGCPauseMillis=50"
 ```

-Salt init:
+Salt initialization:

 ``` text
 description "zookeeper-{{ '{{' }} cluster['name'] {{ '}}' }} centralized coordination service"
--- a/docs/en/operations/troubleshooting.md
+++ b/docs/en/operations/troubleshooting.md
@ -3,7 +3,7 @@ sidebar_position: 46
 sidebar_label: Troubleshooting
 ---

-# Troubleshooting 
+# Troubleshooting

 -   [Installation](#troubleshooting-installation-errors)
 -   [Connecting to the server](#troubleshooting-accepts-no-connections)
@ -26,7 +26,7 @@ Possible issues:

 ### Server Is Not Running {#server-is-not-running}

-**Check if server is runnnig**
+**Check if server is running**

 Command:

--- a/docs/en/sql-reference/functions/geo/h3.md
+++ b/docs/en/sql-reference/functions/geo/h3.md
@ -4,7 +4,7 @@ sidebar_label: H3 Indexes

 # Functions for Working with H3 Indexes

-[H3](https://eng.uber.com/h3/) is a geographical indexing system where Earth’s surface divided into a grid of even hexagonal cells. This system is hierarchical, i. e. each hexagon on the top level ("parent") can be splitted into seven even but smaller ones ("children"), and so on.
+[H3](https://eng.uber.com/h3/) is a geographical indexing system where Earth’s surface divided into a grid of even hexagonal cells. This system is hierarchical, i. e. each hexagon on the top level ("parent") can be split into seven even but smaller ones ("children"), and so on.

 The level of the hierarchy is called `resolution` and can receive a value from `0` till `15`, where `0` is the `base` level with the largest and coarsest cells.

@ -1398,4 +1398,4 @@ Result:
 │ [(37.42012867767779,-122.03773496427027),(37.33755608435299,-122.090428929044)] │
 └─────────────────────────────────────────────────────────────────────────────────┘
 ```
-[Original article](https://clickhouse.com/docs/en/sql-reference/functions/geo/h3) <!--hide-->
+[Original article](https://clickhouse.com/docs/en/sql-reference/functions/geo/h3) <!--hide-->
--- a/docs/en/sql-reference/functions/index.md
+++ b/docs/en/sql-reference/functions/index.md
@ -174,22 +174,24 @@ Result:
 Creating `test_function_sum_json` with named arguments and format [JSONEachRow](../../interfaces/formats.md#jsoneachrow) using XML configuration.
 File test_function.xml.
 ```xml
-<function>
-    <type>executable</type>
-    <name>test_function_sum_json</name>
-    <return_type>UInt64</return_type>
-    <return_name>result_name</return_name>
-    <argument>
-        <type>UInt64</type>
-        <name>argument_1</name>
-    </argument>
-    <argument>
-        <type>UInt64</type>
-        <name>argument_2</name>
-    </argument>
-    <format>JSONEachRow</format>
-    <command>test_function_sum_json.py</command>
-</function>
+<functions>
+    <function>
+        <type>executable</type>
+        <name>test_function_sum_json</name>
+        <return_type>UInt64</return_type>
+        <return_name>result_name</return_name>
+        <argument>
+            <type>UInt64</type>
+            <name>argument_1</name>
+        </argument>
+        <argument>
+            <type>UInt64</type>
+            <name>argument_2</name>
+        </argument>
+        <format>JSONEachRow</format>
+        <command>test_function_sum_json.py</command>
+    </function>
+</functions>
 ```

 Script file inside `user_scripts` folder `test_function_sum_json.py`.
@ -224,6 +226,50 @@ Result:
 └──────────────────────────────┘
 ```

+Executable user defined functions can take constant parameters configured in `command` setting (works only for user defined functions with `executable` type).
+File test_function_parameter_python.xml.
+```xml
+<functions>
+    <function>
+        <type>executable</type>
+        <name>test_function_parameter_python</name>
+        <return_type>String</return_type>
+        <argument>
+            <type>UInt64</type>
+        </argument>
+        <format>TabSeparated</format>
+        <command>test_function_parameter_python.py {test_parameter:UInt64}</command>
+    </function>
+</functions>
+```
+
+Script file inside `user_scripts` folder `test_function_parameter_python.py`.
+
+```python
+#!/usr/bin/python3
+
+import sys
+
+if __name__ == "__main__":
+    for line in sys.stdin:
+        print("Parameter " + str(sys.argv[1]) + " value " + str(line), end="")
+        sys.stdout.flush()
+```
+
+Query:
+
+``` sql
+SELECT test_function_parameter_python(1)(2);
+```
+
+Result:
+
+``` text
+┌─test_function_parameter_python(1)(2)─┐
+│ Parameter 1 value 2                  │
+└──────────────────────────────────────┘
+```
+
 ## Error Handling

 Some functions might throw an exception if the data is invalid. In this case, the query is canceled and an error text is returned to the client. For distributed processing, when an exception occurs on one of the servers, the other servers also attempt to abort the query.
--- a/docs/en/sql-reference/functions/type-conversion-functions.md
+++ b/docs/en/sql-reference/functions/type-conversion-functions.md
@ -32,7 +32,7 @@ Integer value in the `Int8`, `Int16`, `Int32`, `Int64`, `Int128` or `Int256` dat

 Functions use [rounding towards zero](https://en.wikipedia.org/wiki/Rounding#Rounding_towards_zero), meaning they truncate fractional digits of numbers.

-The behavior of functions for the [NaN and Inf](../../sql-reference/data-types/float.md#data_type-float-nan-inf) arguments is undefined. Remember about [numeric convertions issues](#numeric-conversion-issues), when using the functions.
+The behavior of functions for the [NaN and Inf](../../sql-reference/data-types/float.md#data_type-float-nan-inf) arguments is undefined. Remember about [numeric conversions issues](#numeric-conversion-issues), when using the functions.

 **Example**

@ -131,7 +131,7 @@ Integer value in the `UInt8`, `UInt16`, `UInt32`, `UInt64` or `UInt256` data typ

 Functions use [rounding towards zero](https://en.wikipedia.org/wiki/Rounding#Rounding_towards_zero), meaning they truncate fractional digits of numbers.

-The behavior of functions for negative agruments and for the [NaN and Inf](../../sql-reference/data-types/float.md#data_type-float-nan-inf) arguments is undefined. If you pass a string with a negative number, for example `'-32'`, ClickHouse raises an exception. Remember about [numeric convertions issues](#numeric-conversion-issues), when using the functions.
+The behavior of functions for negative agruments and for the [NaN and Inf](../../sql-reference/data-types/float.md#data_type-float-nan-inf) arguments is undefined. If you pass a string with a negative number, for example `'-32'`, ClickHouse raises an exception. Remember about [numeric conversions issues](#numeric-conversion-issues), when using the functions.

 **Example**

@ -689,7 +689,7 @@ x::t

 -    Converted value.

-:::note    
+:::note
 If the input value does not fit the bounds of the target type, the result overflows. For example, `CAST(-1, 'UInt8')` returns `255`.
 :::

@ -1433,7 +1433,7 @@ Result:

 Converts a `DateTime64` to a `Int64` value with fixed sub-second precision. Input value is scaled up or down appropriately depending on it precision.

-:::note    
+:::note
 The output value is a timestamp in UTC, not in the timezone of `DateTime64`.
 :::

--- a/docs/ru/engines/table-engines/mergetree-family/replacingmergetree.md
+++ b/docs/ru/engines/table-engines/mergetree-family/replacingmergetree.md
@ -38,7 +38,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
    При слиянии `ReplacingMergeTree` оставляет только строку для каждого уникального ключа сортировки:

    - Последнюю в выборке, если `ver` не задан. Под выборкой здесь понимается набор строк в наборе кусков данных, участвующих в слиянии. Последний по времени создания кусок (последняя вставка) будет последним в выборке. Таким образом, после дедупликации для каждого значения ключа сортировки останется самая последняя строка из самой последней вставки.
-    - С максимальной версией, если `ver` задан.
+    - С максимальной версией, если `ver` задан. Если `ver` одинаковый у нескольких строк, то для них используется правило -- если `ver` не задан, т.е. в результате слияния останется самая последняя строка из самой последней вставки.

 **Секции запроса**

--- a/docs/ru/interfaces/third-party/client-libraries.md
+++ b/docs/ru/interfaces/third-party/client-libraries.md
@ -41,6 +41,8 @@ sidebar_label: "Клиентские библиотеки от сторонни
    -   [ClickHouse (Ruby)](https://github.com/shlima/click_house)
    -   [clickhouse-activerecord](https://github.com/PNixx/clickhouse-activerecord)
 -   Rust
+    -   [clickhouse.rs](https://github.com/loyd/clickhouse.rs)
+    -   [clickhouse-rs](https://github.com/suharev7/clickhouse-rs)
    -   [Klickhouse](https://github.com/Protryon/klickhouse)
 -   R
    -   [clickhouse-r](https://github.com/hannesmuehleisen/clickhouse-r)
--- a/docs/ru/operations/clickhouse-keeper.md
+++ b/docs/ru/operations/clickhouse-keeper.md
@ -3,23 +3,21 @@ sidebar_position: 66
 sidebar_label: ClickHouse Keeper
 ---

-# [пре-продакшн] ClickHouse Keeper {#clickHouse-keeper}
+# ClickHouse Keeper {#clickHouse-keeper}

 Сервер ClickHouse использует сервис координации [ZooKeeper](https://zookeeper.apache.org/) для [репликации](../engines/table-engines/mergetree-family/replication.md) данных и выполнения [распределенных DDL запросов](../sql-reference/distributed-ddl.md). ClickHouse Keeper — это альтернативный сервис координации, совместимый с ZooKeeper.

-:::danger "Предупреждение"
-    ClickHouse Keeper находится в стадии пре-продакшн и тестируется в CI ClickHouse и на нескольких внутренних инсталляциях.
-
 ## Детали реализации {#implementation-details}

 ZooKeeper — один из первых широко известных сервисов координации с открытым исходным кодом. Он реализован на языке программирования Java, имеет достаточно простую и мощную модель данных. Алгоритм координации Zookeeper называется ZAB (ZooKeeper Atomic Broadcast). Он не гарантирует линеаризуемость операций чтения, поскольку каждый узел ZooKeeper обслуживает чтения локально. В отличие от ZooKeeper, ClickHouse Keeper реализован на C++ и использует алгоритм [RAFT](https://raft.github.io/), [реализация](https://github.com/eBay/NuRaft). Этот алгоритм позволяет достичь линеаризуемости чтения и записи, имеет несколько реализаций с открытым исходным кодом на разных языках.

-По умолчанию ClickHouse Keeper предоставляет те же гарантии, что и ZooKeeper (линеаризуемость записей, последовательная согласованность чтений). У него есть совместимый клиент-серверный протокол, поэтому любой стандартный клиент ZooKeeper может использоваться для взаимодействия с ClickHouse Keeper. Снэпшоты и журналы имеют несовместимый с ZooKeeper формат, однако можно конвертировать данные Zookeeper в снэпшот ClickHouse Keeper с помощью `clickhouse-keeper-converter`. Межсерверный протокол ClickHouse Keeper также несовместим с ZooKeeper, поэтому создание смешанного кластера ZooKeeper / ClickHouse Keeper невозможно.
+По умолчанию ClickHouse Keeper предоставляет те же гарантии, что и ZooKeeper (линеаризуемость записей, нелинеаризуемость чтений). ClickHouse Keeper предоставляет совместимый клиент-серверный протокол, поэтому любой стандартный клиент ZooKeeper может использоваться для взаимодействия с ClickHouse Keeper. Снэпшоты и журналы имеют несовместимый с ZooKeeper формат, однако можно конвертировать данные Zookeeper в снэпшот ClickHouse Keeper с помощью `clickhouse-keeper-converter`. Межсерверный протокол ClickHouse Keeper также несовместим с ZooKeeper, поэтому создание смешанного кластера ZooKeeper / ClickHouse Keeper невозможно.

-Система управления доступом (ACL) ClickHouse Keeper реализована так же, как в [ZooKeeper](https://zookeeper.apache.org/doc/r3.1.2/zookeeperProgrammers.html#sc_ZooKeeperAccessControl). ClickHouse Keeper поддерживает тот же набор разрешений и идентичные схемы: `world`, `auth`, `digest`, `host` и `ip`. Digest для аутентификации использует пару значений `username:password`. Пароль кодируется в Base64.
+Система управления доступом (ACL) ClickHouse Keeper реализована так же, как в [ZooKeeper](https://zookeeper.apache.org/doc/r3.1.2/zookeeperProgrammers.html#sc_ZooKeeperAccessControl). ClickHouse Keeper поддерживает тот же набор разрешений и идентичные схемы: `world`, `auth`, `digest`. Digest для аутентификации использует пару значений `username:password`. Пароль кодируется в Base64.

-:::info "Примечание"
+:::note
    Внешние интеграции не поддерживаются.
+:::

 ## Конфигурация {#configuration}

@ -27,34 +25,36 @@ ClickHouse Keeper может использоваться как равноце

 -    `tcp_port` — порт для подключения клиента (по умолчанию для ZooKeeper: `2181`).
 -    `tcp_port_secure` — зашифрованный порт для SSL-соединения между клиентом и сервером сервиса.
-    `server_id` — уникальный идентификатор сервера, каждый участник кластера должен иметь уникальный номер&nbsp;(1,&nbsp;2,&nbsp;3&nbsp;и&nbsp;т.&nbsp;д.).
-    `log_storage_path` — путь к журналам координации, лучше хранить их на незанятом устройстве (актуально и для ZooKeeper).
+-    `server_id` — уникальный идентификатор сервера, каждый участник кластера должен иметь уникальный номер (1, 2, 3 и т.д.).
+-    `log_storage_path` — путь к журналам координации, лучше хранить их на не нагруженном устройстве (актуально и для ZooKeeper).
 -    `snapshot_storage_path` — путь к снэпшотам координации.

 Другие общие параметры наследуются из конфигурации сервера ClickHouse (`listen_host`, `logger`, и т. д.).

 Настройки внутренней координации находятся в `<keeper_server>.<coordination_settings>`:

-    `operation_timeout_ms` — максимальное время ожидания для одной клиентской операции в миллисекундах (по умолчанию: 10000).
-    `session_timeout_ms` — максимальное время ожидания для клиентской сессии в миллисекундах (по умолчанию: 30000).
-    `dead_session_check_period_ms` — частота, с которой ClickHouse Keeper проверяет мертвые сессии и удаляет их, в миллисекундах (по умолчанию: 500).
-    `heart_beat_interval_ms` — частота, с которой узел-лидер ClickHouse Keeper отправляет хартбиты узлам-последователям, в миллисекундах (по умолчанию: 500).
-    `election_timeout_lower_bound_ms` — время, после которого последователь может инициировать выборы лидера, если не получил от него сердцебиения (по умолчанию: 1000).
-    `election_timeout_upper_bound_ms` — время, после которого последователь должен инициировать выборы лидера, если не получил от него сердцебиения (по умолчанию: 2000).
-    `rotate_log_storage_interval` — количество записей в журнале  координации для хранения в одном файле (по умолчанию: 100000).
-    `reserved_log_items` — минимальное количество записей в журнале координации которые нужно сохранять после снятия снепшота (по умолчанию: 100000).
-    `snapshot_distance` — частота, с которой ClickHouse Keeper делает новые снэпшоты (по количеству записей в журналах), в миллисекундах (по умолчанию: 100000).
-    `snapshots_to_keep` — количество снэпшотов для сохранения (по умолчанию: 3).
-    `stale_log_gap` — время, после которого лидер считает последователя устаревшим и отправляет ему снэпшот вместо журналов (по умолчанию: 10000).
-    `fresh_log_gap` — максимальное отставание от лидера в количестве записей журнала после которого последователь считает себя не отстающим (по умолчанию: 200).
-    `max_requests_batch_size` — количество запросов на запись, которые будут сгруппированы в один перед отправкой через RAFT (по умолчанию: 100).
-    `force_sync` — вызывать `fsync` при каждой записи в журнал координации (по умолчанию: true).
-    `quorum_reads` — выполнять запросы чтения аналогично запросам записи через весь консенсус RAFT с негативным эффектом на производительность и размер журналов (по умолчанию: false).
-    `raft_logs_level` — уровень логгирования сообщений в текстовый лог  (trace, debug и т. д.) (по умолчанию: information).
 -    `auto_forwarding` — разрешить пересылку запросов на запись от последователей лидеру (по умолчанию: true).
-    `shutdown_timeout` — время ожидания завершения внутренних подключений и выключения, в миллисекундах (по умолчанию: 5000).
+-    `dead_session_check_period_ms` — частота, с которой ClickHouse Keeper проверяет мертвые сессии и удаляет их, в миллисекундах (по умолчанию: 500).
+-    `election_timeout_lower_bound_ms` — время, после которого последователь может инициировать перевыбор лидера, если не получил от него контрольный сигнал (по умолчанию: 1000).
+-    `election_timeout_upper_bound_ms` — время, после которого последователь должен инициировать перевыбор лидера, если не получил от него контрольный сигнал (по умолчанию: 2000).
+-    `force_sync` — вызывать `fsync` при каждой записи в журнал координации (по умолчанию: true).
+-    `four_letter_word_white_list` — список разрешенных 4-х буквенных команд (по умолчанию: "conf,cons,crst,envi,ruok,srst,srvr,stat,wchc,wchs,dirs,mntr,isro").
+-    `fresh_log_gap` — минимальное отставание от лидера в количестве записей журнала после которого последователь считает себя актуальным (по умолчанию: 200).
+-    `heart_beat_interval_ms` — частота, с которой узел-лидер ClickHouse Keeper отправляет контрольные сигналы узлам-последователям, в миллисекундах (по умолчанию: 500).
+-    `max_requests_batch_size` — количество запросов на запись, которые будут сгруппированы в один перед отправкой через RAFT (по умолчанию: 100).
+-    `min_session_timeout_ms` — Min timeout for client session (ms) (default: 10000).
+-    `operation_timeout_ms` — максимальное время ожидания для одной клиентской операции в миллисекундах (по умолчанию: 10000).
+-    `quorum_reads` — выполнять запросы чтения аналогично запросам записи через консенсус RAFT (по умолчанию: false).
+-    `raft_logs_level` — уровень логгирования сообщений в текстовый лог  (trace, debug и т. д.) (по умолчанию: default).
+-    `reserved_log_items` — минимальное количество записей в журнале координации которые нужно сохранять после снятия снепшота (по умолчанию: 100000).
+-    `rotate_log_storage_interval` — количество записей в журнале  координации для хранения в одном файле (по умолчанию: 100000).
+-    `session_timeout_ms` — максимальное время ожидания для клиентской сессии в миллисекундах (по умолчанию: 30000).
+-    `shutdown_timeout` — время ожидания завершения внутренних подключений при выключении, в миллисекундах (по умолчанию: 5000).
+-    `snapshot_distance` — частота, с которой ClickHouse Keeper делает новые снэпшоты (по количеству записей в журналах) (по умолчанию: 100000).
+-    `snapshots_to_keep` — количество снэпшотов для хранения (по умолчанию: 3).
+-    `stale_log_gap` — время, после которого лидер считает последователя отставшим и отправляет ему снэпшот вместо журналов (по умолчанию: 10000).
 -    `startup_timeout` — время отключения сервера, если он не подключается к другим участникам кворума, в миллисекундах (по умолчанию: 30000).
-    `four_letter_word_allow_list` — список разрешенных 4-х буквенных команд (по умолчанию: "conf,cons,crst,envi,ruok,srst,srvr,stat,wchs,dirs,mntr,isro").
+

 Конфигурация кворума находится в `<keeper_server>.<raft_configuration>` и содержит описание серверов. 

@ -67,6 +67,10 @@ ClickHouse Keeper может использоваться как равноце
 -    `port` — порт, на котором серверу доступны соединения для внутренней коммуникации.


+:::note
+В случае изменения топологии кластера ClickHouse Keeper(например, замены сервера), удостоверьтесь, что вы сохраняеете отношение `server_id` - `hostname`, не переиспользуете существующие `server_id` для для новых серверов и не перемешиваете идентификаторы. Подобные ошибки могут случаться, если вы используете автоматизацию при разворачивании кластера без логики сохранения идентификаторов.
+:::
+
 Примеры конфигурации кворума с тремя узлами можно найти в [интеграционных тестах](https://github.com/ClickHouse/ClickHouse/tree/master/tests/integration) с префиксом `test_keeper_`. Пример конфигурации для сервера №1:

 ```xml
@ -314,4 +318,31 @@ clickhouse-keeper-converter --zookeeper-logs-dir /var/lib/zookeeper/version-2 --

 4. Скопируйте снэпшот на узлы сервера ClickHouse с настроенным `keeper` или запустите ClickHouse Keeper вместо ZooKeeper. Снэпшот должен сохраняться на всех узлах: в противном случае пустые узлы могут захватить лидерство и сконвертированные данные могут быть отброшены на старте.

+## Восстановление после потери кворума
+
+Так как ClickHouse Keeper основан на протоколе Raft, он может оставаться работоспособным при отказе определенного количества нод в зависимости от размера кластера.
+Например, для кластера из 3 нод, алгоритм кворума продолжает работать при отказе не более чем одной ноды.
+
+Конфигурация кластера может быть изменена динамически с некоторыми ограничениями.
+Переконфигурация также использует Raft, поэтому для добавление новой ноды кластера или исключения старой ноды из него требуется достижения кворума в рамках текущей конфигурации кластера.
+Если в вашем кластере произошел отказ большего числа нод, чем допускает Raft для вашей текущей конфигурации и у вас нет возможности восстановить их работоспособность, Raft перестанет работать и не позволит изменить конфигурацию стандартным механизмом.
+
+Тем не менее ClickHousr Keeper имеет возможность запуститься в режиме восстановления, который позволяет переконфигурировать класте используя только одну ноду кластера.
+Этот механизм может использоваться только как крайняя мера, когда вы не можете восстановить существующие ноды кластера или запустить новый сервер с тем же идентификатором.
+
+Важно:
+- Удостоверьтесь, что отказавшие ноды не смогут в дальнейшем подключиться к кластеру в будущем.
+- Не запускайте новые ноды, пока не завешите процедуру ниже.
+
+После того, как выполнили действия выше выполните следующие шаги.
+1. Выберете одну ноду Keeper, которая станет новым лидером. Учтите, что данные которые с этой ноды будут испольщзованы всем кластером, поэтому рекомендуется выбрать ноду с наиболее актуальным состоянием.
+2. Перед дальнейшими действиям сделайте резервную копию данных из директорий `log_storage_path` и `snapshot_storage_path`.
+3. Измените настройки на всех нодах кластера, которые вы собираетесь использовать.
+4. Отправьте команду `rcvr` на ноду, которую вы выбрали или остановите ее и запустите заново с аргументом `--force-recovery`. Это переведет ноду в режим восстановления.
+5. Запускайте остальные ноды кластера по одной и проверяйте, что команда `mntr` возвращает `follower` в выводе состояния `zk_server_state` перед тем, как запустить следующую ноду.
+6. Пока нода работает в режиме восстановления, лидер будет возвращать ошибку на запрос `mntr` пока кворум не будет достигнут с помощью новых нод. Любые запросы от клиентов и постедователей будут возвращать ошибку.
+7. После достижения кворума лидер перейдет в нормальный режим работы и станет обрабатывать все запросы через Raft. Удостоверьтесь, что запрос `mntr` возвращает `leader` в выводе состояния `zk_server_state`.
+
 [Original article](https://clickhouse.com/docs/en/operations/clickhouse-keeper/) <!--hide-->
+
+
--- a/docs/ru/sql-reference/functions/index.md
+++ b/docs/ru/sql-reference/functions/index.md
@ -174,22 +174,24 @@ SELECT test_function_sum(2, 2);
 Создание `test_function_sum_json` с именноваными аргументами и форматом [JSONEachRow](../../interfaces/formats.md#jsoneachrow) с использованием конфигурации XML.
 Файл test_function.xml.
 ```xml
-<function>
-    <type>executable</type>
-    <name>test_function_sum_json</name>
-    <return_type>UInt64</return_type>
-    <return_name>result_name</return_name>
-    <argument>
-        <type>UInt64</type>
-        <name>argument_1</name>
-    </argument>
-    <argument>
-        <type>UInt64</type>
-        <name>argument_2</name>
-    </argument>
-    <format>JSONEachRow</format>
-    <command>test_function_sum_json.py</command>
-</function>
+<functions>
+    <function>
+        <type>executable</type>
+        <name>test_function_sum_json</name>
+        <return_type>UInt64</return_type>
+        <return_name>result_name</return_name>
+        <argument>
+            <type>UInt64</type>
+            <name>argument_1</name>
+        </argument>
+        <argument>
+            <type>UInt64</type>
+            <name>argument_2</name>
+        </argument>
+        <format>JSONEachRow</format>
+        <command>test_function_sum_json.py</command>
+    </function>
+</functions>
 ```

 Файл скрипта внутри папки `user_scripts` `test_function_sum_json.py`.
@ -224,6 +226,50 @@ SELECT test_function_sum_json(2, 2);
 └──────────────────────────────┘
 ```

+Исполняемые пользовательские функции могут принимать константные параметры, их конфигурация является частью настройки `command` (работает только для пользовательских функций с типом `executable`).
+Файл test_function_parameter_python.xml.
+```xml
+<functions>
+    <function>
+        <type>executable</type>
+        <name>test_function_parameter_python</name>
+        <return_type>String</return_type>
+        <argument>
+            <type>UInt64</type>
+        </argument>
+        <format>TabSeparated</format>
+        <command>test_function_parameter_python.py {test_parameter:UInt64}</command>
+    </function>
+</functions>
+```
+
+Файл скрипта внутри папки `user_scripts` `test_function_parameter_python.py`.
+
+```python
+#!/usr/bin/python3
+
+import sys
+
+if __name__ == "__main__":
+    for line in sys.stdin:
+        print("Parameter " + str(sys.argv[1]) + " value " + str(line), end="")
+        sys.stdout.flush()
+```
+
+Query:
+
+``` sql
+SELECT test_function_parameter_python(1)(2);
+```
+
+Result:
+
+``` text
+┌─test_function_parameter_python(1)(2)─┐
+│ Parameter 1 value 2                  │
+└──────────────────────────────────────┘
+```
+
 ## Обработка ошибок {#obrabotka-oshibok}

 Некоторые функции могут кидать исключения в случае ошибочных данных. В этом случае, выполнение запроса прерывается, и текст ошибки выводится клиенту. При распределённой обработке запроса, при возникновении исключения на одном из серверов, на другие серверы пытается отправиться просьба тоже прервать выполнение запроса.
--- a/docs/ru/sql-reference/statements/check-table.md
+++ b/docs/ru/sql-reference/statements/check-table.md
@ -45,7 +45,7 @@ CHECK TABLE test_table;
 └───────────┴───────────┴─────────┘
 ```

-Если `check_query_single_value_result` = 0, запрос `CHECK TABLE` возвращает статус таблицы в целом.
+Если `check_query_single_value_result` = 1, запрос `CHECK TABLE` возвращает статус таблицы в целом.

 ```sql
 SET check_query_single_value_result = 1;
--- a/docs/zh/interfaces/third-party/client-libraries.md
+++ b/docs/zh/interfaces/third-party/client-libraries.md
@ -41,6 +41,10 @@ Yandex**没有**维护下面列出的库，也没有做过任何广泛的测试
 -   Ruby
    -   [ClickHouse (Ruby)](https://github.com/shlima/click_house)
    -   [clickhouse-activerecord](https://github.com/PNixx/clickhouse-activerecord)
+-   Rust
+    -   [clickhouse.rs](https://github.com/loyd/clickhouse.rs)
+    -   [clickhouse-rs](https://github.com/suharev7/clickhouse-rs)
+    -   [Klickhouse](https://github.com/Protryon/klickhouse)
 -   R
    -   [clickhouse-r](https://github.com/hannesmuehleisen/clickhouse-r)
    -   [RClickHouse](https://github.com/IMSMWU/RClickHouse)
--- a/programs/install/Install.cpp
+++ b/programs/install/Install.cpp
@ -5,7 +5,7 @@
 #include <sys/stat.h>
 #include <pwd.h>

-#if defined(__linux__)
+#if defined(OS_LINUX)
    #include <syscall.h>
    #include <linux/capability.h>
 #endif
@ -789,7 +789,7 @@ int mainEntryClickHouseInstall(int argc, char ** argv)
          *  then attempt to run this file will end up with a cryptic "Operation not permitted" message.
          */

-#if defined(__linux__)
+#if defined(OS_LINUX)
        fmt::print("Setting capabilities for clickhouse binary. This is optional.\n");
        std::string command = fmt::format("command -v setcap >/dev/null"
            " && command -v capsh >/dev/null"
--- a/programs/main.cpp
+++ b/programs/main.cpp
@ -2,7 +2,7 @@
 #include <csetjmp>
 #include <unistd.h>

-#ifdef __linux__
+#ifdef OS_LINUX
 #include <sys/mman.h>
 #endif

@ -339,7 +339,7 @@ struct Checker
        checkRequiredInstructions();
    }
 } checker
-#ifndef __APPLE__
+#ifndef OS_DARWIN
    __attribute__((init_priority(101)))    /// Run before other static initializers.
 #endif
 ;
--- a/src/Common/Allocator.h
+++ b/src/Common/Allocator.h
@ -11,7 +11,7 @@
 #include <pcg_random.hpp>
 #include <Common/thread_local_rng.h>

-#if !defined(__APPLE__) && !defined(__FreeBSD__)
+#if !defined(OS_DARWIN) && !defined(OS_FREEBSD)
 #include <malloc.h>
 #endif

--- a/src/Common/Dwarf.cpp
+++ b/src/Common/Dwarf.cpp
@ -1,4 +1,4 @@
-#if defined(__ELF__) && !defined(__FreeBSD__)
+#if defined(__ELF__) && !defined(OS_FREEBSD)

 /*
 * Copyright 2012-present Facebook, Inc.
--- a/src/Common/Dwarf.h
+++ b/src/Common/Dwarf.h
@ -1,6 +1,6 @@
 #pragma once

-#if defined(__ELF__) && !defined(__FreeBSD__)
+#if defined(__ELF__) && !defined(OS_FREEBSD)

 /*
 * Copyright 2012-present Facebook, Inc.
--- a/src/Common/Elf.cpp
+++ b/src/Common/Elf.cpp
@ -1,4 +1,4 @@
-#if defined(__ELF__) && !defined(__FreeBSD__)
+#if defined(__ELF__) && !defined(OS_FREEBSD)

 #include <Common/Elf.h>
 #include <Common/Exception.h>
--- a/src/Common/Elf.h
+++ b/src/Common/Elf.h
@ -1,6 +1,6 @@
 #pragma once

-#if defined(__ELF__) && !defined(__FreeBSD__)
+#if defined(__ELF__) && !defined(OS_FREEBSD)

 #include <IO/MMapReadBufferFromFile.h>

--- a/src/Common/ErrorCodes.cpp
+++ b/src/Common/ErrorCodes.cpp
@ -630,6 +630,7 @@
    M(659, UNKNOWN_STATUS_OF_TRANSACTION) \
    M(660, HDFS_ERROR) \
    M(661, CANNOT_SEND_SIGNAL) \
+    M(662, FS_METADATA_ERROR) \
    \
    M(999, KEEPER_EXCEPTION) \
    M(1000, POCO_EXCEPTION) \
--- a/src/Common/Exception.cpp
+++ b/src/Common/Exception.cpp
@ -218,7 +218,7 @@ static void getNoSpaceLeftInfoMessage(std::filesystem::path path, String & msg)
        formatReadableQuantity(fs.f_favail),
        mount_point);

-#if defined(__linux__)
+#if defined(OS_LINUX)
    msg += "\nFilesystem: " + getFilesystemName(mount_point);
 #endif
 }
@ -230,7 +230,7 @@ static void getNoSpaceLeftInfoMessage(std::filesystem::path path, String & msg)
  */
 static void getNotEnoughMemoryMessage(std::string & msg)
 {
-#if defined(__linux__)
+#if defined(OS_LINUX)
    try
    {
        static constexpr size_t buf_size = 1024;
@ -261,7 +261,7 @@ static void getNotEnoughMemoryMessage(std::string & msg)
            }
        }

-        if (num_maps > max_map_count * 0.99)
+        if (num_maps > max_map_count * 0.90)
        {
            msg += fmt::format(
                "\nIt looks like that the process is near the limit on number of virtual memory mappings."
--- a/src/Common/FileCache.cpp
+++ b/src/Common/FileCache.cpp
@ -30,6 +30,11 @@ namespace
    }
 }

+static bool isQueryInitialized()
+{
+    return CurrentThread::isInitialized() && CurrentThread::get().getQueryContext() && CurrentThread::getQueryId().size != 0;
+}
+
 IFileCache::IFileCache(
    const String & cache_base_path_,
    const FileCacheSettings & cache_settings_)
@ -37,6 +42,7 @@ IFileCache::IFileCache(
    , max_size(cache_settings_.max_size)
    , max_element_size(cache_settings_.max_elements)
    , max_file_segment_size(cache_settings_.max_file_segment_size)
+    , enable_filesystem_query_cache_limit(cache_settings_.enable_filesystem_query_cache_limit)
 {
 }

@ -59,9 +65,7 @@ String IFileCache::getPathInLocalCache(const Key & key)

 bool IFileCache::isReadOnly()
 {
-    return !CurrentThread::isInitialized()
-        || !CurrentThread::get().getQueryContext()
-        || CurrentThread::getQueryId().size == 0;
+    return (!isQueryInitialized());
 }

 void IFileCache::assertInitialized() const
@ -70,6 +74,73 @@ void IFileCache::assertInitialized() const
        throw Exception(ErrorCodes::REMOTE_FS_OBJECT_CACHE_ERROR, "Cache not initialized");
 }

+IFileCache::QueryContextPtr IFileCache::getCurrentQueryContext(std::lock_guard<std::mutex> & cache_lock)
+{
+    if (!isQueryInitialized())
+        return nullptr;
+
+    return getQueryContext(CurrentThread::getQueryId().toString(), cache_lock);
+}
+
+IFileCache::QueryContextPtr IFileCache::getQueryContext(const String & query_id, std::lock_guard<std::mutex> &)
+{
+    auto query_iter = query_map.find(query_id);
+    return (query_iter == query_map.end()) ? nullptr : query_iter->second;
+}
+
+void IFileCache::removeQueryContext(const String & query_id)
+{
+    std::lock_guard cache_lock(mutex);
+    auto query_iter = query_map.find(query_id);
+
+    if (query_iter == query_map.end())
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to release query context that does not exist");
+
+    query_map.erase(query_iter);
+}
+
+IFileCache::QueryContextPtr IFileCache::getOrSetQueryContext(const String & query_id, const ReadSettings & settings, std::lock_guard<std::mutex> & cache_lock)
+{
+    if (query_id.empty())
+        return nullptr;
+
+    auto context = getQueryContext(query_id, cache_lock);
+    if (!context)
+    {
+        auto query_iter = query_map.insert({query_id, std::make_shared<QueryContext>(settings.max_query_cache_size, settings.skip_download_if_exceeds_query_cache)}).first;
+        context = query_iter->second;
+    }
+    return context;
+}
+
+IFileCache::QueryContextHolder IFileCache::getQueryContextHolder(const String & query_id, const ReadSettings & settings)
+{
+    std::lock_guard cache_lock(mutex);
+
+    /// if enable_filesystem_query_cache_limit is true, and max_query_cache_size large than zero,
+    /// we create context query for current query.
+    if (enable_filesystem_query_cache_limit && settings.max_query_cache_size)
+    {
+        auto context = getOrSetQueryContext(query_id, settings, cache_lock);
+        return QueryContextHolder(query_id, this, context);
+    }
+    else
+        return QueryContextHolder();
+}
+
+IFileCache::QueryContextHolder::QueryContextHolder(const String & query_id_, IFileCache * cache_, IFileCache::QueryContextPtr context_)
+    : query_id(query_id_), cache(cache_), context(context_)
+{
+}
+
+IFileCache::QueryContextHolder::~QueryContextHolder()
+{
+    /// If only the query_map and the current holder hold the context_query,
+    /// the query has been completed and the query_context is released.
+    if (context && context.use_count() == 2)
+        cache->removeQueryContext(query_id);
+}
+
 LRUFileCache::LRUFileCache(const String & cache_base_path_, const FileCacheSettings & cache_settings_)
    : IFileCache(cache_base_path_, cache_settings_)
    , max_stash_element_size(cache_settings_.max_elements)
@ -480,8 +551,170 @@ FileSegmentsHolder LRUFileCache::setDownloading(const Key & key, size_t offset,
    return FileSegmentsHolder(std::move(file_segments));
 }

-bool LRUFileCache::tryReserve(
-    const Key & key, size_t offset, size_t size, std::lock_guard<std::mutex> & cache_lock)
+bool LRUFileCache::tryReserve(const Key & key, size_t offset, size_t size, std::lock_guard<std::mutex> & cache_lock)
+{
+    auto query_context = enable_filesystem_query_cache_limit ? getCurrentQueryContext(cache_lock) : nullptr;
+
+    /// If the context can be found, subsequent cache replacements are made through the Query context.
+    if (query_context)
+    {
+        auto res = tryReserveForQuery(key, offset, size, query_context, cache_lock);
+        switch (res)
+        {
+            case ReserveResult::FITS_IN_QUERY_LIMIT_AND_RESERVATION_COMPLETED :
+            {
+                /// When the maximum cache size of the query is reached, the cache will be
+                /// evicted from the history cache accessed by the current query.
+                return true;
+            }
+            case ReserveResult::EXCEEDS_QUERY_LIMIT :
+            {
+                /// The query currently does not have enough space to reserve.
+                /// It returns false and reads data directly from the remote fs.
+                return false;
+            }
+            case ReserveResult::FITS_IN_QUERY_LIMIT_NEED_RESERVE_FROM_MAIN_LIST :
+            {
+                /// When the maximum cache capacity of the request is not reached, the cache
+                /// block is evicted from the main LRU queue.
+                return tryReserveForMainList(key, offset, size, query_context, cache_lock);
+            }
+        }
+        __builtin_unreachable();
+    }
+    else
+    {
+        return tryReserveForMainList(key, offset, size, query_context, cache_lock);
+    }
+}
+
+LRUFileCache::ReserveResult LRUFileCache::tryReserveForQuery(const Key & key, size_t offset, size_t size, QueryContextPtr query_context, std::lock_guard<std::mutex> & cache_lock)
+{
+    /// The maximum cache capacity of the request is not reached, thus the
+    //// cache block is evicted from the main LRU queue by tryReserveForMainList().
+    if (query_context->getCacheSize() + size <= query_context->getMaxCacheSize())
+    {
+        return ReserveResult::FITS_IN_QUERY_LIMIT_NEED_RESERVE_FROM_MAIN_LIST;
+    }
+    /// When skip_download_if_exceeds_query_cache is true, there is no need
+    /// to evict old data, skip the cache and read directly from remote fs.
+    else if (query_context->isSkipDownloadIfExceed())
+    {
+        return ReserveResult::EXCEEDS_QUERY_LIMIT;
+    }
+    /// The maximum cache size of the query is reached, the cache will be
+    /// evicted from the history cache accessed by the current query.
+    else
+    {
+        size_t removed_size = 0;
+        size_t queue_size = queue.getElementsNum(cache_lock);
+
+        auto * cell_for_reserve = getCell(key, offset, cache_lock);
+
+        std::vector<IFileCache::LRUQueue::Iterator> ghost;
+        std::vector<FileSegmentCell *> trash;
+        std::vector<FileSegmentCell *> to_evict;
+
+        auto is_overflow = [&]
+        {
+            return (max_size != 0 && queue.getTotalWeight(cache_lock) + size - removed_size > max_size)
+            || (max_element_size != 0 && queue_size > max_element_size)
+            || (query_context->getCacheSize() + size - removed_size > query_context->getMaxCacheSize());
+        };
+
+        /// Select the cache from the LRU queue held by query for expulsion.
+        for (auto iter = query_context->queue().begin(); iter != query_context->queue().end(); iter++)
+        {
+            if (!is_overflow())
+                break;
+
+            auto * cell = getCell(iter->key, iter->offset, cache_lock);
+
+            if (!cell)
+            {
+                /// The cache corresponding to this record may be swapped out by
+                /// other queries, so it has become invalid.
+                ghost.push_back(iter);
+                removed_size += iter->size;
+            }
+            else
+            {
+                size_t cell_size = cell->size();
+                assert(iter->size == cell_size);
+
+                if (cell->releasable())
+                {
+                    auto & file_segment = cell->file_segment;
+                    std::lock_guard segment_lock(file_segment->mutex);
+
+                    switch (file_segment->download_state)
+                    {
+                        case FileSegment::State::DOWNLOADED:
+                        {
+                            to_evict.push_back(cell);
+                            break;
+                        }
+                        default:
+                        {
+                            trash.push_back(cell);
+                            break;
+                        }
+                    }
+                    removed_size += cell_size;
+                    --queue_size;
+                }
+            }
+        }
+
+        assert(trash.empty());
+        for (auto & cell : trash)
+        {
+            auto file_segment = cell->file_segment;
+            if (file_segment)
+            {
+                query_context->remove(file_segment->key(), file_segment->offset(), cell->size(), cache_lock);
+
+                std::lock_guard segment_lock(file_segment->mutex);
+                remove(file_segment->key(), file_segment->offset(), cache_lock, segment_lock);
+            }
+        }
+
+        for (auto & iter : ghost)
+            query_context->remove(iter->key, iter->offset, iter->size, cache_lock);
+
+        if (is_overflow())
+        {
+            return ReserveResult::EXCEEDS_QUERY_LIMIT;
+        }
+
+        if (cell_for_reserve)
+        {
+            auto queue_iterator = cell_for_reserve->queue_iterator;
+            if (queue_iterator)
+                queue.incrementSize(*queue_iterator, size, cache_lock);
+            else
+                cell_for_reserve->queue_iterator = queue.add(key, offset, size, cache_lock);
+        }
+
+        for (auto & cell : to_evict)
+        {
+            auto file_segment = cell->file_segment;
+            if (file_segment)
+            {
+                query_context->remove(file_segment->key(), file_segment->offset(), cell->size(), cache_lock);
+
+                std::lock_guard<std::mutex> segment_lock(file_segment->mutex);
+                remove(file_segment->key(), file_segment->offset(), cache_lock, segment_lock);
+            }
+        }
+
+        query_context->reserve(key, offset, size, cache_lock);
+        return ReserveResult::FITS_IN_QUERY_LIMIT_NEED_RESERVE_FROM_MAIN_LIST;
+    }
+}
+
+bool LRUFileCache::tryReserveForMainList(
+    const Key & key, size_t offset, size_t size, QueryContextPtr query_context, std::lock_guard<std::mutex> & cache_lock)
 {
    auto removed_size = 0;
    size_t queue_size = queue.getElementsNum(cache_lock);
@ -595,6 +828,9 @@ bool LRUFileCache::tryReserve(
    if (queue.getTotalWeight(cache_lock) > (1ull << 63))
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Cache became inconsistent. There must be a bug");

+    if (query_context)
+        query_context->reserve(key, offset, size, cache_lock);
+
    return true;
 }

@ -616,13 +852,18 @@ void LRUFileCache::remove(const Key & key)
    for (auto & [offset, cell] : offsets)
        to_remove.push_back(&cell);

+    bool some_cells_were_skipped = false;
    for (auto & cell : to_remove)
    {
+        /// In ordinary case we remove data from cache when it's not used by anyone.
+        /// But if we have multiple replicated zero-copy tables on the same server
+        /// it became possible to start removing something from cache when it is used
+        /// by other "zero-copy" tables. That is why it's not an error.
        if (!cell->releasable())
-            throw Exception(
-                ErrorCodes::LOGICAL_ERROR,
-                "Cannot remove file from cache because someone reads from it. File segment info: {}",
-                cell->file_segment->getInfoForLog());
+        {
+            some_cells_were_skipped = true;
+            continue;
+        }

        auto file_segment = cell->file_segment;
        if (file_segment)
@ -634,10 +875,13 @@ void LRUFileCache::remove(const Key & key)

    auto key_path = getPathInLocalCache(key);

-    files.erase(key);
+    if (!some_cells_were_skipped)
+    {
+        files.erase(key);

-    if (fs::exists(key_path))
-        fs::remove(key_path);
+        if (fs::exists(key_path))
+            fs::remove(key_path);
+    }
 }

 void LRUFileCache::remove()
@ -844,7 +1088,6 @@ FileSegments LRUFileCache::getSnapshot() const
        for (const auto & [offset, cell] : cells_by_offset)
            file_segments.push_back(FileSegment::getSnapshot(cell.file_segment, cache_lock));
    }
-
    return file_segments;
 }

@ -930,7 +1173,7 @@ LRUFileCache::FileSegmentCell::FileSegmentCell(
    }
 }

-LRUFileCache::LRUQueue::Iterator LRUFileCache::LRUQueue::add(
+IFileCache::LRUQueue::Iterator IFileCache::LRUQueue::add(
    const IFileCache::Key & key, size_t offset, size_t size, std::lock_guard<std::mutex> & /* cache_lock */)
 {
 #ifndef NDEBUG
@ -948,30 +1191,30 @@ LRUFileCache::LRUQueue::Iterator LRUFileCache::LRUQueue::add(
    return queue.insert(queue.end(), FileKeyAndOffset(key, offset, size));
 }

-void LRUFileCache::LRUQueue::remove(Iterator queue_it, std::lock_guard<std::mutex> & /* cache_lock */)
+void IFileCache::LRUQueue::remove(Iterator queue_it, std::lock_guard<std::mutex> & /* cache_lock */)
 {
    cache_size -= queue_it->size;
    queue.erase(queue_it);
 }

-void LRUFileCache::LRUQueue::removeAll(std::lock_guard<std::mutex> & /* cache_lock */)
+void IFileCache::LRUQueue::removeAll(std::lock_guard<std::mutex> & /* cache_lock */)
 {
    queue.clear();
    cache_size = 0;
 }

-void LRUFileCache::LRUQueue::moveToEnd(Iterator queue_it, std::lock_guard<std::mutex> & /* cache_lock */)
+void IFileCache::LRUQueue::moveToEnd(Iterator queue_it, std::lock_guard<std::mutex> & /* cache_lock */)
 {
    queue.splice(queue.end(), queue, queue_it);
 }

-void LRUFileCache::LRUQueue::incrementSize(Iterator queue_it, size_t size_increment, std::lock_guard<std::mutex> & /* cache_lock */)
+void IFileCache::LRUQueue::incrementSize(Iterator queue_it, size_t size_increment, std::lock_guard<std::mutex> & /* cache_lock */)
 {
    cache_size += size_increment;
    queue_it->size += size_increment;
 }

-bool LRUFileCache::LRUQueue::contains(
+bool IFileCache::LRUQueue::contains(
    const IFileCache::Key & key, size_t offset, std::lock_guard<std::mutex> & /* cache_lock */) const
 {
    /// This method is used for assertions in debug mode.
@ -984,31 +1227,7 @@ bool LRUFileCache::LRUQueue::contains(
    return false;
 }

-void LRUFileCache::LRUQueue::assertCorrectness(LRUFileCache * cache, std::lock_guard<std::mutex> & cache_lock)
-{
-    [[maybe_unused]] size_t total_size = 0;
-    for (auto it = queue.begin(); it != queue.end();)
-    {
-        auto & [key, offset, size, _] = *it++;
-
-        auto * cell = cache->getCell(key, offset, cache_lock);
-        if (!cell)
-        {
-            throw Exception(
-                ErrorCodes::LOGICAL_ERROR,
-                "Cache is in inconsistent state: LRU queue contains entries with no cache cell (assertCorrectness())");
-        }
-
-        assert(cell->size() == size);
-        total_size += size;
-    }
-
-    assert(total_size == cache_size);
-    assert(cache_size <= cache->max_size);
-    assert(queue.size() <= cache->max_element_size);
-}
-
-String LRUFileCache::LRUQueue::toString(std::lock_guard<std::mutex> & /* cache_lock */) const
+String IFileCache::LRUQueue::toString(std::lock_guard<std::mutex> & /* cache_lock */) const
 {
    String result;
    for (const auto & [key, offset, size, _] : queue)
@ -1057,14 +1276,38 @@ void LRUFileCache::assertCacheCellsCorrectness(
 void LRUFileCache::assertCacheCorrectness(const Key & key, std::lock_guard<std::mutex> & cache_lock)
 {
    assertCacheCellsCorrectness(files[key], cache_lock);
-    queue.assertCorrectness(this, cache_lock);
+    assertQueueCorrectness(cache_lock);
 }

 void LRUFileCache::assertCacheCorrectness(std::lock_guard<std::mutex> & cache_lock)
 {
    for (const auto & [key, cells_by_offset] : files)
        assertCacheCellsCorrectness(files[key], cache_lock);
-    queue.assertCorrectness(this, cache_lock);
+    assertQueueCorrectness(cache_lock);
+}
+
+void LRUFileCache::assertQueueCorrectness(std::lock_guard<std::mutex> & cache_lock)
+{
+    [[maybe_unused]] size_t total_size = 0;
+    for (auto it = queue.begin(); it != queue.end();)
+    {
+        auto & [key, offset, size, _] = *it++;
+
+        auto * cell = getCell(key, offset, cache_lock);
+        if (!cell)
+        {
+            throw Exception(
+                ErrorCodes::LOGICAL_ERROR,
+                "Cache is in inconsistent state: LRU queue contains entries with no cache cell (assertCorrectness())");
+        }
+
+        assert(cell->size() == size);
+        total_size += size;
+    }
+
+    assert(total_size == queue.getTotalWeight(cache_lock));
+    assert(queue.getTotalWeight(cache_lock) <= max_size);
+    assert(queue.getElementsNum(cache_lock) <= max_element_size);
 }

 }
--- a/src/Common/FileCache.h
+++ b/src/Common/FileCache.h
@ -12,6 +12,7 @@
 #include <map>

 #include "FileCache_fwd.h"
+#include <IO/ReadSettings.h>
 #include <Common/logger_useful.h>
 #include <Common/FileSegment.h>
 #include <Core/Types.h>
@ -20,6 +21,14 @@
 namespace DB
 {

+namespace ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+}
+
+class IFileCache;
+using FileCachePtr = std::shared_ptr<IFileCache>;
+
 /**
 * Local cache for remote filesystem files, represented as a set of non-overlapping non-empty file segments.
 */
@ -106,58 +115,6 @@ protected:

    mutable std::mutex mutex;

-    virtual bool tryReserve(
-        const Key & key, size_t offset, size_t size,
-        std::lock_guard<std::mutex> & cache_lock) = 0;
-
-    virtual void remove(
-        Key key, size_t offset,
-        std::lock_guard<std::mutex> & cache_lock,
-        std::lock_guard<std::mutex> & segment_lock) = 0;
-
-    virtual bool isLastFileSegmentHolder(
-        const Key & key, size_t offset,
-        std::lock_guard<std::mutex> & cache_lock,
-        std::lock_guard<std::mutex> & segment_lock) = 0;
-
-    /// If file segment was partially downloaded and then space reservation fails (because of no
-    /// space left), then update corresponding cache cell metadata (file segment size).
-    virtual void reduceSizeToDownloaded(
-        const Key & key, size_t offset,
-        std::lock_guard<std::mutex> & cache_lock,
-        std::lock_guard<std::mutex> & segment_lock) = 0;
-
-    void assertInitialized() const;
-};
-
-using FileCachePtr = std::shared_ptr<IFileCache>;
-
-class LRUFileCache final : public IFileCache
-{
-public:
-    LRUFileCache(
-        const String & cache_base_path_,
-        const FileCacheSettings & cache_settings_);
-
-    FileSegmentsHolder getOrSet(const Key & key, size_t offset, size_t size) override;
-
-    FileSegmentsHolder get(const Key & key, size_t offset, size_t size) override;
-
-    FileSegments getSnapshot() const override;
-
-    void initialize() override;
-
-    void remove(const Key & key) override;
-
-    void remove() override;
-
-    std::vector<String> tryGetCachePaths(const Key & key) override;
-
-    size_t getUsedCacheSize() const override;
-
-    size_t getFileSegmentsNum() const override;
-
-private:
    class LRUQueue
    {
    public:
@ -186,8 +143,6 @@ private:
        /// Space reservation for a file segment is incremental, so we need to be able to increment size of the queue entry.
        void incrementSize(Iterator queue_it, size_t size_increment, std::lock_guard<std::mutex> & cache_lock);

-        void assertCorrectness(LRUFileCache * cache, std::lock_guard<std::mutex> & cache_lock);
-
        String toString(std::lock_guard<std::mutex> & cache_lock) const;

        bool contains(const Key & key, size_t offset, std::lock_guard<std::mutex> & cache_lock) const;
@ -203,6 +158,171 @@ private:
        size_t cache_size = 0;
    };

+    using AccessKeyAndOffset = std::pair<Key, size_t>;
+
+    struct KeyAndOffsetHash
+    {
+        std::size_t operator()(const AccessKeyAndOffset & key) const
+        {
+            return std::hash<UInt128>()(key.first) ^ std::hash<UInt64>()(key.second);
+        }
+    };
+
+    using AccessRecord = std::unordered_map<AccessKeyAndOffset, LRUQueue::Iterator, KeyAndOffsetHash>;
+
+    /// Used to track and control the cache access of each query.
+    /// Through it, we can realize the processing of different queries by the cache layer.
+    struct QueryContext
+    {
+        LRUQueue lru_queue;
+        AccessRecord records;
+
+        size_t cache_size = 0;
+        size_t max_cache_size;
+
+        bool skip_download_if_exceeds_query_cache;
+
+        QueryContext(size_t max_cache_size_, bool skip_download_if_exceeds_query_cache_)
+            : max_cache_size(max_cache_size_)
+            , skip_download_if_exceeds_query_cache(skip_download_if_exceeds_query_cache_) {}
+
+        void remove(const Key & key, size_t offset, size_t size, std::lock_guard<std::mutex> & cache_lock)
+        {
+            if (cache_size < size)
+                throw Exception(ErrorCodes::LOGICAL_ERROR, "Deleted cache size exceeds existing cache size");
+
+            if (!skip_download_if_exceeds_query_cache)
+            {
+                auto record = records.find({key, offset});
+                if (record != records.end())
+                {
+                    lru_queue.remove(record->second, cache_lock);
+                    records.erase({key, offset});
+                }
+            }
+            cache_size -= size;
+        }
+
+        void reserve(const Key & key, size_t offset, size_t size, std::lock_guard<std::mutex> & cache_lock)
+        {
+            if (cache_size + size > max_cache_size)
+                throw Exception(ErrorCodes::LOGICAL_ERROR, "Reserved cache size exceeds the remaining cache size");
+
+            if (!skip_download_if_exceeds_query_cache)
+            {
+                auto record = records.find({key, offset});
+                if (record == records.end())
+                {
+                    auto queue_iter = lru_queue.add(key, offset, 0, cache_lock);
+                    record = records.insert({{key, offset}, queue_iter}).first;
+                }
+                record->second->size += size;
+            }
+            cache_size += size;
+        }
+
+        void use(const Key & key, size_t offset, std::lock_guard<std::mutex> & cache_lock)
+        {
+            if (!skip_download_if_exceeds_query_cache)
+            {
+                auto record = records.find({key, offset});
+                if (record != records.end())
+                    lru_queue.moveToEnd(record->second, cache_lock);
+            }
+        }
+
+        size_t getMaxCacheSize() { return max_cache_size; }
+
+        size_t getCacheSize() { return cache_size; }
+
+        LRUQueue & queue() { return lru_queue; }
+
+        bool isSkipDownloadIfExceed() { return skip_download_if_exceeds_query_cache; }
+    };
+
+    using QueryContextPtr = std::shared_ptr<QueryContext>;
+    using QueryContextMap = std::unordered_map<String, QueryContextPtr>;
+
+    QueryContextMap query_map;
+
+    bool enable_filesystem_query_cache_limit;
+
+    QueryContextPtr getCurrentQueryContext(std::lock_guard<std::mutex> & cache_lock);
+
+    QueryContextPtr getQueryContext(const String & query_id, std::lock_guard<std::mutex> & cache_lock);
+
+    void removeQueryContext(const String & query_id);
+
+    QueryContextPtr getOrSetQueryContext(const String & query_id, const ReadSettings & settings, std::lock_guard<std::mutex> &);
+
+    virtual bool tryReserve(
+        const Key & key, size_t offset, size_t size,
+        std::lock_guard<std::mutex> & cache_lock) = 0;
+
+    virtual void remove(
+        Key key, size_t offset,
+        std::lock_guard<std::mutex> & cache_lock,
+        std::lock_guard<std::mutex> & segment_lock) = 0;
+
+    virtual bool isLastFileSegmentHolder(
+        const Key & key, size_t offset,
+        std::lock_guard<std::mutex> & cache_lock,
+        std::lock_guard<std::mutex> & segment_lock) = 0;
+
+    /// If file segment was partially downloaded and then space reservation fails (because of no
+    /// space left), then update corresponding cache cell metadata (file segment size).
+    virtual void reduceSizeToDownloaded(
+        const Key & key, size_t offset,
+        std::lock_guard<std::mutex> & cache_lock,
+        std::lock_guard<std::mutex> & segment_lock) = 0;
+
+    void assertInitialized() const;
+
+public:
+    /// Save a query context information, and adopt different cache policies
+    /// for different queries through the context cache layer.
+    struct QueryContextHolder : private boost::noncopyable
+    {
+        explicit QueryContextHolder(const String & query_id_, IFileCache * cache_, QueryContextPtr context_);
+
+        QueryContextHolder() = default;
+
+        ~QueryContextHolder();
+
+        String query_id {};
+        IFileCache * cache = nullptr;
+        QueryContextPtr context = nullptr;
+    };
+
+    QueryContextHolder getQueryContextHolder(const String & query_id, const ReadSettings & settings);
+};
+
+class LRUFileCache final : public IFileCache
+{
+public:
+    LRUFileCache(
+        const String & cache_base_path_,
+        const FileCacheSettings & cache_settings_);
+
+    FileSegmentsHolder getOrSet(const Key & key, size_t offset, size_t size) override;
+
+    FileSegmentsHolder get(const Key & key, size_t offset, size_t size) override;
+
+    FileSegments getSnapshot() const override;
+
+    void initialize() override;
+
+    void remove(const Key & key) override;
+
+    void remove() override;
+
+    std::vector<String> tryGetCachePaths(const Key & key) override;
+
+    size_t getUsedCacheSize() const override;
+
+    size_t getFileSegmentsNum() const override;
+
+private:
    struct FileSegmentCell : private boost::noncopyable
    {
        FileSegmentPtr file_segment;
@ -227,26 +347,22 @@ private:
    using FileSegmentsByOffset = std::map<size_t, FileSegmentCell>;
    using CachedFiles = std::unordered_map<Key, FileSegmentsByOffset>;

-    using AccessKeyAndOffset = std::pair<Key, size_t>;
-
-    struct KeyAndOffsetHash
-    {
-        std::size_t operator()(const AccessKeyAndOffset & key) const
-        {
-            return std::hash<UInt128>()(key.first) ^ std::hash<UInt64>()(key.second);
-        }
-    };
-
-    using AccessRecord = std::unordered_map<AccessKeyAndOffset, LRUQueue::Iterator, KeyAndOffsetHash>;
-
    CachedFiles files;
    LRUQueue queue;

    LRUQueue stash_queue;
    AccessRecord records;
+
    size_t max_stash_element_size;
    size_t enable_cache_hits_threshold;

+    enum class ReserveResult
+    {
+        FITS_IN_QUERY_LIMIT_AND_RESERVATION_COMPLETED,
+        EXCEEDS_QUERY_LIMIT,
+        FITS_IN_QUERY_LIMIT_NEED_RESERVE_FROM_MAIN_LIST,
+    };
+
    Poco::Logger * log;

    FileSegments getImpl(
@ -266,6 +382,17 @@ private:
        const Key & key, size_t offset, size_t size,
        std::lock_guard<std::mutex> & cache_lock) override;

+    bool tryReserveForMainList(
+        const Key & key, size_t offset, size_t size,
+        QueryContextPtr query_context,
+        std::lock_guard<std::mutex> & cache_lock);
+
+    /// Limit the maximum cache size for current query.
+    LRUFileCache::ReserveResult tryReserveForQuery(
+        const Key & key, size_t offset, size_t size,
+        QueryContextPtr query_context,
+        std::lock_guard<std::mutex> & cache_lock);
+
    void remove(
        Key key, size_t offset,
        std::lock_guard<std::mutex> & cache_lock,
@ -309,6 +436,8 @@ public:
    void assertCacheCorrectness(const Key & key, std::lock_guard<std::mutex> & cache_lock);

    void assertCacheCorrectness(std::lock_guard<std::mutex> & cache_lock);
+
+    void assertQueueCorrectness(std::lock_guard<std::mutex> & cache_lock);
 };

 }
--- a/src/Common/FileCacheSettings.cpp
+++ b/src/Common/FileCacheSettings.cpp
@ -11,6 +11,7 @@ void FileCacheSettings::loadFromConfig(const Poco::Util::AbstractConfiguration &
    max_elements = config.getUInt64(config_prefix + ".data_cache_max_elements", REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_ELEMENTS);
    max_file_segment_size = config.getUInt64(config_prefix + ".max_file_segment_size", REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_FILE_SEGMENT_SIZE);
    cache_on_write_operations = config.getUInt64(config_prefix + ".cache_on_write_operations", false);
+    enable_filesystem_query_cache_limit = config.getUInt64(config_prefix + ".enable_filesystem_query_cache_limit", false);
    enable_cache_hits_threshold = config.getUInt64(config_prefix + ".enable_cache_hits_threshold", REMOTE_FS_OBJECTS_CACHE_ENABLE_HITS_THRESHOLD);
 }

--- a/src/Common/FileCacheSettings.h
+++ b/src/Common/FileCacheSettings.h
@ -13,6 +13,7 @@ struct FileCacheSettings
    size_t max_elements = REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_ELEMENTS;
    size_t max_file_segment_size = REMOTE_FS_OBJECTS_CACHE_DEFAULT_MAX_FILE_SEGMENT_SIZE;
    bool cache_on_write_operations = false;
+    bool enable_filesystem_query_cache_limit = false;

    size_t enable_cache_hits_threshold = REMOTE_FS_OBJECTS_CACHE_ENABLE_HITS_THRESHOLD;

--- a/src/Common/FileSegment.h
+++ b/src/Common/FileSegment.h
@ -200,6 +200,7 @@ private:
    const Range segment_range;

    State download_state;
+
    String downloader_id;

    RemoteFileReaderPtr remote_file_reader;
--- a/src/Common/InterruptListener.h
+++ b/src/Common/InterruptListener.h
@ -16,7 +16,7 @@ namespace ErrorCodes
    extern const int CANNOT_UNBLOCK_SIGNAL;
 }

-#ifdef __APPLE__
+#ifdef OS_DARWIN
 // We only need to support timeout = {0, 0} at this moment
 static int sigtimedwait(const sigset_t *set, siginfo_t *info, const struct timespec * /*timeout*/)
 {
--- a/src/Common/IntervalKind.cpp
+++ b/src/Common/IntervalKind.cpp
@ -9,13 +9,13 @@ namespace ErrorCodes
    extern const int SYNTAX_ERROR;
 }

-Int32 IntervalKind::toAvgSeconds() const
+Float64 IntervalKind::toAvgSeconds() const
 {
    switch (kind)
    {
-        case IntervalKind::Nanosecond:
-        case IntervalKind::Microsecond:
-        case IntervalKind::Millisecond: return 0; /// fractional parts of seconds have 0 seconds
+        case IntervalKind::Nanosecond: return 0.000000001;
+        case IntervalKind::Microsecond: return 0.000001;
+        case IntervalKind::Millisecond: return 0.001;
        case IntervalKind::Second: return 1;
        case IntervalKind::Minute: return 60;
        case IntervalKind::Hour: return 3600;
@ -28,6 +28,25 @@ Int32 IntervalKind::toAvgSeconds() const
    __builtin_unreachable();
 }

+bool IntervalKind::isFixedLength() const
+{
+    switch (kind)
+    {
+        case IntervalKind::Nanosecond:
+        case IntervalKind::Microsecond:
+        case IntervalKind::Millisecond:
+        case IntervalKind::Second:
+        case IntervalKind::Minute:
+        case IntervalKind::Hour:
+        case IntervalKind::Day:
+        case IntervalKind::Week: return true;
+        case IntervalKind::Month:
+        case IntervalKind::Quarter:
+        case IntervalKind::Year: return false;
+    }
+    __builtin_unreachable();
+}
+
 IntervalKind IntervalKind::fromAvgSeconds(Int64 num_seconds)
 {
    if (num_seconds)
--- a/src/Common/IntervalKind.h
+++ b/src/Common/IntervalKind.h
@ -31,12 +31,15 @@ struct IntervalKind

    /// Returns number of seconds in one interval.
    /// For `Month`, `Quarter` and `Year` the function returns an average number of seconds.
-    Int32 toAvgSeconds() const;
+    Float64 toAvgSeconds() const;

    /// Chooses an interval kind based on number of seconds.
    /// For example, `IntervalKind::fromAvgSeconds(3600)` returns `IntervalKind::Hour`.
    static IntervalKind fromAvgSeconds(Int64 num_seconds);

+    /// Returns whether IntervalKind has a fixed number of seconds (e.g. Day) or non-fixed(e.g. Month)
+    bool isFixedLength() const;
+
    /// Returns an uppercased version of what `toString()` returns.
    const char * toKeyword() const;

--- a/src/Common/PipeFDs.cpp
+++ b/src/Common/PipeFDs.cpp
@ -27,7 +27,7 @@ void LazyPipeFDs::open()
        if (fd >= 0)
            throw Exception("Pipe is already opened", ErrorCodes::LOGICAL_ERROR);

-#ifndef __APPLE__
+#ifndef OS_DARWIN
    if (0 != pipe2(fds_rw, O_CLOEXEC))
        throwFromErrno("Cannot create pipe", ErrorCodes::CANNOT_PIPE);
 #else
--- a/src/Common/ProcfsMetricsProvider.cpp
+++ b/src/Common/ProcfsMetricsProvider.cpp
@ -1,6 +1,6 @@
 #include "ProcfsMetricsProvider.h"

-#if defined(__linux__)
+#if defined(OS_LINUX)

 #include <Common/Exception.h>
 #include <IO/ReadBufferFromMemory.h>
--- a/src/Common/ProcfsMetricsProvider.h
+++ b/src/Common/ProcfsMetricsProvider.h
@ -4,7 +4,7 @@
 #include <boost/noncopyable.hpp>


-#if defined(__linux__)
+#if defined(OS_LINUX)
 struct taskstats;

 namespace DB
@ -19,7 +19,7 @@ public:
    /// Updates only a part of taskstats struct's fields:
    ///  - cpu_run_virtual_total, cpu_delay_total (when /proc/thread-self/schedstat is available)
    ///  - blkio_delay_total                      (when /proc/thread-self/stat is available)
-    ///  - rchar, wchar, read_bytes, write_bytes  (when /prod/thread-self/io is available)
+    ///  - rchar, wchar, read_bytes, write_bytes  (when /proc/thread-self/io is available)
    /// See: man procfs
    void getTaskStats(::taskstats & out_stats) const;

--- a/src/Common/RadixSort.h
+++ b/src/Common/RadixSort.h
@ -2,7 +2,7 @@


 #include <string.h>
-#if !defined(__APPLE__) && !defined(__FreeBSD__)
+#if !defined(OS_DARWIN) && !defined(OS_FREEBSD)
 #include <malloc.h>
 #endif
 #include <algorithm>
--- a/src/Common/StackTrace.cpp
+++ b/src/Common/StackTrace.cpp
@ -33,7 +33,7 @@ std::string signalToErrorMessage(int sig, const siginfo_t & info, [[maybe_unused
            else
                error << "Address: " << info.si_addr;

-#if defined(__x86_64__) && !defined(__FreeBSD__) && !defined(__APPLE__) && !defined(__arm__) && !defined(__powerpc__)
+#if defined(__x86_64__) && !defined(OS_FREEBSD) && !defined(OS_DARWIN) && !defined(__arm__) && !defined(__powerpc__)
            auto err_mask = context.uc_mcontext.gregs[REG_ERR];
            if ((err_mask & 0x02))
                error << " Access: write.";
@ -173,18 +173,18 @@ static void * getCallerAddress(const ucontext_t & context)
 {
 #if defined(__x86_64__)
    /// Get the address at the time the signal was raised from the RIP (x86-64)
-#    if defined(__FreeBSD__)
+#    if defined(OS_FREEBSD)
    return reinterpret_cast<void *>(context.uc_mcontext.mc_rip);
-#    elif defined(__APPLE__)
+#    elif defined(OS_DARWIN)
    return reinterpret_cast<void *>(context.uc_mcontext->__ss.__rip);
 #    else
    return reinterpret_cast<void *>(context.uc_mcontext.gregs[REG_RIP]);
 #    endif

-#elif defined(__APPLE__) && defined(__aarch64__)
+#elif defined(OS_DARWIN) && defined(__aarch64__)
    return reinterpret_cast<void *>(context.uc_mcontext->__ss.__pc);

-#elif defined(__FreeBSD__) && defined(__aarch64__)
+#elif defined(OS_FREEBSD) && defined(__aarch64__)
    return reinterpret_cast<void *>(context.uc_mcontext.mc_gpregs.gp_elr);
 #elif defined(__aarch64__)
    return reinterpret_cast<void *>(context.uc_mcontext.pc);
@ -201,7 +201,7 @@ void StackTrace::symbolize(
    const StackTrace::FramePointers & frame_pointers, [[maybe_unused]] size_t offset,
    size_t size, StackTrace::Frames & frames)
 {
-#if defined(__ELF__) && !defined(__FreeBSD__)
+#if defined(__ELF__) && !defined(OS_FREEBSD)

    auto symbol_index_ptr = DB::SymbolIndex::instance();
    const DB::SymbolIndex & symbol_index = *symbol_index_ptr;
@ -332,7 +332,7 @@ static void toStringEveryLineImpl(
    if (size == 0)
        return callback("<Empty trace>");

-#if defined(__ELF__) && !defined(__FreeBSD__)
+#if defined(__ELF__) && !defined(OS_FREEBSD)
    auto symbol_index_ptr = DB::SymbolIndex::instance();
    const DB::SymbolIndex & symbol_index = *symbol_index_ptr;
    std::unordered_map<std::string, DB::Dwarf> dwarfs;
--- a/src/Common/StackTrace.h
+++ b/src/Common/StackTrace.h
@ -9,7 +9,7 @@
 #include <functional>
 #include <signal.h>

-#ifdef __APPLE__
+#ifdef OS_DARWIN
 // ucontext is not available without _XOPEN_SOURCE
 #   ifdef __clang__
 #       pragma clang diagnostic ignored "-Wreserved-id-macro"
--- a/src/Common/SymbolIndex.cpp
+++ b/src/Common/SymbolIndex.cpp
@ -1,4 +1,4 @@
-#if defined(__ELF__) && !defined(__FreeBSD__)
+#if defined(__ELF__) && !defined(OS_FREEBSD)

 #include <Common/SymbolIndex.h>
 #include <Common/hex.h>
--- a/src/Common/SymbolIndex.h
+++ b/src/Common/SymbolIndex.h
@ -1,6 +1,6 @@
 #pragma once

-#if defined(__ELF__) && !defined(__FreeBSD__)
+#if defined(__ELF__) && !defined(OS_FREEBSD)

 #include <vector>
 #include <string>
--- a/src/Common/ThreadProfileEvents.cpp
+++ b/src/Common/ThreadProfileEvents.cpp
@ -1,6 +1,6 @@
 #include "ThreadProfileEvents.h"

-#if defined(__linux__)
+#if defined(OS_LINUX)

 #include "TaskStatsInfoGetter.h"
 #include "ProcfsMetricsProvider.h"
@ -177,7 +177,7 @@ void TasksStatsCounters::incrementProfileEvents(const ::taskstats & prev, const

 #endif

-#if defined(__linux__)
+#if defined(OS_LINUX)

 namespace DB
 {
--- a/src/Common/ThreadProfileEvents.h
+++ b/src/Common/ThreadProfileEvents.h
@ -8,7 +8,7 @@
 #include <Common/logger_useful.h>


-#if defined(__linux__)
+#if defined(OS_LINUX)
 #include <linux/taskstats.h>
 #else
 struct taskstats {};
@ -66,7 +66,7 @@ struct RUsageCounters
    static RUsageCounters current()
    {
        ::rusage rusage {};
-#if !defined(__APPLE__)
+#if !defined(OS_DARWIN)
 #if defined(OS_SUNOS)
        ::getrusage(RUSAGE_LWP, &rusage);
 #else
@ -102,7 +102,7 @@ private:
    }
 };

-#if defined(__linux__)
+#if defined(OS_LINUX)

 struct PerfEventInfo
 {
@ -171,7 +171,7 @@ extern PerfEventsCounters current_thread_counters;

 #endif

-#if defined(__linux__)
+#if defined(OS_LINUX)

 class TasksStatsCounters
 {
--- a/src/Common/ZooKeeper/ZooKeeper.cpp
+++ b/src/Common/ZooKeeper/ZooKeeper.cpp
@ -841,6 +841,21 @@ bool ZooKeeper::waitForDisappear(const std::string & path, const WaitCondition &
    return false;
 }

+void ZooKeeper::waitForEphemeralToDisappearIfAny(const std::string & path)
+{
+    zkutil::EventPtr eph_node_disappeared = std::make_shared<Poco::Event>();
+    String content;
+    if (!tryGet(path, content, nullptr, eph_node_disappeared))
+        return;
+
+    int32_t timeout_ms = 2 * session_timeout_ms;
+    if (!eph_node_disappeared->tryWait(timeout_ms))
+        throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR,
+                            "Ephemeral node {} still exists after {}s, probably it's owned by someone else. "
+                            "Either session_timeout_ms in client's config is different from server's config or it's a bug. "
+                            "Node data: '{}'", path, timeout_ms / 1000, content);
+}
+
 ZooKeeperPtr ZooKeeper::startNewSession() const
 {
    return std::make_shared<ZooKeeper>(hosts, identity, session_timeout_ms, operation_timeout_ms, chroot, implementation, zk_log, get_priority_load_balancing);
--- a/src/Common/ZooKeeper/ZooKeeper.h
+++ b/src/Common/ZooKeeper/ZooKeeper.h
@ -240,6 +240,10 @@ public:
    /// The function returns true if waited and false if waiting was interrupted by condition.
    bool waitForDisappear(const std::string & path, const WaitCondition & condition = {});

+    /// Wait for the ephemeral node created in previous session to disappear.
+    /// Throws LOGICAL_ERROR if node still exists after 2x session_timeout.
+    void waitForEphemeralToDisappearIfAny(const std::string & path);
+
    /// Async interface (a small subset of operations is implemented).
    ///
    /// Usage:
--- a/src/Common/ZooKeeper/ZooKeeperIO.cpp
+++ b/src/Common/ZooKeeper/ZooKeeperIO.cpp
@ -9,7 +9,7 @@ void write(size_t x, WriteBuffer & out)
    writeBinary(x, out);
 }

-#ifdef __APPLE__
+#ifdef OS_DARWIN
 void write(uint64_t x, WriteBuffer & out)
 {
    x = __builtin_bswap64(x);
@ -71,7 +71,7 @@ void write(const Error & x, WriteBuffer & out)
    write(static_cast<int32_t>(x), out);
 }

-#ifdef __APPLE__
+#ifdef OS_DARWIN
 void read(uint64_t & x, ReadBuffer & in)
 {
    readBinary(x, in);
--- a/src/Common/ZooKeeper/ZooKeeperIO.h
+++ b/src/Common/ZooKeeper/ZooKeeperIO.h
@ -16,7 +16,7 @@ using namespace DB;
 void write(size_t x, WriteBuffer & out);

 /// uint64_t != size_t on darwin
-#ifdef __APPLE__
+#ifdef OS_DARWIN
 void write(uint64_t x, WriteBuffer & out);
 #endif

@ -45,7 +45,7 @@ void write(const std::vector<T> & arr, WriteBuffer & out)
 }

 void read(size_t & x, ReadBuffer & in);
-#ifdef __APPLE__
+#ifdef OS_DARWIN
 void read(uint64_t & x, ReadBuffer & in);
 #endif
 void read(int64_t & x, ReadBuffer & in);
--- a/src/Common/atomicRename.cpp
+++ b/src/Common/atomicRename.cpp
@ -21,7 +21,7 @@ namespace ErrorCodes
 }


-#if defined(__linux__)
+#if defined(OS_LINUX)

 #include <unistd.h>
 #include <fcntl.h>
@ -101,7 +101,7 @@ bool supportsAtomicRename()

 }

-#elif defined(__APPLE__)
+#elif defined(OS_DARWIN)

 // Includes
 #include <dlfcn.h>  // For dlsym
--- a/src/Common/checkStackSize.cpp
+++ b/src/Common/checkStackSize.cpp
@ -5,7 +5,7 @@
 #include <pthread.h>
 #include <cstdint>

-#if defined(__FreeBSD__)
+#if defined(OS_FREEBSD)
 #   include <pthread_np.h>
 #endif

@ -48,7 +48,7 @@ size_t getStackSize(void ** out_address)
    address = reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(pthread_get_stackaddr_np(thread)) - size);
 #else
    pthread_attr_t attr;
-#   if defined(__FreeBSD__) || defined(OS_SUNOS)
+#   if defined(OS_FREEBSD) || defined(OS_SUNOS)
    pthread_attr_init(&attr);
    if (0 != pthread_attr_get_np(pthread_self(), &attr))
        throwFromErrno("Cannot pthread_attr_get_np", ErrorCodes::CANNOT_PTHREAD_ATTR);
--- a/src/Common/examples/int_hashes_perf.cpp
+++ b/src/Common/examples/int_hashes_perf.cpp
@ -16,7 +16,7 @@

 static void setAffinity()
 {
-#if !defined(__APPLE__) && !defined(__FreeBSD__) && !defined(__sun)
+#if !defined(OS_DARWIN) && !defined(OS_FREEBSD) && !defined(__sun)
    cpu_set_t mask;
    CPU_ZERO(&mask);
    CPU_SET(0, &mask);
@ -283,7 +283,7 @@ int main(int argc, char ** argv)

    if (!method || method == 1) test<identity>  (n, data.data(), "0: identity");
    if (!method || method == 2) test<intHash32> (n, data.data(), "1: intHash32");
-#if !defined(__APPLE__) /// The difference in size_t: unsigned long on Linux, unsigned long long on Mac OS.
+#if !defined(OS_DARWIN) /// The difference in size_t: unsigned long on Linux, unsigned long long on Mac OS.
    if (!method || method == 3) test<intHash64> (n, data.data(), "2: intHash64");
 #endif
    if (!method || method == 4) test<hash3>     (n, data.data(), "3: two rounds");
--- a/src/Common/examples/procfs_metrics_provider_perf.cpp
+++ b/src/Common/examples/procfs_metrics_provider_perf.cpp
@ -1,4 +1,4 @@
-#if defined(__linux__)
+#if defined(OS_LINUX)
 #include <Common/ProcfsMetricsProvider.h>

 #include <iostream>
@ -6,7 +6,7 @@
 #endif


-#if defined(__linux__)
+#if defined(OS_LINUX)
 int main(int argc, char ** argv)
 {
    using namespace DB;
--- a/src/Common/examples/symbol_index.cpp
+++ b/src/Common/examples/symbol_index.cpp
@ -16,7 +16,7 @@ static NO_INLINE const void * getAddress()

 int main(int argc, char ** argv)
 {
-#if defined(__ELF__) && !defined(__FreeBSD__)
+#if defined(__ELF__) && !defined(OS_FREEBSD)
    using namespace DB;

    if (argc < 2)
--- a/src/Common/filesystemHelpers.cpp
+++ b/src/Common/filesystemHelpers.cpp
@ -1,6 +1,6 @@
 #include "filesystemHelpers.h"

-#if defined(__linux__)
+#if defined(OS_LINUX)
 #    include <cstdio>
 #    include <mntent.h>
 #    include <sys/sysmacros.h>
@ -62,12 +62,12 @@ std::unique_ptr<TemporaryFile> createTemporaryFile(const std::string & path)
    return std::make_unique<TemporaryFile>(path);
 }

-#if !defined(__linux__)
+#if !defined(OS_LINUX)
 [[noreturn]]
 #endif
 String getBlockDeviceId([[maybe_unused]] const String & path)
 {
-#if defined(__linux__)
+#if defined(OS_LINUX)
    struct stat sb;
    if (lstat(path.c_str(), &sb))
        throwFromErrnoWithPath("Cannot lstat " + path, path, ErrorCodes::CANNOT_STAT);
@ -79,12 +79,12 @@ String getBlockDeviceId([[maybe_unused]] const String & path)
 #endif
 }

-#if !defined(__linux__)
+#if !defined(OS_LINUX)
 [[noreturn]]
 #endif
 BlockDeviceType getBlockDeviceType([[maybe_unused]] const String & device_id)
 {
-#if defined(__linux__)
+#if defined(OS_LINUX)
    try
    {
        ReadBufferFromFile in("/sys/dev/block/" + device_id + "/queue/rotational");
@ -101,12 +101,12 @@ BlockDeviceType getBlockDeviceType([[maybe_unused]] const String & device_id)
 #endif
 }

-#if !defined(__linux__)
+#if !defined(OS_LINUX)
 [[noreturn]]
 #endif
 UInt64 getBlockDeviceReadAheadBytes([[maybe_unused]] const String & device_id)
 {
-#if defined(__linux__)
+#if defined(OS_LINUX)
    try
    {
        ReadBufferFromFile in("/sys/dev/block/" + device_id + "/queue/read_ahead_kb");
@ -155,12 +155,12 @@ std::filesystem::path getMountPoint(std::filesystem::path absolute_path)
 }

 /// Returns name of filesystem mounted to mount_point
-#if !defined(__linux__)
+#if !defined(OS_LINUX)
 [[noreturn]]
 #endif
 String getFilesystemName([[maybe_unused]] const String & mount_point)
 {
-#if defined(__linux__)
+#if defined(OS_LINUX)
    FILE * mounted_filesystems = setmntent("/etc/mtab", "r");
    if (!mounted_filesystems)
        throw DB::Exception("Cannot open /etc/mtab to get name of filesystem", ErrorCodes::SYSTEM_ERROR);
--- a/src/Common/filesystemHelpers.h
+++ b/src/Common/filesystemHelpers.h
@ -19,7 +19,7 @@ bool enoughSpaceInDirectory(const std::string & path, size_t data_size);
 std::unique_ptr<TemporaryFile> createTemporaryFile(const std::string & path);

 // Determine what block device is responsible for specified path
-#if !defined(__linux__)
+#if !defined(OS_LINUX)
 [[noreturn]]
 #endif
 String getBlockDeviceId([[maybe_unused]] const String & path);
@ -32,13 +32,13 @@ enum class BlockDeviceType
 };

 // Try to determine block device type
-#if !defined(__linux__)
+#if !defined(OS_LINUX)
 [[noreturn]]
 #endif
 BlockDeviceType getBlockDeviceType([[maybe_unused]] const String & device_id);

 // Get size of read-ahead in bytes for specified block device
-#if !defined(__linux__)
+#if !defined(OS_LINUX)
 [[noreturn]]
 #endif
 UInt64 getBlockDeviceReadAheadBytes([[maybe_unused]] const String & device_id);
@ -47,7 +47,7 @@ UInt64 getBlockDeviceReadAheadBytes([[maybe_unused]] const String & device_id);
 std::filesystem::path getMountPoint(std::filesystem::path absolute_path);

 /// Returns name of filesystem mounted to mount_point
-#if !defined(__linux__)
+#if !defined(OS_LINUX)
 [[noreturn]]
 #endif
 String getFilesystemName([[maybe_unused]] const String & mount_point);
--- a/src/Common/getCurrentProcessFDCount.cpp
+++ b/src/Common/getCurrentProcessFDCount.cpp
@ -11,7 +11,7 @@ int getCurrentProcessFDCount()
 {
    namespace fs = std::filesystem;
    int result = -1;
-#if defined(__linux__)  || defined(__APPLE__)
+#if defined(OS_LINUX)  || defined(OS_DARWIN)
    using namespace DB;

    Int32 pid = getpid();
--- a/src/Common/getHashOfLoadedBinary.cpp
+++ b/src/Common/getHashOfLoadedBinary.cpp
@ -1,6 +1,6 @@
 #include <Common/getHashOfLoadedBinary.h>

-#if defined(__linux__)
+#if defined(OS_LINUX)

 #include <link.h>
 #include <array>
--- a/src/Common/getMappedArea.cpp
+++ b/src/Common/getMappedArea.cpp
@ -1,7 +1,7 @@
 #include "getMappedArea.h"
 #include <Common/Exception.h>

-#if defined(__linux__)
+#if defined(OS_LINUX)

 #include <Common/StringUtils/StringUtils.h>
 #include <Common/hex.h>
--- a/src/Common/getMaxFileDescriptorCount.cpp
+++ b/src/Common/getMaxFileDescriptorCount.cpp
@ -9,7 +9,7 @@ int getMaxFileDescriptorCount()
 {
    namespace fs = std::filesystem;
    int result = -1;
-#if defined(__linux__) || defined(__APPLE__)
+#if defined(OS_LINUX) || defined(OS_DARWIN)
    using namespace DB;

    if (fs::exists("/proc/sys/fs/file-max"))
--- a/src/Common/hasLinuxCapability.cpp
+++ b/src/Common/hasLinuxCapability.cpp
@ -1,4 +1,4 @@
-#if defined(__linux__)
+#if defined(OS_LINUX)

 #include "hasLinuxCapability.h"

--- a/src/Common/hasLinuxCapability.h
+++ b/src/Common/hasLinuxCapability.h
@ -1,5 +1,5 @@
 #pragma once
-#if defined(__linux__)
+#if defined(OS_LINUX)

 #include <linux/capability.h>

--- a/src/Common/randomSeed.cpp
+++ b/src/Common/randomSeed.cpp
@ -7,6 +7,9 @@
 #include <base/getThreadId.h>
 #include <base/types.h>

+#if defined(__linux__)
+#include <sys/utsname.h>
+#endif

 namespace DB
 {
@ -29,6 +32,15 @@ DB::UInt64 randomSeed()
    hash.update(times.tv_nsec);
    hash.update(times.tv_sec);
    hash.update(getThreadId());
-    hash.update(&times);
+
+    /// It makes sense to add something like hostname to avoid seed collision when multiple servers start simultaneously.
+    /// But randomSeed() must be signal-safe and gethostname and similar functions are not.
+    /// Let's try to get utsname.nodename using uname syscall (it's signal-safe).
+#if defined(__linux__)
+    struct utsname sysinfo;
+    if (uname(&sysinfo) == 0)
+        hash.update(sysinfo);
+#endif
+
    return hash.get64();
 }
--- a/src/Common/remapExecutable.cpp
+++ b/src/Common/remapExecutable.cpp
@ -1,6 +1,6 @@
 #include "remapExecutable.h"

-#if defined(__linux__) && defined(__amd64__) && defined(__SSE2__) && !defined(SANITIZER) && defined(NDEBUG) && !defined(SPLIT_SHARED_LIBRARIES)
+#if defined(OS_LINUX) && defined(__amd64__) && defined(__SSE2__) && !defined(SANITIZER) && defined(NDEBUG) && !defined(SPLIT_SHARED_LIBRARIES)

 #include <sys/mman.h>
 #include <unistd.h>
--- a/src/Common/setThreadName.cpp
+++ b/src/Common/setThreadName.cpp
@ -1,7 +1,7 @@
 #include <pthread.h>

-#if defined(__APPLE__) || defined(OS_SUNOS)
-#elif defined(__FreeBSD__)
+#if defined(OS_DARWIN) || defined(OS_SUNOS)
+#elif defined(OS_FREEBSD)
    #include <pthread_np.h>
 #else
    #include <sys/prctl.h>
@ -55,10 +55,10 @@ const char * getThreadName()
    if (thread_name[0])
        return thread_name;

-#if defined(__APPLE__) || defined(OS_SUNOS)
+#if defined(OS_DARWIN) || defined(OS_SUNOS)
    if (pthread_getname_np(pthread_self(), thread_name, THREAD_NAME_SIZE))
        throw DB::Exception("Cannot get thread name with pthread_getname_np()", DB::ErrorCodes::PTHREAD_ERROR);
-#elif defined(__FreeBSD__)
+#elif defined(OS_FREEBSD)
 // TODO: make test. freebsd will have this function soon https://freshbsd.org/commit/freebsd/r337983
 //    if (pthread_get_name_np(pthread_self(), thread_name, THREAD_NAME_SIZE))
 //        throw DB::Exception("Cannot get thread name with pthread_get_name_np()", DB::ErrorCodes::PTHREAD_ERROR);
--- a/src/Common/tests/gtest_lru_file_cache.cpp
+++ b/src/Common/tests/gtest_lru_file_cache.cpp
@ -98,9 +98,10 @@ TEST(LRUFileCache, get)
    DB::ThreadStatus thread_status;

    /// To work with cache need query_id and query context.
+    std::string query_id = "query_id";
    auto query_context = DB::Context::createCopy(getContext().context);
    query_context->makeQueryContext();
-    query_context->setCurrentQueryId("query_id");
+    query_context->setCurrentQueryId(query_id);
    DB::CurrentThread::QueryScope query_scope_holder(query_context);

    DB::FileCacheSettings settings;
@ -513,4 +514,5 @@ TEST(LRUFileCache, get)
        assertRange(49, segments1[1], DB::FileSegment::Range(10, 19), DB::FileSegment::State::EMPTY);
        assertRange(50, segments1[2], DB::FileSegment::Range(20, 24), DB::FileSegment::State::EMPTY);
    }
+
 }
--- a/src/Coordination/Changelog.cpp
+++ b/src/Coordination/Changelog.cpp
@ -405,7 +405,7 @@ void Changelog::readChangelogAndInitWriter(uint64_t last_commited_log_index, uin

        if (last_log_read_result->last_read_index == 0 || last_log_read_result->error) /// If it's broken log then remove it
        {
-            LOG_INFO(log, "Removing log {} because it's empty or read finished with error", description.path);
+            LOG_INFO(log, "Removing chagelog {} because it's empty or read finished with error", description.path);
            std::filesystem::remove(description.path);
            existing_changelogs.erase(last_log_read_result->log_start_index);
            std::erase_if(logs, [last_log_read_result] (const auto & item) { return item.first >= last_log_read_result->log_start_index; });
--- a/src/Coordination/FourLetterCommand.cpp
+++ b/src/Coordination/FourLetterCommand.cpp
@ -236,7 +236,7 @@ String MonitorCommand::run()
    print(ret, "key_arena_size", state_machine.getKeyArenaSize());
    print(ret, "latest_snapshot_size", state_machine.getLatestSnapshotBufSize());

-#if defined(__linux__) || defined(__APPLE__)
+#if defined(OS_LINUX) || defined(OS_DARWIN)
    print(ret, "open_file_descriptor_count", getCurrentProcessFDCount());
    print(ret, "max_file_descriptor_count", getMaxFileDescriptorCount());
 #endif
--- a/src/Coordination/KeeperSnapshotManager.cpp
+++ b/src/Coordination/KeeperSnapshotManager.cpp
@ -12,6 +12,7 @@
 #include <Coordination/pathUtils.h>
 #include <filesystem>
 #include <memory>
+#include <Common/logger_useful.h>

 namespace DB
 {
@ -20,6 +21,7 @@ namespace ErrorCodes
 {
    extern const int UNKNOWN_FORMAT_VERSION;
    extern const int UNKNOWN_SNAPSHOT;
+    extern const int LOGICAL_ERROR;
 }

 namespace
@ -296,6 +298,25 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial
        }
    }

+    for (const auto & itr : storage.container)
+    {
+        if (itr.key != "/")
+        {
+            if (itr.value.stat.numChildren != static_cast<int32_t>(itr.value.getChildren().size()))
+            {
+#ifdef NDEBUG
+                /// TODO (alesapin) remove this, it should be always CORRUPTED_DATA.
+                LOG_ERROR(&Poco::Logger::get("KeeperSnapshotManager"), "Children counter in stat.numChildren {}"
+                            " is different from actual children size {} for node {}", itr.value.stat.numChildren, itr.value.getChildren().size(), itr.key);
+#else
+                throw Exception(ErrorCodes::LOGICAL_ERROR, "Children counter in stat.numChildren {}"
+                            " is different from actual children size {} for node {}", itr.value.stat.numChildren, itr.value.getChildren().size(), itr.key);
+#endif
+            }
+        }
+    }
+
+
    size_t active_sessions_size;
    readBinary(active_sessions_size, in);

--- a/src/Coordination/KeeperStorage.cpp
+++ b/src/Coordination/KeeperStorage.cpp
@ -13,7 +13,7 @@
 #include <iomanip>
 #include <mutex>
 #include <functional>
-#include <Common/logger_useful.h>
+#include <base/defines.h>

 namespace DB
 {
@ -349,7 +349,9 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr
        container.updateValue(parent_path, [child_path, zxid, &prev_parent_zxid,
                                            parent_cversion, &prev_parent_cversion] (KeeperStorage::Node & parent)
        {
+            ++parent.stat.numChildren;
            parent.addChild(child_path);
+
            prev_parent_cversion = parent.stat.cversion;
            prev_parent_zxid = parent.stat.pzxid;

@ -363,7 +365,7 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr

            if (zxid > parent.stat.pzxid)
                parent.stat.pzxid = zxid;
-            ++parent.stat.numChildren;
+            chassert(parent.stat.numChildren == static_cast<int32_t>(parent.getChildren().size()));
        });

        response.path_created = path_created;
@ -385,6 +387,7 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr
                undo_parent.stat.cversion = prev_parent_cversion;
                undo_parent.stat.pzxid = prev_parent_zxid;
                undo_parent.removeChild(child_path);
+                chassert(undo_parent.stat.numChildren == static_cast<int32_t>(undo_parent.getChildren().size()));
            });

            storage.container.erase(path_created);
@ -494,7 +497,7 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr
        {
            response.error = Coordination::Error::ZBADVERSION;
        }
-        else if (it->value.stat.numChildren)
+        else if (!it->value.getChildren().empty())
        {
            response.error = Coordination::Error::ZNOTEMPTY;
        }
@ -519,6 +522,7 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr
                --parent.stat.numChildren;
                ++parent.stat.cversion;
                parent.removeChild(child_basename);
+                chassert(parent.stat.numChildren == static_cast<int32_t>(parent.getChildren().size()));
            });

            response.error = Coordination::Error::ZOK;
@ -540,6 +544,7 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr
                    ++parent.stat.numChildren;
                    --parent.stat.cversion;
                    parent.addChild(child_name);
+                    chassert(parent.stat.numChildren == static_cast<int32_t>(parent.getChildren().size()));
                });
            };
        }
@ -1110,6 +1115,7 @@ KeeperStorage::ResponsesForSessions KeeperStorage::processRequest(const Coordina
                    ++parent.stat.cversion;
                    auto base_name = getBaseName(ephemeral_path);
                    parent.removeChild(base_name);
+                    chassert(parent.stat.numChildren == static_cast<int32_t>(parent.getChildren().size()));
                });

                container.erase(ephemeral_path);
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -574,6 +574,8 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
    M(Bool, enable_filesystem_cache_on_write_operations, false, "Write into cache on write operations. To actually work this setting requires be added to disk config too", 0) \
    M(Bool, enable_filesystem_cache_log, false, "Allows to record the filesystem caching log for each query", 0) \
    M(Bool, read_from_filesystem_cache_if_exists_otherwise_bypass_cache, false, "", 0) \
+    M(Bool, skip_download_if_exceeds_query_cache, true, "Skip download from remote filesystem if exceeds query cache size", 0) \
+    M(UInt64, max_query_cache_size, (128UL * 1024 * 1024 * 1024), "Max remote filesystem cache size that can be used by a single query", 0) \
    \
    M(Bool, use_structure_from_insertion_table_in_table_functions, false, "Use structure from insertion table instead of schema inference from data", 0) \
    \
--- a/src/Core/SortCursor.h
+++ b/src/Core/SortCursor.h
@ -6,11 +6,24 @@

 #include <Common/typeid_cast.h>
 #include <Common/assert_cast.h>
+#include <Core/callOnTypeIndex.h>
 #include <Core/SortDescription.h>
 #include <Core/Block.h>
 #include <Core/ColumnNumbers.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <DataTypes/DataTypesDecimal.h>
+#include <DataTypes/DataTypeString.h>
+#include <DataTypes/DataTypeFixedString.h>
+#include <DataTypes/DataTypeDate.h>
+#include <DataTypes/DataTypeDate32.h>
+#include <DataTypes/DataTypeDateTime.h>
+#include <DataTypes/DataTypeDateTime64.h>
+#include <DataTypes/DataTypeEnum.h>
+#include <DataTypes/DataTypeUUID.h>
 #include <Columns/IColumn.h>
+#include <Columns/ColumnDecimal.h>
 #include <Columns/ColumnString.h>
+#include <Columns/ColumnFixedString.h>

 #include "config_core.h"

@ -250,6 +263,36 @@ struct SimpleSortCursor : SortCursorHelper<SimpleSortCursor>
    }
 };

+template <typename ColumnType>
+struct SpecializedSingleColumnSortCursor : SortCursorHelper<SpecializedSingleColumnSortCursor<ColumnType>>
+{
+    using SortCursorHelper<SpecializedSingleColumnSortCursor>::SortCursorHelper;
+
+    bool ALWAYS_INLINE greaterAt(const SortCursorHelper<SpecializedSingleColumnSortCursor> & rhs, size_t lhs_pos, size_t rhs_pos) const
+    {
+        auto & this_impl = this->impl;
+
+        auto & lhs_columns = this_impl->sort_columns;
+        auto & rhs_columns = rhs.impl->sort_columns;
+
+        assert(lhs_columns.size() == 1);
+        assert(rhs_columns.size() == 1);
+
+        const auto & lhs_column = assert_cast<const ColumnType &>(*lhs_columns[0]);
+        const auto & rhs_column = assert_cast<const ColumnType &>(*rhs_columns[0]);
+
+        const auto & desc = this->impl->desc[0];
+
+        int res = desc.direction * lhs_column.compareAt(lhs_pos, rhs_pos, rhs_column, desc.nulls_direction);
+
+        if (res > 0)
+            return true;
+        if (res < 0)
+            return false;
+
+        return this_impl->order > rhs.impl->order;
+    }
+};

 /// Separate comparator for locale-sensitive string comparisons
 struct SortCursorWithCollation : SortCursorHelper<SortCursorWithCollation>
@ -411,6 +454,124 @@ private:
    }
 };

+/** SortQueueVariants allow to specialize sorting queue for concrete types and sort description.
+  * To access queue callOnVariant method must be used.
+  */
+class SortQueueVariants
+{
+public:
+    SortQueueVariants() = default;
+
+    SortQueueVariants(const DataTypes & sort_description_types, const SortDescription & sort_description)
+    {
+        bool has_collation = false;
+        for (const auto & column_description : sort_description)
+        {
+            if (column_description.collator)
+            {
+                has_collation = true;
+                break;
+            }
+        }
+
+        if (has_collation)
+        {
+            queue_variants = SortingHeap<SortCursorWithCollation>();
+            return;
+        }
+        else if (sort_description.size() == 1)
+        {
+            TypeIndex column_type_index = sort_description_types[0]->getTypeId();
+
+            bool result = callOnIndexAndDataType<void>(
+                column_type_index,
+                [&](const auto & types)
+                {
+                    using Types = std::decay_t<decltype(types)>;
+                    using ColumnDataType = typename Types::LeftType;
+                    using ColumnType = typename ColumnDataType::ColumnType;
+
+                    queue_variants = SortingHeap<SpecializedSingleColumnSortCursor<ColumnType>>();
+                    return true;
+                });
+
+            if (!result)
+                queue_variants = SortingHeap<SimpleSortCursor>();
+        }
+        else
+        {
+            queue_variants = SortingHeap<SortCursor>();
+        }
+    }
+
+    SortQueueVariants(const Block & header, const SortDescription & sort_description)
+        : SortQueueVariants(extractSortDescriptionTypesFromHeader(header, sort_description), sort_description)
+    {
+    }
+
+    template <typename Func>
+    decltype(auto) callOnVariant(Func && func)
+    {
+        return std::visit(func, queue_variants);
+    }
+
+    bool variantSupportJITCompilation() const
+    {
+        return std::holds_alternative<SortingHeap<SimpleSortCursor>>(queue_variants)
+            || std::holds_alternative<SortingHeap<SortCursor>>(queue_variants)
+            || std::holds_alternative<SortingHeap<SortCursorWithCollation>>(queue_variants);
+    }
+
+private:
+    static DataTypes extractSortDescriptionTypesFromHeader(const Block & header, const SortDescription & sort_description)
+    {
+        size_t sort_description_size = sort_description.size();
+        DataTypes data_types(sort_description_size);
+
+        for (size_t i = 0; i < sort_description_size; ++i)
+        {
+            const auto & column_sort_description = sort_description[i];
+            data_types[i] = header.getByName(column_sort_description.column_name).type;
+        }
+
+        return data_types;
+    }
+
+    std::variant<
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<UInt8>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<UInt16>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<UInt32>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<UInt64>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<UInt128>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<UInt256>>>,
+
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<Int8>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<Int16>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<Int32>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<Int64>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<Int128>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<Int256>>>,
+
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<Float32>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<Float64>>>,
+
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnDecimal<Decimal32>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnDecimal<Decimal64>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnDecimal<Decimal128>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnDecimal<Decimal256>>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnDecimal<DateTime64>>>,
+
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnVector<UUID>>>,
+
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnString>>,
+        SortingHeap<SpecializedSingleColumnSortCursor<ColumnFixedString>>,
+
+        SortingHeap<SimpleSortCursor>,
+        SortingHeap<SortCursor>,
+        SortingHeap<SortCursorWithCollation>>
+        queue_variants;
+};
+
 template <typename TLeftColumns, typename TRightColumns>
 bool less(const TLeftColumns & lhs, const TRightColumns & rhs, size_t i, size_t j, const SortDescriptionWithPositions & descr)
 {
--- a/src/Daemon/BaseDaemon.cpp
+++ b/src/Daemon/BaseDaemon.cpp
@ -10,7 +10,7 @@
 #include <sys/time.h>
 #include <sys/wait.h>
 #include <sys/resource.h>
-#if defined(__linux__)
+#if defined(OS_LINUX)
    #include <sys/prctl.h>
 #endif
 #include <cerrno>
@ -858,7 +858,7 @@ void BaseDaemon::initializeTerminationAndSignalProcessing()
    signal_listener = std::make_unique<SignalListener>(*this);
    signal_listener_thread.start(*signal_listener);

-#if defined(__ELF__) && !defined(__FreeBSD__)
+#if defined(__ELF__) && !defined(OS_FREEBSD)
    String build_id_hex = DB::SymbolIndex::instance()->getBuildIDHex();
    if (build_id_hex.empty())
        build_id_info = "no build id";
@ -868,7 +868,7 @@ void BaseDaemon::initializeTerminationAndSignalProcessing()
    build_id_info = "no build id";
 #endif

-#if defined(__linux__)
+#if defined(OS_LINUX)
    std::string executable_path = getExecutablePath();

    if (!executable_path.empty())
@ -986,7 +986,7 @@ void BaseDaemon::setupWatchdog()
        if (0 == pid)
        {
            logger().information("Forked a child process to watch");
-#if defined(__linux__)
+#if defined(OS_LINUX)
            if (0 != prctl(PR_SET_PDEATHSIG, SIGKILL))
                logger().warning("Cannot do prctl to ask termination with parent.");
 #endif
--- a/src/Daemon/SentryWriter.cpp
+++ b/src/Daemon/SentryWriter.cpp
@ -149,7 +149,7 @@ void SentryWriter::onFault(int sig, const std::string & error_message, const Sta
        sentry_set_tag("signal", strsignal(sig));
        sentry_set_extra("signal_number", sentry_value_new_int32(sig));

-        #if defined(__ELF__) && !defined(__FreeBSD__)
+        #if defined(__ELF__) && !defined(OS_FREEBSD)
            const String & build_id_hex = DB::SymbolIndex::instance()->getBuildIDHex();
            sentry_set_tag("build_id", build_id_hex.c_str());
        #endif
--- a/src/Dictionaries/ExternalQueryBuilder.cpp
+++ b/src/Dictionaries/ExternalQueryBuilder.cpp
@ -84,6 +84,19 @@ std::string ExternalQueryBuilder::composeLoadAllQuery() const
    }
    else
    {
+        /** In case UPDATE_FIELD is specified in {condition} for dictionary that must load all data.
+          * Replace {condition} with true_condition for initial dictionary load.
+          * For next dictionary loads {condition} will be updated with UPDATE_FIELD.
+          */
+        static constexpr auto true_condition = "(1 = 1)";
+        auto condition_position = query.find(CONDITION_PLACEHOLDER_TO_REPLACE_VALUE);
+        if (condition_position != std::string::npos)
+        {
+            auto query_copy = query;
+            query_copy.replace(condition_position, CONDITION_PLACEHOLDER_TO_REPLACE_VALUE.size(), true_condition);
+            return query_copy;
+        }
+
        return query;
    }
 }
--- a/Show More
+++ b/Show More