Merge branch 'ClickHouse:master' into group_sorted_array_function

2024-11-24 00:22:29 +00:00 · 2023-09-20 19:35:47 +03:00 · 2023-09-20 19:35:47 +03:00 · 5f92103fa4
commit 5f92103fa4
parent 01722c7099 2c91e52da1
142 changed files with 2305 additions and 561 deletions
--- a/.github/workflows/libfuzzer.yml
+++ b/.github/workflows/libfuzzer.yml
@ -0,0 +1,95 @@
+name: libFuzzer
+
+env:
+  # Force the stdout and stderr streams to be unbuffered
+  PYTHONUNBUFFERED: 1
+
+on: # yamllint disable-line rule:truthy
+  #  schedule:
+  #    - cron: '0 0 2 31 1' # never for now
+  workflow_call:
+jobs:
+  BuilderFuzzers:
+    runs-on: [self-hosted, builder]
+    steps:
+      - name: Set envs
+        run: |
+          cat >> "$GITHUB_ENV" << 'EOF'
+          TEMP_PATH=${{runner.temp}}/build_check
+          IMAGES_PATH=${{runner.temp}}/images_path
+          REPO_COPY=${{runner.temp}}/build_check/ClickHouse
+          CACHES_PATH=${{runner.temp}}/../ccaches
+          BUILD_NAME=fuzzers
+          EOF
+      - name: Download changed images
+        # even if artifact does not exist, e.g. on `do not test` label or failed Docker job
+        continue-on-error: true
+        uses: actions/download-artifact@v3
+        with:
+          name: changed_images
+          path: ${{ env.IMAGES_PATH }}
+      - name: Check out repository code
+        uses: ClickHouse/checkout@v1
+        with:
+          clear-repository: true
+          submodules: true
+          ref: ${{github.ref}}
+      - name: Build
+        run: |
+          sudo rm -fr "$TEMP_PATH"
+          mkdir -p "$TEMP_PATH"
+          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
+          cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME"
+      - name: Upload build URLs to artifacts
+        if: ${{ success() || failure() }}
+        uses: actions/upload-artifact@v3
+        with:
+          name: ${{ env.BUILD_URLS }}
+          path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json
+      - name: Cleanup
+        if: always()
+        run: |
+          docker ps --quiet | xargs --no-run-if-empty docker kill ||:
+          docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||:
+          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
+  libFuzzerTest:
+    needs: [BuilderFuzzers]
+    runs-on: [self-hosted, func-tester]
+    steps:
+      - name: Set envs
+        run: |
+          cat >> "$GITHUB_ENV" << 'EOF'
+          TEMP_PATH=${{runner.temp}}/libfuzzer
+          REPORTS_PATH=${{runner.temp}}/reports_dir
+          CHECK_NAME=libFuzzer tests
+          REPO_COPY=${{runner.temp}}/libfuzzer/ClickHouse
+          KILL_TIMEOUT=10800
+          EOF
+      - name: Download changed images
+        # even if artifact does not exist, e.g. on `do not test` label or failed Docker job
+        continue-on-error: true
+        uses: actions/download-artifact@v3
+        with:
+          name: changed_images
+          path: ${{ env.TEMP_PATH }}
+      - name: Download json reports
+        uses: actions/download-artifact@v3
+        with:
+          path: ${{ env.REPORTS_PATH }}
+      - name: Check out repository code
+        uses: ClickHouse/checkout@v1
+        with:
+          clear-repository: true
+      - name: libFuzzer test
+        run: |
+          sudo rm -fr "$TEMP_PATH"
+          mkdir -p "$TEMP_PATH"
+          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
+          cd "$REPO_COPY/tests/ci"
+          python3 libfuzzer_test_check.py "$CHECK_NAME" "$KILL_TIMEOUT"
+      - name: Cleanup
+        if: always()
+        run: |
+          docker ps --quiet | xargs --no-run-if-empty docker kill ||:
+          docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||:
+          sudo rm -fr "$TEMP_PATH"
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@ -5187,9 +5187,16 @@ jobs:
          cd "$GITHUB_WORKSPACE/tests/ci"
          python3 finish_check.py
          python3 merge_pr.py --check-approved
-##############################################################################################
-########################### SQLLOGIC TEST ###################################################
-##############################################################################################
+#############################################################################################
+####################################### libFuzzer ###########################################
+#############################################################################################
+  libFuzzer:
+    if: contains(github.event.pull_request.labels.*.name, 'libFuzzer')
+    needs: [DockerHubPush, StyleCheck]
+    uses: ./.github/workflows/libfuzzer.yml
+  ##############################################################################################
+  ############################ SQLLOGIC TEST ###################################################
+  ##############################################################################################
  SQLLogicTestRelease:
    needs: [BuilderDebRelease]
    runs-on: [self-hosted, func-tester]
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -19,6 +19,7 @@ include (cmake/tools.cmake)
 include (cmake/ccache.cmake)
 include (cmake/clang_tidy.cmake)
 include (cmake/git.cmake)
+include (cmake/utils.cmake)

 # Ignore export() since we don't use it,
 # but it gets broken with a global targets via link_libraries()
@ -562,22 +563,6 @@ add_subdirectory (programs)
 add_subdirectory (tests)
 add_subdirectory (utils)

-# Function get_all_targets collects all targets recursively
-function(get_all_targets var)
-    macro(get_all_targets_recursive targets dir)
-        get_property(subdirectories DIRECTORY ${dir} PROPERTY SUBDIRECTORIES)
-        foreach(subdir ${subdirectories})
-            get_all_targets_recursive(${targets} ${subdir})
-        endforeach()
-        get_property(current_targets DIRECTORY ${dir} PROPERTY BUILDSYSTEM_TARGETS)
-        list(APPEND ${targets} ${current_targets})
-    endmacro()
-
-    set(targets)
-    get_all_targets_recursive(targets ${CMAKE_CURRENT_SOURCE_DIR})
-    set(${var} ${targets} PARENT_SCOPE)
-endfunction()
-
 if (FUZZER)
    # Bundle fuzzers target
    add_custom_target(fuzzers)
@ -592,14 +577,18 @@ if (FUZZER)
            # clickhouse fuzzer isn't working correctly
            # initial PR https://github.com/ClickHouse/ClickHouse/pull/27526
            #if (target MATCHES ".+_fuzzer" OR target STREQUAL "clickhouse")
-            if (target MATCHES ".+_fuzzer")
+            if (target_type STREQUAL "EXECUTABLE" AND target MATCHES ".+_fuzzer")
                message(STATUS "${target} instrumented with fuzzer")
                target_link_libraries(${target} PUBLIC ch_contrib::fuzzer)
                # Add to fuzzers bundle
                add_dependencies(fuzzers ${target})
+                get_target_filename(${target} target_bin_name)
+                get_target_property(target_bin_dir ${target} BINARY_DIR)
+                add_custom_command(TARGET fuzzers POST_BUILD COMMAND mv "${target_bin_dir}/${target_bin_name}" "${CMAKE_CURRENT_BINARY_DIR}/programs/" VERBATIM)
            endif()
        endif()
    endforeach()
+    add_custom_command(TARGET fuzzers POST_BUILD COMMAND SRC=${CMAKE_SOURCE_DIR} BIN=${CMAKE_BINARY_DIR} OUT=${CMAKE_BINARY_DIR}/programs ${CMAKE_SOURCE_DIR}/tests/fuzz/build.sh VERBATIM)
 endif()

 include (cmake/sanitize_targets.cmake)
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@ -0,0 +1,120 @@
+# Useful stuff
+
+# Function get_all_targets collects all targets recursively
+function(get_all_targets outvar)
+    macro(get_all_targets_recursive targets dir)
+        get_property(subdirectories DIRECTORY ${dir} PROPERTY SUBDIRECTORIES)
+        foreach(subdir ${subdirectories})
+            get_all_targets_recursive(${targets} ${subdir})
+        endforeach()
+        get_property(current_targets DIRECTORY ${dir} PROPERTY BUILDSYSTEM_TARGETS)
+        list(APPEND ${targets} ${current_targets})
+    endmacro()
+
+    set(targets)
+    get_all_targets_recursive(targets ${CMAKE_CURRENT_SOURCE_DIR})
+    set(${outvar} ${targets} PARENT_SCOPE)
+endfunction()
+
+
+# Function get_target_filename calculates target's output file name
+function(get_target_filename target outvar)
+    get_target_property(prop_type "${target}" TYPE)
+    get_target_property(prop_is_framework "${target}" FRAMEWORK)
+    get_target_property(prop_outname "${target}" OUTPUT_NAME)
+    get_target_property(prop_archive_outname "${target}" ARCHIVE_OUTPUT_NAME)
+    get_target_property(prop_library_outname "${target}" LIBRARY_OUTPUT_NAME)
+    get_target_property(prop_runtime_outname "${target}" RUNTIME_OUTPUT_NAME)
+    # message("prop_archive_outname: ${prop_archive_outname}")    
+    # message("prop_library_outname: ${prop_library_outname}")
+    # message("prop_runtime_outname: ${prop_runtime_outname}")
+    if(DEFINED CMAKE_BUILD_TYPE)
+        get_target_property(prop_cfg_outname "${target}" "${OUTPUT_NAME}_${CMAKE_BUILD_TYPE}")
+        get_target_property(prop_archive_cfg_outname "${target}" "${ARCHIVE_OUTPUT_NAME}_${CMAKE_BUILD_TYPE}")
+        get_target_property(prop_library_cfg_outname "${target}" "${LIBRARY_OUTPUT_NAME}_${CMAKE_BUILD_TYPE}")
+        get_target_property(prop_runtime_cfg_outname "${target}" "${RUNTIME_OUTPUT_NAME}_${CMAKE_BUILD_TYPE}")
+        # message("prop_archive_cfg_outname: ${prop_archive_cfg_outname}")
+        # message("prop_library_cfg_outname: ${prop_library_cfg_outname}")
+        # message("prop_runtime_cfg_outname: ${prop_runtime_cfg_outname}")
+        if(NOT ("${prop_cfg_outname}" STREQUAL "prop_cfg_outname-NOTFOUND"))
+            set(prop_outname "${prop_cfg_outname}")
+        endif()
+        if(NOT ("${prop_archive_cfg_outname}" STREQUAL "prop_archive_cfg_outname-NOTFOUND"))
+            set(prop_archive_outname "${prop_archive_cfg_outname}")
+        endif()
+        if(NOT ("${prop_library_cfg_outname}" STREQUAL "prop_library_cfg_outname-NOTFOUND"))
+            set(prop_library_outname "${prop_library_cfg_outname}")
+        endif()
+        if(NOT ("${prop_runtime_cfg_outname}" STREQUAL "prop_runtime_cfg_outname-NOTFOUND"))
+            set(prop_runtime_outname "${prop_runtime_cfg_outname}")
+        endif()
+    endif()
+    set(outname "${target}")
+    if(NOT ("${prop_outname}" STREQUAL "prop_outname-NOTFOUND"))
+        set(outname "${prop_outname}")
+    endif()
+    if("${prop_is_framework}")
+        set(filename "${outname}")
+    elseif(prop_type STREQUAL "STATIC_LIBRARY")
+        if(NOT ("${prop_archive_outname}" STREQUAL "prop_archive_outname-NOTFOUND"))
+            set(outname "${prop_archive_outname}")
+        endif()
+        set(filename "${CMAKE_STATIC_LIBRARY_PREFIX}${outname}${CMAKE_STATIC_LIBRARY_SUFFIX}")
+    elseif(prop_type STREQUAL "MODULE_LIBRARY")
+        if(NOT ("${prop_library_outname}" STREQUAL "prop_library_outname-NOTFOUND"))
+            set(outname "${prop_library_outname}")
+        endif()
+        set(filename "${CMAKE_SHARED_MODULE_LIBRARY_PREFIX}${outname}${CMAKE_SHARED_MODULE_LIBRARY_SUFFIX}")
+    elseif(prop_type STREQUAL "SHARED_LIBRARY")
+        if(WIN32)
+            if(NOT ("${prop_runtime_outname}" STREQUAL "prop_runtime_outname-NOTFOUND"))
+                set(outname "${prop_runtime_outname}")
+            endif()
+        else()
+            if(NOT ("${prop_library_outname}" STREQUAL "prop_library_outname-NOTFOUND"))
+                set(outname "${prop_library_outname}")
+            endif()
+        endif()
+        set(filename "${CMAKE_SHARED_LIBRARY_PREFIX}${outname}${CMAKE_SHARED_LIBRARY_SUFFIX}")
+    elseif(prop_type STREQUAL "EXECUTABLE")
+        if(NOT ("${prop_runtime_outname}" STREQUAL "prop_runtime_outname-NOTFOUND"))
+            set(outname "${prop_runtime_outname}")
+        endif()
+        set(filename "${CMAKE_EXECUTABLE_PREFIX}${outname}${CMAKE_EXECUTABLE_SUFFIX}")
+    else()
+        message(FATAL_ERROR "target \"${target}\" is not of type STATIC_LIBRARY, MODULE_LIBRARY, SHARED_LIBRARY, or EXECUTABLE.")
+    endif()
+    set("${outvar}" "${filename}" PARENT_SCOPE)
+endfunction()
+
+
+# Function get_cmake_properties returns list of all propreties that cmake supports
+function(get_cmake_properties outvar)
+    execute_process(COMMAND cmake --help-property-list OUTPUT_VARIABLE cmake_properties)
+    # Convert command output into a CMake list
+    string(REGEX REPLACE ";" "\\\\;" cmake_properties "${cmake_properties}")
+    string(REGEX REPLACE "\n" ";" cmake_properties "${cmake_properties}")
+    list(REMOVE_DUPLICATES cmake_properties)
+    set("${outvar}" "${cmake_properties}" PARENT_SCOPE)
+endfunction()
+
+# Function get_target_property_list returns list of all propreties set for target
+function(get_target_property_list target outvar)
+    get_cmake_properties(cmake_property_list)
+    foreach(property ${cmake_property_list})
+        string(REPLACE "<CONFIG>" "${CMAKE_BUILD_TYPE}" property ${property})
+
+        # https://stackoverflow.com/questions/32197663/how-can-i-remove-the-the-location-property-may-not-be-read-from-target-error-i
+        if(property STREQUAL "LOCATION" OR property MATCHES "^LOCATION_" OR property MATCHES "_LOCATION$")
+            continue()
+        endif()
+
+        get_property(was_set TARGET ${target} PROPERTY ${property} SET)
+        if(was_set)
+            get_target_property(value ${target} ${property})
+            string(REGEX REPLACE ";" "\\\\\\\\;" value "${value}")
+            list(APPEND outvar "${property} = ${value}")
+        endif()
+    endforeach()
+    set(${outvar} ${${outvar}} PARENT_SCOPE)
+endfunction()
--- a/docker/images.json
+++ b/docker/images.json
@ -21,6 +21,10 @@
        "name": "clickhouse/fuzzer",
        "dependent": []
    },
+    "docker/test/libfuzzer": {
+        "name": "clickhouse/libfuzzer",
+        "dependent": []
+    },
    "docker/test/performance-comparison": {
        "name": "clickhouse/performance-comparison",
        "dependent": []
@ -121,6 +125,7 @@
         "name": "clickhouse/test-base",
         "dependent": [
            "docker/test/fuzzer",
+            "docker/test/libfuzzer",
            "docker/test/integration/base",
            "docker/test/keeper-jepsen",
            "docker/test/server-jepsen",
--- a/docker/packager/binary/Dockerfile
+++ b/docker/packager/binary/Dockerfile
@ -78,6 +78,7 @@ RUN add-apt-repository ppa:ubuntu-toolchain-r/test --yes \
        python3-boto3 \
        yasm \
        zstd \
+        zip \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists

--- a/docker/packager/binary/build.sh
+++ b/docker/packager/binary/build.sh
@ -97,11 +97,10 @@ if [ -n "$MAKE_DEB" ]; then
  bash -x /build/packages/build
 fi

-if [ "$BUILD_TARGET" != "fuzzers" ]; then
-  mv ./programs/clickhouse* /output
-  [ -x ./programs/self-extracting/clickhouse ] && mv ./programs/self-extracting/clickhouse /output
-  mv ./src/unit_tests_dbms /output ||: # may not exist for some binary builds
-fi
+mv ./programs/clickhouse* /output || mv ./programs/*_fuzzer /output
+[ -x ./programs/self-extracting/clickhouse ] && mv ./programs/self-extracting/clickhouse /output
+mv ./src/unit_tests_dbms /output ||: # may not exist for some binary builds
+mv ./programs/*.dict ./programs/*.options ./programs/*_seed_corpus.zip /output ||: # libFuzzer oss-fuzz compatible infrastructure

 prepare_combined_output () {
    local OUTPUT
--- a/docker/test/libfuzzer/Dockerfile
+++ b/docker/test/libfuzzer/Dockerfile
@ -0,0 +1,43 @@
+ARG FROM_TAG=latest
+FROM clickhouse/test-base:$FROM_TAG
+
+# ARG for quick switch to a given ubuntu mirror
+ARG apt_archive="http://archive.ubuntu.com"
+RUN sed -i "s|http://archive.ubuntu.com|$apt_archive|g" /etc/apt/sources.list
+
+ENV LANG=C.UTF-8
+ENV TZ=Europe/Amsterdam
+RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
+
+RUN apt-get update \
+    && DEBIAN_FRONTEND=noninteractive apt-get install --yes --no-install-recommends \
+            ca-certificates \
+            libc6-dbg \
+            moreutils \
+            ncdu \
+            p7zip-full \
+            parallel \
+            psmisc \
+            python3 \
+            python3-pip \
+            rsync \
+            tree \
+            tzdata \
+            vim \
+            wget \
+    && apt-get autoremove --yes \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN pip3 install Jinja2
+
+COPY * /
+
+ENV FUZZER_ARGS="-max_total_time=60"
+
+SHELL ["/bin/bash", "-c"]
+CMD set -o pipefail \
+    && timeout -s 9 1h /run_libfuzzer.py 2>&1 | ts "$(printf '%%Y-%%m-%%d %%H:%%M:%%S\t')" | tee main.log
+
+# docker run --network=host --volume <workspace>:/workspace -e PR_TO_TEST=<> -e SHA_TO_TEST=<> clickhouse/libfuzzer
+
--- a/docker/test/libfuzzer/run_libfuzzer.py
+++ b/docker/test/libfuzzer/run_libfuzzer.py
@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+
+import configparser
+import logging
+import os
+from pathlib import Path
+import subprocess
+
+DEBUGGER = os.getenv("DEBUGGER", "")
+FUZZER_ARGS = os.getenv("FUZZER_ARGS", "")
+
+
+def run_fuzzer(fuzzer: str):
+    logging.info(f"Running fuzzer {fuzzer}...")
+
+    corpus_dir = f"{fuzzer}.in"
+    with Path(corpus_dir) as path:
+        if not path.exists() or not path.is_dir():
+            corpus_dir = ""
+
+    options_file = f"{fuzzer}.options"
+    custom_libfuzzer_options = ""
+
+    with Path(options_file) as path:
+        if path.exists() and path.is_file():
+            parser = configparser.ConfigParser()
+            parser.read(path)
+
+            if parser.has_section("asan"):
+                os.environ[
+                    "ASAN_OPTIONS"
+                ] = f"{os.environ['ASAN_OPTIONS']}:{':'.join('%s=%s' % (key, value) for key, value in parser['asan'].items())}"
+
+            if parser.has_section("msan"):
+                os.environ[
+                    "MSAN_OPTIONS"
+                ] = f"{os.environ['MSAN_OPTIONS']}:{':'.join('%s=%s' % (key, value) for key, value in parser['msan'].items())}"
+
+            if parser.has_section("ubsan"):
+                os.environ[
+                    "UBSAN_OPTIONS"
+                ] = f"{os.environ['UBSAN_OPTIONS']}:{':'.join('%s=%s' % (key, value) for key, value in parser['ubsan'].items())}"
+
+            if parser.has_section("libfuzzer"):
+                custom_libfuzzer_options = " ".join(
+                    "-%s=%s" % (key, value)
+                    for key, value in parser["libfuzzer"].items()
+                )
+
+    cmd_line = f"{DEBUGGER} ./{fuzzer} {FUZZER_ARGS} {corpus_dir}"
+    if custom_libfuzzer_options:
+        cmd_line += f" {custom_libfuzzer_options}"
+
+    if not "-dict=" in cmd_line and Path(f"{fuzzer}.dict").exists():
+        cmd_line += f" -dict={fuzzer}.dict"
+
+    cmd_line += " < /dev/null"
+
+    logging.info(f"...will execute: {cmd_line}")
+    subprocess.check_call(cmd_line, shell=True)
+
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+
+    subprocess.check_call("ls -al", shell=True)
+
+    with Path() as current:
+        for fuzzer in current.iterdir():
+            if (current / fuzzer).is_file() and os.access(current / fuzzer, os.X_OK):
+                run_fuzzer(fuzzer)
+
+    exit(0)
+
+
+if __name__ == "__main__":
+    main()
--- a/docs/en/engines/table-engines/integrations/embedded-rocksdb.md
+++ b/docs/en/engines/table-engines/integrations/embedded-rocksdb.md
@ -8,7 +8,7 @@ sidebar_label: EmbeddedRocksDB

 This engine allows integrating ClickHouse with [rocksdb](http://rocksdb.org/).

-## Creating a Table {#table_engine-EmbeddedRocksDB-creating-a-table}
+## Creating a Table {#creating-a-table}

 ``` sql
 CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
@ -85,7 +85,7 @@ You can also change any [rocksdb options](https://github.com/facebook/rocksdb/wi
 </rocksdb>
 ```

-## Supported operations {#table_engine-EmbeddedRocksDB-supported-operations}
+## Supported operations {#supported-operations}

 ### Inserts

--- a/docs/en/engines/table-engines/integrations/kafka.md
+++ b/docs/en/engines/table-engines/integrations/kafka.md
@ -14,7 +14,7 @@ Kafka lets you:
 - Organize fault-tolerant storage.
 - Process streams as they become available.

-## Creating a Table {#table_engine-kafka-creating-a-table}
+## Creating a Table {#creating-a-table}

 ``` sql
 CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
--- a/docs/en/engines/table-engines/integrations/nats.md
+++ b/docs/en/engines/table-engines/integrations/nats.md
@ -13,7 +13,7 @@ This engine allows integrating ClickHouse with [NATS](https://nats.io/).
 - Publish or subscribe to message subjects.
 - Process new messages as they become available.

-## Creating a Table {#table_engine-redisstreams-creating-a-table}
+## Creating a Table {#creating-a-table}

 ``` sql
 CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
--- a/docs/en/engines/table-engines/integrations/rabbitmq.md
+++ b/docs/en/engines/table-engines/integrations/rabbitmq.md
@ -13,7 +13,7 @@ This engine allows integrating ClickHouse with [RabbitMQ](https://www.rabbitmq.c
 - Publish or subscribe to data flows.
 - Process streams as they become available.

-## Creating a Table {#table_engine-rabbitmq-creating-a-table}
+## Creating a Table {#creating-a-table}

 ``` sql
 CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
--- a/docs/en/engines/table-engines/integrations/s3queue.md
+++ b/docs/en/engines/table-engines/integrations/s3queue.md
@ -63,7 +63,7 @@ SETTINGS
    mode = 'ordered';
 ```

-## Settings {#s3queue-settings}
+## Settings {#settings}

 ### mode {#mode}

@ -93,7 +93,7 @@ Possible values:

 Default value: `/`.

-### s3queue_loading_retries {#s3queue_loading_retries}
+### s3queue_loading_retries {#loading_retries}

 Retry file loading up to specified number of times. By default, there are no retries.
 Possible values:
@ -102,7 +102,7 @@ Possible values:

 Default value: `0`.

-### s3queue_polling_min_timeout_ms {#s3queue_polling_min_timeout_ms}
+### s3queue_polling_min_timeout_ms {#polling_min_timeout_ms}

 Minimal timeout before next polling (in milliseconds).

@ -112,7 +112,7 @@ Possible values:

 Default value: `1000`.

-### s3queue_polling_max_timeout_ms {#s3queue_polling_max_timeout_ms}
+### s3queue_polling_max_timeout_ms {#polling_max_timeout_ms}

 Maximum timeout before next polling (in milliseconds).

@ -122,7 +122,7 @@ Possible values:

 Default value: `10000`.

-### s3queue_polling_backoff_ms {#s3queue_polling_backoff_ms}
+### s3queue_polling_backoff_ms {#polling_backoff_ms}

 Polling backoff (in milliseconds).

@ -132,7 +132,7 @@ Possible values:

 Default value: `0`.

-### s3queue_tracked_files_limit {#s3queue_tracked_files_limit}
+### s3queue_tracked_files_limit {#tracked_files_limit}

 Allows to limit the number of Zookeeper nodes if the 'unordered' mode is used, does nothing for 'ordered' mode.
 If limit reached the oldest processed files will be deleted from ZooKeeper node and processed again.
@ -143,7 +143,7 @@ Possible values:

 Default value: `1000`.

-### s3queue_tracked_file_ttl_sec {#s3queue_tracked_file_ttl_sec}
+### s3queue_tracked_file_ttl_sec {#tracked_file_ttl_sec}

 Maximum number of seconds to store processed files in ZooKeeper node (store forever by default) for 'unordered' mode, does nothing for 'ordered' mode.
 After the specified number of seconds, the file will be re-imported.
@ -154,7 +154,7 @@ Possible values:

 Default value: `0`.

-### s3queue_polling_size {#s3queue_polling_size}
+### s3queue_polling_size {#polling_size}

 Maximum files to fetch from S3 with SELECT or in background task.
 Engine takes files for processing from S3 in batches.
--- a/docs/en/engines/table-engines/mergetree-family/annindexes.md
+++ b/docs/en/engines/table-engines/mergetree-family/annindexes.md
@ -203,9 +203,10 @@ Parameter `NumTrees` is the number of trees which the algorithm creates (default
 more accurate search results but slower index creation / query times (approximately linearly) as well as larger index sizes.

 :::note
-Indexes over columns of type `Array` will generally work faster than indexes on `Tuple` columns. All arrays **must** have same length. Use
-[CONSTRAINT](/docs/en/sql-reference/statements/create/table.md#constraints) to avoid errors. For example, `CONSTRAINT constraint_name_1
-CHECK length(vectors) = 256`.
+Indexes over columns of type `Array` will generally work faster than indexes on `Tuple` columns. All arrays must have same length. To avoid
+errors, you can use a [CONSTRAINT](/docs/en/sql-reference/statements/create/table.md#constraints), for example, `CONSTRAINT
+constraint_name_1 CHECK length(vectors) = 256`. Also, unspecified `Array` values in INSERT statements (i.e. default values) are not
+supported.
 :::

 Setting `annoy_index_search_k_nodes` (default: `NumTrees * LIMIT`) determines how many tree nodes are inspected during SELECTs. Larger
@ -223,6 +224,7 @@ SETTINGS annoy_index_search_k_nodes=100;
 The Annoy index currently does not work with per-table, non-default `index_granularity` settings (see
 [here](https://github.com/ClickHouse/ClickHouse/pull/51325#issuecomment-1605920475)). If necessary, the value must be changed in config.xml.
 :::
+
 ## USearch {#usearch}

 This type of ANN index is based on the [the USearch library](https://github.com/unum-cloud/usearch), which implements the [HNSW
--- a/docs/en/engines/table-engines/mergetree-family/mergetree.md
+++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md
@ -1354,3 +1354,4 @@ In this sample configuration:
 - `_part_uuid` — Unique part identifier (if enabled MergeTree setting `assign_part_uuids`).
 - `_partition_value` — Values (a tuple) of a `partition by` expression.
 - `_sample_factor` — Sample factor (from the query).
+- `_block_number` — Block number of the row, it is persisted on merges when `allow_experimental_block_number_column` is set to true.
--- a/docs/en/engines/table-engines/special/keepermap.md
+++ b/docs/en/engines/table-engines/special/keepermap.md
@ -20,7 +20,7 @@ For example:

 where path can be any other valid ZooKeeper path.

-## Creating a Table {#table_engine-KeeperMap-creating-a-table}
+## Creating a Table {#creating-a-table}

 ``` sql
 CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
@ -74,7 +74,7 @@ If multiple tables are created on the same ZooKeeper path, the values are persis
 As a result, it is possible to use `ON CLUSTER` clause when creating the table and sharing the data from multiple ClickHouse instances.  
 Of course, it's possible to manually run `CREATE TABLE` with same path on unrelated ClickHouse instances to have same data sharing effect.

-## Supported operations {#table_engine-KeeperMap-supported-operations}
+## Supported operations {#supported-operations}

 ### Inserts

--- a/docs/en/operations/query-cache.md
+++ b/docs/en/operations/query-cache.md
@ -43,6 +43,12 @@ SETTINGS use_query_cache = true;
 will store the query result in the query cache. Subsequent executions of the same query (also with parameter `use_query_cache = true`) will
 read the computed result from the cache and return it immediately.

+:::note
+Setting `use_query_cache` and all other query-cache-related settings only take an effect on stand-alone `SELECT` statements. In particular,
+the results of `SELECT`s to views created by `CREATE VIEW AS SELECT [...] SETTINGS use_query_cache = true` are not cached unless the `SELECT`
+statement runs with `SETTINGS use_query_cache = true`.
+:::
+
 The way the cache is utilized can be configured in more detail using settings [enable_writes_to_query_cache](settings/settings.md#enable-writes-to-query-cache)
 and [enable_reads_from_query_cache](settings/settings.md#enable-reads-from-query-cache) (both `true` by default). The former setting
 controls whether query results are stored in the cache, whereas the latter setting determines if the database should try to retrieve query
@ -84,7 +90,7 @@ It is also possible to limit the cache usage of individual users using [settings
 constraints](settings/constraints-on-settings.md). More specifically, you can restrict the maximum amount of memory (in bytes) a user may
 allocate in the query cache and the the maximum number of stored query results. For that, first provide configurations
 [query_cache_max_size_in_bytes](settings/settings.md#query-cache-max-size-in-bytes) and
-[query_cache_max_entries](settings/settings.md#query-cache-size-max-items) in a user profile in `users.xml`, then make both settings
+[query_cache_max_entries](settings/settings.md#query-cache-size-max-entries) in a user profile in `users.xml`, then make both settings
 readonly:

 ``` xml
@ -134,10 +140,26 @@ block granularity when query results are later served from the query cache.

 As a result, the query cache stores for each query multiple (partial)
 result blocks. While this behavior is a good default, it can be suppressed using setting
-[query_cache_squash_partial_query_results](settings/settings.md#query-cache-squash-partial-query-results).
+[query_cache_squash_partial_results](settings/settings.md#query-cache-squash-partial-results).

-Also, results of queries with non-deterministic functions such as `rand()` and `now()` are not cached. This can be overruled using
-setting [query_cache_store_results_of_queries_with_nondeterministic_functions](settings/settings.md#query-cache-store-results-of-queries-with-nondeterministic-functions).
+Also, results of queries with non-deterministic functions are not cached by default. Such functions include
+- functions for accessing dictionaries: [`dictGet()`](../sql-reference/functions/ext-dict-functions.md#dictGet) etc.
+- [user-defined functions](../sql-reference/statements/create/function.md),
+- functions which return the current date or time: [`now()`](../sql-reference/functions/date-time-functions.md#now),
+  [`today()`](../sql-reference/functions/date-time-functions.md#today),
+  [`yesterday()`](../sql-reference/functions/date-time-functions.md#yesterday) etc.,
+- functions which return random values: [`randomString()`](../sql-reference/functions/random-functions.md#randomString),
+  [`fuzzBits()`](../sql-reference/functions/random-functions.md#fuzzBits) etc.,
+- functions whose result depends on the size and order or the internal chunks used for query processing:
+  [`nowInBlock()`](../sql-reference/functions/date-time-functions.md#nowInBlock) etc.,
+  [`rowNumberInBlock()`](../sql-reference/functions/other-functions.md#rowNumberInBlock),
+  [`runningDifference()`](../sql-reference/functions/other-functions.md#runningDifference),
+  [`blockSize()`](../sql-reference/functions/other-functions.md#blockSize) etc.,
+- functions which depend on the environment: [`currentUser()`](../sql-reference/functions/other-functions.md#currentUser),
+  [`queryID()`](../sql-reference/functions/other-functions.md#queryID),
+  [`getMacro()`](../sql-reference/functions/other-functions.md#getMacro) etc.
+To force caching of results of queries with non-deterministic functions regardless, use setting
+[query_cache_store_results_of_queries_with_nondeterministic_functions](settings/settings.md#query-cache-store-results-of-queries-with-nondeterministic-functions).

 Finally, entries in the query cache are not shared between users due to security reasons. For example, user A must not be able to bypass a
 row policy on a table by running the same query as another user B for whom no such policy exists. However, if necessary, cache entries can
--- a/docs/en/operations/server-configuration-parameters/settings.md
+++ b/docs/en/operations/server-configuration-parameters/settings.md
@ -835,7 +835,7 @@ List of prefixes for [custom settings](../../operations/settings/index.md#custom

 - [Custom settings](../../operations/settings/index.md#custom_settings)

-## core_dump {#server_configuration_parameters-core_dump}
+## core_dump {#core_dump}

 Configures soft limit for core dump file size.

@ -924,7 +924,7 @@ The path to the table in ZooKeeper.
 <default_replica_name>{replica}</default_replica_name>
 ```

-## dictionaries_config {#server_configuration_parameters-dictionaries_config}
+## dictionaries_config {#dictionaries_config}

 The path to the config file for dictionaries.

@ -941,7 +941,7 @@ See also “[Dictionaries](../../sql-reference/dictionaries/index.md)”.
 <dictionaries_config>*_dictionary.xml</dictionaries_config>
 ```

-## user_defined_executable_functions_config {#server_configuration_parameters-user_defined_executable_functions_config}
+## user_defined_executable_functions_config {#user_defined_executable_functions_config}

 The path to the config file for executable user defined functions.

@ -958,7 +958,7 @@ See also “[Executable User Defined Functions](../../sql-reference/functions/in
 <user_defined_executable_functions_config>*_function.xml</user_defined_executable_functions_config>
 ```

-## dictionaries_lazy_load {#server_configuration_parameters-dictionaries_lazy_load}
+## dictionaries_lazy_load {#dictionaries_lazy_load}

 Lazy loading of dictionaries.

@ -974,7 +974,7 @@ The default is `true`.
 <dictionaries_lazy_load>true</dictionaries_lazy_load>
 ```

-## format_schema_path {#server_configuration_parameters-format_schema_path}
+## format_schema_path {#format_schema_path}

 The path to the directory with the schemes for the input data, such as schemas for the [CapnProto](../../interfaces/formats.md#capnproto) format.

@ -985,7 +985,7 @@ The path to the directory with the schemes for the input data, such as schemas f
  <format_schema_path>format_schemas/</format_schema_path>
 ```

-## graphite {#server_configuration_parameters-graphite}
+## graphite {#graphite}

 Sending data to [Graphite](https://github.com/graphite-project).

@ -1019,7 +1019,7 @@ You can configure multiple `<graphite>` clauses. For instance, you can use this
 </graphite>
 ```

-## graphite_rollup {#server_configuration_parameters-graphite-rollup}
+## graphite_rollup {#graphite-rollup}

 Settings for thinning data for Graphite.

@ -1051,7 +1051,7 @@ For more details, see [GraphiteMergeTree](../../engines/table-engines/mergetree-

 The port for connecting to the server over HTTP(s).

-If `https_port` is specified, [openSSL](#server_configuration_parameters-openssl) must be configured.
+If `https_port` is specified, [openSSL](#openssl) must be configured.

 If `http_port` is specified, the OpenSSL configuration is ignored even if it is set.

@ -1061,7 +1061,7 @@ If `http_port` is specified, the OpenSSL configuration is ignored even if it is
 <https_port>9999</https_port>
 ```

-## http_server_default_response {#server_configuration_parameters-http_server_default_response}
+## http_server_default_response {#http_server_default_response}

 The page that is shown by default when you access the ClickHouse HTTP(s) server.
 The default value is “Ok.” (with a line feed at the end)
@ -1086,7 +1086,7 @@ Expired time for HSTS in seconds. The default value is 0 means clickhouse disabl
 <hsts_max_age>600000</hsts_max_age>
 ```

-## include_from {#server_configuration_parameters-include_from}
+## include_from {#include_from}

 The path to the file with substitutions.

@ -1222,7 +1222,7 @@ The number of seconds that ClickHouse waits for incoming requests before closing
 <keep_alive_timeout>10</keep_alive_timeout>
 ```

-## listen_host {#server_configuration_parameters-listen_host}
+## listen_host {#listen_host}

 Restriction on hosts that requests can come from. If you want the server to answer all of them, specify `::`.

@ -1233,7 +1233,7 @@ Examples:
 <listen_host>127.0.0.1</listen_host>
 ```

-## listen_backlog {#server_configuration_parameters-listen_backlog}
+## listen_backlog {#listen_backlog}

 Backlog (queue size of pending connections) of the listen socket.

@ -1253,7 +1253,7 @@ Examples:
 <listen_backlog>4096</listen_backlog>
 ```

-## logger {#server_configuration_parameters-logger}
+## logger {#logger}

 Logging settings.

@ -1357,7 +1357,7 @@ Keys for syslog:
    Default value: `LOG_USER` if `address` is specified, `LOG_DAEMON` otherwise.
 - format – Message format. Possible values: `bsd` and `syslog.`

-## send_crash_reports {#server_configuration_parameters-send_crash_reports}
+## send_crash_reports {#send_crash_reports}

 Settings for opt-in sending crash reports to the ClickHouse core developers team via [Sentry](https://sentry.io).
 Enabling it, especially in pre-production environments, is highly appreciated.
@ -1629,7 +1629,7 @@ Default value: `0.5`.



-## merge_tree {#server_configuration_parameters-merge_tree}
+## merge_tree {#merge_tree}

 Fine tuning for tables in the [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md).

@ -1676,7 +1676,7 @@ To disable `metric_log` setting, you should create the following file `/etc/clic
 </clickhouse>
 ```

-## replicated_merge_tree {#server_configuration_parameters-replicated_merge_tree}
+## replicated_merge_tree {#replicated_merge_tree}

 Fine tuning for tables in the [ReplicatedMergeTree](../../engines/table-engines/mergetree-family/mergetree.md).

@ -1692,7 +1692,7 @@ For more information, see the MergeTreeSettings.h header file.
 </replicated_merge_tree>
 ```

-## openSSL {#server_configuration_parameters-openssl}
+## openSSL {#openssl}

 SSL client/server configuration.

@ -1751,7 +1751,7 @@ Keys for server/client settings:
 </openSSL>
 ```

-## part_log {#server_configuration_parameters-part-log}
+## part_log {#part-log}

 Logging events that are associated with [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md). For instance, adding or merging data. You can use the log to simulate merge algorithms and compare their characteristics. You can visualize the merge process.

@ -1791,7 +1791,7 @@ Default: false.
 </part_log>
 ```

-## path {#server_configuration_parameters-path}
+## path {#path}

 The path to the directory containing data.

@ -1805,7 +1805,7 @@ The trailing slash is mandatory.
 <path>/var/lib/clickhouse/</path>
 ```

-## Prometheus {#server_configuration_parameters-prometheus}
+## Prometheus {#prometheus}

 Exposing metrics data for scraping from [Prometheus](https://prometheus.io).

@ -1841,7 +1841,7 @@ Check (replace `127.0.0.1` with the IP addr or hostname of your ClickHouse serve
 curl 127.0.0.1:9363/metrics
 ```

-## query_log {#server_configuration_parameters-query-log}
+## query_log {#query-log}

 Setting for logging queries received with the [log_queries=1](../../operations/settings/settings.md) setting.

@ -1911,7 +1911,7 @@ Data for the query cache is allocated in DRAM. If memory is scarce, make sure to
 </query_cache>
 ```

-## query_thread_log {#server_configuration_parameters-query_thread_log}
+## query_thread_log {#query_thread_log}

 Setting for logging threads of queries received with the [log_query_threads=1](../../operations/settings/settings.md#settings-log-query-threads) setting.

@ -1953,7 +1953,7 @@ If the table does not exist, ClickHouse will create it. If the structure of the
 </query_thread_log>
 ```

-## query_views_log {#server_configuration_parameters-query_views_log}
+## query_views_log {#query_views_log}

 Setting for logging views (live, materialized etc) dependant of queries received with the [log_query_views=1](../../operations/settings/settings.md#settings-log-query-views) setting.

@ -1995,7 +1995,7 @@ If the table does not exist, ClickHouse will create it. If the structure of the
 </query_views_log>
 ```

-## text_log {#server_configuration_parameters-text_log}
+## text_log {#text_log}

 Settings for the [text_log](../../operations/system-tables/text_log.md#system_tables-text_log) system table for logging text messages.

@ -2037,7 +2037,7 @@ Default: false.
 </clickhouse>
 ```

-## trace_log {#server_configuration_parameters-trace_log}
+## trace_log {#trace_log}

 Settings for the [trace_log](../../operations/system-tables/trace_log.md#system_tables-trace_log) system table operation.

@ -2073,7 +2073,7 @@ The default server configuration file `config.xml` contains the following settin
 </trace_log>
 ```

-## asynchronous_insert_log {#server_configuration_parameters-asynchronous_insert_log}
+## asynchronous_insert_log {#asynchronous_insert_log}

 Settings for the [asynchronous_insert_log](../../operations/system-tables/asynchronous_insert_log.md#system_tables-asynchronous_insert_log) system table for logging async inserts.

@ -2112,7 +2112,7 @@ Default: false.
 </clickhouse>
 ```

-## crash_log {#server_configuration_parameters-crash_log}
+## crash_log {#crash_log}

 Settings for the [crash_log](../../operations/system-tables/crash-log.md) system table operation.

@ -2150,7 +2150,7 @@ The default server configuration file `config.xml` contains the following settin
 </crash_log>
 ```

-## backup_log {#server_configuration_parameters-backup_log}
+## backup_log {#backup_log}

 Settings for the [backup_log](../../operations/system-tables/backup_log.md) system table for logging `BACKUP` and `RESTORE` operations.

@ -2239,7 +2239,7 @@ For the value of the `incl` attribute, see the section “[Configuration files](
 - [Cluster Discovery](../../operations/cluster-discovery.md)
 - [Replicated database engine](../../engines/database-engines/replicated.md)

-## timezone {#server_configuration_parameters-timezone}
+## timezone {#timezone}

 The server’s time zone.

@ -2257,7 +2257,7 @@ The time zone is necessary for conversions between String and DateTime formats w

 - [session_timezone](../settings/settings.md#session_timezone)

-## tcp_port {#server_configuration_parameters-tcp_port}
+## tcp_port {#tcp_port}

 Port for communicating with clients over the TCP protocol.

@ -2267,9 +2267,9 @@ Port for communicating with clients over the TCP protocol.
 <tcp_port>9000</tcp_port>
 ```

-## tcp_port_secure {#server_configuration_parameters-tcp_port_secure}
+## tcp_port_secure {#tcp_port_secure}

-TCP port for secure communication with clients. Use it with [OpenSSL](#server_configuration_parameters-openssl) settings.
+TCP port for secure communication with clients. Use it with [OpenSSL](#openssl) settings.

 **Possible values**

@ -2281,7 +2281,7 @@ Positive integer.
 <tcp_port_secure>9440</tcp_port_secure>
 ```

-## mysql_port {#server_configuration_parameters-mysql_port}
+## mysql_port {#mysql_port}

 Port for communicating with clients over MySQL protocol.

@ -2295,7 +2295,7 @@ Example
 <mysql_port>9004</mysql_port>
 ```

-## postgresql_port {#server_configuration_parameters-postgresql_port}
+## postgresql_port {#postgresql_port}

 Port for communicating with clients over PostgreSQL protocol.

@ -2326,7 +2326,7 @@ Path on the local filesystem to store temporary data for processing large querie
 ```


-## user_files_path {#server_configuration_parameters-user_files_path}
+## user_files_path {#user_files_path}

 The directory with user files. Used in the table function [file()](../../sql-reference/table-functions/file.md).

@ -2336,7 +2336,7 @@ The directory with user files. Used in the table function [file()](../../sql-ref
 <user_files_path>/var/lib/clickhouse/user_files/</user_files_path>
 ```

-## user_scripts_path {#server_configuration_parameters-user_scripts_path}
+## user_scripts_path {#user_scripts_path}

 The directory with user scripts files. Used for Executable user defined functions [Executable User Defined Functions](../../sql-reference/functions/index.md#executable-user-defined-functions).

@ -2346,7 +2346,7 @@ The directory with user scripts files. Used for Executable user defined function
 <user_scripts_path>/var/lib/clickhouse/user_scripts/</user_scripts_path>
 ```

-## user_defined_path {#server_configuration_parameters-user_defined_path}
+## user_defined_path {#user_defined_path}

 The directory with user defined files. Used for SQL user defined functions [SQL User Defined Functions](../../sql-reference/functions/index.md#user-defined-functions).

@ -2442,7 +2442,7 @@ Storage method for data part headers in ZooKeeper.

 This setting only applies to the `MergeTree` family. It can be specified:

- Globally in the [merge_tree](#server_configuration_parameters-merge_tree) section of the `config.xml` file.
+- Globally in the [merge_tree](#merge_tree) section of the `config.xml` file.

    ClickHouse uses the setting for all the tables on the server. You can change the setting at any time. Existing tables change their behaviour when the setting changes.

--- a/docs/en/operations/settings/merge-tree-settings.md
+++ b/docs/en/operations/settings/merge-tree-settings.md
@ -854,3 +854,9 @@ Possible values:
 - `Always` or `Never`.

 Default value: `Never`
+
+## allow_experimental_block_number_column
+
+Persists virtual column `_block_number` on merges.
+
+Default value: false.
--- a/docs/en/operations/settings/permissions-for-queries.md
+++ b/docs/en/operations/settings/permissions-for-queries.md
@ -48,7 +48,7 @@ Setting `readonly = 1` prohibits the user from changing settings. There is a way
 :::


-## allow_ddl {#settings_allow_ddl}
+## allow_ddl {#allow_ddl}

 Allows or denies [DDL](https://en.wikipedia.org/wiki/Data_definition_language) queries.

--- a/docs/en/operations/settings/query-complexity.md
+++ b/docs/en/operations/settings/query-complexity.md
@ -154,6 +154,13 @@ Result:
 Maximum query execution time in seconds.
 At this time, it is not checked for one of the sorting stages, or when merging and finalizing aggregate functions.

+The `max_execution_time` parameter can be a bit tricky to understand. 
+It operates based on interpolation relative to the current query execution speed (this behaviour is controlled by [timeout_before_checking_execution_speed](#timeout-before-checking-execution-speed)). 
+ClickHouse will interrupt a query if the projected execution time exceeds the specified `max_execution_time`.
+By default, the timeout_before_checking_execution_speed is set to 10 seconds. This means that after 10 seconds of query execution, ClickHouse will begin estimating the total execution time. 
+If, for example, `max_execution_time` is set to 3600 seconds (1 hour), ClickHouse will terminate the query if the estimated time exceeds this 3600-second limit.
+If you set `timeout_before_checking_execution_speed `to 0, ClickHouse will use clock time as the basis for `max_execution_time`.
+
 ## timeout_overflow_mode {#timeout-overflow-mode}

 What to do if the query is run longer than ‘max_execution_time’: ‘throw’ or ‘break’. By default, throw.
--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@ -177,7 +177,7 @@ If `enable_optimize_predicate_expression = 1`, then the execution time of these

 If `enable_optimize_predicate_expression = 0`, then the execution time of the second query is much longer because the `WHERE` clause applies to all the data after the subquery finishes.

-## fallback_to_stale_replicas_for_distributed_queries {#settings-fallback_to_stale_replicas_for_distributed_queries}
+## fallback_to_stale_replicas_for_distributed_queries {#fallback_to_stale_replicas_for_distributed_queries}

 Forces a query to an out-of-date replica if updated data is not available. See [Replication](../../engines/table-engines/mergetree-family/replication.md).

@ -187,7 +187,7 @@ Used when performing `SELECT` from a distributed table that points to replicated

 By default, 1 (enabled).

-## force_index_by_date {#settings-force_index_by_date}
+## force_index_by_date {#force_index_by_date}

 Disables query execution if the index can’t be used by date.

@ -203,7 +203,7 @@ Works with tables in the MergeTree family.

 If `force_primary_key=1`, ClickHouse checks to see if the query has a primary key condition that can be used for restricting data ranges. If there is no suitable condition, it throws an exception. However, it does not check whether the condition reduces the amount of data to read. For more information about data ranges in MergeTree tables, see [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md).

-## use_skip_indexes {#settings-use_skip_indexes}
+## use_skip_indexes {#use_skip_indexes}

 Use data skipping indexes during query execution.

@ -214,7 +214,7 @@ Possible values:

 Default value: 1.

-## force_data_skipping_indices {#settings-force_data_skipping_indices}
+## force_data_skipping_indices {#force_data_skipping_indices}

 Disables query execution if passed data skipping indices wasn't used.

@ -241,7 +241,7 @@ SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices='`d1_
 SELECT * FROM data_01515 WHERE d1 = 0 AND assumeNotNull(d1_null) = 0 SETTINGS force_data_skipping_indices='`d1_idx`, d1_null_idx'; -- Ok.
 ```

-## ignore_data_skipping_indices {#settings-ignore_data_skipping_indices}
+## ignore_data_skipping_indices {#ignore_data_skipping_indices}

 Ignores the skipping indexes specified if used by the query.

@ -401,7 +401,7 @@ Enables or disables [fsync](http://pubs.opengroup.org/onlinepubs/9699919799/func

 It makes sense to disable it if the server has millions of tiny tables that are constantly being created and destroyed.

-## function_range_max_elements_in_block {#settings-function_range_max_elements_in_block}
+## function_range_max_elements_in_block {#function_range_max_elements_in_block}

 Sets the safety threshold for data volume generated by function [range](../../sql-reference/functions/array-functions.md/#range). Defines the maximum number of values generated by function per block of data (sum of array sizes for every row in a block).

@ -416,7 +416,7 @@ Default value: `500,000,000`.
 - [max_block_size](#setting-max_block_size)
 - [min_insert_block_size_rows](#min-insert-block-size-rows)

-## enable_http_compression {#settings-enable_http_compression}
+## enable_http_compression {#enable_http_compression}

 Enables or disables data compression in the response to an HTTP request.

@ -429,15 +429,15 @@ Possible values:

 Default value: 0.

-## http_zlib_compression_level {#settings-http_zlib_compression_level}
+## http_zlib_compression_level {#http_zlib_compression_level}

-Sets the level of data compression in the response to an HTTP request if [enable_http_compression = 1](#settings-enable_http_compression).
+Sets the level of data compression in the response to an HTTP request if [enable_http_compression = 1](#enable_http_compression).

 Possible values: Numbers from 1 to 9.

 Default value: 3.

-## http_native_compression_disable_checksumming_on_decompress {#settings-http_native_compression_disable_checksumming_on_decompress}
+## http_native_compression_disable_checksumming_on_decompress {#http_native_compression_disable_checksumming_on_decompress}

 Enables or disables checksum verification when decompressing the HTTP POST data from the client. Used only for ClickHouse native compression format (not used with `gzip` or `deflate`).

@ -480,7 +480,7 @@ Possible values:

 Default value: `1000`.

-## send_progress_in_http_headers {#settings-send_progress_in_http_headers}
+## send_progress_in_http_headers {#send_progress_in_http_headers}

 Enables or disables `X-ClickHouse-Progress` HTTP response headers in `clickhouse-server` responses.

@ -518,7 +518,7 @@ Possible values:

 Default value: `1`.

-## join_default_strictness {#settings-join_default_strictness}
+## join_default_strictness {#join_default_strictness}

 Sets default strictness for [JOIN clauses](../../sql-reference/statements/select/join.md/#select-join).

@ -531,7 +531,7 @@ Possible values:

 Default value: `ALL`.

-## join_algorithm {#settings-join_algorithm}
+## join_algorithm {#join_algorithm}

 Specifies which [JOIN](../../sql-reference/statements/select/join.md) algorithm is used.

@ -547,7 +547,7 @@ Possible values:

 [Grace hash join](https://en.wikipedia.org/wiki/Hash_join#Grace_hash_join) is used.  Grace hash provides an algorithm option that provides performant complex joins while limiting memory use.

- The first phase of a grace join reads the right table and splits it into N buckets depending on the hash value of key columns (initially, N is `grace_hash_join_initial_buckets`). This is done in a way to ensure that each bucket can be processed independently. Rows from the first bucket are added to an in-memory hash table while the others are saved to disk. If the hash table grows beyond the memory limit (e.g., as set by [`max_bytes_in_join`](/docs/en/operations/settings/query-complexity.md/#settings-max_bytes_in_join)), the number of buckets is increased and the assigned bucket for each row. Any rows which don’t belong to the current bucket are flushed and reassigned.
+ The first phase of a grace join reads the right table and splits it into N buckets depending on the hash value of key columns (initially, N is `grace_hash_join_initial_buckets`). This is done in a way to ensure that each bucket can be processed independently. Rows from the first bucket are added to an in-memory hash table while the others are saved to disk. If the hash table grows beyond the memory limit (e.g., as set by [`max_bytes_in_join`](/docs/en/operations/settings/query-complexity.md/#max_bytes_in_join)), the number of buckets is increased and the assigned bucket for each row. Any rows which don’t belong to the current bucket are flushed and reassigned.

 Supports `INNER/LEFT/RIGHT/FULL ALL/ANY JOIN`.

@ -588,7 +588,7 @@ Possible values:
 ClickHouse always tries to use `partial_merge` join if possible, otherwise, it uses `hash`. *Deprecated*, same as `partial_merge,hash`.


-## join_any_take_last_row {#settings-join_any_take_last_row}
+## join_any_take_last_row {#join_any_take_last_row}

 Changes the behaviour of join operations with `ANY` strictness.

@ -607,7 +607,7 @@ See also:

 - [JOIN clause](../../sql-reference/statements/select/join.md/#select-join)
 - [Join table engine](../../engines/table-engines/special/join.md)
- [join_default_strictness](#settings-join_default_strictness)
+- [join_default_strictness](#join_default_strictness)

 ## join_use_nulls {#join_use_nulls}

@ -879,7 +879,7 @@ Possible values:

 Default value: 2013265920.

-## min_bytes_to_use_direct_io {#settings-min-bytes-to-use-direct-io}
+## min_bytes_to_use_direct_io {#min-bytes-to-use-direct-io}

 The minimum data volume required for using direct I/O access to the storage disk.

@ -917,7 +917,7 @@ Possible values:

 Default value: `1`.

-## log_queries {#settings-log-queries}
+## log_queries {#log-queries}

 Setting up query logging.

@ -929,7 +929,7 @@ Example:
 log_queries=1
 ```

-## log_queries_min_query_duration_ms {#settings-log-queries-min-query-duration-ms}
+## log_queries_min_query_duration_ms {#log-queries-min-query-duration-ms}

 If enabled (non-zero), queries faster than the value of this setting will not be logged (you can think about this as a `long_query_time` for [MySQL Slow Query Log](https://dev.mysql.com/doc/refman/5.7/en/slow-query-log.html)), and this basically means that you will not find them in the following tables:

@ -944,7 +944,7 @@ Only the queries with the following type will get to the log:
 - Type: milliseconds
 - Default value: 0 (any query)

-## log_queries_min_type {#settings-log-queries-min-type}
+## log_queries_min_type {#log-queries-min-type}

 `query_log` minimal type to log.

@ -962,11 +962,11 @@ Can be used to limit which entities will go to `query_log`, say you are interest
 log_queries_min_type='EXCEPTION_WHILE_PROCESSING'
 ```

-## log_query_threads {#settings-log-query-threads}
+## log_query_threads {#log-query-threads}

 Setting up query threads logging.

-Query threads log into the [system.query_thread_log](../../operations/system-tables/query_thread_log.md) table. This setting has effect only when [log_queries](#settings-log-queries) is true. Queries’ threads run by ClickHouse with this setup are logged according to the rules in the [query_thread_log](../../operations/server-configuration-parameters/settings.md/#server_configuration_parameters-query_thread_log) server configuration parameter.
+Query threads log into the [system.query_thread_log](../../operations/system-tables/query_thread_log.md) table. This setting has effect only when [log_queries](#log-queries) is true. Queries’ threads run by ClickHouse with this setup are logged according to the rules in the [query_thread_log](../../operations/server-configuration-parameters/settings.md/#server_configuration_parameters-query_thread_log) server configuration parameter.

 Possible values:

@ -981,7 +981,7 @@ Default value: `1`.
 log_query_threads=1
 ```

-## log_query_views {#settings-log-query-views}
+## log_query_views {#log-query-views}

 Setting up query views logging.

@ -993,7 +993,7 @@ Example:
 log_query_views=1
 ```

-## log_formatted_queries {#settings-log-formatted-queries}
+## log_formatted_queries {#log-formatted-queries}

 Allows to log formatted queries to the [system.query_log](../../operations/system-tables/query_log.md) system table (populates `formatted_query` column in the [system.query_log](../../operations/system-tables/query_log.md)).

@ -1004,7 +1004,7 @@ Possible values:

 Default value: `0`.

-## log_comment {#settings-log-comment}
+## log_comment {#log-comment}

 Specifies the value for the `log_comment` field of the [system.query_log](../system-tables/query_log.md) table and comment text for the server log.

@ -1012,7 +1012,7 @@ It can be used to improve the readability of server logs. Additionally, it helps

 Possible values:

- Any string no longer than [max_query_size](#settings-max_query_size). If the max_query_size is exceeded, the server throws an exception.
+- Any string no longer than [max_query_size](#max_query_size). If the max_query_size is exceeded, the server throws an exception.

 Default value: empty string.

@ -1036,7 +1036,7 @@ Result:
 └─────────────┴───────────┘
 ```

-## log_processors_profiles {#settings-log_processors_profiles}
+## log_processors_profiles {#log_processors_profiles}

 Write time that processor spent during execution/waiting for data to `system.processors_profile_log` table.

@ -1045,7 +1045,7 @@ See also:
 - [`system.processors_profile_log`](../../operations/system-tables/processors_profile_log.md)
 - [`EXPLAIN PIPELINE`](../../sql-reference/statements/explain.md#explain-pipeline)

-## max_insert_block_size {#settings-max_insert_block_size}
+## max_insert_block_size {#max_insert_block_size}

 The size of blocks (in a count of rows) to form for insertion into a table.
 This setting only applies in cases when the server forms the blocks.
@ -1079,7 +1079,7 @@ Possible values:

 Default value: 268435456.

-## max_replica_delay_for_distributed_queries {#settings-max_replica_delay_for_distributed_queries}
+## max_replica_delay_for_distributed_queries {#max_replica_delay_for_distributed_queries}

 Disables lagging replicas for distributed queries. See [Replication](../../engines/table-engines/mergetree-family/replication.md).

@ -1096,7 +1096,7 @@ Default value: 300.

 Used when performing `SELECT` from a distributed table that points to replicated tables.

-## max_threads {#settings-max_threads}
+## max_threads {#max_threads}

 The maximum number of query processing threads, excluding threads for retrieving data from remote servers (see the ‘max_distributed_connections’ parameter).

@ -1109,7 +1109,7 @@ For queries that are completed quickly because of a LIMIT, you can set a lower

 The smaller the `max_threads` value, the less memory is consumed.

-## max_insert_threads {#settings-max-insert-threads}
+## max_insert_threads {#max-insert-threads}

 The maximum number of threads to execute the `INSERT SELECT` query.

@ -1120,7 +1120,7 @@ Possible values:

 Default value: 0.

-Parallel `INSERT SELECT` has effect only if the `SELECT` part is executed in parallel, see [max_threads](#settings-max_threads) setting.
+Parallel `INSERT SELECT` has effect only if the `SELECT` part is executed in parallel, see [max_threads](#max_threads) setting.
 Higher values will lead to higher memory usage.

 ## max_compress_block_size {#max-compress-block-size}
@ -1149,7 +1149,7 @@ We are writing a URL column with the String type (average size of 60 bytes per v
 This is an expert-level setting, and you shouldn't change it if you're just getting started with ClickHouse.
 :::

-## max_query_size {#settings-max_query_size}
+## max_query_size {#max_query_size}

 The maximum number of bytes of a query string parsed by the SQL parser.
 Data in the VALUES clause of INSERT queries is processed by a separate stream parser (that consumes O(1) RAM) and not affected by this restriction.
@ -1393,7 +1393,7 @@ Default value: 5000.

 ## stream_flush_interval_ms {#stream-flush-interval-ms}

-Works for tables with streaming in the case of a timeout, or when a thread generates [max_insert_block_size](#settings-max_insert_block_size) rows.
+Works for tables with streaming in the case of a timeout, or when a thread generates [max_insert_block_size](#max_insert_block_size) rows.

 The default value is 7500.

@ -1405,7 +1405,7 @@ Timeout for polling data from/to streaming storages.

 Default value: 500.

-## load_balancing {#settings-load_balancing}
+## load_balancing {#load_balancing}

 Specifies the algorithm of replicas selection that is used for distributed query processing.

@ -1419,7 +1419,7 @@ ClickHouse supports the following algorithms of choosing replicas:

 See also:

- [distributed_replica_max_ignored_errors](#settings-distributed_replica_max_ignored_errors)
+- [distributed_replica_max_ignored_errors](#distributed_replica_max_ignored_errors)

 ### Random (by Default) {#load_balancing-random}

@ -1473,20 +1473,20 @@ load_balancing = round_robin

 This algorithm uses a round-robin policy across replicas with the same number of errors (only the queries with `round_robin` policy is accounted).

-## prefer_localhost_replica {#settings-prefer-localhost-replica}
+## prefer_localhost_replica {#prefer-localhost-replica}

 Enables/disables preferable using the localhost replica when processing distributed queries.

 Possible values:

 - 1 — ClickHouse always sends a query to the localhost replica if it exists.
- 0 — ClickHouse uses the balancing strategy specified by the [load_balancing](#settings-load_balancing) setting.
+- 0 — ClickHouse uses the balancing strategy specified by the [load_balancing](#load_balancing) setting.

 Default value: 1.

 :::note
-Disable this setting if you use [max_parallel_replicas](#settings-max_parallel_replicas) without [parallel_replicas_custom_key](#settings-parallel_replicas_custom_key).
-If [parallel_replicas_custom_key](#settings-parallel_replicas_custom_key) is set, disable this setting only if it's used on a cluster with multiple shards containing multiple replicas.
+Disable this setting if you use [max_parallel_replicas](#max_parallel_replicas) without [parallel_replicas_custom_key](#parallel_replicas_custom_key).
+If [parallel_replicas_custom_key](#parallel_replicas_custom_key) is set, disable this setting only if it's used on a cluster with multiple shards containing multiple replicas.
 If it's used on a cluster with a single shard and multiple replicas, disabling this setting will have negative effects.
 :::

@ -1500,7 +1500,7 @@ See the section “WITH TOTALS modifier”.
 The threshold for `totals_mode = 'auto'`.
 See the section “WITH TOTALS modifier”.

-## max_parallel_replicas {#settings-max_parallel_replicas}
+## max_parallel_replicas {#max_parallel_replicas}

 The maximum number of replicas for each shard when executing a query.

@ -1527,23 +1527,23 @@ A query may be processed faster if it is executed on several servers in parallel
 - The sampling key is an expression that is expensive to calculate.
 - The cluster latency distribution has a long tail, so that querying more servers increases the query overall latency.

-### Parallel processing using [parallel_replicas_custom_key](#settings-parallel_replicas_custom_key)
+### Parallel processing using [parallel_replicas_custom_key](#parallel_replicas_custom_key)

 This setting is useful for any replicated table.

-## parallel_replicas_custom_key {#settings-parallel_replicas_custom_key}
+## parallel_replicas_custom_key {#parallel_replicas_custom_key}

 An arbitrary integer expression that can be used to split work between replicas for a specific table.
 The value can be any integer expression.
-A query may be processed faster if it is executed on several servers in parallel but it depends on the used [parallel_replicas_custom_key](#settings-parallel_replicas_custom_key)
-and [parallel_replicas_custom_key_filter_type](#settings-parallel_replicas_custom_key_filter_type).
+A query may be processed faster if it is executed on several servers in parallel but it depends on the used [parallel_replicas_custom_key](#parallel_replicas_custom_key)
+and [parallel_replicas_custom_key_filter_type](#parallel_replicas_custom_key_filter_type).

 Simple expressions using primary keys are preferred.

 If the setting is used on a cluster that consists of a single shard with multiple replicas, those replicas will be converted into virtual shards.
 Otherwise, it will behave same as for `SAMPLE` key, it will use multiple replicas of each shard.

-## parallel_replicas_custom_key_filter_type {#settings-parallel_replicas_custom_key_filter_type}
+## parallel_replicas_custom_key_filter_type {#parallel_replicas_custom_key_filter_type}

 How to use `parallel_replicas_custom_key` expression for splitting work between replicas.

@ -1637,7 +1637,7 @@ Possible values:

 Default value: `1`.

-## query_cache_store_results_of_queries_with_nondeterministic_functions {#query--store-results-of-queries-with-nondeterministic-functions}
+## query_cache_store_results_of_queries_with_nondeterministic_functions {#query-cache-store-results-of-queries-with-nondeterministic-functions}

 If turned on, then results of `SELECT` queries with non-deterministic functions (e.g. `rand()`, `now()`) can be cached in the [query cache](../query-cache.md).

@ -1732,7 +1732,7 @@ Possible values:

 Default value: 0 (no restriction).

-## insert_quorum {#settings-insert_quorum}
+## insert_quorum {#insert_quorum}

 Enables the quorum writes.

@ -1746,7 +1746,7 @@ Quorum writes

 `INSERT` succeeds only when ClickHouse manages to correctly write data to the `insert_quorum` of replicas during the `insert_quorum_timeout`. If for any reason the number of replicas with successful writes does not reach the `insert_quorum`, the write is considered failed and ClickHouse will delete the inserted block from all the replicas where data has already been written.

-When `insert_quorum_parallel` is disabled, all replicas in the quorum are consistent, i.e. they contain data from all previous `INSERT` queries (the `INSERT` sequence is linearized). When reading data written using `insert_quorum` and `insert_quorum_parallel` is disabled, you can turn on sequential consistency for `SELECT` queries using [select_sequential_consistency](#settings-select_sequential_consistency).
+When `insert_quorum_parallel` is disabled, all replicas in the quorum are consistent, i.e. they contain data from all previous `INSERT` queries (the `INSERT` sequence is linearized). When reading data written using `insert_quorum` and `insert_quorum_parallel` is disabled, you can turn on sequential consistency for `SELECT` queries using [select_sequential_consistency](#select_sequential_consistency).

 ClickHouse generates an exception:

@ -1755,11 +1755,11 @@ ClickHouse generates an exception:

 See also:

- [insert_quorum_timeout](#settings-insert_quorum_timeout)
- [insert_quorum_parallel](#settings-insert_quorum_parallel)
- [select_sequential_consistency](#settings-select_sequential_consistency)
+- [insert_quorum_timeout](#insert_quorum_timeout)
+- [insert_quorum_parallel](#insert_quorum_parallel)
+- [select_sequential_consistency](#select_sequential_consistency)

-## insert_quorum_timeout {#settings-insert_quorum_timeout}
+## insert_quorum_timeout {#insert_quorum_timeout}

 Write to a quorum timeout in milliseconds. If the timeout has passed and no write has taken place yet, ClickHouse will generate an exception and the client must repeat the query to write the same block to the same or any other replica.

@ -1767,11 +1767,11 @@ Default value: 600 000 milliseconds (ten minutes).

 See also:

- [insert_quorum](#settings-insert_quorum)
- [insert_quorum_parallel](#settings-insert_quorum_parallel)
- [select_sequential_consistency](#settings-select_sequential_consistency)
+- [insert_quorum](#insert_quorum)
+- [insert_quorum_parallel](#insert_quorum_parallel)
+- [select_sequential_consistency](#select_sequential_consistency)

-## insert_quorum_parallel {#settings-insert_quorum_parallel}
+## insert_quorum_parallel {#insert_quorum_parallel}

 Enables or disables parallelism for quorum `INSERT` queries. If enabled, additional `INSERT` queries can be sent while previous queries have not yet finished. If disabled, additional writes to the same table will be rejected.

@ -1784,11 +1784,11 @@ Default value: 1.

 See also:

- [insert_quorum](#settings-insert_quorum)
- [insert_quorum_timeout](#settings-insert_quorum_timeout)
- [select_sequential_consistency](#settings-select_sequential_consistency)
+- [insert_quorum](#insert_quorum)
+- [insert_quorum_timeout](#insert_quorum_timeout)
+- [select_sequential_consistency](#select_sequential_consistency)

-## select_sequential_consistency {#settings-select_sequential_consistency}
+## select_sequential_consistency {#select_sequential_consistency}

 Enables or disables sequential consistency for `SELECT` queries. Requires `insert_quorum_parallel` to be disabled (enabled by default).

@ -1807,11 +1807,11 @@ When `insert_quorum_parallel` is enabled (the default), then `select_sequential_

 See also:

- [insert_quorum](#settings-insert_quorum)
- [insert_quorum_timeout](#settings-insert_quorum_timeout)
- [insert_quorum_parallel](#settings-insert_quorum_parallel)
+- [insert_quorum](#insert_quorum)
+- [insert_quorum_timeout](#insert_quorum_timeout)
+- [insert_quorum_parallel](#insert_quorum_parallel)

-## insert_deduplicate {#settings-insert-deduplicate}
+## insert_deduplicate {#insert-deduplicate}

 Enables or disables block deduplication of `INSERT` (for Replicated\* tables).

@ -1938,7 +1938,7 @@ For the replicated tables, by default, only 10000 of the most recent inserts for
 We recommend enabling the [async_block_ids_cache](merge-tree-settings.md/#use-async-block-ids-cache) to increase the efficiency of deduplication.
 This function does not work for non-replicated tables.

-## deduplicate_blocks_in_dependent_materialized_views {#settings-deduplicate-blocks-in-dependent-materialized-views}
+## deduplicate_blocks_in_dependent_materialized_views {#deduplicate-blocks-in-dependent-materialized-views}

 Enables or disables the deduplication check for materialized views that receive data from Replicated\* tables.

@ -2048,7 +2048,7 @@ Possible values:

 Default value: 10000

-## max_network_bytes {#settings-max-network-bytes}
+## max_network_bytes {#max-network-bytes}

 Limits the data volume (in bytes) that is received or transmitted over the network when executing a query. This setting applies to every individual query.

@ -2059,7 +2059,7 @@ Possible values:

 Default value: 0.

-## max_network_bandwidth {#settings-max-network-bandwidth}
+## max_network_bandwidth {#max-network-bandwidth}

 Limits the speed of the data exchange over the network in bytes per second. This setting applies to every query.

@ -2070,7 +2070,7 @@ Possible values:

 Default value: 0.

-## max_network_bandwidth_for_user {#settings-max-network-bandwidth-for-user}
+## max_network_bandwidth_for_user {#max-network-bandwidth-for-user}

 Limits the speed of the data exchange over the network in bytes per second. This setting applies to all concurrently running queries performed by a single user.

@ -2081,7 +2081,7 @@ Possible values:

 Default value: 0.

-## max_network_bandwidth_for_all_users {#settings-max-network-bandwidth-for-all-users}
+## max_network_bandwidth_for_all_users {#max-network-bandwidth-for-all-users}

 Limits the speed that data is exchanged at over the network in bytes per second. This setting applies to all concurrently running queries on the server.

@ -2092,7 +2092,7 @@ Possible values:

 Default value: 0.

-## count_distinct_implementation {#settings-count_distinct_implementation}
+## count_distinct_implementation {#count_distinct_implementation}

 Specifies which of the `uniq*` functions should be used to perform the [COUNT(DISTINCT …)](../../sql-reference/aggregate-functions/reference/count.md/#agg_function-count) construction.

@ -2106,7 +2106,7 @@ Possible values:

 Default value: `uniqExact`.

-## skip_unavailable_shards {#settings-skip_unavailable_shards}
+## skip_unavailable_shards {#skip_unavailable_shards}

 Enables or disables silently skipping of unavailable shards.

@ -2270,7 +2270,7 @@ Possible values:

 Default value: 0

-## force_optimize_skip_unused_shards_nesting {#settings-force_optimize_skip_unused_shards_nesting}
+## force_optimize_skip_unused_shards_nesting {#force_optimize_skip_unused_shards_nesting}

 Controls [`force_optimize_skip_unused_shards`](#force-optimize-skip-unused-shards) (hence still requires [`force_optimize_skip_unused_shards`](#force-optimize-skip-unused-shards)) depends on the nesting level of the distributed query (case when you have `Distributed` table that look into another `Distributed` table).

@ -2400,7 +2400,7 @@ Enables caching of rows number during count from files in table functions `file`

 Enabled by default.

-## distributed_replica_error_half_life {#settings-distributed_replica_error_half_life}
+## distributed_replica_error_half_life {#distributed_replica_error_half_life}

 - Type: seconds
 - Default value: 60 seconds
@ -2411,10 +2411,10 @@ See also:

 - [load_balancing](#load_balancing-round_robin)
 - [Table engine Distributed](../../engines/table-engines/special/distributed.md)
- [distributed_replica_error_cap](#settings-distributed_replica_error_cap)
- [distributed_replica_max_ignored_errors](#settings-distributed_replica_max_ignored_errors)
+- [distributed_replica_error_cap](#distributed_replica_error_cap)
+- [distributed_replica_max_ignored_errors](#distributed_replica_max_ignored_errors)

-## distributed_replica_error_cap {#settings-distributed_replica_error_cap}
+## distributed_replica_error_cap {#distributed_replica_error_cap}

 - Type: unsigned int
 - Default value: 1000
@ -2425,10 +2425,10 @@ See also:

 - [load_balancing](#load_balancing-round_robin)
 - [Table engine Distributed](../../engines/table-engines/special/distributed.md)
- [distributed_replica_error_half_life](#settings-distributed_replica_error_half_life)
- [distributed_replica_max_ignored_errors](#settings-distributed_replica_max_ignored_errors)
+- [distributed_replica_error_half_life](#distributed_replica_error_half_life)
+- [distributed_replica_max_ignored_errors](#distributed_replica_max_ignored_errors)

-## distributed_replica_max_ignored_errors {#settings-distributed_replica_max_ignored_errors}
+## distributed_replica_max_ignored_errors {#distributed_replica_max_ignored_errors}

 - Type: unsigned int
 - Default value: 0
@ -2439,7 +2439,7 @@ See also:

 - [load_balancing](#load_balancing-round_robin)
 - [Table engine Distributed](../../engines/table-engines/special/distributed.md)
- [distributed_replica_error_cap](#settings-distributed_replica_error_cap)
+- [distributed_replica_error_cap](#distributed_replica_error_cap)
 - [distributed_replica_error_half_life](#settings-distributed_replica_error_half_life)

 ## distributed_directory_monitor_sleep_time_ms {#distributed_directory_monitor_sleep_time_ms}
@ -2595,7 +2595,7 @@ Possible values:

 Default value: 0.

-## allow_introspection_functions {#settings-allow_introspection_functions}
+## allow_introspection_functions {#allow_introspection_functions}

 Enables or disables [introspection functions](../../sql-reference/functions/introspection.md) for query profiling.

@ -3136,7 +3136,7 @@ Do not enable this feature in version `<= 21.8`. It's not properly implemented a
 ## aggregate_functions_null_for_empty {#aggregate_functions_null_for_empty}

 Enables or disables rewriting all aggregate functions in a query, adding [-OrNull](../../sql-reference/aggregate-functions/combinators.md/#agg-functions-combinator-ornull) suffix to them. Enable it for SQL standard compatibility.
-It is implemented via query rewrite (similar to [count_distinct_implementation](#settings-count_distinct_implementation) setting) to get consistent results for distributed queries.
+It is implemented via query rewrite (similar to [count_distinct_implementation](#count_distinct_implementation) setting) to get consistent results for distributed queries.

 Possible values:

@ -4609,7 +4609,7 @@ Default: 0

 ## rewrite_count_distinct_if_with_count_distinct_implementation

-Allows you to rewrite `countDistcintIf` with [count_distinct_implementation](#settings-count_distinct_implementation) setting.
+Allows you to rewrite `countDistcintIf` with [count_distinct_implementation](#count_distinct_implementation) setting.

 Possible values:

--- a/docs/en/sql-reference/data-types/array.md
+++ b/docs/en/sql-reference/data-types/array.md
@ -4,7 +4,7 @@ sidebar_position: 52
 sidebar_label: Array(T)
 ---

-# Array(t)
+# Array(T)

 An array of `T`-type items, with the starting array index as 1. `T` can be any data type, including an array.

--- a/docs/en/sql-reference/dictionaries/index.md
+++ b/docs/en/sql-reference/dictionaries/index.md
@ -123,7 +123,7 @@ LAYOUT(...) -- Memory layout configuration
 LIFETIME(...) -- Lifetime of dictionary in memory
 ```

-## Storing Dictionaries in Memory {#storig-dictionaries-in-memory}
+## Storing Dictionaries in Memory {#storing-dictionaries-in-memory}

 There are a variety of ways to store dictionaries in memory.

--- a/docs/en/sql-reference/functions/array-functions.md
+++ b/docs/en/sql-reference/functions/array-functions.md
@ -657,7 +657,7 @@ SELECT arraySlice([1, 2, NULL, 4, 5], 2, 3) AS res;

 Array elements set to `NULL` are handled as normal values.

-## arraySort(\[func,\] arr, …) {#array_functions-sort}
+## arraySort(\[func,\] arr, …) {#sort}

 Sorts the elements of the `arr` array in ascending order. If the `func` function is specified, sorting order is determined by the result of the `func` function applied to the elements of the array. If `func` accepts multiple arguments, the `arraySort` function is passed several arrays that the arguments of `func` will correspond to. Detailed examples are shown at the end of `arraySort` description.

@ -716,7 +716,7 @@ SELECT arraySort((x) -> -x, [1, 2, 3]) as res;
 └─────────┘
 ```

-For each element of the source array, the lambda function returns the sorting key, that is, \[1 –\> -1, 2 –\> -2, 3 –\> -3\]. Since the `arraySort` function sorts the keys in ascending order, the result is \[3, 2, 1\]. Thus, the `(x) –> -x` lambda function sets the [descending order](#array_functions-reverse-sort) in a sorting.
+For each element of the source array, the lambda function returns the sorting key, that is, \[1 –\> -1, 2 –\> -2, 3 –\> -3\]. Since the `arraySort` function sorts the keys in ascending order, the result is \[3, 2, 1\]. Thus, the `(x) –> -x` lambda function sets the [descending order](#reverse-sort) in a sorting.

 The lambda function can accept multiple arguments. In this case, you need to pass the `arraySort` function several arrays of identical length that the arguments of lambda function will correspond to. The resulting array will consist of elements from the first input array; elements from the next input array(s) specify the sorting keys. For example:

@ -762,7 +762,7 @@ To improve sorting efficiency, the [Schwartzian transform](https://en.wikipedia.

 Same as `arraySort` with additional `limit` argument allowing partial sorting. Returns an array of the same size as the original array where elements in range `[1..limit]` are sorted in ascending order. Remaining elements `(limit..N]` shall contain elements in unspecified order.

-## arrayReverseSort(\[func,\] arr, …) {#array_functions-reverse-sort}
+## arrayReverseSort(\[func,\] arr, …) {#reverse-sort}

 Sorts the elements of the `arr` array in descending order. If the `func` function is specified, `arr` is sorted according to the result of the `func` function applied to the elements of the array, and then the sorted array is reversed. If `func` accepts multiple arguments, the `arrayReverseSort` function is passed several arrays that the arguments of `func` will correspond to. Detailed examples are shown at the end of `arrayReverseSort` description.

--- a/docs/en/sql-reference/functions/date-time-functions.md
+++ b/docs/en/sql-reference/functions/date-time-functions.md
@ -239,7 +239,7 @@ int32samoa: 1546300800

 **See Also**

- [formatDateTime](#date_time_functions-formatDateTime) - supports non-constant timezone.
+- [formatDateTime](#formatDateTime) - supports non-constant timezone.
 - [toString](type-conversion-functions.md#tostring) - supports non-constant timezone.

 ## timeZoneOf
@ -1274,7 +1274,7 @@ Alias: `SUBDATE`
 **See Also**
 - [date_sub](#date_sub)

-## now
+## now {#now}

 Returns the current date and time at the moment of query analysis. The function is a constant expression.

@ -1361,7 +1361,7 @@ Result:
 └─────────────────────────┴───────────────────────────────┘
 ```

-## nowInBlock
+## nowInBlock {#nowInBlock}

 Returns the current date and time at the moment of processing of each block of data. In contrast to the function [now](#now), it is not a constant expression, and the returned value will be different in different blocks for long-running queries.

@ -1405,14 +1405,14 @@ Result:
 └─────────────────────┴─────────────────────┴──────────┘
 ```

-## today
+## today {#today}

 Accepts zero arguments and returns the current date at one of the moments of query analysis.
 The same as ‘toDate(now())’.

 Aliases: `curdate`, `current_date`.

-## yesterday
+## yesterday {#yesterday}

 Accepts zero arguments and returns yesterday’s date at one of the moments of query analysis.
 The same as ‘today() - 1’.
@ -1628,7 +1628,7 @@ SELECT timeSlots(toDateTime64('1980-12-12 21:01:02.1234', 4, 'UTC'), toDecimal64
 └───────────────────────────────────────────────────────────────────────────────────────────────────────────┘
 ```

-## formatDateTime {#date_time_functions-formatDateTime}
+## formatDateTime {#formatDateTime}

 Formats a Time according to the given Format string. Format is a constant expression, so you cannot have multiple formats for a single result column.

@ -1753,7 +1753,7 @@ LIMIT 10
 - [formatDateTimeInJodaSyntax](##formatDateTimeInJodaSyntax)


-## formatDateTimeInJodaSyntax {#date_time_functions-formatDateTimeInJodaSyntax}
+## formatDateTimeInJodaSyntax {#formatDateTimeInJodaSyntax}

 Similar to formatDateTime, except that it formats datetime in Joda style instead of MySQL style. Refer to https://joda-time.sourceforge.net/apidocs/org/joda/time/format/DateTimeFormat.html.

--- a/docs/en/sql-reference/functions/ext-dict-functions.md
+++ b/docs/en/sql-reference/functions/ext-dict-functions.md
@ -12,7 +12,7 @@ For dictionaries created with [DDL queries](../../sql-reference/statements/creat

 For information on connecting and configuring dictionaries, see [Dictionaries](../../sql-reference/dictionaries/index.md).

-## dictGet, dictGetOrDefault, dictGetOrNull
+## dictGet, dictGetOrDefault, dictGetOrNull {#dictGet}

 Retrieves values from a dictionary.

--- a/docs/en/sql-reference/functions/hash-functions.md
+++ b/docs/en/sql-reference/functions/hash-functions.md
@ -19,7 +19,7 @@ halfMD5(par1, ...)
 ```

 The function is relatively slow (5 million short strings per second per processor core).
-Consider using the [sipHash64](#hash_functions-siphash64) function instead.
+Consider using the [sipHash64](#siphash64) function instead.

 **Arguments**

@ -45,13 +45,13 @@ SELECT halfMD5(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00')

 Calculates the MD4 from a string and returns the resulting set of bytes as FixedString(16).

-## MD5 {#hash_functions-md5}
+## MD5 {#md5}

 Calculates the MD5 from a string and returns the resulting set of bytes as FixedString(16).
 If you do not need MD5 in particular, but you need a decent cryptographic 128-bit hash, use the ‘sipHash128’ function instead.
 If you want to get the same result as output by the md5sum utility, use lower(hex(MD5(s))).

-## sipHash64 {#hash_functions-siphash64}
+## sipHash64 {#siphash64}

 Produces a 64-bit [SipHash](https://en.wikipedia.org/wiki/SipHash) hash value.

@ -59,7 +59,7 @@ Produces a 64-bit [SipHash](https://en.wikipedia.org/wiki/SipHash) hash value.
 sipHash64(par1,...)
 ```

-This is a cryptographic hash function. It works at least three times faster than the [MD5](#hash_functions-md5) hash function.
+This is a cryptographic hash function. It works at least three times faster than the [MD5](#md5) hash function.

 The function [interprets](/docs/en/sql-reference/functions/type-conversion-functions.md/#type_conversion_functions-reinterpretAsString) all the input parameters as strings and calculates the hash value for each of them. It then combines the hashes by the following algorithm:

@ -91,7 +91,7 @@ SELECT sipHash64(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00

 ## sipHash64Keyed

-Same as [sipHash64](#hash_functions-siphash64) but additionally takes an explicit key argument instead of using a fixed key.
+Same as [sipHash64](#siphash64) but additionally takes an explicit key argument instead of using a fixed key.

 **Syntax**

@ -101,7 +101,7 @@ sipHash64Keyed((k0, k1), par1,...)

 **Arguments**

-Same as [sipHash64](#hash_functions-siphash64), but the first argument is a tuple of two UInt64 values representing the key.
+Same as [sipHash64](#siphash64), but the first argument is a tuple of two UInt64 values representing the key.

 **Returned value**

@ -123,12 +123,12 @@ SELECT sipHash64Keyed((506097522914230528, 1084818905618843912), array('e','x','

 ## sipHash128

-Like [sipHash64](#hash_functions-siphash64) but produces a 128-bit hash value, i.e. the final xor-folding state is done up to 128 bits.
+Like [sipHash64](#siphash64) but produces a 128-bit hash value, i.e. the final xor-folding state is done up to 128 bits.

 :::note
 This 128-bit variant differs from the reference implementation and it's weaker.
 This version exists because, when it was written, there was no official 128-bit extension for SipHash.
-New projects should probably use [sipHash128Reference](#hash_functions-siphash128reference).
+New projects should probably use [sipHash128Reference](#siphash128reference).
 :::

 **Syntax**
@ -139,7 +139,7 @@ sipHash128(par1,...)

 **Arguments**

-Same as for [sipHash64](#hash_functions-siphash64).
+Same as for [sipHash64](#siphash64).

 **Returned value**

@ -163,12 +163,12 @@ Result:

 ## sipHash128Keyed

-Same as [sipHash128](#hash_functions-siphash128) but additionally takes an explicit key argument instead of using a fixed key.
+Same as [sipHash128](#siphash128) but additionally takes an explicit key argument instead of using a fixed key.

 :::note
 This 128-bit variant differs from the reference implementation and it's weaker.
 This version exists because, when it was written, there was no official 128-bit extension for SipHash.
-New projects should probably use [sipHash128ReferenceKeyed](#hash_functions-siphash128referencekeyed).
+New projects should probably use [sipHash128ReferenceKeyed](#siphash128referencekeyed).
 :::

 **Syntax**
@ -179,7 +179,7 @@ sipHash128Keyed((k0, k1), par1,...)

 **Arguments**

-Same as [sipHash128](#hash_functions-siphash128), but the first argument is a tuple of two UInt64 values representing the key.
+Same as [sipHash128](#siphash128), but the first argument is a tuple of two UInt64 values representing the key.

 **Returned value**

@ -203,7 +203,7 @@ Result:

 ## sipHash128Reference

-Like [sipHash128](#hash_functions-siphash128) but implements the 128-bit algorithm from the original authors of SipHash.
+Like [sipHash128](#siphash128) but implements the 128-bit algorithm from the original authors of SipHash.

 **Syntax**

@ -213,7 +213,7 @@ sipHash128Reference(par1,...)

 **Arguments**

-Same as for [sipHash128](#hash_functions-siphash128).
+Same as for [sipHash128](#siphash128).

 **Returned value**

@ -237,7 +237,7 @@ Result:

 ## sipHash128ReferenceKeyed

-Same as [sipHash128Reference](#hash_functions-siphash128reference) but additionally takes an explicit key argument instead of using a fixed key.
+Same as [sipHash128Reference](#siphash128reference) but additionally takes an explicit key argument instead of using a fixed key.

 **Syntax**

@ -247,7 +247,7 @@ sipHash128ReferenceKeyed((k0, k1), par1,...)

 **Arguments**

-Same as [sipHash128Reference](#hash_functions-siphash128reference), but the first argument is a tuple of two UInt64 values representing the key.
+Same as [sipHash128Reference](#siphash128reference), but the first argument is a tuple of two UInt64 values representing the key.

 **Returned value**

@ -536,7 +536,7 @@ Calculates `HiveHash` from a string.
 SELECT hiveHash('')
 ```

-This is just [JavaHash](#hash_functions-javahash) with zeroed out sign bit. This function is used in [Apache Hive](https://en.wikipedia.org/wiki/Apache_Hive) for versions before 3.0. This hash function is neither fast nor having a good quality. The only reason to use it is when this algorithm is already used in another system and you have to calculate exactly the same result.
+This is just [JavaHash](#javahash) with zeroed out sign bit. This function is used in [Apache Hive](https://en.wikipedia.org/wiki/Apache_Hive) for versions before 3.0. This hash function is neither fast nor having a good quality. The only reason to use it is when this algorithm is already used in another system and you have to calculate exactly the same result.

 **Returned value**

--- a/docs/en/sql-reference/functions/other-functions.md
+++ b/docs/en/sql-reference/functions/other-functions.md
@ -11,7 +11,7 @@ sidebar_label: Other
 Returns the name of the host on which this function was executed. If the function executes on a remote server (distributed processing), the remote server name is returned.
 If the function executes in the context of a distributed table, it generates a normal column with values relevant to each shard. Otherwise it produces a constant value.

-## getMacro
+## getMacro {#getMacro}

 Returns a named value from the [macros](../../operations/server-configuration-parameters/settings.md#macros) section of the server configuration.

@ -186,7 +186,7 @@ Returns the type name of the passed argument.

 If `NULL` is passed, then the function returns type `Nullable(Nothing)`, which corresponds to ClickHouse's internal `NULL` representation.

-## blockSize()
+## blockSize() {#blockSize}

 In ClickHouse, queries are processed in blocks (chunks).
 This function returns the size (row count) of the block the function is called on.
@ -311,7 +311,7 @@ Sleeps ‘seconds’ seconds for each row. The sleep time can be specified as in
 Returns the name of the current database.
 Useful in table engine parameters of `CREATE TABLE` queries where you need to specify the database.

-## currentUser()
+## currentUser() {#currentUser}

 Returns the name of the current user. In case of a distributed query, the name of the user who initiated the query is returned.

@ -771,7 +771,7 @@ If executed in the context of a distributed table, this function generates a nor

 Returns the sequence number of the data block where the row is located.

-## rowNumberInBlock()
+## rowNumberInBlock() {#rowNumberInBlock}

 Returns the ordinal number of the row in the data block. Different data blocks are always recalculated.

@ -896,7 +896,7 @@ Result:
 └────────────┴───────┴───────────┴────────────────┘
 ```

-## runningDifference(x)
+## runningDifference(x) {#runningDifference}

 Calculates the difference between two consecutive row values in the data block.
 Returns 0 for the first row, and for subsequent rows the difference to the previous row.
@ -2274,7 +2274,7 @@ Result:
 └───────────────────────────┘
 ```

-## queryID
+## queryID {#queryID}

 Returns the ID of the current query. Other parameters of a query can be extracted from the [system.query_log](../../operations/system-tables/query_log.md) table via `query_id`.

--- a/docs/en/sql-reference/functions/random-functions.md
+++ b/docs/en/sql-reference/functions/random-functions.md
@ -478,7 +478,7 @@ Result:
 └─────────────────────┘
 ```

-## randomString
+## randomString {#randomString}

 Generates a string of the specified length filled with random bytes (including zero bytes). Not all characters may be printable.

@ -627,7 +627,7 @@ Result:
 └──────────────────────┘
 ```

-## fuzzBits
+## fuzzBits {#fuzzBits}

 **Syntax**

--- a/docs/en/sql-reference/statements/system.md
+++ b/docs/en/sql-reference/statements/system.md
@ -340,6 +340,15 @@ After running this statement the `[db.]replicated_merge_tree_family_table_name`
 - If a `LIGHTWEIGHT` modifier was specified then the query waits only for `GET_PART`, `ATTACH_PART`, `DROP_RANGE`, `REPLACE_RANGE` and `DROP_PART` entries to be processed.
 - If a `PULL` modifier was specified then the query pulls new replication queue entries from ZooKeeper, but does not wait for anything to be processed.

+### SYNC DATABASE REPLICA
+
+Waits until the specified [replicated database](https://clickhouse.com/docs/en/engines/database-engines/replicated) applies all schema changes from the DDL queue of that database. 
+
+**Syntax**
+```sql
+SYSTEM SYNC DATABASE REPLICA replicated_database_name;
+```
+
 ### RESTART REPLICA

 Provides possibility to reinitialize Zookeeper session's state for `ReplicatedMergeTree` table, will compare current state with Zookeeper as source of truth and add tasks to Zookeeper queue if needed.
--- a/programs/diagnostics/internal/platform/data/file_test.go
+++ b/programs/diagnostics/internal/platform/data/file_test.go
@ -135,7 +135,7 @@ func TestConfigFileFrameCopy(t *testing.T) {
 		sizes := map[string]int64{
 			"users.xml":            int64(2017),
 			"default-password.xml": int64(188),
-			"config.xml":           int64(59506),
+			"config.xml":           int64(59377),
 			"server-include.xml":   int64(168),
 			"user-include.xml":     int64(559),
 		}
--- a/programs/local/LocalServer.cpp
+++ b/programs/local/LocalServer.cpp
@ -32,6 +32,8 @@
 #include <Common/randomSeed.h>
 #include <Common/ThreadPool.h>
 #include <Loggers/Loggers.h>
+#include <Loggers/OwnFormattingChannel.h>
+#include <Loggers/OwnPatternFormatter.h>
 #include <IO/ReadBufferFromFile.h>
 #include <IO/ReadBufferFromString.h>
 #include <IO/WriteBufferFromFileDescriptor.h>
@ -599,7 +601,9 @@ void LocalServer::processConfig()
    {
        auto poco_logs_level = Poco::Logger::parseLevel(level);
        Poco::Logger::root().setLevel(poco_logs_level);
-        Poco::Logger::root().setChannel(Poco::AutoPtr<Poco::SimpleFileChannel>(new Poco::SimpleFileChannel(server_logs_file)));
+        Poco::AutoPtr<OwnPatternFormatter> pf = new OwnPatternFormatter;
+        Poco::AutoPtr<OwnFormattingChannel> log = new OwnFormattingChannel(pf, new Poco::SimpleFileChannel(server_logs_file));
+        Poco::Logger::root().setChannel(log);
        logging_initialized = true;
    }
    else if (logging || is_interactive)
--- a/src/Backups/BackupIO_S3.cpp
+++ b/src/Backups/BackupIO_S3.cpp
@ -49,6 +49,7 @@ namespace
            settings.auth_settings.region,
            context->getRemoteHostFilter(),
            static_cast<unsigned>(context->getGlobalContext()->getSettingsRef().s3_max_redirects),
+            static_cast<unsigned>(context->getGlobalContext()->getSettingsRef().s3_retry_attempts),
            context->getGlobalContext()->getSettingsRef().enable_s3_requests_logging,
            /* for_disk_s3 = */ false, settings.request_settings.get_request_throttler, settings.request_settings.put_request_throttler,
            s3_uri.uri.getScheme());
--- a/src/Common/ErrorCodes.cpp
+++ b/src/Common/ErrorCodes.cpp
@ -585,6 +585,8 @@
    M(700, USER_SESSION_LIMIT_EXCEEDED)  \
    M(701, CLUSTER_DOESNT_EXIST) \
    M(702, CLIENT_INFO_DOES_NOT_MATCH) \
+    M(703, INVALID_IDENTIFIER) \
+    M(704, CANNOT_USE_QUERY_CACHE_WITH_NONDETERMINISTIC_FUNCTIONS) \
    \
    M(999, KEEPER_EXCEPTION) \
    M(1000, POCO_EXCEPTION) \
--- a/src/Coordination/KeeperSnapshotManagerS3.cpp
+++ b/src/Coordination/KeeperSnapshotManagerS3.cpp
@ -80,6 +80,7 @@ void KeeperSnapshotManagerS3::updateS3Configuration(const Poco::Util::AbstractCo
        auto headers = auth_settings.headers;

        static constexpr size_t s3_max_redirects = 10;
+        static constexpr size_t s3_retry_attempts = 10;
        static constexpr bool enable_s3_requests_logging = false;

        if (!new_uri.key.empty())
@ -90,7 +91,7 @@ void KeeperSnapshotManagerS3::updateS3Configuration(const Poco::Util::AbstractCo

        S3::PocoHTTPClientConfiguration client_configuration = S3::ClientFactory::instance().createClientConfiguration(
            auth_settings.region,
-            RemoteHostFilter(), s3_max_redirects,
+            RemoteHostFilter(), s3_max_redirects, s3_retry_attempts,
            enable_s3_requests_logging,
            /* for_disk_s3 = */ false, /* get_request_throttler = */ {}, /* put_request_throttler = */ {},
            new_uri.uri.getScheme());
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -169,7 +169,7 @@ class IColumn;
    M(String, parallel_replicas_custom_key, "", "Custom key assigning work to replicas when parallel replicas are used.", 0) \
    M(ParallelReplicasCustomKeyFilterType, parallel_replicas_custom_key_filter_type, ParallelReplicasCustomKeyFilterType::DEFAULT, "Type of filter to use with custom key for parallel replicas. default - use modulo operation on the custom key, range - use range filter on custom key using all possible values for the value type of custom key.", 0) \
    \
-    M(String, cluster_for_parallel_replicas, "default", "Cluster for a shard in which current server is located", 0) \
+    M(String, cluster_for_parallel_replicas, "", "Cluster for a shard in which current server is located", 0) \
    M(UInt64, allow_experimental_parallel_reading_from_replicas, 0, "Use all the replicas from a shard for SELECT query execution. Reading is parallelized and coordinated dynamically. 0 - disabled, 1 - enabled, silently disable them in case of failure, 2 - enabled, throw an exception in case of failure", 0) \
    M(Float, parallel_replicas_single_task_marks_count_multiplier, 2, "A multiplier which will be added during calculation for minimal number of marks to retrieve from coordinator. This will be applied only for remote replicas.", 0) \
    M(Bool, parallel_replicas_for_non_replicated_merge_tree, false, "If true, ClickHouse will use parallel replicas algorithm also for non-replicated MergeTree tables", 0) \
@ -796,7 +796,7 @@ class IColumn;
    M(UInt64, grace_hash_join_initial_buckets, 1, "Initial number of grace hash join buckets", 0) \
    M(UInt64, grace_hash_join_max_buckets, 1024, "Limit on the number of grace hash join buckets", 0) \
    M(Bool, optimize_distinct_in_order, true, "Enable DISTINCT optimization if some columns in DISTINCT form a prefix of sorting. For example, prefix of sorting key in merge tree or ORDER BY statement", 0) \
-    M(Bool, allow_experimental_undrop_table_query, false, "Allow to use undrop query to restore dropped table in a limited time", 0) \
+    M(Bool, allow_experimental_undrop_table_query, true, "Allow to use undrop query to restore dropped table in a limited time", 0) \
    M(Bool, keeper_map_strict_mode, false, "Enforce additional checks during operations on KeeperMap. E.g. throw an exception on an insert for already existing key", 0) \
    M(UInt64, extract_kvp_max_pairs_per_row, 1000, "Max number pairs that can be produced by extractKeyValuePairs function. Used to safeguard against consuming too much memory.", 0) \
    M(Timezone, session_timezone, "", "This setting can be removed in the future due to potential caveats. It is experimental and is not suitable for production usage. The default timezone for current session or query. The server default timezone if empty.", 0) \
--- a/src/DataTypes/Utils.cpp
+++ b/src/DataTypes/Utils.cpp
@ -0,0 +1,231 @@
+#include <DataTypes/Utils.h>
+#include <DataTypes/DataTypesDecimal.h>
+#include <DataTypes/DataTypeNullable.h>
+#include <DataTypes/DataTypeLowCardinality.h>
+#include <DataTypes/DataTypeArray.h>
+#include <DataTypes/DataTypeMap.h>
+#include <DataTypes/DataTypeTuple.h>
+
+namespace DB
+{
+
+bool canBeSafelyCasted(const DataTypePtr & from_type, const DataTypePtr & to_type)
+{
+    auto from_which_type = WhichDataType(from_type->getTypeId());
+    bool to_type_was_nullable = isNullableOrLowCardinalityNullable(to_type);
+    auto to_type_unwrapped = removeNullable(removeLowCardinality(to_type));
+
+    if (from_type->equals(*to_type_unwrapped))
+        return true;
+
+    auto to_which_type = WhichDataType(to_type_unwrapped->getTypeId());
+
+    switch (from_which_type.idx)
+    {
+        case TypeIndex::UInt8:
+        case TypeIndex::UInt16:
+        case TypeIndex::UInt32:
+        case TypeIndex::UInt64:
+        case TypeIndex::UInt128:
+        case TypeIndex::UInt256:
+        {
+            if (to_which_type.isUInt() &&
+                to_type_unwrapped->getSizeOfValueInMemory() >= from_type->getSizeOfValueInMemory())
+                return true;
+
+            if (to_which_type.isString())
+                return true;
+
+            return false;
+        }
+        case TypeIndex::Int8:
+        case TypeIndex::Int16:
+        case TypeIndex::Int32:
+        case TypeIndex::Int64:
+        case TypeIndex::Int128:
+        case TypeIndex::Int256:
+        {
+            if (to_which_type.isInt() &&
+                to_type_unwrapped->getSizeOfValueInMemory() >= from_type->getSizeOfValueInMemory())
+                return true;
+
+            if (to_which_type.isString())
+                return true;
+
+            return false;
+        }
+        case TypeIndex::Float32:
+        {
+            if (to_which_type.isFloat64() || to_which_type.isString())
+                return true;
+
+            return false;
+        }
+        case TypeIndex::Float64:
+        case TypeIndex::Date:
+        case TypeIndex::Date32:
+        case TypeIndex::DateTime:
+        case TypeIndex::DateTime64:
+        case TypeIndex::FixedString:
+        case TypeIndex::Enum8:
+        case TypeIndex::Enum16:
+        case TypeIndex::IPv6:
+        {
+            if (to_which_type.isString())
+                return true;
+
+            return false;
+        }
+        case TypeIndex::Decimal32:
+        case TypeIndex::Decimal64:
+        case TypeIndex::Decimal128:
+        case TypeIndex::Decimal256:
+        {
+            if (to_which_type.isDecimal())
+            {
+                auto from_type_decimal_precision = getDecimalPrecision(*from_type);
+                auto to_type_decimal_precision = getDecimalPrecision(*to_type_unwrapped);
+                if (from_type_decimal_precision > to_type_decimal_precision)
+                    return false;
+
+                auto from_type_decimal_scale = getDecimalScale(*from_type);
+                auto to_type_decimal_scale = getDecimalScale(*to_type_unwrapped);
+                if (from_type_decimal_scale > to_type_decimal_scale)
+                    return false;
+
+                return true;
+            }
+
+            if (to_which_type.isString())
+                return true;
+
+            return false;
+        }
+        case TypeIndex::UUID:
+        {
+            if (to_which_type.isUInt128() || to_which_type.isString())
+                return true;
+
+            return false;
+        }
+        case TypeIndex::IPv4:
+        {
+            if (to_which_type.isUInt32() || to_which_type.isUInt64() || to_which_type.isString())
+                return true;
+
+            return false;
+        }
+        case TypeIndex::Nullable:
+        {
+            if (to_type_was_nullable)
+            {
+                const auto & from_type_nullable = assert_cast<const DataTypeNullable &>(*from_type);
+                return canBeSafelyCasted(from_type_nullable.getNestedType(), to_type_unwrapped);
+            }
+
+            if (to_which_type.isString())
+                return true;
+
+            return false;
+        }
+        case TypeIndex::LowCardinality:
+        {
+            const auto & from_type_low_cardinality = assert_cast<const DataTypeLowCardinality &>(*from_type);
+            return canBeSafelyCasted(from_type_low_cardinality.getDictionaryType(), to_type_unwrapped);
+        }
+        case TypeIndex::Array:
+        {
+            if (to_which_type.isArray())
+            {
+                const auto & from_type_array = assert_cast<const DataTypeArray &>(*from_type);
+                const auto & to_type_array = assert_cast<const DataTypeArray &>(*to_type_unwrapped);
+                return canBeSafelyCasted(from_type_array.getNestedType(), to_type_array.getNestedType());
+            }
+
+            if (to_which_type.isString())
+                return true;
+
+            return false;
+        }
+        case TypeIndex::Map:
+        {
+            if (to_which_type.isMap())
+            {
+                const auto & from_type_map = assert_cast<const DataTypeMap &>(*from_type);
+                const auto & to_type_map = assert_cast<const DataTypeMap &>(*to_type_unwrapped);
+                if (!canBeSafelyCasted(from_type_map.getKeyType(), to_type_map.getKeyType()))
+                    return false;
+
+                if (!canBeSafelyCasted(from_type_map.getValueType(), to_type_map.getValueType()))
+                    return false;
+
+                return true;
+            }
+
+            if (to_which_type.isArray())
+            {
+                // Map nested type is Array(Tuple(key_type, value_type))
+                const auto & from_type_map = assert_cast<const DataTypeMap &>(*from_type);
+                const auto & to_type_array = assert_cast<const DataTypeArray &>(*to_type_unwrapped);
+                const auto * to_type_nested_tuple_type = typeid_cast<const DataTypeTuple *>(to_type_array.getNestedType().get());
+                if (!to_type_nested_tuple_type)
+                    return false;
+
+                const auto & to_type_tuple_elements = to_type_nested_tuple_type->getElements();
+                if (to_type_tuple_elements.size() != 2)
+                    return false;
+
+                if (!canBeSafelyCasted(from_type_map.getKeyType(), to_type_tuple_elements[0]))
+                    return false;
+
+                if (!canBeSafelyCasted(from_type_map.getValueType(), to_type_tuple_elements[1]))
+                    return false;
+
+                return true;
+            }
+
+            if (to_which_type.isString())
+                return true;
+
+            return false;
+        }
+        case TypeIndex::Tuple:
+        {
+            if (to_which_type.isTuple())
+            {
+                const auto & from_type_tuple = assert_cast<const DataTypeTuple &>(*from_type);
+                const auto & to_type_tuple = assert_cast<const DataTypeTuple &>(*to_type_unwrapped);
+
+                const auto & from_tuple_type_elements = from_type_tuple.getElements();
+                const auto & to_tuple_type_elements = to_type_tuple.getElements();
+
+                size_t lhs_type_elements_size = from_tuple_type_elements.size();
+                if (lhs_type_elements_size != to_tuple_type_elements.size())
+                    return false;
+
+                for (size_t i = 0; i < lhs_type_elements_size; ++i)
+                    if (!canBeSafelyCasted(from_tuple_type_elements[i], to_tuple_type_elements[i]))
+                        return false;
+
+                return true;
+            }
+
+            if (to_which_type.isString())
+                return true;
+
+            return false;
+        }
+        case TypeIndex::String:
+        case TypeIndex::Object:
+        case TypeIndex::Set:
+        case TypeIndex::Interval:
+        case TypeIndex::Function:
+        case TypeIndex::AggregateFunction:
+        case TypeIndex::Nothing:
+            return false;
+    }
+
+    return true;
+}
+
+}
--- a/src/DataTypes/Utils.h
+++ b/src/DataTypes/Utils.h
@ -0,0 +1,19 @@
+#pragma once
+
+#include <DataTypes/IDataType.h>
+
+namespace DB
+{
+
+/** Returns true if from_type can be safely casted to to_type.
+  *
+  * Examples:
+  * From type UInt8 to type UInt16 returns true.
+  * From type UInt16 to type UInt8 returns false.
+  * From type String to type LowCardinality(String) returns true.
+  * From type LowCardinality(String) to type String returns true.
+  * From type String to type UInt8 returns false.
+  */
+bool canBeSafelyCasted(const DataTypePtr & from_type, const DataTypePtr & to_type);
+
+}
--- a/src/Disks/ObjectStorages/S3/diskSettings.cpp
+++ b/src/Disks/ObjectStorages/S3/diskSettings.cpp
@ -52,6 +52,7 @@ std::unique_ptr<S3::Client> getClient(
        config.getString(config_prefix + ".region", ""),
        context->getRemoteHostFilter(),
        static_cast<int>(context->getGlobalContext()->getSettingsRef().s3_max_redirects),
+        static_cast<int>(context->getGlobalContext()->getSettingsRef().s3_retry_attempts),
        context->getGlobalContext()->getSettingsRef().enable_s3_requests_logging,
        /* for_disk_s3 = */ true,
        settings.request_settings.get_request_throttler,
--- a/src/Formats/JSONUtils.cpp
+++ b/src/Formats/JSONUtils.cpp
@ -687,10 +687,9 @@ namespace JSONUtils
        return names_and_types;
    }

-    NamesAndTypesList readMetadataAndValidateHeader(ReadBuffer & in, const Block & header)
+    void validateMetadataByHeader(const NamesAndTypesList & names_and_types_from_metadata, const Block & header)
    {
-        auto names_and_types = JSONUtils::readMetadata(in);
-        for (const auto & [name, type] : names_and_types)
+        for (const auto & [name, type] : names_and_types_from_metadata)
        {
            if (!header.has(name))
                continue;
@ -698,10 +697,16 @@ namespace JSONUtils
            auto header_type = header.getByName(name).type;
            if (!type->equals(*header_type))
                throw Exception(
-                                ErrorCodes::INCORRECT_DATA,
-                                "Type {} of column '{}' from metadata is not the same as type in header {}",
-                                type->getName(), name, header_type->getName());
+                    ErrorCodes::INCORRECT_DATA,
+                    "Type {} of column '{}' from metadata is not the same as type in header {}",
+                    type->getName(), name, header_type->getName());
        }
+    }
+
+    NamesAndTypesList readMetadataAndValidateHeader(ReadBuffer & in, const Block & header)
+    {
+        auto names_and_types = JSONUtils::readMetadata(in);
+        validateMetadataByHeader(names_and_types, header);
        return names_and_types;
    }

--- a/src/Formats/JSONUtils.h
+++ b/src/Formats/JSONUtils.h
@ -124,6 +124,7 @@ namespace JSONUtils

    NamesAndTypesList readMetadata(ReadBuffer & in);
    NamesAndTypesList readMetadataAndValidateHeader(ReadBuffer & in, const Block & header);
+    void validateMetadataByHeader(const NamesAndTypesList & names_and_types_from_metadata, const Block & header);

    bool skipUntilFieldInObject(ReadBuffer & in, const String & desired_field_name);
    void skipTheRestOfObject(ReadBuffer & in);
--- a/src/Formats/ReadSchemaUtils.cpp
+++ b/src/Formats/ReadSchemaUtils.cpp
@ -5,6 +5,7 @@
 #include <Storages/IStorage.h>
 #include <Common/assert_cast.h>
 #include <IO/WithFileName.h>
+#include <IO/WithFileSize.h>


 namespace DB
@ -86,7 +87,16 @@ try
                buf = read_buffer_iterator.next();
                if (!buf)
                    break;
-                is_eof = buf->eof();
+
+                /// We just want to check for eof, but eof() can be pretty expensive.
+                /// So we use getFileSize() when available, which has better worst case.
+                /// (For remote files, typically eof() would read 1 MB from S3, which may be much
+                ///  more than what the schema reader and even data reader will read).
+                auto size = tryGetFileSizeFromReadBuffer(*buf);
+                if (size.has_value())
+                    is_eof = *size == 0;
+                else
+                    is_eof = buf->eof();
            }
            catch (Exception & e)
            {
--- a/src/Functions/FunctionHelpers.cpp
+++ b/src/Functions/FunctionHelpers.cpp
@ -6,7 +6,6 @@
 #include <Columns/ColumnNullable.h>
 #include <Columns/ColumnLowCardinality.h>
 #include <Common/assert_cast.h>
-#include <DataTypes/DataTypeNullable.h>


 namespace DB
--- a/src/Functions/FunctionSQLJSON.h
+++ b/src/Functions/FunctionSQLJSON.h
@ -199,6 +199,7 @@ public:

            /// Parse JSON for every row
            Impl impl;
+            GeneratorJSONPath<JSONParser> generator_json_path(res);
            for (const auto i : collections::range(0, input_rows_count))
            {
                std::string_view json{
@ -208,7 +209,9 @@ public:
                bool added_to_column = false;
                if (document_ok)
                {
-                    added_to_column = impl.insertResultToColumn(*to, document, res, context);
+                    /// Instead of creating a new generator for each row, we can reuse the same one.
+                    generator_json_path.reinitialize();
+                    added_to_column = impl.insertResultToColumn(*to, document, generator_json_path, context);
                }
                if (!added_to_column)
                {
@ -287,9 +290,8 @@ public:

    static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; }

-    static bool insertResultToColumn(IColumn & dest, const Element & root, ASTPtr & query_ptr, const ContextPtr &)
+    static bool insertResultToColumn(IColumn & dest, const Element & root, GeneratorJSONPath<JSONParser> & generator_json_path, const ContextPtr &)
    {
-        GeneratorJSONPath<JSONParser> generator_json_path(query_ptr);
        Element current_element = root;
        VisitorStatus status;
        while ((status = generator_json_path.getNextItem(current_element)) != VisitorStatus::Exhausted)
@ -337,9 +339,8 @@ public:

    static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; }

-    static bool insertResultToColumn(IColumn & dest, const Element & root, ASTPtr & query_ptr, const ContextPtr & context)
+    static bool insertResultToColumn(IColumn & dest, const Element & root, GeneratorJSONPath<JSONParser> & generator_json_path, const ContextPtr & context)
    {
-        GeneratorJSONPath<JSONParser> generator_json_path(query_ptr);
        Element current_element = root;
        VisitorStatus status;

@ -405,11 +406,10 @@ public:

    static size_t getNumberOfIndexArguments(const ColumnsWithTypeAndName & arguments) { return arguments.size() - 1; }

-    static bool insertResultToColumn(IColumn & dest, const Element & root, ASTPtr & query_ptr, const ContextPtr &)
+    static bool insertResultToColumn(IColumn & dest, const Element & root, GeneratorJSONPath<JSONParser> & generator_json_path, const ContextPtr &)
    {
        ColumnString & col_str = assert_cast<ColumnString &>(dest);

-        GeneratorJSONPath<JSONParser> generator_json_path(query_ptr);
        Element current_element = root;
        VisitorStatus status;
        bool success = false;
--- a/src/Functions/FunctionsOpDate.cpp
+++ b/src/Functions/FunctionsOpDate.cpp
@ -53,7 +53,6 @@ public:
    }

    bool useDefaultImplementationForConstants() const override { return true; }
-    ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {0, 2}; }

    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
    {
--- a/src/Functions/JSONPath/Generator/GeneratorJSONPath.h
+++ b/src/Functions/JSONPath/Generator/GeneratorJSONPath.h
@ -105,6 +105,16 @@ public:
        }
    }

+    void reinitialize()
+    {
+        while (current_visitor >= 0)
+        {
+            visitors[current_visitor]->reinitialize();
+            current_visitor--;
+        }
+        current_visitor = 0;
+    }
+
 private:
    bool updateVisitorsForNextRun()
    {
--- a/src/IO/HTTPCommon.cpp
+++ b/src/IO/HTTPCommon.cpp
@ -321,7 +321,7 @@ namespace
            /// To avoid such a deadlock we unlock `lock` before entering `pool_ptr->second->get`.
            lock.unlock();

-            auto retry_timeout = timeouts.connection_timeout.totalMicroseconds();
+            auto retry_timeout = timeouts.connection_timeout.totalMilliseconds();
            auto session = pool_ptr->second->get(retry_timeout);

            setTimeouts(*session, timeouts);
--- a/src/IO/S3/Client.cpp
+++ b/src/IO/S3/Client.cpp
@ -49,11 +49,12 @@ namespace ErrorCodes
 namespace S3
 {

-Client::RetryStrategy::RetryStrategy(std::shared_ptr<Aws::Client::RetryStrategy> wrapped_strategy_)
-    : wrapped_strategy(std::move(wrapped_strategy_))
+Client::RetryStrategy::RetryStrategy(uint32_t maxRetries_, uint32_t scaleFactor_, uint32_t maxDelayMs_)
+    : maxRetries(maxRetries_)
+    , scaleFactor(scaleFactor_)
+    , maxDelayMs(maxDelayMs_)
 {
-    if (!wrapped_strategy)
-        wrapped_strategy = Aws::Client::InitRetryStrategy();
+    chassert(maxDelayMs <= uint64_t(scaleFactor) * (1ul << 31l));
 }

 /// NOLINTNEXTLINE(google-runtime-int)
@ -62,39 +63,28 @@ bool Client::RetryStrategy::ShouldRetry(const Aws::Client::AWSError<Aws::Client:
    if (error.GetResponseCode() == Aws::Http::HttpResponseCode::MOVED_PERMANENTLY)
        return false;

-    return wrapped_strategy->ShouldRetry(error, attemptedRetries);
+    if (attemptedRetries >= maxRetries)
+        return false;
+
+    return error.ShouldRetry();
 }

 /// NOLINTNEXTLINE(google-runtime-int)
-long Client::RetryStrategy::CalculateDelayBeforeNextRetry(const Aws::Client::AWSError<Aws::Client::CoreErrors>& error, long attemptedRetries) const
+long Client::RetryStrategy::CalculateDelayBeforeNextRetry(const Aws::Client::AWSError<Aws::Client::CoreErrors>&, long attemptedRetries) const
 {
-    return wrapped_strategy->CalculateDelayBeforeNextRetry(error, attemptedRetries);
+    if (attemptedRetries == 0)
+    {
+        return 0;
+    }
+
+    uint64_t backoffLimitedPow = 1ul << std::min(attemptedRetries, 31l);
+    return std::min<uint64_t>(scaleFactor * backoffLimitedPow, maxDelayMs);
 }

 /// NOLINTNEXTLINE(google-runtime-int)
 long Client::RetryStrategy::GetMaxAttempts() const
 {
-    return wrapped_strategy->GetMaxAttempts();
-}
-
-void Client::RetryStrategy::GetSendToken()
-{
-    return wrapped_strategy->GetSendToken();
-}
-
-bool Client::RetryStrategy::HasSendToken()
-{
-    return wrapped_strategy->HasSendToken();
-}
-
-void Client::RetryStrategy::RequestBookkeeping(const Aws::Client::HttpResponseOutcome& httpResponseOutcome)
-{
-    return wrapped_strategy->RequestBookkeeping(httpResponseOutcome);
-}
-
-void Client::RetryStrategy::RequestBookkeeping(const Aws::Client::HttpResponseOutcome& httpResponseOutcome, const Aws::Client::AWSError<Aws::Client::CoreErrors>& lastError)
-{
-    return wrapped_strategy->RequestBookkeeping(httpResponseOutcome, lastError);
+    return maxRetries + 1;
 }

 namespace
@ -569,6 +559,7 @@ Client::doRequestWithRetryNetworkErrors(const RequestType & request, RequestFn r
    {
        chassert(client_configuration.retryStrategy);
        const Int64 max_attempts = client_configuration.retryStrategy->GetMaxAttempts();
+        chassert(max_attempts > 0);
        std::exception_ptr last_exception = nullptr;
        for (Int64 attempt_no = 0; attempt_no < max_attempts; ++attempt_no)
        {
@ -846,7 +837,8 @@ std::unique_ptr<S3::Client> ClientFactory::create( // NOLINT
            std::move(credentials),
            credentials_configuration);

-    client_configuration.retryStrategy = std::make_shared<Client::RetryStrategy>(std::move(client_configuration.retryStrategy));
+    client_configuration.retryStrategy = std::make_shared<Client::RetryStrategy>(client_configuration.s3_retry_attempts);
+
    return Client::create(
        client_configuration.s3_max_redirects,
        std::move(sse_kms_config),
@ -861,6 +853,7 @@ PocoHTTPClientConfiguration ClientFactory::createClientConfiguration( // NOLINT
    const String & force_region,
    const RemoteHostFilter & remote_host_filter,
    unsigned int s3_max_redirects,
+    unsigned int s3_retry_attempts,
    bool enable_s3_requests_logging,
    bool for_disk_s3,
    const ThrottlerPtr & get_request_throttler,
@ -879,6 +872,7 @@ PocoHTTPClientConfiguration ClientFactory::createClientConfiguration( // NOLINT
        force_region,
        remote_host_filter,
        s3_max_redirects,
+        s3_retry_attempts,
        enable_s3_requests_logging,
        for_disk_s3,
        get_request_throttler,
--- a/src/IO/S3/Client.h
+++ b/src/IO/S3/Client.h
@ -152,16 +152,16 @@ public:

    Aws::Auth::AWSCredentials getCredentials() const;

-    /// Decorator for RetryStrategy needed for this client to work correctly.
    /// We want to manually handle permanent moves (status code 301) because:
    /// - redirect location is written in XML format inside the response body something that doesn't exist for HEAD
    ///   requests so we need to manually find the correct location
    /// - we want to cache the new location to decrease number of roundtrips for future requests
-    /// This decorator doesn't retry if 301 is detected and fallbacks to the inner retry strategy otherwise.
+    /// Other retries are processed with exponential backoff timeout
+    /// which is limited and rundomly spread
    class RetryStrategy : public Aws::Client::RetryStrategy
    {
    public:
-        explicit RetryStrategy(std::shared_ptr<Aws::Client::RetryStrategy> wrapped_strategy_);
+        RetryStrategy(uint32_t maxRetries_ = 10, uint32_t scaleFactor_ = 25, uint32_t maxDelayMs_ = 90000);

        /// NOLINTNEXTLINE(google-runtime-int)
        bool ShouldRetry(const Aws::Client::AWSError<Aws::Client::CoreErrors>& error, long attemptedRetries) const override;
@ -172,14 +172,10 @@ public:
        /// NOLINTNEXTLINE(google-runtime-int)
        long GetMaxAttempts() const override;

-        void GetSendToken() override;
-
-        bool HasSendToken() override;
-
-        void RequestBookkeeping(const Aws::Client::HttpResponseOutcome& httpResponseOutcome) override;
-        void RequestBookkeeping(const Aws::Client::HttpResponseOutcome& httpResponseOutcome, const Aws::Client::AWSError<Aws::Client::CoreErrors>& lastError) override;
    private:
-        std::shared_ptr<Aws::Client::RetryStrategy> wrapped_strategy;
+        uint32_t maxRetries;
+        uint32_t scaleFactor;
+        uint32_t maxDelayMs;
    };

    /// SSE-KMS headers MUST be signed, so they need to be added before the SDK signs the message
@ -311,6 +307,7 @@ public:
        const String & force_region,
        const RemoteHostFilter & remote_host_filter,
        unsigned int s3_max_redirects,
+        unsigned int s3_retry_attempts,
        bool enable_s3_requests_logging,
        bool for_disk_s3,
        const ThrottlerPtr & get_request_throttler,
--- a/src/IO/S3/Credentials.cpp
+++ b/src/IO/S3/Credentials.cpp
@ -623,6 +623,7 @@ S3CredentialsProviderChain::S3CredentialsProviderChain(
                configuration.region,
                configuration.remote_host_filter,
                configuration.s3_max_redirects,
+                configuration.s3_retry_attempts,
                configuration.enable_s3_requests_logging,
                configuration.for_disk_s3,
                configuration.get_request_throttler,
@ -637,6 +638,7 @@ S3CredentialsProviderChain::S3CredentialsProviderChain(
                configuration.region,
                configuration.remote_host_filter,
                configuration.s3_max_redirects,
+                configuration.s3_retry_attempts,
                configuration.enable_s3_requests_logging,
                configuration.for_disk_s3,
                configuration.get_request_throttler,
@ -679,6 +681,7 @@ S3CredentialsProviderChain::S3CredentialsProviderChain(
                configuration.region,
                configuration.remote_host_filter,
                configuration.s3_max_redirects,
+                configuration.s3_retry_attempts,
                configuration.enable_s3_requests_logging,
                configuration.for_disk_s3,
                configuration.get_request_throttler,
--- a/src/IO/S3/PocoHTTPClient.cpp
+++ b/src/IO/S3/PocoHTTPClient.cpp
@ -96,6 +96,7 @@ PocoHTTPClientConfiguration::PocoHTTPClientConfiguration(
        const String & force_region_,
        const RemoteHostFilter & remote_host_filter_,
        unsigned int s3_max_redirects_,
+        unsigned int s3_retry_attempts_,
        bool enable_s3_requests_logging_,
        bool for_disk_s3_,
        const ThrottlerPtr & get_request_throttler_,
@ -105,6 +106,7 @@ PocoHTTPClientConfiguration::PocoHTTPClientConfiguration(
    , force_region(force_region_)
    , remote_host_filter(remote_host_filter_)
    , s3_max_redirects(s3_max_redirects_)
+    , s3_retry_attempts(s3_retry_attempts_)
    , enable_s3_requests_logging(enable_s3_requests_logging_)
    , for_disk_s3(for_disk_s3_)
    , get_request_throttler(get_request_throttler_)
--- a/src/IO/S3/PocoHTTPClient.h
+++ b/src/IO/S3/PocoHTTPClient.h
@ -41,6 +41,7 @@ struct PocoHTTPClientConfiguration : public Aws::Client::ClientConfiguration
    String force_region;
    const RemoteHostFilter & remote_host_filter;
    unsigned int s3_max_redirects;
+    unsigned int s3_retry_attempts;
    bool enable_s3_requests_logging;
    bool for_disk_s3;
    ThrottlerPtr get_request_throttler;
@ -64,6 +65,7 @@ private:
        const String & force_region_,
        const RemoteHostFilter & remote_host_filter_,
        unsigned int s3_max_redirects_,
+        unsigned int s3_retry_attempts,
        bool enable_s3_requests_logging_,
        bool for_disk_s3_,
        const ThrottlerPtr & get_request_throttler_,
--- a/src/IO/S3/tests/gtest_aws_s3_client.cpp
+++ b/src/IO/S3/tests/gtest_aws_s3_client.cpp
@ -40,14 +40,6 @@
 [[maybe_unused]] static Poco::Util::ServerApplication app;


-class NoRetryStrategy : public Aws::Client::StandardRetryStrategy
-{
-    bool ShouldRetry(const Aws::Client::AWSError<Aws::Client::CoreErrors> &, long /* NOLINT */) const override { return false; }
-
-public:
-    ~NoRetryStrategy() override = default;
-};
-
 String getSSEAndSignedHeaders(const Poco::Net::MessageHeader & message_header)
 {
    String content;
@ -123,6 +115,7 @@ void testServerSideEncryption(

    DB::RemoteHostFilter remote_host_filter;
    unsigned int s3_max_redirects = 100;
+    unsigned int s3_retry_attempts = 0;
    DB::S3::URI uri(http.getUrl() + "/IOTestAwsS3ClientAppendExtraHeaders/test.txt");
    String access_key_id = "ACCESS_KEY_ID";
    String secret_access_key = "SECRET_ACCESS_KEY";
@ -132,6 +125,7 @@ void testServerSideEncryption(
        region,
        remote_host_filter,
        s3_max_redirects,
+        s3_retry_attempts,
        enable_s3_requests_logging,
        /* for_disk_s3 = */ false,
        /* get_request_throttler = */ {},
@ -140,7 +134,6 @@ void testServerSideEncryption(
    );

    client_configuration.endpointOverride = uri.endpoint;
-    client_configuration.retryStrategy = std::make_shared<NoRetryStrategy>();

    DB::HTTPHeaderEntries headers;
    bool use_environment_credentials = false;
--- a/src/IO/tests/gtest_writebuffer_s3.cpp
+++ b/src/IO/tests/gtest_writebuffer_s3.cpp
@ -228,6 +228,7 @@ struct Client : DB::S3::Client
            "some-region",
            remote_host_filter,
            /* s3_max_redirects = */ 100,
+            /* s3_retry_attempts = */ 0,
            /* enable_s3_requests_logging = */ true,
            /* for_disk_s3 = */ false,
            /* get_request_throttler = */ {},
--- a/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp
+++ b/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp
@ -114,7 +114,8 @@ void SelectStreamFactory::createForShard(
    ContextPtr context,
    std::vector<QueryPlanPtr> & local_plans,
    Shards & remote_shards,
-    UInt32 shard_count)
+    UInt32 shard_count,
+    bool parallel_replicas_enabled)
 {
    auto it = objects_by_shard.find(shard_info.shard_num);
    if (it != objects_by_shard.end())
@ -146,7 +147,10 @@ void SelectStreamFactory::createForShard(
        return;
    });

-    if (settings.prefer_localhost_replica && shard_info.isLocal())
+    // prefer_localhost_replica is not effective in case of parallel replicas
+    // (1) prefer_localhost_replica is about choosing one replica on a shard
+    // (2) parallel replica coordinator has own logic to choose replicas to read from
+    if (settings.prefer_localhost_replica && shard_info.isLocal() && !parallel_replicas_enabled)
    {
        StoragePtr main_table_storage;

@ -187,7 +191,7 @@ void SelectStreamFactory::createForShard(
            return;
        }

-        UInt64 max_allowed_delay = settings.max_replica_delay_for_distributed_queries;
+        const UInt64 max_allowed_delay = settings.max_replica_delay_for_distributed_queries;

        if (!max_allowed_delay)
        {
--- a/src/Interpreters/ClusterProxy/SelectStreamFactory.h
+++ b/src/Interpreters/ClusterProxy/SelectStreamFactory.h
@ -78,7 +78,8 @@ public:
        ContextPtr context,
        std::vector<QueryPlanPtr> & local_plans,
        Shards & remote_shards,
-        UInt32 shard_count);
+        UInt32 shard_count,
+        bool parallel_replicas_enabled);

    struct ShardPlans
    {
--- a/src/Interpreters/ClusterProxy/executeQuery.cpp
+++ b/src/Interpreters/ClusterProxy/executeQuery.cpp
@ -178,9 +178,12 @@ void executeQuery(
                                                main_table, query_info.additional_filter_ast, log);
    new_context->increaseDistributedDepth();

-    size_t shards = query_info.getCluster()->getShardCount();
-    for (const auto & shard_info : query_info.getCluster()->getShardsInfo())
+    ClusterPtr cluster = query_info.getCluster();
+    const size_t shards = cluster->getShardCount();
+    for (size_t i = 0, s = cluster->getShardsInfo().size(); i < s; ++i)
    {
+        const auto & shard_info = cluster->getShardsInfo()[i];
+
        ASTPtr query_ast_for_shard = query_ast->clone();
        if (sharding_key_expr && query_info.optimized_cluster && settings.optimize_skip_unused_shards_rewrite_in && shards > 1)
        {
@ -210,9 +213,15 @@ void executeQuery(
            }
        }

+        // decide for each shard if parallel reading from replicas should be enabled
+        // according to settings and number of replicas declared per shard
+        const auto & addresses = cluster->getShardsAddresses().at(i);
+        bool parallel_replicas_enabled = addresses.size() > 1 && context->canUseParallelReplicas();
+
        stream_factory.createForShard(shard_info,
            query_ast_for_shard, main_table, table_func_ptr,
-            new_context, plans, remote_shards, static_cast<UInt32>(shards));
+            new_context, plans, remote_shards, static_cast<UInt32>(shards),
+            parallel_replicas_enabled);
    }

    if (!remote_shards.empty())
@ -236,7 +245,7 @@ void executeQuery(
            log,
            shards,
            query_info.storage_limits,
-            query_info.getCluster()->getName());
+            not_optimized_cluster->getName());

        read_from_remote->setStepDescription("Read from remote replica");
        plan->addStep(std::move(read_from_remote));
--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@ -4629,18 +4629,20 @@ Context::ParallelReplicasMode Context::getParallelReplicasMode() const
    return SAMPLE_KEY;
 }

-bool Context::canUseParallelReplicasOnInitiator() const
+bool Context::canUseParallelReplicas() const
 {
    const auto & settings_ref = getSettingsRef();
-    return getParallelReplicasMode() == ParallelReplicasMode::READ_TASKS && settings_ref.max_parallel_replicas > 1
-        && !getClientInfo().collaborate_with_initiator;
+    return getParallelReplicasMode() == ParallelReplicasMode::READ_TASKS && settings_ref.max_parallel_replicas > 1;
+}
+
+bool Context::canUseParallelReplicasOnInitiator() const
+{
+    return canUseParallelReplicas() && !getClientInfo().collaborate_with_initiator;
 }

 bool Context::canUseParallelReplicasOnFollower() const
 {
-    const auto & settings_ref = getSettingsRef();
-    return getParallelReplicasMode() == ParallelReplicasMode::READ_TASKS && settings_ref.max_parallel_replicas > 1
-        && getClientInfo().collaborate_with_initiator;
+    return canUseParallelReplicas() && getClientInfo().collaborate_with_initiator;
 }

 void Context::setPreparedSetsCache(const PreparedSetsCachePtr & cache)
--- a/src/Interpreters/Context.h
+++ b/src/Interpreters/Context.h
@ -1182,6 +1182,7 @@ public:
    WriteSettings getWriteSettings() const;

    /** There are multiple conditions that have to be met to be able to use parallel replicas */
+    bool canUseParallelReplicas() const;
    bool canUseParallelReplicasOnInitiator() const;
    bool canUseParallelReplicasOnFollower() const;

--- a/src/Interpreters/DatabaseAndTableWithAlias.cpp
+++ b/src/Interpreters/DatabaseAndTableWithAlias.cpp
@ -16,6 +16,7 @@ namespace DB
 namespace ErrorCodes
 {
    extern const int LOGICAL_ERROR;
+    extern const int INVALID_IDENTIFIER;
 }

 DatabaseAndTableWithAlias::DatabaseAndTableWithAlias(const ASTTableIdentifier & identifier, const String & current_database)
@ -37,7 +38,7 @@ DatabaseAndTableWithAlias::DatabaseAndTableWithAlias(const ASTIdentifier & ident
    else if (identifier.name_parts.size() == 1)
        table = identifier.name_parts[0];
    else
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: invalid identifier");
+        throw Exception(ErrorCodes::INVALID_IDENTIFIER, "Invalid identifier");

    if (database.empty())
        database = current_database;
@ -50,7 +51,7 @@ DatabaseAndTableWithAlias::DatabaseAndTableWithAlias(const ASTPtr & node, const
    else if (const auto * identifier = node->as<ASTIdentifier>())
        *this = DatabaseAndTableWithAlias(*identifier, current_database);
    else
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: identifier or table identifier expected");
+        throw Exception(ErrorCodes::INVALID_IDENTIFIER, "Identifier or table identifier expected");
 }

 DatabaseAndTableWithAlias::DatabaseAndTableWithAlias(const ASTTableExpression & table_expression, const String & current_database)
--- a/src/Interpreters/InterpreterCreateQuery.cpp
+++ b/src/Interpreters/InterpreterCreateQuery.cpp
@ -31,6 +31,7 @@
 #include <Storages/StorageInMemoryMetadata.h>
 #include <Storages/WindowView/StorageWindowView.h>
 #include <Storages/StorageReplicatedMergeTree.h>
+#include <Storages/BlockNumberColumn.h>

 #include <Interpreters/Context.h>
 #include <Interpreters/executeDDLQueryOnCluster.h>
@ -833,6 +834,13 @@ void InterpreterCreateQuery::validateTableStructure(const ASTCreateQuery & creat
                            "Cannot create table with column '{}' for *MergeTree engines because it "
                            "is reserved for lightweight delete feature",
                            LightweightDeleteDescription::FILTER_COLUMN.name);
+
+        auto search_block_number = all_columns.find(BlockNumberColumn::name);
+        if (search_block_number != all_columns.end())
+            throw Exception(ErrorCodes::ILLEGAL_COLUMN,
+                            "Cannot create table with column '{}' for *MergeTree engines because it "
+                            "is reserved for storing block number",
+                            BlockNumberColumn::name);
    }

    const auto & settings = getContext()->getSettingsRef();
--- a/src/Interpreters/MutationsInterpreter.cpp
+++ b/src/Interpreters/MutationsInterpreter.cpp
@ -7,6 +7,7 @@
 #include <Storages/MergeTree/MergeTreeData.h>
 #include <Storages/MergeTree/StorageFromMergeTreeDataPart.h>
 #include <Storages/StorageMergeTree.h>
+#include <Storages/BlockNumberColumn.h>
 #include <Processors/Transforms/FilterTransform.h>
 #include <Processors/Transforms/ExpressionTransform.h>
 #include <Processors/Transforms/CreatingSetsTransform.h>
@ -40,7 +41,6 @@
 #include <Parsers/makeASTForLogicalFunction.h>
 #include <Common/logger_useful.h>

-
 namespace DB
 {

@ -56,6 +56,7 @@ namespace ErrorCodes
    extern const int THERE_IS_NO_COLUMN;
 }

+
 namespace
 {

@ -416,6 +417,12 @@ static void validateUpdateColumns(
            found = true;
        }

+        /// Dont allow to override value of block number virtual column
+        if (!found && column_name == BlockNumberColumn::name)
+        {
+            throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Update is not supported for virtual column {} ", backQuote(column_name));
+        }
+
        if (!found)
        {
            for (const auto & col : metadata_snapshot->getColumns().getMaterialized())
@ -511,7 +518,8 @@ void MutationsInterpreter::prepare(bool dry_run)

        for (const auto & [name, _] : command.column_to_update_expression)
        {
-            if (!available_columns_set.contains(name) && name != LightweightDeleteDescription::FILTER_COLUMN.name)
+            if (!available_columns_set.contains(name) && name != LightweightDeleteDescription::FILTER_COLUMN.name
+                && name != BlockNumberColumn::name)
                throw Exception(ErrorCodes::THERE_IS_NO_COLUMN,
                    "Column {} is updated but not requested to read", name);

@ -613,6 +621,8 @@ void MutationsInterpreter::prepare(bool dry_run)
                    type = physical_column->type;
                else if (column == LightweightDeleteDescription::FILTER_COLUMN.name)
                    type = LightweightDeleteDescription::FILTER_COLUMN.type;
+                else if (column == BlockNumberColumn::name)
+                    type = BlockNumberColumn::type;
                else
                    throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown column {}", column);

@ -1087,6 +1097,18 @@ struct VirtualColumns

                virtuals.emplace_back(ColumnAndPosition{.column = std::move(column), .position = i});
            }
+            else if (columns_to_read[i] == BlockNumberColumn::name)
+            {
+                if (!part->getColumns().contains(BlockNumberColumn::name))
+                {
+                    ColumnWithTypeAndName block_number_column;
+                    block_number_column.type = BlockNumberColumn::type;
+                    block_number_column.column = block_number_column.type->createColumnConst(0, part->info.min_block);
+                    block_number_column.name = std::move(columns_to_read[i]);
+
+                    virtuals.emplace_back(ColumnAndPosition{.column = std::move(block_number_column), .position = i});
+                }
+            }
        }

        if (!virtuals.empty())
--- a/src/Interpreters/PreparedSets.cpp
+++ b/src/Interpreters/PreparedSets.cpp
@ -48,7 +48,7 @@ static bool equals(const DataTypes & lhs, const DataTypes & rhs)

 FutureSetFromStorage::FutureSetFromStorage(SetPtr set_) : set(std::move(set_)) {}
 SetPtr FutureSetFromStorage::get() const { return set; }
-const DataTypes & FutureSetFromStorage::getTypes() const { return set->getElementsTypes(); }
+DataTypes FutureSetFromStorage::getTypes() const { return set->getElementsTypes(); }

 SetPtr FutureSetFromStorage::buildOrderedSetInplace(const ContextPtr &)
 {
@ -73,7 +73,7 @@ FutureSetFromTuple::FutureSetFromTuple(Block block, const Settings & settings)
    set->finishInsert();
 }

-const DataTypes & FutureSetFromTuple::getTypes() const { return set->getElementsTypes(); }
+DataTypes FutureSetFromTuple::getTypes() const { return set->getElementsTypes(); }

 SetPtr FutureSetFromTuple::buildOrderedSetInplace(const ContextPtr & context)
 {
@ -138,7 +138,7 @@ void FutureSetFromSubquery::setQueryPlan(std::unique_ptr<QueryPlan> source_)
    set_and_key->set->setHeader(source->getCurrentDataStream().header.getColumnsWithTypeAndName());
 }

-const DataTypes & FutureSetFromSubquery::getTypes() const
+DataTypes FutureSetFromSubquery::getTypes() const
 {
    return set_and_key->set->getElementsTypes();
 }
@ -183,7 +183,10 @@ SetPtr FutureSetFromSubquery::buildOrderedSetInplace(const ContextPtr & context)
    {
        auto set = external_table_set->buildOrderedSetInplace(context);
        if (set)
-            return set_and_key->set = set;
+        {
+            set_and_key->set = set;
+            return set_and_key->set;
+        }
    }

    auto plan = build(context);
--- a/src/Interpreters/PreparedSets.h
+++ b/src/Interpreters/PreparedSets.h
@ -47,7 +47,7 @@ public:
    /// Returns set if set is ready (created and filled) or nullptr if not.
    virtual SetPtr get() const = 0;
    /// Returns set->getElementsTypes(), even if set is not created yet.
-    virtual const DataTypes & getTypes() const = 0;
+    virtual DataTypes getTypes() const = 0;
    /// If possible, return set with stored elements useful for PK analysis.
    virtual SetPtr buildOrderedSetInplace(const ContextPtr & context) = 0;
 };
@ -62,7 +62,7 @@ public:
    FutureSetFromStorage(SetPtr set_);

    SetPtr get() const override;
-    const DataTypes & getTypes() const override;
+    DataTypes getTypes() const override;
    SetPtr buildOrderedSetInplace(const ContextPtr &) override;

 private:
@ -79,7 +79,7 @@ public:
    SetPtr get() const override { return set; }
    SetPtr buildOrderedSetInplace(const ContextPtr & context) override;

-    const DataTypes & getTypes() const override;
+    DataTypes getTypes() const override;

 private:
    SetPtr set;
@ -105,7 +105,7 @@ public:
        const Settings & settings);

    SetPtr get() const override;
-    const DataTypes & getTypes() const override;
+    DataTypes getTypes() const override;
    SetPtr buildOrderedSetInplace(const ContextPtr & context) override;

    std::unique_ptr<QueryPlan> build(const ContextPtr & context);
--- a/src/Interpreters/executeQuery.cpp
+++ b/src/Interpreters/executeQuery.cpp
@ -94,11 +94,12 @@ namespace DB

 namespace ErrorCodes
 {
+    extern const int CANNOT_USE_QUERY_CACHE_WITH_NONDETERMINISTIC_FUNCTIONS;
    extern const int INTO_OUTFILE_NOT_ALLOWED;
-    extern const int QUERY_WAS_CANCELLED;
    extern const int INVALID_TRANSACTION;
    extern const int LOGICAL_ERROR;
    extern const int NOT_IMPLEMENTED;
+    extern const int QUERY_WAS_CANCELLED;
 }


@ -991,7 +992,7 @@ static std::tuple<ASTPtr, BlockIO> executeQueryImpl(

        if (!async_insert)
        {
-            /// If it is a non-internal SELECT, and passive/read use of the query cache is enabled, and the cache knows the query, then set
+            /// If it is a non-internal SELECT, and passive (read) use of the query cache is enabled, and the cache knows the query, then set
            /// a pipeline with a source populated by the query cache.
            auto get_result_from_query_cache = [&]()
            {
@ -1091,11 +1092,14 @@ static std::tuple<ASTPtr, BlockIO> executeQueryImpl(

                    res = interpreter->execute();

-                    /// If it is a non-internal SELECT query, and active/write use of the query cache is enabled, then add a processor on
+                    /// If it is a non-internal SELECT query, and active (write) use of the query cache is enabled, then add a processor on
                    /// top of the pipeline which stores the result in the query cache.
-                    if (can_use_query_cache && settings.enable_writes_to_query_cache
-                        && (!astContainsNonDeterministicFunctions(ast, context) || settings.query_cache_store_results_of_queries_with_nondeterministic_functions))
+                    if (can_use_query_cache && settings.enable_writes_to_query_cache)
                    {
+                        if (astContainsNonDeterministicFunctions(ast, context) && !settings.query_cache_store_results_of_queries_with_nondeterministic_functions)
+                            throw Exception(ErrorCodes::CANNOT_USE_QUERY_CACHE_WITH_NONDETERMINISTIC_FUNCTIONS,
+                                "Unable to cache the query result because the query contains a non-deterministic function. Use setting query_cache_store_results_of_queries_with_nondeterministic_functions = 1 to store the query result regardless.");
+
                        QueryCache::Key key(
                            ast, res.pipeline.getHeader(),
                            context->getUserName(), settings.query_cache_share_between_users,
--- a/src/Interpreters/inplaceBlockConversions.cpp
+++ b/src/Interpreters/inplaceBlockConversions.cpp
@ -20,6 +20,7 @@
 #include <Columns/ColumnArray.h>
 #include <DataTypes/DataTypeArray.h>
 #include <Storages/StorageInMemoryMetadata.h>
+#include <Storages/BlockNumberColumn.h>


 namespace DB
@ -260,7 +261,7 @@ void fillMissingColumns(
    const NamesAndTypesList & requested_columns,
    const NamesAndTypesList & available_columns,
    const NameSet & partially_read_columns,
-    StorageMetadataPtr metadata_snapshot)
+    StorageMetadataPtr metadata_snapshot, size_t block_number)
 {
    size_t num_columns = requested_columns.size();
    if (num_columns != res_columns.size())
@ -339,9 +340,14 @@ void fillMissingColumns(
        }
        else
        {
-            /// We must turn a constant column into a full column because the interpreter could infer
-            /// that it is constant everywhere but in some blocks (from other parts) it can be a full column.
-            res_columns[i] = type->createColumnConstWithDefaultValue(num_rows)->convertToFullColumnIfConst();
+            if (requested_column->name == BlockNumberColumn::name)
+                res_columns[i] = type->createColumnConst(num_rows, block_number)->convertToFullColumnIfConst();
+            else
+                /// We must turn a constant column into a full column because the interpreter could infer
+                /// that it is constant everywhere but in some blocks (from other parts) it can be a full column.
+                res_columns[i] = type->createColumnConstWithDefaultValue(num_rows)->convertToFullColumnIfConst();
+
+
        }
    }
 }
--- a/src/Interpreters/inplaceBlockConversions.h
+++ b/src/Interpreters/inplaceBlockConversions.h
@ -46,6 +46,6 @@ void fillMissingColumns(
    const NamesAndTypesList & requested_columns,
    const NamesAndTypesList & available_columns,
    const NameSet & partially_read_columns,
-    StorageMetadataPtr metadata_snapshot);
+    StorageMetadataPtr metadata_snapshot, size_t block_number = 0);

 }
--- a/src/Parsers/tests/gtest_Parser.cpp
+++ b/src/Parsers/tests/gtest_Parser.cpp
@ -18,6 +18,7 @@
 #include <string_view>
 #include <regex>
 #include <gtest/gtest.h>
+#include <boost/algorithm/string/replace.hpp>

 namespace
 {
@ -39,7 +40,11 @@ std::ostream & operator<<(std::ostream & ostr, const std::shared_ptr<IParser> pa

 std::ostream & operator<<(std::ostream & ostr, const ParserTestCase & test_case)
 {
-    return ostr << "ParserTestCase input: " << test_case.input_text;
+    // New line characters are removed because at the time of writing this the unit test results are parsed from the
+    // command line output, and multi-line string representations are breaking the parsing logic.
+    std::string input_text{test_case.input_text};
+    boost::replace_all(input_text, "\n", "\\n");
+    return ostr << "ParserTestCase input: " << input_text;
 }

 class ParserTest : public ::testing::TestWithParam<std::tuple<std::shared_ptr<IParser>, ParserTestCase>>
@ -494,11 +499,11 @@ INSTANTIATE_TEST_SUITE_P(
        ::testing::Values(std::make_shared<ParserPRQLQuery>(kDummyMaxQuerySize, kDummyMaxParserDepth)),
        ::testing::ValuesIn(std::initializer_list<ParserTestCase>{
            {
-                "from albums\ngroup [author_id] (\n  aggregate [first_pushlied = min published]\n)\njoin a=author side:left [==author_id]\njoin p=purchases side:right [==author_id]\ngroup [a.id, p.purchase_id] (\n  aggregate [avg_sell = min first_pushlied]\n)",
-                "WITH table_1 AS\n    (\n        SELECT\n            MIN(published) AS _expr_0,\n            author_id\n        FROM albums\n        GROUP BY author_id\n    )\nSELECT\n    a.id,\n    p.purchase_id,\n    MIN(table_0._expr_0) AS avg_sell\nFROM table_1 AS table_0\nLEFT JOIN author AS a ON table_0.author_id = a.author_id\nRIGHT JOIN purchases AS p ON table_0.author_id = p.author_id\nGROUP BY\n    a.id,\n    p.purchase_id",
+                "from albums\ngroup {author_id} (\n  aggregate {first_published = min published}\n)\njoin a=author side:left (==author_id)\njoin p=purchases side:right (==author_id)\ngroup {a.id, p.purchase_id} (\n  aggregate {avg_sell = min first_published}\n)",
+                "WITH table_0 AS\n    (\n        SELECT\n            MIN(published) AS _expr_0,\n            author_id\n        FROM albums\n        GROUP BY author_id\n    )\nSELECT\n    a.id,\n    p.purchase_id,\n    MIN(table_0._expr_0) AS avg_sell\nFROM table_0\nLEFT JOIN author AS a ON table_0.author_id = a.author_id\nRIGHT JOIN purchases AS p ON table_0.author_id = p.author_id\nGROUP BY\n    a.id,\n    p.purchase_id",
            },
            {
-                "from matches\nfilter start_date > @2023-05-30                 # Some comment here\nderive [\n  some_derived_value_1 = a + (b ?? 0),          # And there\n  some_derived_value_2 = c + some_derived_value\n]\nfilter some_derived_value_2 > 0\ngroup [country, city] (\n  aggregate [\n    average some_derived_value_2,\n    aggr = max some_derived_value_2,\n  ]\n)\nderive place = f\"{city} in {country}\"\nderive country_code = s\"LEFT(country, 2)\"\nsort [aggr, -country]\ntake 1..20",
-                "WITH\n    table_3 AS\n    (\n        SELECT\n            country,\n            city,\n            c + some_derived_value AS _expr_1\n        FROM matches\n        WHERE start_date > toDate('2023-05-30')\n    ),\n    table_1 AS\n    (\n        SELECT\n            country,\n            city,\n            AVG(_expr_1) AS _expr_0,\n            MAX(_expr_1) AS aggr\n        FROM table_3 AS table_2\n        WHERE _expr_1 > 0\n        GROUP BY\n            country,\n            city\n    )\nSELECT\n    country,\n    city,\n    _expr_0,\n    aggr,\n    CONCAT(city, ' in ', country) AS place,\n    LEFT(country, 2) AS country_code\nFROM table_1 AS table_0\nORDER BY\n    aggr ASC,\n    country DESC\nLIMIT 20",
+                "from matches\nfilter start_date > @2023-05-30                 # Some comment here\nderive {\n  some_derived_value_1 = a + (b ?? 0),          # And there\n  some_derived_value_2 = c + some_derived_value\n}\nfilter some_derived_value_2 > 0\ngroup {country, city} (\n  aggregate {\n    average some_derived_value_2,\n    aggr = max some_derived_value_2\n  }\n)\nderive place = f\"{city} in {country}\"\nderive country_code = s\"LEFT(country, 2)\"\nsort {aggr, -country}\ntake 1..20",
+                "WITH\n    table_1 AS\n    (\n        SELECT\n            country,\n            city,\n            c + some_derived_value AS _expr_1\n        FROM matches\n        WHERE start_date > toDate('2023-05-30')\n    ),\n    table_0 AS\n    (\n        SELECT\n            country,\n            city,\n            AVG(_expr_1) AS _expr_0,\n            MAX(_expr_1) AS aggr\n        FROM table_1\n        WHERE _expr_1 > 0\n        GROUP BY\n            country,\n            city\n    )\nSELECT\n    country,\n    city,\n    _expr_0,\n    aggr,\n    CONCAT(city, ' in ', country) AS place,\n    LEFT(country, 2) AS country_code\nFROM table_0\nORDER BY\n    aggr ASC,\n    country DESC\nLIMIT 20",
            },
        })));
--- a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h
+++ b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h
@ -32,10 +32,11 @@ public:
    String getName() const override { return "JSONEachRowRowInputFormat"; }
    void resetParser() override;

-private:
+protected:
    void readPrefix() override;
    void readSuffix() override;

+private:
    bool readRow(MutableColumns & columns, RowReadExtension & ext) override;
    bool allowSyncAfterError() const override { return true; }
    void syncAfterError() override;
--- a/src/Processors/Formats/Impl/JSONRowInputFormat.cpp
+++ b/src/Processors/Formats/Impl/JSONRowInputFormat.cpp
@ -12,42 +12,106 @@ namespace ErrorCodes
 }

 JSONRowInputFormat::JSONRowInputFormat(ReadBuffer & in_, const Block & header_, Params params_, const FormatSettings & format_settings_)
-    : JSONEachRowRowInputFormat(in_, header_, params_, format_settings_, false), validate_types_from_metadata(format_settings_.json.validate_types_from_metadata)
+    : JSONRowInputFormat(std::make_unique<PeekableReadBuffer>(in_), header_, params_, format_settings_)
+{
+}
+
+JSONRowInputFormat::JSONRowInputFormat(std::unique_ptr<PeekableReadBuffer> buf, const DB::Block & header_, DB::IRowInputFormat::Params params_, const DB::FormatSettings & format_settings_)
+    : JSONEachRowRowInputFormat(*buf, header_, params_, format_settings_, false), validate_types_from_metadata(format_settings_.json.validate_types_from_metadata), peekable_buf(std::move(buf))
 {
 }

 void JSONRowInputFormat::readPrefix()
 {
-    skipBOMIfExists(*in);
-    JSONUtils::skipObjectStart(*in);
-    if (validate_types_from_metadata)
-        JSONUtils::readMetadataAndValidateHeader(*in, getPort().getHeader());
-    else
-        JSONUtils::readMetadata(*in);
+    skipBOMIfExists(*peekable_buf);

-    JSONUtils::skipComma(*in);
-    if (!JSONUtils::skipUntilFieldInObject(*in, "data"))
-        throw Exception(ErrorCodes::INCORRECT_DATA, "Expected field \"data\" with table content");
+    PeekableReadBufferCheckpoint checkpoint(*peekable_buf);
+    NamesAndTypesList names_and_types_from_metadata;

-    JSONUtils::skipArrayStart(*in);
-    data_in_square_brackets = true;
+    /// Try to parse metadata, if failed, try to parse data as JSONEachRow format.
+    try
+    {
+        JSONUtils::skipObjectStart(*peekable_buf);
+        names_and_types_from_metadata = JSONUtils::readMetadata(*peekable_buf);
+        JSONUtils::skipComma(*peekable_buf);
+        if (!JSONUtils::skipUntilFieldInObject(*peekable_buf, "data"))
+            throw Exception(ErrorCodes::INCORRECT_DATA, "Expected field \"data\" with table content");
+
+        JSONUtils::skipArrayStart(*peekable_buf);
+        data_in_square_brackets = true;
+    }
+    catch (const ParsingException &)
+    {
+        parse_as_json_each_row = true;
+    }
+    catch (const Exception & e)
+    {
+        if (e.code() != ErrorCodes::INCORRECT_DATA)
+            throw;
+
+        parse_as_json_each_row = true;
+    }
+
+    if (parse_as_json_each_row)
+    {
+        peekable_buf->rollbackToCheckpoint();
+        JSONEachRowRowInputFormat::readPrefix();
+    }
+    else if (validate_types_from_metadata)
+    {
+        JSONUtils::validateMetadataByHeader(names_and_types_from_metadata, getPort().getHeader());
+    }
 }

 void JSONRowInputFormat::readSuffix()
 {
-    JSONUtils::skipArrayEnd(*in);
-    JSONUtils::skipTheRestOfObject(*in);
+    if (parse_as_json_each_row)
+    {
+        JSONEachRowRowInputFormat::readSuffix();
+    }
+    else
+    {
+        JSONUtils::skipArrayEnd(*peekable_buf);
+        JSONUtils::skipTheRestOfObject(*peekable_buf);
+    }
 }

-JSONRowSchemaReader::JSONRowSchemaReader(ReadBuffer & in_) : ISchemaReader(in_)
+void JSONRowInputFormat::setReadBuffer(DB::ReadBuffer & in_)
+{
+    peekable_buf->setSubBuffer(in_);
+}
+
+void JSONRowInputFormat::resetParser()
+{
+    JSONEachRowRowInputFormat::resetParser();
+    peekable_buf->reset();
+}
+
+JSONRowSchemaReader::JSONRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_)
+    : JSONRowSchemaReader(std::make_unique<PeekableReadBuffer>(in_), format_settings_)
+{
+}
+
+JSONRowSchemaReader::JSONRowSchemaReader(std::unique_ptr<PeekableReadBuffer> buf, const DB::FormatSettings & format_settings_)
+    : JSONEachRowSchemaReader(*buf, format_settings_), peekable_buf(std::move(buf))
 {
 }

 NamesAndTypesList JSONRowSchemaReader::readSchema()
 {
-    skipBOMIfExists(in);
-    JSONUtils::skipObjectStart(in);
-    return JSONUtils::readMetadata(in);
+    skipBOMIfExists(*peekable_buf);
+    PeekableReadBufferCheckpoint checkpoint(*peekable_buf);
+    /// Try to parse metadata, if failed, try to parse data as JSONEachRow format
+    try
+    {
+        JSONUtils::skipObjectStart(*peekable_buf);
+        return JSONUtils::readMetadata(*peekable_buf);
+    }
+    catch (...)
+    {
+        peekable_buf->rollbackToCheckpoint(true);
+        return JSONEachRowSchemaReader::readSchema();
+    }
 }

 void registerInputFormatJSON(FormatFactory & factory)
@ -69,7 +133,7 @@ void registerJSONSchemaReader(FormatFactory & factory)
    auto register_schema_reader = [&](const String & format)
    {
        factory.registerSchemaReader(
-            format, [](ReadBuffer & buf, const FormatSettings &) { return std::make_unique<JSONRowSchemaReader>(buf); });
+            format, [](ReadBuffer & buf, const FormatSettings & format_settings) { return std::make_unique<JSONRowSchemaReader>(buf, format_settings); });
    };
    register_schema_reader("JSON");
    /// JSONCompact has the same suffix with metadata.
--- a/src/Processors/Formats/Impl/JSONRowInputFormat.h
+++ b/src/Processors/Formats/Impl/JSONRowInputFormat.h
@ -23,21 +23,38 @@ public:

    String getName() const override { return "JSONRowInputFormat"; }

+    void setReadBuffer(ReadBuffer & in_) override;
+    void resetParser() override;
+
 private:
+    JSONRowInputFormat(
+        std::unique_ptr<PeekableReadBuffer> buf,
+        const Block & header_,
+        Params params_,
+        const FormatSettings & format_settings_);
+
    void readPrefix() override;
    void readSuffix() override;

    const bool validate_types_from_metadata;
+    bool parse_as_json_each_row = false;
+    std::unique_ptr<PeekableReadBuffer> peekable_buf;
+    std::exception_ptr reading_metadata_exception;
 };

-class JSONRowSchemaReader : public ISchemaReader
+class JSONRowSchemaReader : public JSONEachRowSchemaReader
 {
 public:
-    JSONRowSchemaReader(ReadBuffer & in_);
+    JSONRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_);

    NamesAndTypesList readSchema() override;

    bool hasStrictOrderOfColumns() const override { return false; }
+
+private:
+    JSONRowSchemaReader(std::unique_ptr<PeekableReadBuffer> buf, const FormatSettings & format_settings_);
+
+    std::unique_ptr<PeekableReadBuffer> peekable_buf;
 };

 }
--- a/src/Processors/Formats/Impl/ParquetMetadataInputFormat.cpp
+++ b/src/Processors/Formats/Impl/ParquetMetadataInputFormat.cpp
@ -130,7 +130,7 @@ static std::shared_ptr<parquet::FileMetaData> getFileMetadata(
    const FormatSettings & format_settings,
    std::atomic<int> & is_stopped)
 {
-    auto arrow_file = asArrowFile(in, format_settings, is_stopped, "Parquet", PARQUET_MAGIC_BYTES);
+    auto arrow_file = asArrowFile(in, format_settings, is_stopped, "Parquet", PARQUET_MAGIC_BYTES, /* avoid_buffering */ true);
    return parquet::ReadMetaData(arrow_file);
 }

@ -495,12 +495,15 @@ NamesAndTypesList ParquetMetadataSchemaReader::readSchema()

 void registerInputFormatParquetMetadata(FormatFactory & factory)
 {
-    factory.registerInputFormat(
+    factory.registerRandomAccessInputFormat(
        "ParquetMetadata",
-        [](ReadBuffer &buf,
-           const Block &sample,
-           const RowInputFormatParams &,
-           const FormatSettings & settings)
+        [](ReadBuffer & buf,
+            const Block & sample,
+            const FormatSettings & settings,
+            const ReadSettings &,
+            bool /* is_remote_fs */,
+            size_t /* max_download_threads */,
+            size_t /* max_parsing_threads */)
        {
            return std::make_shared<ParquetMetadataInputFormat>(buf, sample, settings);
        });
--- a/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp
+++ b/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp
@ -28,7 +28,6 @@ ReplacingSortedAlgorithm::ReplacingSortedAlgorithm(
    , cleanup(cleanup_)
    , cleanedup_rows_count(cleanedup_rows_count_)
 {
-
    if (!is_deleted_column.empty())
        is_deleted_column_number = header_.getPositionByName(is_deleted_column);
    if (!version_column.empty())
@ -83,8 +82,11 @@ IMergingAlgorithm::Status ReplacingSortedAlgorithm::merge()
                    uint8_t value = assert_cast<const ColumnUInt8 &>(*(*selected_row.all_columns)[is_deleted_column_number]).getData()[selected_row.row_num];
                    if (!cleanup || !value)
                        insertRow();
-                    else if (cleanedup_rows_count != nullptr)
+                    else if (cleanup && cleanedup_rows_count != nullptr)
+                    {
                        *cleanedup_rows_count += current_row_sources.size();
+                        current_row_sources.resize(0);
+                    }
                }
                else
                    insertRow();
@ -141,8 +143,11 @@ IMergingAlgorithm::Status ReplacingSortedAlgorithm::merge()
            uint8_t value = assert_cast<const ColumnUInt8 &>(*(*selected_row.all_columns)[is_deleted_column_number]).getData()[selected_row.row_num];
            if (!cleanup || !value)
                insertRow();
-            else if (cleanedup_rows_count != nullptr)
+            else if (cleanup && cleanedup_rows_count != nullptr)
+            {
                *cleanedup_rows_count += current_row_sources.size();
+                current_row_sources.resize(0);
+            }
        }
        else
            insertRow();
--- a/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.cpp
+++ b/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.cpp
@ -12,6 +12,7 @@
 #include <DataTypes/NestedUtils.h>
 #include <DataTypes/DataTypeLowCardinality.h>
 #include <IO/WriteHelpers.h>
+#include <Storages/BlockNumberColumn.h>


 namespace DB
@ -222,6 +223,12 @@ static SummingSortedAlgorithm::ColumnsDefinition defineColumns(
        const ColumnWithTypeAndName & column = header.safeGetByPosition(i);

        const auto * simple = dynamic_cast<const DataTypeCustomSimpleAggregateFunction *>(column.type->getCustomName());
+        if (column.name == BlockNumberColumn::name)
+        {
+            def.column_numbers_not_to_aggregate.push_back(i);
+            continue;
+        }
+
        /// Discover nested Maps and find columns for summation
        if (typeid_cast<const DataTypeArray *>(column.type.get()) && !simple)
        {
--- a/src/Processors/Sinks/IOutputChunkGenerator.h
+++ b/src/Processors/Sinks/IOutputChunkGenerator.h
@ -0,0 +1,26 @@
+#pragma once
+
+#include <Processors/Chunk.h>
+#include <Interpreters/Context.h>
+
+namespace DB
+{
+
+/// This interface is meant to be used by the SinkToStorage processor
+/// SinkToStorage delegates on it the creation of the data chunk that will deliver to the next stages of the query pipeline
+/// Default implementation (createDefault() factory method) just forwards everything that it receives
+class IOutputChunkGenerator
+{
+public:
+    static std::unique_ptr<IOutputChunkGenerator> createCopyRanges(bool deduplicate_later);
+    static std::unique_ptr<IOutputChunkGenerator> createDefault();
+
+    virtual ~IOutputChunkGenerator() = default;
+
+    virtual void onNewChunkArrived(Chunk chunk) = 0;
+    virtual void onRowsProcessed(size_t row_count, bool append) = 0;
+
+    virtual Chunk generateChunk() = 0;
+};
+
+}
--- a/src/Processors/Sinks/OutputChunkGenerator.cpp
+++ b/src/Processors/Sinks/OutputChunkGenerator.cpp
@ -0,0 +1,91 @@
+#include <Processors/Sinks/IOutputChunkGenerator.h>
+
+namespace DB
+{
+
+/// Default implementation. The new chunk received is forwarded as-is to the next stages of the query
+class ForwardEverythingGenerator : public IOutputChunkGenerator
+{
+public:
+
+    explicit ForwardEverythingGenerator() = default;
+
+    void onNewChunkArrived(Chunk chunk) override
+    {
+        in_chunk = chunk.clone();
+    }
+
+    void onRowsProcessed(size_t /*row_count*/, bool /*append*/) override
+    {}
+
+    Chunk generateChunk() override
+    {
+        return std::move(in_chunk);
+    }
+
+private:
+    Chunk in_chunk;
+};
+
+/// Specific implementation which generates a chunk with just a subset of the rows received originally
+/// Rows are assumed to be processed in the same order than they appear in the original chunk
+/// Is up to the client to decide how many rows process at once, but after each range processed,
+/// onRowsProcessed() has to be called, indicating whether append that range to the output chunk or not
+class CopyRangesGenerator : public IOutputChunkGenerator
+{
+public:
+    explicit CopyRangesGenerator() = default;
+
+    void onNewChunkArrived(Chunk chunk) override
+    {
+        out_cols = chunk.cloneEmptyColumns();
+        in_chunk = std::move(chunk);
+        row_offset = 0;
+        final_chunk_rows = 0;
+    }
+
+    void onRowsProcessed(size_t row_count, bool append) override
+    {
+        if (append)
+        {
+            const Columns& in_cols = in_chunk.getColumns();
+            for (size_t i = 0; i < out_cols.size(); i++)
+            {
+                out_cols[i]->insertRangeFrom(*(in_cols[i]), row_offset, row_count);
+            }
+            final_chunk_rows += row_count;
+        }
+
+        row_offset += row_count;
+    }
+
+    Chunk generateChunk() override
+    {
+        return Chunk(std::move(out_cols), final_chunk_rows);
+    }
+
+private:
+    Chunk in_chunk;
+    MutableColumns out_cols;
+    size_t row_offset = 0;
+    size_t final_chunk_rows = 0;
+};
+
+std::unique_ptr<IOutputChunkGenerator> IOutputChunkGenerator::createCopyRanges(bool deduplicate_later)
+{
+    // If MV is responsible for deduplication, block won't be considered duplicated.
+    // So default implementation, forwarding all the data, is used
+    if (deduplicate_later)
+    {
+        return createDefault();
+    }
+
+    return std::make_unique<CopyRangesGenerator>();
+}
+
+std::unique_ptr<IOutputChunkGenerator> IOutputChunkGenerator::createDefault()
+{
+    return std::make_unique<ForwardEverythingGenerator>();
+}
+
+}
--- a/src/Processors/Sinks/SinkToStorage.cpp
+++ b/src/Processors/Sinks/SinkToStorage.cpp
@ -4,7 +4,12 @@
 namespace DB
 {

-SinkToStorage::SinkToStorage(const Block & header) : ExceptionKeepingTransform(header, header, false) {}
+SinkToStorage::SinkToStorage(const Block & header) : SinkToStorage(header, IOutputChunkGenerator::createDefault()) {}
+
+SinkToStorage::SinkToStorage(const Block & header, std::unique_ptr<IOutputChunkGenerator> output_generator_)
+    : ExceptionKeepingTransform(header, header, false),
+    output_generator(std::move(output_generator_))
+{ }

 void SinkToStorage::onConsume(Chunk chunk)
 {
@ -15,15 +20,15 @@ void SinkToStorage::onConsume(Chunk chunk)
      */
    Nested::validateArraySizes(getHeader().cloneWithColumns(chunk.getColumns()));

+    output_generator->onNewChunkArrived(chunk.clone());
    consume(chunk.clone());
-    if (!lastBlockIsDuplicate())
-        cur_chunk = std::move(chunk);
 }

 SinkToStorage::GenerateResult SinkToStorage::onGenerate()
 {
    GenerateResult res;
-    res.chunk = std::move(cur_chunk);
+
+    res.chunk = output_generator->generateChunk();
    res.is_done = true;
    return res;
 }
--- a/src/Processors/Sinks/SinkToStorage.h
+++ b/src/Processors/Sinks/SinkToStorage.h
@ -1,6 +1,7 @@
 #pragma once
 #include <Storages/TableLockHolder.h>
 #include <Processors/Transforms/ExceptionKeepingTransform.h>
+#include <Processors/Sinks/IOutputChunkGenerator.h>

 namespace DB
 {
@ -13,13 +14,15 @@ friend class PartitionedSink;

 public:
    explicit SinkToStorage(const Block & header);
+    explicit SinkToStorage(const Block & header, std::unique_ptr<IOutputChunkGenerator> output_generator_);

    const Block & getHeader() const { return inputs.front().getHeader(); }
    void addTableLock(const TableLockHolder & lock) { table_locks.push_back(lock); }

 protected:
    virtual void consume(Chunk chunk) = 0;
-    virtual bool lastBlockIsDuplicate() const { return false; }
+
+    IOutputChunkGenerator& getOutputGenerator() { return *output_generator; }

 private:
    std::vector<TableLockHolder> table_locks;
@ -27,7 +30,7 @@ private:
    void onConsume(Chunk chunk) override;
    GenerateResult onGenerate() override;

-    Chunk cur_chunk;
+    std::unique_ptr<IOutputChunkGenerator> output_generator;
 };

 using SinkToStoragePtr = std::shared_ptr<SinkToStorage>;
--- a/src/Processors/Transforms/TTLTransform.cpp
+++ b/src/Processors/Transforms/TTLTransform.cpp
@ -49,7 +49,8 @@ TTLTransform::TTLTransform(

    for (const auto & group_by_ttl : metadata_snapshot_->getGroupByTTLs())
        algorithms.emplace_back(std::make_unique<TTLAggregationAlgorithm>(
-            group_by_ttl, old_ttl_infos.group_by_ttl[group_by_ttl.result_column], current_time_, force_, getInputPort().getHeader(), storage_));
+                group_by_ttl, old_ttl_infos.group_by_ttl[group_by_ttl.result_column], current_time_, force_,
+                getInputPort().getHeader(), storage_));

    if (metadata_snapshot_->hasAnyColumnTTL())
    {
--- a/src/Storages/AlterCommands.cpp
+++ b/src/Storages/AlterCommands.cpp
@ -27,6 +27,7 @@
 #include <Storages/AlterCommands.h>
 #include <Storages/IStorage.h>
 #include <Storages/LightweightDeleteDescription.h>
+#include <Storages/BlockNumberColumn.h>
 #include <Storages/MergeTree/MergeTreeData.h>
 #include <Common/typeid_cast.h>
 #include <Common/randomSeed.h>
@ -782,7 +783,7 @@ bool AlterCommand::isRequireMutationStage(const StorageInMemoryMetadata & metada
    /// Drop alias is metadata alter, in other case mutation is required.
    if (type == DROP_COLUMN)
        return metadata.columns.hasColumnOrNested(GetColumnsOptions::AllPhysical, column_name) ||
-            column_name == LightweightDeleteDescription::FILTER_COLUMN.name;
+            column_name == LightweightDeleteDescription::FILTER_COLUMN.name || column_name == BlockNumberColumn::name;

    if (type != MODIFY_COLUMN || data_type == nullptr)
        return false;
@ -1066,6 +1067,10 @@ void AlterCommands::validate(const StoragePtr & table, ContextPtr context) const
                throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot add column {}: "
                                "this column name is reserved for lightweight delete feature", backQuote(column_name));

+            if (column_name == BlockNumberColumn::name && std::dynamic_pointer_cast<MergeTreeData>(table))
+                throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot add column {}: "
+                                                            "this column name is reserved for _block_number persisting feature", backQuote(column_name));
+
            if (command.codec)
                CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST(command.codec, command.data_type, !context->getSettingsRef().allow_suspicious_codecs, context->getSettingsRef().allow_experimental_codecs, context->getSettingsRef().enable_deflate_qpl_codec);

@ -1270,6 +1275,10 @@ void AlterCommands::validate(const StoragePtr & table, ContextPtr context) const
                throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot rename to {}: "
                                "this column name is reserved for lightweight delete feature", backQuote(command.rename_to));

+            if (command.rename_to == BlockNumberColumn::name && std::dynamic_pointer_cast<MergeTreeData>(table))
+                throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot rename to {}: "
+                                                            "this column name is reserved for _block_number persisting feature", backQuote(command.rename_to));
+
            if (modified_columns.contains(column_name))
                throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Cannot rename and modify the same column {} "
                                                             "in a single ALTER query", backQuote(column_name));
--- a/src/Storages/BlockNumberColumn.cpp
+++ b/src/Storages/BlockNumberColumn.cpp
@ -0,0 +1,23 @@
+#include <Storages/BlockNumberColumn.h>
+#include <Compression/CompressionCodecMultiple.h>
+
+namespace DB
+{
+
+CompressionCodecPtr getCompressionCodecDelta(UInt8 delta_bytes_size);
+
+CompressionCodecPtr getCompressionCodecForBlockNumberColumn()
+{
+    std::vector <CompressionCodecPtr> codecs;
+    codecs.reserve(2);
+    auto data_bytes_size = BlockNumberColumn::type->getSizeOfValueInMemory();
+    codecs.emplace_back(getCompressionCodecDelta(data_bytes_size));
+    codecs.emplace_back(CompressionCodecFactory::instance().get("LZ4", {}));
+    return std::make_shared<CompressionCodecMultiple>(codecs);
+}
+
+const String BlockNumberColumn::name = "_block_number";
+const DataTypePtr BlockNumberColumn::type = std::make_shared<DataTypeUInt64>();
+const CompressionCodecPtr BlockNumberColumn::compression_codec = getCompressionCodecForBlockNumberColumn();
+
+}
--- a/src/Storages/BlockNumberColumn.h
+++ b/src/Storages/BlockNumberColumn.h
@ -0,0 +1,16 @@
+#pragma once
+#include <Core/NamesAndTypes.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <Compression/CompressionFactory.h>
+
+namespace DB
+{
+
+struct BlockNumberColumn
+{
+    static const String name;
+    static const DataTypePtr type;
+    static const CompressionCodecPtr compression_codec;
+};
+
+}
--- a/src/Storages/ColumnsDescription.cpp
+++ b/src/Storages/ColumnsDescription.cpp
@ -30,11 +30,15 @@
 #include <Interpreters/TreeRewriter.h>
 #include <Interpreters/ExpressionActions.h>
 #include <Interpreters/FunctionNameNormalizer.h>
+#include <Storages/BlockNumberColumn.h>


 namespace DB
 {

+CompressionCodecPtr getCompressionCodecDelta(UInt8 delta_bytes_size);
+
+
 namespace ErrorCodes
 {
    extern const int NO_SUCH_COLUMN_IN_TABLE;
@ -721,11 +725,13 @@ CompressionCodecPtr ColumnsDescription::getCodecOrDefault(const String & column_

 CompressionCodecPtr ColumnsDescription::getCodecOrDefault(const String & column_name) const
 {
+    assert (column_name != BlockNumberColumn::name);
    return getCodecOrDefault(column_name, CompressionCodecFactory::instance().getDefaultCodec());
 }

 ASTPtr ColumnsDescription::getCodecDescOrDefault(const String & column_name, CompressionCodecPtr default_codec) const
 {
+    assert (column_name != BlockNumberColumn::name);
    const auto it = columns.get<1>().find(column_name);

    if (it == columns.get<1>().end() || !it->codec)
--- a/src/Storages/MergeTree/IMergeTreeDataPart.h
+++ b/src/Storages/MergeTree/IMergeTreeDataPart.h
@ -477,10 +477,6 @@ public:
    /// Moar hardening: this method is supposed to be used for debug assertions
    bool assertHasValidVersionMetadata() const;

-    /// Return hardlink count for part.
-    /// Required for keep data on remote FS when part has shadow copies.
-    UInt32 getNumberOfRefereneces() const;
-
    /// True if the part supports lightweight delete mutate.
    bool supportLightweightDeleteMutate() const;

--- a/src/Storages/MergeTree/IMergeTreeReader.cpp
+++ b/src/Storages/MergeTree/IMergeTreeReader.cpp
@ -62,7 +62,7 @@ const IMergeTreeReader::ValueSizeMap & IMergeTreeReader::getAvgValueSizeHints()
    return avg_value_size_hints;
 }

-void IMergeTreeReader::fillMissingColumns(Columns & res_columns, bool & should_evaluate_missing_defaults, size_t num_rows) const
+void IMergeTreeReader::fillMissingColumns(Columns & res_columns, bool & should_evaluate_missing_defaults, size_t num_rows, size_t block_number) const
 {
    try
    {
@ -71,7 +71,7 @@ void IMergeTreeReader::fillMissingColumns(Columns & res_columns, bool & should_e
            res_columns, num_rows,
            Nested::convertToSubcolumns(requested_columns),
            Nested::convertToSubcolumns(available_columns),
-            partially_read_columns, storage_snapshot->metadata);
+            partially_read_columns, storage_snapshot->metadata, block_number);

        should_evaluate_missing_defaults = std::any_of(
            res_columns.begin(), res_columns.end(), [](const auto & column) { return column == nullptr; });
--- a/src/Storages/MergeTree/IMergeTreeReader.h
+++ b/src/Storages/MergeTree/IMergeTreeReader.h
@ -45,7 +45,7 @@ public:
    /// Add columns from ordered_names that are not present in the block.
    /// Missing columns are added in the order specified by ordered_names.
    /// num_rows is needed in case if all res_columns are nullptr.
-    void fillMissingColumns(Columns & res_columns, bool & should_evaluate_missing_defaults, size_t num_rows) const;
+    void fillMissingColumns(Columns & res_columns, bool & should_evaluate_missing_defaults, size_t num_rows, size_t block_number = 0) const;
    /// Evaluate defaulted columns if necessary.
    void evaluateMissingDefaults(Block additional_columns, Columns & res_columns) const;

--- a/src/Storages/MergeTree/KeyCondition.cpp
+++ b/src/Storages/MergeTree/KeyCondition.cpp
@ -3,6 +3,7 @@
 #include <DataTypes/DataTypesNumber.h>
 #include <DataTypes/FieldToDataType.h>
 #include <DataTypes/getLeastSupertype.h>
+#include <DataTypes/Utils.h>
 #include <Interpreters/TreeRewriter.h>
 #include <Interpreters/ExpressionAnalyzer.h>
 #include <Interpreters/ExpressionActions.h>
@ -1258,10 +1259,18 @@ bool KeyCondition::tryPrepareSetIndex(

    const auto right_arg = func.getArgumentAt(1);

-    auto future_set = right_arg.tryGetPreparedSet(indexes_mapping, data_types);
+    auto future_set = right_arg.tryGetPreparedSet();
    if (!future_set)
        return false;

+    const auto set_types = future_set->getTypes();
+    size_t set_types_size = set_types.size();
+    size_t indexes_mapping_size = indexes_mapping.size();
+
+    for (auto & index_mapping : indexes_mapping)
+        if (index_mapping.tuple_index >= set_types_size)
+            return false;
+
    auto prepared_set = future_set->buildOrderedSetInplace(right_arg.getTreeContext().getQueryContext());
    if (!prepared_set)
        return false;
@ -1270,11 +1279,72 @@ bool KeyCondition::tryPrepareSetIndex(
    if (!prepared_set->hasExplicitSetElements())
        return false;

-    prepared_set->checkColumnsNumber(left_args_count);
-    for (size_t i = 0; i < indexes_mapping.size(); ++i)
-        prepared_set->checkTypesEqual(indexes_mapping[i].tuple_index, data_types[i]);
+    /** Try to convert set columns to primary key columns.
+      * Example: SELECT id FROM test_table WHERE id IN (SELECT 1);
+      * In this example table `id` column has type UInt64, Set column has type UInt8. To use index
+      * we need to convert set column to primary key column.
+      */
+    auto set_columns = prepared_set->getSetElements();
+    assert(set_types_size == set_columns.size());

-    out.set_index = std::make_shared<MergeTreeSetIndex>(prepared_set->getSetElements(), std::move(indexes_mapping));
+    for (size_t indexes_mapping_index = 0; indexes_mapping_index < indexes_mapping_size; ++indexes_mapping_index)
+    {
+        const auto & key_column_type = data_types[indexes_mapping_index];
+        size_t set_element_index = indexes_mapping[indexes_mapping_index].tuple_index;
+        auto set_element_type = set_types[set_element_index];
+        auto set_column = set_columns[set_element_index];
+
+        if (canBeSafelyCasted(set_element_type, key_column_type))
+        {
+            set_columns[set_element_index] = castColumn({set_column, set_element_type, {}}, key_column_type);
+            continue;
+        }
+
+        if (!key_column_type->canBeInsideNullable())
+            return false;
+
+        const NullMap * set_column_null_map = nullptr;
+
+        if (isNullableOrLowCardinalityNullable(set_element_type))
+        {
+            if (WhichDataType(set_element_type).isLowCardinality())
+            {
+                set_element_type = removeLowCardinality(set_element_type);
+                set_column = set_column->convertToFullColumnIfLowCardinality();
+            }
+
+            set_element_type = removeNullable(set_element_type);
+            const auto & set_column_nullable = assert_cast<const ColumnNullable &>(*set_column);
+            set_column_null_map = &set_column_nullable.getNullMapData();
+            set_column = set_column_nullable.getNestedColumnPtr();
+        }
+
+        auto nullable_set_column = castColumnAccurateOrNull({set_column, set_element_type, {}}, key_column_type);
+        const auto & nullable_set_column_typed = assert_cast<const ColumnNullable &>(*nullable_set_column);
+        const auto & nullable_set_column_null_map = nullable_set_column_typed.getNullMapData();
+        size_t nullable_set_column_null_map_size = nullable_set_column_null_map.size();
+
+        IColumn::Filter filter(nullable_set_column_null_map_size);
+
+        if (set_column_null_map)
+        {
+            for (size_t i = 0; i < nullable_set_column_null_map_size; ++i)
+                filter[i] = (*set_column_null_map)[i] || !nullable_set_column_null_map[i];
+
+            set_column = nullable_set_column_typed.filter(filter, 0);
+        }
+        else
+        {
+            for (size_t i = 0; i < nullable_set_column_null_map_size; ++i)
+                filter[i] = !nullable_set_column_null_map[i];
+
+            set_column = nullable_set_column_typed.getNestedColumn().filter(filter, 0);
+        }
+
+        set_columns[set_element_index] = std::move(set_column);
+    }
+
+    out.set_index = std::make_shared<MergeTreeSetIndex>(set_columns, std::move(indexes_mapping));
    return true;
 }

--- a/src/Storages/MergeTree/MergeTask.cpp
+++ b/src/Storages/MergeTree/MergeTask.cpp
@ -218,6 +218,14 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare()
    ctx->need_remove_expired_values = false;
    ctx->force_ttl = false;

+    if (supportsBlockNumberColumn(global_ctx) && !global_ctx->storage_columns.contains(BlockNumberColumn::name))
+    {
+        global_ctx->storage_columns.emplace_back(NameAndTypePair{BlockNumberColumn::name,BlockNumberColumn::type});
+        global_ctx->all_column_names.emplace_back(BlockNumberColumn::name);
+        global_ctx->gathering_columns.emplace_back(NameAndTypePair{BlockNumberColumn::name,BlockNumberColumn::type});
+        global_ctx->gathering_column_names.emplace_back(BlockNumberColumn::name);
+    }
+
    SerializationInfo::Settings info_settings =
    {
        .ratio_of_defaults_for_sparse = global_ctx->data->getSettings()->ratio_of_defaults_for_sparse_serialization,
@ -251,12 +259,12 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare()
        }
    }

-    global_ctx->new_data_part->setColumns(global_ctx->storage_columns, infos, global_ctx->metadata_snapshot->getMetadataVersion());
-
    const auto & local_part_min_ttl = global_ctx->new_data_part->ttl_infos.part_min_ttl;
    if (local_part_min_ttl && local_part_min_ttl <= global_ctx->time_of_merge)
        ctx->need_remove_expired_values = true;

+    global_ctx->new_data_part->setColumns(global_ctx->storage_columns, infos, global_ctx->metadata_snapshot->getMetadataVersion());
+
    if (ctx->need_remove_expired_values && global_ctx->ttl_merges_blocker->isCancelled())
    {
        LOG_INFO(ctx->log, "Part {} has values with expired TTL, but merges with TTL are cancelled.", global_ctx->new_data_part->name);
@ -998,6 +1006,17 @@ void MergeTask::ExecuteAndFinalizeHorizontalPart::createMergedStream()

    if (global_ctx->deduplicate)
    {
+        /// We don't want to deduplicate by block number column
+        /// so if deduplicate_by_columns is empty, add all columns except _block_number
+        if (supportsBlockNumberColumn(global_ctx) && global_ctx->deduplicate_by_columns.empty())
+        {
+            for (const auto & col : global_ctx->merging_column_names)
+            {
+                if (col != BlockNumberColumn::name)
+                    global_ctx->deduplicate_by_columns.emplace_back(col);
+            }
+        }
+
        if (DistinctSortedTransform::isApplicable(header, sort_description, global_ctx->deduplicate_by_columns))
            res_pipe.addTransform(std::make_shared<DistinctSortedTransform>(
                res_pipe.getHeader(), sort_description, SizeLimits(), 0 /*limit_hint*/, global_ctx->deduplicate_by_columns));
--- a/src/Storages/MergeTree/MergeTask.h
+++ b/src/Storages/MergeTree/MergeTask.h
@ -13,6 +13,7 @@
 #include <QueryPipeline/QueryPipeline.h>
 #include <Compression/CompressedReadBufferFromFile.h>
 #include <Common/filesystemHelpers.h>
+#include <Storages/BlockNumberColumn.h>

 #include <memory>
 #include <list>
@ -388,6 +389,12 @@ private:

    Stages::iterator stages_iterator = stages.begin();

+    /// Check for persisting block number column
+    static bool supportsBlockNumberColumn(GlobalRuntimeContextPtr global_ctx)
+    {
+        return global_ctx->data->getSettings()->allow_experimental_block_number_column && global_ctx->metadata_snapshot->getGroupByTTLs().empty();
+    }
+
 };

 /// FIXME
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@ -78,6 +78,7 @@
 #include <Storages/VirtualColumnUtils.h>
 #include <Storages/MergeTree/MergeTreeDataPartBuilder.h>
 #include <Storages/MutationCommands.h>
+#include <Storages/BlockNumberColumn.h>

 #include <boost/range/algorithm_ext/erase.hpp>
 #include <boost/algorithm/string/join.hpp>
@ -3730,7 +3731,7 @@ void MergeTreeData::checkPartDynamicColumns(MutableDataPartPtr & part, DataParts
    const auto & part_columns = part->getColumns();
    for (const auto & part_column : part_columns)
    {
-        if (part_column.name == LightweightDeleteDescription::FILTER_COLUMN.name)
+        if (part_column.name == LightweightDeleteDescription::FILTER_COLUMN.name || part_column.name == BlockNumberColumn::name)
            continue;

        auto storage_column = columns.getPhysical(part_column.name);
@ -8269,6 +8270,7 @@ NamesAndTypesList MergeTreeData::getVirtuals() const
        NameAndTypePair("_sample_factor", std::make_shared<DataTypeFloat64>()),
        NameAndTypePair("_part_offset", std::make_shared<DataTypeUInt64>()),
        LightweightDeleteDescription::FILTER_COLUMN,
+        NameAndTypePair(BlockNumberColumn::name, BlockNumberColumn::type),
    };
 }

--- a/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp
@ -5,6 +5,7 @@
 #include <Interpreters/Context.h>
 #include <Storages/MergeTree/LoadedMergeTreeDataPartInfoForReader.h>
 #include <Compression/CompressedReadBufferFromFile.h>
+#include <Storages/BlockNumberColumn.h>


 namespace DB
@ -64,6 +65,12 @@ IMergeTreeDataPart::MergeTreeWriterPtr MergeTreeDataPartCompact::getWriter(
    ordered_columns_list.sort([this](const auto & lhs, const auto & rhs)
        { return *getColumnPosition(lhs.name) < *getColumnPosition(rhs.name); });

+    /// _block_number column is not added by user, but is persisted in a part after merge
+    /// If _block_number is not present in the parts to be merged, then it won't have a position
+    /// So check if its not present and add it at the end
+    if (columns_list.contains(BlockNumberColumn::name) && !ordered_columns_list.contains(BlockNumberColumn::name))
+        ordered_columns_list.emplace_back(NameAndTypePair{BlockNumberColumn::name, BlockNumberColumn::type});
+
    return std::make_unique<MergeTreeDataPartWriterCompact>(
        shared_from_this(), ordered_columns_list, metadata_snapshot,
        indices_to_recalc, getMarksFileExtension(),
--- a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp
@ -1,9 +1,12 @@
 #include <Storages/MergeTree/MergeTreeDataPartWriterCompact.h>
 #include <Storages/MergeTree/MergeTreeDataPartCompact.h>
+#include <Storages/BlockNumberColumn.h>

 namespace DB
 {

+    CompressionCodecPtr getCompressionCodecDelta(UInt8 delta_bytes_size);
+
 namespace ErrorCodes
 {
    extern const int LOGICAL_ERROR;
@ -53,7 +56,14 @@ MergeTreeDataPartWriterCompact::MergeTreeDataPartWriterCompact(

    const auto & storage_columns = metadata_snapshot->getColumns();
    for (const auto & column : columns_list)
-        addStreams(column, storage_columns.getCodecDescOrDefault(column.name, default_codec));
+    {
+        ASTPtr compression;
+        if (column.name == BlockNumberColumn::name)
+            compression = BlockNumberColumn::compression_codec->getFullCodecDesc();
+        else
+            compression = storage_columns.getCodecDescOrDefault(column.name, default_codec);
+        addStreams(column, compression);
+    }
 }

 void MergeTreeDataPartWriterCompact::addStreams(const NameAndTypePair & column, const ASTPtr & effective_codec_desc)
--- a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp
@ -6,9 +6,12 @@
 #include <Common/escapeForFileName.h>
 #include <Columns/ColumnSparse.h>
 #include <Common/logger_useful.h>
+#include <Storages/BlockNumberColumn.h>

 namespace DB
 {
+    CompressionCodecPtr getCompressionCodecDelta(UInt8 delta_bytes_size);
+
 namespace ErrorCodes
 {
    extern const int LOGICAL_ERROR;
@ -87,7 +90,14 @@ MergeTreeDataPartWriterWide::MergeTreeDataPartWriterWide(
 {
    const auto & columns = metadata_snapshot->getColumns();
    for (const auto & it : columns_list)
-        addStreams(it, columns.getCodecDescOrDefault(it.name, default_codec));
+    {
+        ASTPtr compression;
+        if (it.name == BlockNumberColumn::name)
+            compression = BlockNumberColumn::compression_codec->getFullCodecDesc();
+        else
+            compression = columns.getCodecDescOrDefault(it.name, default_codec);
+        addStreams(it, compression);
+    }
 }

 void MergeTreeDataPartWriterWide::addStreams(
--- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
@ -46,7 +46,7 @@
 #include <Functions/IFunction.h>

 #include <IO/WriteBufferFromOStream.h>
-
+#include <Storages/BlockNumberColumn.h>
 #include <Storages/MergeTree/ApproximateNearestNeighborIndexesCommon.h>

 namespace CurrentMetrics
@ -1232,6 +1232,10 @@ static void selectColumnNames(
        {
            virt_column_names.push_back(name);
        }
+        else if (name == BlockNumberColumn::name)
+        {
+            virt_column_names.push_back(name);
+        }
        else if (name == "_part_uuid")
        {
            virt_column_names.push_back(name);
--- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp
+++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp
@ -154,36 +154,45 @@ void MergeTreeIndexAggregatorAnnoy<Distance>::update(const Block & block, size_t

    if (const auto & column_array = typeid_cast<const ColumnArray *>(column_cut.get()))
    {
-        const auto & data = column_array->getData();
-        const auto & array = typeid_cast<const ColumnFloat32 &>(data).getData();
+        const auto & column_array_data = column_array->getData();
+        const auto & column_arary_data_float_data = typeid_cast<const ColumnFloat32 &>(column_array_data).getData();

-        if (array.empty())
-            throw Exception(ErrorCodes::LOGICAL_ERROR, "Array has 0 rows, {} rows expected", rows_read);
+        const auto & column_array_offsets = column_array->getOffsets();
+        const size_t num_rows = column_array_offsets.size();

-        const auto & offsets = column_array->getOffsets();
-        const size_t num_rows = offsets.size();
+        /// The Annoy algorithm naturally assumes that the indexed vectors have dimension >= 1. This condition is violated if empty arrays
+        /// are INSERTed into an Annoy-indexed column or if no value was specified at all in which case the arrays take on their default
+        /// value which is also empty.
+        if (column_array->isDefaultAt(0))
+            throw Exception(ErrorCodes::INCORRECT_DATA, "The arrays in column '{}' must not be empty. Did you try to INSERT default values?", index_column_name);

        /// Check all sizes are the same
-        size_t size = offsets[0];
+        size_t dimension = column_array_offsets[0];
        for (size_t i = 0; i < num_rows - 1; ++i)
-            if (offsets[i + 1] - offsets[i] != size)
-                throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column {} must have equal length", index_column_name);
+            if (column_array_offsets[i + 1] - column_array_offsets[i] != dimension)
+                throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column '{}' must have equal length", index_column_name);
+
+        /// Also check that previously inserted blocks have the same size as this block.
+        /// Note that this guarantees consistency of dimension only within parts. We are unable to detect inconsistent dimensions across
+        /// parts - for this, a little help from the user is needed, e.g. CONSTRAINT cnstr CHECK length(array) = 42.
+        if (index && index->getDimensions() != dimension)
+            throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column '{}' must have equal length", index_column_name);

        if (!index)
-            index = std::make_shared<AnnoyIndexWithSerialization<Distance>>(size);
+            index = std::make_shared<AnnoyIndexWithSerialization<Distance>>(dimension);

        /// Add all rows of block
-        index->add_item(index->get_n_items(), array.data());
+        index->add_item(index->get_n_items(), column_arary_data_float_data.data());
        for (size_t current_row = 1; current_row < num_rows; ++current_row)
-            index->add_item(index->get_n_items(), &array[offsets[current_row - 1]]);
+            index->add_item(index->get_n_items(), &column_arary_data_float_data[column_array_offsets[current_row - 1]]);
    }
    else if (const auto & column_tuple = typeid_cast<const ColumnTuple *>(column_cut.get()))
    {
-        const auto & columns = column_tuple->getColumns();
+        const auto & column_tuple_columns = column_tuple->getColumns();

        /// TODO check if calling index->add_item() directly on the block's tuples is faster than materializing everything
-        std::vector<std::vector<Float32>> data{column_tuple->size(), std::vector<Float32>()};
-        for (const auto & column : columns)
+        std::vector<std::vector<Float32>> data(column_tuple->size(), std::vector<Float32>());
+        for (const auto & column : column_tuple_columns)
        {
            const auto & pod_array = typeid_cast<const ColumnFloat32 *>(column.get())->getData();
            for (size_t i = 0; i < pod_array.size(); ++i)
@ -363,7 +372,7 @@ void annoyIndexValidator(const IndexDescription & index, bool /* attach */)
    {
        throw Exception(
            ErrorCodes::ILLEGAL_COLUMN,
-            "Annoy indexes can only be created on columns of type Array(Float32) and Tuple(Float32)");
+            "Annoy indexes can only be created on columns of type Array(Float32) and Tuple(Float32[, Float32[, ...]])");
    };

    DataTypePtr data_type = index.sample_block.getDataTypes()[0];
--- a/src/Storages/MergeTree/MergeTreeIndexUSearch.cpp
+++ b/src/Storages/MergeTree/MergeTreeIndexUSearch.cpp
@ -173,23 +173,32 @@ void MergeTreeIndexAggregatorUSearch<Metric>::update(const Block & block, size_t

    if (const auto & column_array = typeid_cast<const ColumnArray *>(column_cut.get()))
    {
-        const auto & data = column_array->getData();
-        const auto & array = typeid_cast<const ColumnFloat32 &>(data).getData();
+        const auto & column_array_data = column_array->getData();
+        const auto & column_array_data_float_data = typeid_cast<const ColumnFloat32 &>(column_array_data).getData();

-        if (array.empty())
-            throw Exception(ErrorCodes::LOGICAL_ERROR, "Array has 0 rows, {} rows expected", rows_read);
+        const auto & column_array_offsets = column_array->getOffsets();
+        const size_t num_rows = column_array_offsets.size();

-        const auto & offsets = column_array->getOffsets();
-        const size_t num_rows = offsets.size();
+        /// The Usearch algorithm naturally assumes that the indexed vectors have dimension >= 1. This condition is violated if empty arrays
+        /// are INSERTed into an Usearch-indexed column or if no value was specified at all in which case the arrays take on their default
+        /// values which is also empty.
+        if (column_array->isDefaultAt(0))
+            throw Exception(ErrorCodes::INCORRECT_DATA, "The arrays in column '{}' must not be empty. Did you try to INSERT default values?", index_column_name);

        /// Check all sizes are the same
-        size_t size = offsets[0];
+        size_t dimension = column_array_offsets[0];
        for (size_t i = 0; i < num_rows - 1; ++i)
-            if (offsets[i + 1] - offsets[i] != size)
-                throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column {} must have equal length", index_column_name);
+            if (column_array_offsets[i + 1] - column_array_offsets[i] != dimension)
+                throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column '{}' must have equal length", index_column_name);
+
+        /// Also check that previously inserted blocks have the same size as this block.
+        /// Note that this guarantees consistency of dimension only within parts. We are unable to detect inconsistent dimensions across
+        /// parts - for this, a little help from the user is needed, e.g. CONSTRAINT cnstr CHECK length(array) = 42.
+        if (index && index->getDimensions() != dimension)
+            throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column '{}' must have equal length", index_column_name);

        if (!index)
-            index = std::make_shared<USearchIndexWithSerialization<Metric>>(size, scalar_kind);
+            index = std::make_shared<USearchIndexWithSerialization<Metric>>(dimension, scalar_kind);

        /// Add all rows of block
        if (!index->reserve(unum::usearch::ceil2(index->size() + num_rows)))
@ -197,7 +206,7 @@ void MergeTreeIndexAggregatorUSearch<Metric>::update(const Block & block, size_t

        for (size_t current_row = 0; current_row < num_rows; ++current_row)
        {
-            auto rc = index->add(static_cast<uint32_t>(index->size()), &array[offsets[current_row - 1]]);
+            auto rc = index->add(static_cast<uint32_t>(index->size()), &column_array_data_float_data[column_array_offsets[current_row - 1]]);
            if (!rc)
                throw Exception(ErrorCodes::INCORRECT_DATA, rc.error.release());

@ -208,9 +217,9 @@ void MergeTreeIndexAggregatorUSearch<Metric>::update(const Block & block, size_t
    }
    else if (const auto & column_tuple = typeid_cast<const ColumnTuple *>(column_cut.get()))
    {
-        const auto & columns = column_tuple->getColumns();
-        std::vector<std::vector<Float32>> data{column_tuple->size(), std::vector<Float32>()};
-        for (const auto & column : columns)
+        const auto & column_tuple_columns = column_tuple->getColumns();
+        std::vector<std::vector<Float32>> data(column_tuple->size(), std::vector<Float32>());
+        for (const auto & column : column_tuple_columns)
        {
            const auto & pod_array = typeid_cast<const ColumnFloat32 *>(column.get())->getData();
            for (size_t i = 0; i < pod_array.size(); ++i)
@ -413,7 +422,8 @@ void usearchIndexValidator(const IndexDescription & index, bool /* attach */)
    auto throw_unsupported_underlying_column_exception = []()
    {
        throw Exception(
-            ErrorCodes::ILLEGAL_COLUMN, "USearch indexes can only be created on columns of type Array(Float32) and Tuple(Float32)");
+            ErrorCodes::ILLEGAL_COLUMN,
+            "USearch can only be created on columns of type Array(Float32) and Tuple(Float32[, Float32[, ...]])");
    };

    DataTypePtr data_type = index.sample_block.getDataTypes()[0];
--- a/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp
+++ b/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp
@ -9,6 +9,7 @@
 #include <DataTypes/DataTypeUUID.h>
 #include <DataTypes/DataTypeArray.h>
 #include <Processors/Transforms/AggregatingTransform.h>
+#include <Storages/BlockNumberColumn.h>
 #include <city.h>

 namespace DB
@ -24,7 +25,8 @@ namespace ErrorCodes
 static void injectNonConstVirtualColumns(
    size_t rows,
    Block & block,
-    const Names & virtual_columns);
+    const Names & virtual_columns,
+    MergeTreeReadTask * task = nullptr);

 static void injectPartConstVirtualColumns(
    size_t rows,
@ -247,7 +249,8 @@ namespace
 static void injectNonConstVirtualColumns(
    size_t rows,
    Block & block,
-    const Names & virtual_columns)
+    const Names & virtual_columns,
+    MergeTreeReadTask * task)
 {
    VirtualColumnsInserter inserter(block);
    for (const auto & virtual_column_name : virtual_columns)
@ -278,6 +281,24 @@ static void injectNonConstVirtualColumns(

                inserter.insertUInt8Column(column, virtual_column_name);
        }
+
+        if (virtual_column_name == BlockNumberColumn::name)
+        {
+            ColumnPtr column;
+            if (rows)
+            {
+                size_t value = 0;
+                if (task)
+                {
+                    value = task->getInfo().data_part ? task->getInfo().data_part->info.min_block : 0;
+                }
+                column = BlockNumberColumn::type->createColumnConst(rows, value)->convertToFullColumnIfConst();
+            }
+            else
+                column = BlockNumberColumn::type->createColumn();
+
+            inserter.insertUInt64Column(column, virtual_column_name);
+        }
    }
 }

@ -368,7 +389,7 @@ void MergeTreeSelectProcessor::injectVirtualColumns(
 {
    /// First add non-const columns that are filled by the range reader and then const columns that we will fill ourselves.
    /// Note that the order is important: virtual columns filled by the range reader must go first
-    injectNonConstVirtualColumns(row_count, block, virtual_columns);
+    injectNonConstVirtualColumns(row_count, block, virtual_columns,task);
    injectPartConstVirtualColumns(row_count, block, task, partition_value_type, virtual_columns);
 }

--- a/src/Storages/MergeTree/MergeTreeSequentialSource.cpp
+++ b/src/Storages/MergeTree/MergeTreeSequentialSource.cpp
@ -176,7 +176,7 @@ try
            current_mark += (rows_to_read == rows_read);

            bool should_evaluate_missing_defaults = false;
-            reader->fillMissingColumns(columns, should_evaluate_missing_defaults, rows_read);
+            reader->fillMissingColumns(columns, should_evaluate_missing_defaults, rows_read, data_part->info.min_block);

            if (should_evaluate_missing_defaults)
            {
--- a/Show More
+++ b/Show More