Merge branch 'master' into remove-trash-3

2024-11-21 15:12:02 +00:00 · 2022-09-10 05:08:27 +02:00 · 2022-09-10 05:08:27 +02:00 · 5e14b4575b
commit 5e14b4575b
parent 51dc796275 6924a931c6
187 changed files with 3927 additions and 1087 deletions
--- a/.github/workflows/backport_branches.yml
+++ b/.github/workflows/backport_branches.yml
@ -349,6 +349,100 @@ jobs:
          # shellcheck disable=SC2046
          docker rm -f $(docker ps -a -q) ||:
          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
+  BuilderBinDarwin:
+    needs: [DockerHubPush]
+    runs-on: [self-hosted, builder]
+    steps:
+      - name: Set envs
+        run: |
+          cat >> "$GITHUB_ENV" << 'EOF'
+          TEMP_PATH=${{runner.temp}}/build_check
+          IMAGES_PATH=${{runner.temp}}/images_path
+          REPO_COPY=${{runner.temp}}/build_check/ClickHouse
+          CACHES_PATH=${{runner.temp}}/../ccaches
+          BUILD_NAME=binary_darwin
+          EOF
+      - name: Download changed images
+        uses: actions/download-artifact@v2
+        with:
+          name: changed_images
+          path: ${{ env.IMAGES_PATH }}
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 0 # otherwise we will have no info about contributors
+      - name: Build
+        run: |
+          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
+          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          sudo rm -fr "$TEMP_PATH"
+          mkdir -p "$TEMP_PATH"
+          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
+          cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME"
+      - name: Upload build URLs to artifacts
+        if: ${{ success() || failure() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: ${{ env.BUILD_URLS }}
+          path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json
+      - name: Cleanup
+        if: always()
+        run: |
+          # shellcheck disable=SC2046
+          docker kill $(docker ps -q) ||:
+          # shellcheck disable=SC2046
+          docker rm -f $(docker ps -a -q) ||:
+          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
+  BuilderBinDarwinAarch64:
+    needs: [DockerHubPush]
+    runs-on: [self-hosted, builder]
+    steps:
+      - name: Set envs
+        run: |
+          cat >> "$GITHUB_ENV" << 'EOF'
+          TEMP_PATH=${{runner.temp}}/build_check
+          IMAGES_PATH=${{runner.temp}}/images_path
+          REPO_COPY=${{runner.temp}}/build_check/ClickHouse
+          CACHES_PATH=${{runner.temp}}/../ccaches
+          BUILD_NAME=binary_darwin_aarch64
+          EOF
+      - name: Download changed images
+        uses: actions/download-artifact@v2
+        with:
+          name: changed_images
+          path: ${{ env.IMAGES_PATH }}
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 0 # otherwise we will have no info about contributors
+      - name: Build
+        run: |
+          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
+          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          sudo rm -fr "$TEMP_PATH"
+          mkdir -p "$TEMP_PATH"
+          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
+          cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME"
+      - name: Upload build URLs to artifacts
+        if: ${{ success() || failure() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: ${{ env.BUILD_URLS }}
+          path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json
+      - name: Cleanup
+        if: always()
+        run: |
+          # shellcheck disable=SC2046
+          docker kill $(docker ps -q) ||:
+          # shellcheck disable=SC2046
+          docker rm -f $(docker ps -a -q) ||:
+          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
 ############################################################################################
 ##################################### Docker images  #######################################
 ############################################################################################
@ -425,6 +519,46 @@ jobs:
          # shellcheck disable=SC2046
          docker rm -f $(docker ps -a -q) ||:
          sudo rm -fr "$TEMP_PATH"
+  BuilderSpecialReport:
+    needs:
+      - BuilderBinDarwin
+      - BuilderBinDarwinAarch64
+    runs-on: [self-hosted, style-checker]
+    steps:
+      - name: Set envs
+        run: |
+          cat >> "$GITHUB_ENV" << 'EOF'
+          TEMP_PATH=${{runner.temp}}/report_check
+          REPORTS_PATH=${{runner.temp}}/reports_dir
+          CHECK_NAME=ClickHouse special build check
+          NEEDS_DATA_PATH=${{runner.temp}}/needs.json
+          EOF
+      - name: Download json reports
+        uses: actions/download-artifact@v2
+        with:
+          path: ${{ env.REPORTS_PATH }}
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+      - name: Report Builder
+        run: |
+          sudo rm -fr "$TEMP_PATH"
+          mkdir -p "$TEMP_PATH"
+          cat > "$NEEDS_DATA_PATH" << 'EOF'
+          ${{ toJSON(needs) }}
+          EOF
+          cd "$GITHUB_WORKSPACE/tests/ci"
+          python3 build_report_check.py "$CHECK_NAME"
+      - name: Cleanup
+        if: always()
+        run: |
+          # shellcheck disable=SC2046
+          docker kill $(docker ps -q) ||:
+          # shellcheck disable=SC2046
+          docker rm -f $(docker ps -a -q) ||:
+          sudo rm -fr "$TEMP_PATH"
 ##############################################################################################
 ########################### FUNCTIONAl STATELESS TESTS #######################################
 ##############################################################################################
@ -592,6 +726,7 @@ jobs:
      - DockerHubPush
      - DockerServerImages
      - BuilderReport
+      - BuilderSpecialReport
      - FunctionalStatelessTestAsan
      - FunctionalStatefulTestDebug
      - StressTestTsan
--- a/.github/workflows/master.yml
+++ b/.github/workflows/master.yml
@ -923,6 +923,53 @@ jobs:
          # shellcheck disable=SC2046
          docker rm -f $(docker ps -a -q) ||:
          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
+  BuilderBinAmd64SSE2:
+    needs: [DockerHubPush]
+    runs-on: [self-hosted, builder]
+    steps:
+      - name: Set envs
+        run: |
+          cat >> "$GITHUB_ENV" << 'EOF'
+          TEMP_PATH=${{runner.temp}}/build_check
+          IMAGES_PATH=${{runner.temp}}/images_path
+          REPO_COPY=${{runner.temp}}/build_check/ClickHouse
+          CACHES_PATH=${{runner.temp}}/../ccaches
+          BUILD_NAME=binary_amd64sse2
+          EOF
+      - name: Download changed images
+        uses: actions/download-artifact@v2
+        with:
+          name: changed_images
+          path: ${{ env.IMAGES_PATH }}
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 0 # otherwise we will have no info about contributors
+      - name: Build
+        run: |
+          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
+          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          sudo rm -fr "$TEMP_PATH"
+          mkdir -p "$TEMP_PATH"
+          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
+          cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME"
+      - name: Upload build URLs to artifacts
+        if: ${{ success() || failure() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: ${{ env.BUILD_URLS }}
+          path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json
+      - name: Cleanup
+        if: always()
+        run: |
+          # shellcheck disable=SC2046
+          docker kill $(docker ps -q) ||:
+          # shellcheck disable=SC2046
+          docker rm -f $(docker ps -a -q) ||:
+          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
 ############################################################################################
 ##################################### Docker images  #######################################
 ############################################################################################
@ -1011,6 +1058,7 @@ jobs:
      - BuilderBinFreeBSD
      # - BuilderBinGCC
      - BuilderBinPPC64
+      - BuilderBinAmd64SSE2
      - BuilderBinClangTidy
      - BuilderDebShared
    runs-on: [self-hosted, style-checker]
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@ -935,6 +935,51 @@ jobs:
          # shellcheck disable=SC2046
          docker rm -f $(docker ps -a -q) ||:
          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
+  BuilderBinAmd64SSE2:
+    needs: [DockerHubPush, FastTest, StyleCheck]
+    runs-on: [self-hosted, builder]
+    steps:
+      - name: Set envs
+        run: |
+          cat >> "$GITHUB_ENV" << 'EOF'
+          TEMP_PATH=${{runner.temp}}/build_check
+          IMAGES_PATH=${{runner.temp}}/images_path
+          REPO_COPY=${{runner.temp}}/build_check/ClickHouse
+          CACHES_PATH=${{runner.temp}}/../ccaches
+          BUILD_NAME=binary_amd64sse2
+          EOF
+      - name: Download changed images
+        uses: actions/download-artifact@v2
+        with:
+          name: changed_images
+          path: ${{ env.IMAGES_PATH }}
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+      - name: Build
+        run: |
+          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
+          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          sudo rm -fr "$TEMP_PATH"
+          mkdir -p "$TEMP_PATH"
+          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
+          cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME"
+      - name: Upload build URLs to artifacts
+        if: ${{ success() || failure() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: ${{ env.BUILD_URLS }}
+          path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json
+      - name: Cleanup
+        if: always()
+        run: |
+          # shellcheck disable=SC2046
+          docker kill $(docker ps -q) ||:
+          # shellcheck disable=SC2046
+          docker rm -f $(docker ps -a -q) ||:
+          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
 ############################################################################################
 ##################################### Docker images  #######################################
 ############################################################################################
@ -1023,6 +1068,7 @@ jobs:
      - BuilderBinFreeBSD
      # - BuilderBinGCC
      - BuilderBinPPC64
+      - BuilderBinAmd64SSE2
      - BuilderBinClangTidy
      - BuilderDebShared
    runs-on: [self-hosted, style-checker]
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@ -29,8 +29,12 @@ jobs:
        rm -rf "$TEMP_PATH" && mkdir -p "$TEMP_PATH"
        cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
        cd "$REPO_COPY"
+        # Download and push packages to artifactory
        python3 ./tests/ci/push_to_artifactory.py --release "${{ github.ref }}" \
          --commit '${{ github.sha }}' --artifactory-url "${{ secrets.JFROG_ARTIFACTORY_URL }}" --all
+        # Download macos binaries to ${{runner.temp}}/download_binary
+        python3 ./tests/ci/download_binary.py binary_darwin binary_darwin_aarch64
+        mv '${{runner.temp}}/download_binary/'clickhouse-* '${{runner.temp}}/push_to_artifactory'
    - name: Upload packages to release assets
      uses: svenstaro/upload-release-action@v2
      with:
--- a/.github/workflows/release_branches.yml
+++ b/.github/workflows/release_branches.yml
@ -426,6 +426,100 @@ jobs:
          # shellcheck disable=SC2046
          docker rm -f $(docker ps -a -q) ||:
          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
+  BuilderBinDarwin:
+    needs: [DockerHubPush]
+    runs-on: [self-hosted, builder]
+    steps:
+      - name: Set envs
+        run: |
+          cat >> "$GITHUB_ENV" << 'EOF'
+          TEMP_PATH=${{runner.temp}}/build_check
+          IMAGES_PATH=${{runner.temp}}/images_path
+          REPO_COPY=${{runner.temp}}/build_check/ClickHouse
+          CACHES_PATH=${{runner.temp}}/../ccaches
+          BUILD_NAME=binary_darwin
+          EOF
+      - name: Download changed images
+        uses: actions/download-artifact@v2
+        with:
+          name: changed_images
+          path: ${{ env.IMAGES_PATH }}
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 0 # otherwise we will have no info about contributors
+      - name: Build
+        run: |
+          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
+          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          sudo rm -fr "$TEMP_PATH"
+          mkdir -p "$TEMP_PATH"
+          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
+          cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME"
+      - name: Upload build URLs to artifacts
+        if: ${{ success() || failure() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: ${{ env.BUILD_URLS }}
+          path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json
+      - name: Cleanup
+        if: always()
+        run: |
+          # shellcheck disable=SC2046
+          docker kill $(docker ps -q) ||:
+          # shellcheck disable=SC2046
+          docker rm -f $(docker ps -a -q) ||:
+          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
+  BuilderBinDarwinAarch64:
+    needs: [DockerHubPush]
+    runs-on: [self-hosted, builder]
+    steps:
+      - name: Set envs
+        run: |
+          cat >> "$GITHUB_ENV" << 'EOF'
+          TEMP_PATH=${{runner.temp}}/build_check
+          IMAGES_PATH=${{runner.temp}}/images_path
+          REPO_COPY=${{runner.temp}}/build_check/ClickHouse
+          CACHES_PATH=${{runner.temp}}/../ccaches
+          BUILD_NAME=binary_darwin_aarch64
+          EOF
+      - name: Download changed images
+        uses: actions/download-artifact@v2
+        with:
+          name: changed_images
+          path: ${{ env.IMAGES_PATH }}
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 0 # otherwise we will have no info about contributors
+      - name: Build
+        run: |
+          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
+          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          sudo rm -fr "$TEMP_PATH"
+          mkdir -p "$TEMP_PATH"
+          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
+          cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME"
+      - name: Upload build URLs to artifacts
+        if: ${{ success() || failure() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: ${{ env.BUILD_URLS }}
+          path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json
+      - name: Cleanup
+        if: always()
+        run: |
+          # shellcheck disable=SC2046
+          docker kill $(docker ps -q) ||:
+          # shellcheck disable=SC2046
+          docker rm -f $(docker ps -a -q) ||:
+          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
 ############################################################################################
 ##################################### Docker images  #######################################
 ############################################################################################
@ -505,6 +599,46 @@ jobs:
          # shellcheck disable=SC2046
          docker rm -f $(docker ps -a -q) ||:
          sudo rm -fr "$TEMP_PATH"
+  BuilderSpecialReport:
+    needs:
+      - BuilderBinDarwin
+      - BuilderBinDarwinAarch64
+    runs-on: [self-hosted, style-checker]
+    steps:
+      - name: Set envs
+        run: |
+          cat >> "$GITHUB_ENV" << 'EOF'
+          TEMP_PATH=${{runner.temp}}/report_check
+          REPORTS_PATH=${{runner.temp}}/reports_dir
+          CHECK_NAME=ClickHouse special build check
+          NEEDS_DATA_PATH=${{runner.temp}}/needs.json
+          EOF
+      - name: Download json reports
+        uses: actions/download-artifact@v2
+        with:
+          path: ${{ env.REPORTS_PATH }}
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+      - name: Report Builder
+        run: |
+          sudo rm -fr "$TEMP_PATH"
+          mkdir -p "$TEMP_PATH"
+          cat > "$NEEDS_DATA_PATH" << 'EOF'
+          ${{ toJSON(needs) }}
+          EOF
+          cd "$GITHUB_WORKSPACE/tests/ci"
+          python3 build_report_check.py "$CHECK_NAME"
+      - name: Cleanup
+        if: always()
+        run: |
+          # shellcheck disable=SC2046
+          docker kill $(docker ps -q) ||:
+          # shellcheck disable=SC2046
+          docker rm -f $(docker ps -a -q) ||:
+          sudo rm -fr "$TEMP_PATH"
 ##############################################################################################
 ########################### FUNCTIONAl STATELESS TESTS #######################################
 ##############################################################################################
@ -1847,6 +1981,7 @@ jobs:
      - DockerHubPush
      - DockerServerImages
      - BuilderReport
+      - BuilderSpecialReport
      - FunctionalStatelessTestDebug0
      - FunctionalStatelessTestDebug1
      - FunctionalStatelessTestDebug2
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -143,6 +143,8 @@ include (cmake/add_warning.cmake)
 if (COMPILER_CLANG)
    # generate ranges for fast "addr2line" search
    if (NOT CMAKE_BUILD_TYPE_UC STREQUAL "RELEASE")
+        # NOTE: that clang has a bug because of it does not emit .debug_aranges
+        # with ThinLTO, so custom ld.lld wrapper is shipped in docker images.
        set(COMPILER_FLAGS "${COMPILER_FLAGS} -gdwarf-aranges")
    endif ()

--- a/README.md
+++ b/README.md
@ -15,4 +15,5 @@ ClickHouse® is an open-source column-oriented database management system that a
 * [Contacts](https://clickhouse.com/company/contact) can help to get your questions answered if there are any.

 ## Upcoming events
-* [**v22.8 Release Webinar**](https://clickhouse.com/company/events/v22-8-release-webinar) Original creator, co-founder, and CTO of ClickHouse Alexey Milovidov will walk us through the highlights of the release, provide live demos, and share vision into what is coming in the roadmap.
+* [**v22.9 Release Webinar**](https://clickhouse.com/company/events/v22-9-release-webinar) Original creator, co-founder, and CTO of ClickHouse Alexey Milovidov will walk us through the highlights of the release, provide live demos, and share vision into what is coming in the roadmap.
+* [**ClickHouse for Analytics @ Barracuda Networks**](https://www.meetup.com/clickhouse-silicon-valley-meetup-group/events/288140358/) Join us for this in person meetup hosted by our friends at Barracuda in Bay Area.
--- a/cmake/cpu_features.cmake
+++ b/cmake/cpu_features.cmake
@ -24,6 +24,23 @@ option (ENABLE_BMI "Use BMI instructions on x86_64" 0)
 option (ENABLE_AVX2_FOR_SPEC_OP "Use avx2 instructions for specific operations on x86_64" 0)
 option (ENABLE_AVX512_FOR_SPEC_OP "Use avx512 instructions for specific operations on x86_64" 0)

+# X86: Allow compilation for a SSE2-only target machine. Done by a special build in CI for embedded or very old hardware.
+option (NO_SSE3_OR_HIGHER "Disable SSE3 or higher on x86_64" 0)
+if (NO_SSE3_OR_HIGHER)
+    SET(ENABLE_SSSE3 0)
+    SET(ENABLE_SSE41 0)
+    SET(ENABLE_SSE42 0)
+    SET(ENABLE_PCLMULQDQ 0)
+    SET(ENABLE_POPCNT 0)
+    SET(ENABLE_AVX 0)
+    SET(ENABLE_AVX2 0)
+    SET(ENABLE_AVX512 0)
+    SET(ENABLE_AVX512_VBMI 0)
+    SET(ENABLE_BMI 0)
+    SET(ENABLE_AVX2_FOR_SPEC_OP 0)
+    SET(ENABLE_AVX512_FOR_SPEC_OP 0)
+endif()
+
 option (ARCH_NATIVE "Add -march=native compiler flag. This makes your binaries non-portable but more performant code may be generated. This option overrides ENABLE_* options for specific instruction set. Highly not recommended to use." 0)

 if (ARCH_NATIVE)
--- a/cmake/ld.lld.in
+++ b/cmake/ld.lld.in
@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+# This is a workaround for bug in llvm/clang,
+# that does not produce .debug_aranges with LTO
+#
+# NOTE: this is a temporary solution, that should be removed once [1] will be
+# resolved.
+#
+#   [1]: https://discourse.llvm.org/t/clang-does-not-produce-full-debug-aranges-section-with-thinlto/64898/8
+
+# NOTE: only -flto=thin is supported.
+# NOTE: it is not possible to check was there -gdwarf-aranges initially or not.
+if [[ "$*" =~ -plugin-opt=thinlto ]]; then
+    exec "@LLD_PATH@" -mllvm -generate-arange-section "$@"
+else
+    exec "@LLD_PATH@" "$@"
+fi
--- a/cmake/split_debug_symbols.cmake
+++ b/cmake/split_debug_symbols.cmake
@ -20,7 +20,7 @@ macro(clickhouse_split_debug_symbols)
       COMMAND mkdir -p "${STRIP_DESTINATION_DIR}/bin"
       COMMAND cp "${STRIP_BINARY_PATH}" "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}"
       # Splits debug symbols into separate file, leaves the binary untouched:
-       COMMAND "${OBJCOPY_PATH}" --only-keep-debug --compress-debug-sections "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}" "${STRIP_DESTINATION_DIR}/lib/debug/bin/${STRIP_TARGET}.debug"
+       COMMAND "${OBJCOPY_PATH}" --only-keep-debug "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}" "${STRIP_DESTINATION_DIR}/lib/debug/bin/${STRIP_TARGET}.debug"
       COMMAND chmod 0644 "${STRIP_DESTINATION_DIR}/lib/debug/bin/${STRIP_TARGET}.debug"
       # Strips binary, sections '.note' & '.comment' are removed in line with Debian's stripping policy: www.debian.org/doc/debian-policy/ch-files.html, section '.clickhouse.hash' is needed for integrity check:
       COMMAND "${STRIP_PATH}" --remove-section=.comment --remove-section=.note --keep-section=.clickhouse.hash "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}"
--- a/cmake/tools.cmake
+++ b/cmake/tools.cmake
@ -94,8 +94,13 @@ if (LINKER_NAME)
        if (NOT LLD_PATH)
            message (FATAL_ERROR "Using linker ${LINKER_NAME} but can't find its path.")
        endif ()
-        set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} --ld-path=${LLD_PATH}")
-        set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} --ld-path=${LLD_PATH}")
+
+        # This a temporary quirk to emit .debug_aranges with ThinLTO
+        set (LLD_WRAPPER "${CMAKE_CURRENT_BINARY_DIR}/ld.lld")
+        configure_file ("${CMAKE_CURRENT_SOURCE_DIR}/cmake/ld.lld.in" "${LLD_WRAPPER}" @ONLY)
+
+        set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} --ld-path=${LLD_WRAPPER}")
+        set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} --ld-path=${LLD_WRAPPER}")
    else ()
        set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=${LINKER_NAME}")
        set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fuse-ld=${LINKER_NAME}")
--- a/contrib/vectorscan-cmake/CMakeLists.txt
+++ b/contrib/vectorscan-cmake/CMakeLists.txt
@ -1,6 +1,6 @@
 # We use vectorscan, a portable and API/ABI-compatible drop-in replacement for hyperscan.

-if (ARCH_AMD64)
+if (ARCH_AMD64 AND NOT NO_SSE3_OR_HIGHER)
    option (ENABLE_VECTORSCAN "Enable vectorscan library" ${ENABLE_LIBRARIES})
 endif()

--- a/docker/packager/packager
+++ b/docker/packager/packager
@ -130,6 +130,7 @@ def parse_env_variables(
    ARM_SUFFIX = "-aarch64"
    FREEBSD_SUFFIX = "-freebsd"
    PPC_SUFFIX = "-ppc64le"
+    AMD64_SSE2_SUFFIX = "-amd64sse2"

    result = []
    result.append("OUTPUT_DIR=/output")
@ -141,6 +142,7 @@ def parse_env_variables(
    is_cross_arm = compiler.endswith(ARM_SUFFIX)
    is_cross_ppc = compiler.endswith(PPC_SUFFIX)
    is_cross_freebsd = compiler.endswith(FREEBSD_SUFFIX)
+    is_amd64_sse2 = compiler.endswith(AMD64_SSE2_SUFFIX)

    if is_cross_darwin:
        cc = compiler[: -len(DARWIN_SUFFIX)]
@ -186,6 +188,10 @@ def parse_env_variables(
        cmake_flags.append(
            "-DCMAKE_TOOLCHAIN_FILE=/build/cmake/linux/toolchain-ppc64le.cmake"
        )
+    elif is_amd64_sse2:
+        cc = compiler[: -len(AMD64_SSE2_SUFFIX)]
+        result.append("DEB_ARCH=amd64")
+        cmake_flags.append("-DNO_SSE3_OR_HIGHER=1")
    else:
        cc = compiler
        result.append("DEB_ARCH=amd64")
@ -339,6 +345,7 @@ if __name__ == "__main__":
            "clang-14-darwin-aarch64",
            "clang-14-aarch64",
            "clang-14-ppc64le",
+            "clang-14-amd64sse2",
            "clang-14-freebsd",
            "gcc-11",
        ),
--- a/docker/test/fuzzer/run-fuzzer.sh
+++ b/docker/test/fuzzer/run-fuzzer.sh
@ -1,8 +1,15 @@
 #!/bin/bash
 # shellcheck disable=SC2086,SC2001,SC2046,SC2030,SC2031

-set -eux
+set -x
+
+# core.COMM.PID-TID
+sysctl kernel.core_pattern='core.%e.%p-%P'
+
+set -e
+set -u
 set -o pipefail
+
 trap "exit" INT TERM
 # The watchdog is in the separate process group, so we have to kill it separately
 # if the script terminates earlier.
@ -87,6 +94,19 @@ function configure
    # TODO figure out which ones are needed
    cp -av --dereference "$repo_dir"/tests/config/config.d/listen.xml db/config.d
    cp -av --dereference "$script_dir"/query-fuzzer-tweaks-users.xml db/users.d
+
+    cat > db/config.d/core.xml <<EOL
+<clickhouse>
+    <core_dump>
+        <!-- 100GiB -->
+        <size_limit>107374182400</size_limit>
+    </core_dump>
+    <!-- NOTE: no need to configure core_path,
+         since clickhouse is not started as daemon (via clickhouse start)
+    -->
+    <core_path>$PWD</core_path>
+</clickhouse>
+EOL
 }

 function watchdog
@ -180,7 +200,6 @@ handle SIGUSR2 nostop noprint pass
 handle SIG$RTMIN nostop noprint pass
 info signals
 continue
-gcore
 backtrace full
 thread apply all backtrace full
 info registers
--- a/docker/test/stress/run.sh
+++ b/docker/test/stress/run.sh
@ -3,8 +3,14 @@
 # shellcheck disable=SC2086
 # shellcheck disable=SC2024

+# Avoid overlaps with previous runs
+dmesg --clear
+
 set -x

+# core.COMM.PID-TID
+sysctl kernel.core_pattern='core.%e.%p-%P'
+
 # Thread Fuzzer allows to check more permutations of possible thread scheduling
 # and find more potential issues.

@ -101,6 +107,19 @@ EOL
        </default>
    </profiles>
 </clickhouse>
+EOL
+
+    cat > /etc/clickhouse-server/config.d/core.xml <<EOL
+<clickhouse>
+    <core_dump>
+        <!-- 100GiB -->
+        <size_limit>107374182400</size_limit>
+    </core_dump>
+    <!-- NOTE: no need to configure core_path,
+         since clickhouse is not started as daemon (via clickhouse start)
+    -->
+    <core_path>$PWD</core_path>
+</clickhouse>
 EOL
 }

@ -157,7 +176,6 @@ handle SIGUSR2 nostop noprint pass
 handle SIG$RTMIN nostop noprint pass
 info signals
 continue
-gcore
 backtrace full
 thread apply all backtrace full
 info registers
@ -501,8 +519,7 @@ done
 clickhouse-local --structure "test String, res String" -q "SELECT 'failure', test FROM table WHERE res != 'OK' order by (lower(test) like '%hung%'), rowNumberInAllBlocks() LIMIT 1" < /test_output/test_results.tsv > /test_output/check_status.tsv
 [ -s /test_output/check_status.tsv ] || echo -e "success\tNo errors found" > /test_output/check_status.tsv

-# Core dumps (see gcore)
-# Default filename is 'core.PROCESS_ID'
+# Core dumps
 for core in core.*; do
    pigz $core
    mv $core.gz /test_output/
--- a/docs/en/development/build.md
+++ b/docs/en/development/build.md
@ -140,6 +140,6 @@ hash cmake

 ClickHouse is available in pre-built binaries and packages. Binaries are portable and can be run on any Linux flavour.

-They are built for stable, prestable and testing releases as long as for every commit to master and for every pull request.
+Binaries are built for stable and LTS releases and also every commit to `master` for each pull request.

 To find the freshest build from `master`, go to [commits page](https://github.com/ClickHouse/ClickHouse/commits/master), click on the first green check mark or red cross near commit, and click to the “Details” link right after “ClickHouse Build Check”.
--- a/docs/en/getting-started/example-datasets/nypd_complaint_data.md
+++ b/docs/en/getting-started/example-datasets/nypd_complaint_data.md
@ -0,0 +1,654 @@
+---
+slug: /en/getting-started/example-datasets/nypd_complaint_data
+sidebar_label: NYPD Complaint Data
+description: "Ingest and query Tab Separated Value data in 5 steps"
+title: NYPD Complaint Data
+---
+
+Tab separated value, or TSV, files are common and may include field headings as the first line of the file. ClickHouse can ingest TSVs, and also can query TSVs without ingesting the files.  This guide covers both of these cases. If you need to query or ingest CSV files, the same techniques work, simply substitute `TSV` with `CSV` in your format arguments.
+
+While working through this guide you will:
+- **Investigate**: Query the structure and content of the TSV file.
+- **Determine the target ClickHouse schema**: Choose proper data types and map the existing data to those types.
+- **Create a ClickHouse table**.
+- **Preprocess and stream** the data to ClickHouse.
+- **Run some queries** against ClickHouse.
+
+The dataset used in this guide comes from the NYC Open Data team, and contains data about "all valid felony, misdemeanor, and violation crimes reported to the New York City Police Department (NYPD)". At the time of writing, the data file is 166MB, but it is updated regularly.
+
+**Source**: [data.cityofnewyork.us](https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Current-Year-To-Date-/5uac-w243)  
+**Terms of use**: https://www1.nyc.gov/home/terms-of-use.page
+
+## Prerequisites
+- Download the dataset by visiting the [NYPD Complaint Data Current (Year To Date)](https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Current-Year-To-Date-/5uac-w243) page, clicking the Export button, and choosing **TSV for Excel**.
+- Install [ClickHouse server and client](../../getting-started/install.md).
+- [Launch](../../getting-started/install.md#launch) ClickHouse server, and connect with `clickhouse-client`
+
+### A note about the commands described in this guide
+There are two types of commands in this guide:
+- Some of the commands are querying the TSV files, these are run at the command prompt.
+- The rest of the commands are querying ClickHouse, and these are run in the `clickhouse-client` or Play UI.
+
+:::note
+The examples in this guide assume that you have saved the TSV file to `${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv`, please adjust the commands if needed.
+:::
+
+## Familiarize yourself with the TSV file
+
+Before starting to work with the ClickHouse database familiarize yourself with the data. 
+
+### Look at the fields in the source TSV file
+
+This is an example of a command to query a TSV file, but don't run it yet.
+```sh
+clickhouse-local --query \
+"describe file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')"
+```
+
+Sample response
+```response
+CMPLNT_NUM                  Nullable(Float64)					
+ADDR_PCT_CD                 Nullable(Float64)					
+BORO_NM                     Nullable(String)					
+CMPLNT_FR_DT                Nullable(String)					
+CMPLNT_FR_TM                Nullable(String)					
+```
+
+:::tip
+Most of the time the above command will let you know which fields in the input data are numeric, and which are strings, and which are tuples.  This is not always the case.  Because ClickHouse is routineley used with datasets containing billions of records there is a default number (100) of rows examined to [infer the schema](../../guides/developer/working-with-json/json-semi-structured.md/#relying-on-schema-inference) in order to avoid parsing billions of rows to infer the schema. The response below may not match what you see, as the dataset is updated several times each year. Looking at the Data Dictionary you can see that CMPLNT_NUM is specified as text, and not numeric.  By overriding the default of 100 rows for inference with the setting `SETTINGS input_format_max_rows_to_read_for_schema_inference=2000`
+you can get a better idea of the content.
+
+Note: as of version 22.5 the default is now 25,000 rows for inferring the schema, so only change the setting if you are on an older version or if you need more than 25,000 rows to be sampled.
+:::
+
+Run this command at your command prompt.  You will be using `clickhouse-local` to query the data in the TSV file you downloaded.
+```sh
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"describe file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')" 
+```
+
+Result:
+```response
+CMPLNT_NUM        Nullable(String)					
+ADDR_PCT_CD       Nullable(Float64)					
+BORO_NM           Nullable(String)					
+CMPLNT_FR_DT      Nullable(String)					
+CMPLNT_FR_TM      Nullable(String)					
+CMPLNT_TO_DT      Nullable(String)					
+CMPLNT_TO_TM      Nullable(String)					
+CRM_ATPT_CPTD_CD  Nullable(String)					
+HADEVELOPT        Nullable(String)					
+HOUSING_PSA       Nullable(Float64)					
+JURISDICTION_CODE Nullable(Float64)					
+JURIS_DESC        Nullable(String)					
+KY_CD             Nullable(Float64)					
+LAW_CAT_CD        Nullable(String)					
+LOC_OF_OCCUR_DESC Nullable(String)					
+OFNS_DESC         Nullable(String)					
+PARKS_NM          Nullable(String)					
+PATROL_BORO       Nullable(String)					
+PD_CD             Nullable(Float64)					
+PD_DESC           Nullable(String)					
+PREM_TYP_DESC     Nullable(String)					
+RPT_DT            Nullable(String)					
+STATION_NAME      Nullable(String)					
+SUSP_AGE_GROUP    Nullable(String)					
+SUSP_RACE         Nullable(String)					
+SUSP_SEX          Nullable(String)					
+TRANSIT_DISTRICT  Nullable(Float64)					
+VIC_AGE_GROUP     Nullable(String)					
+VIC_RACE          Nullable(String)					
+VIC_SEX           Nullable(String)					
+X_COORD_CD        Nullable(Float64)					
+Y_COORD_CD        Nullable(Float64)					
+Latitude          Nullable(Float64)					
+Longitude         Nullable(Float64)					
+Lat_Lon           Tuple(Nullable(Float64), Nullable(Float64))					
+New Georeferenced Column Nullable(String)
+```
+
+At this point you should check that the columns in the TSV file match the names and types specified in the **Columns in this Dataset** section of the [dataset web page](https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Current-Year-To-Date-/5uac-w243).  The data types are not very specific, all numeric fields are set to `Nullable(Float64)`, and all other fields are `Nullable(String)`.  When you create a ClickHouse table to store the data you can specify more appropriate and performant types.
+
+### Determine the proper schema
+
+In order to figure out what types should be used for the fields it is necessary to know what the data looks like. For example, the field `JURISDICTION_CODE` is a numeric: should it be a `UInt8`, or an `Enum`, or is `Float64` appropriate?
+
+```sql
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"select JURISDICTION_CODE, count() FROM
+ file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
+ GROUP BY JURISDICTION_CODE
+ ORDER BY JURISDICTION_CODE
+ FORMAT PrettyCompact"
+```
+
+Result:
+```response
+┌─JURISDICTION_CODE─┬─count()─┐
+│                 0 │  188875 │
+│                 1 │    4799 │
+│                 2 │   13833 │
+│                 3 │     656 │
+│                 4 │      51 │
+│                 6 │       5 │
+│                 7 │       2 │
+│                 9 │      13 │
+│                11 │      14 │
+│                12 │       5 │
+│                13 │       2 │
+│                14 │      70 │
+│                15 │      20 │
+│                72 │     159 │
+│                87 │       9 │
+│                88 │      75 │
+│                97 │     405 │
+└───────────────────┴─────────┘
+```
+
+The query response shows that the `JURISDICTION_CODE` fits well in a `UInt8`.
+
+Similarly, look at some of the `String` fields and see if they are well suited to being `DateTime` or [`LowCardinality(String)`](../../sql-reference/data-types/lowcardinality.md) fields.
+
+For example, the field `PARKS_NM` is described as "Name of NYC park, playground or greenspace of occurrence, if applicable (state parks are not included)".  The names of parks in New York City may be a good candidate for a `LowCardinality(String)`:
+
+```sh
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"select count(distinct PARKS_NM) FROM
+ file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
+ FORMAT PrettyCompact"
+```
+
+Result:
+```response
+┌─uniqExact(PARKS_NM)─┐
+│                 319 │
+└─────────────────────┘
+```
+
+Have a look at some of the park names:
+```sql
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"select distinct PARKS_NM FROM
+ file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
+ LIMIT 10
+ FORMAT PrettyCompact"
+```
+
+Result:
+```response
+┌─PARKS_NM───────────────────┐
+│ (null)                     │
+│ ASSER LEVY PARK            │
+│ JAMES J WALKER PARK        │
+│ BELT PARKWAY/SHORE PARKWAY │
+│ PROSPECT PARK              │
+│ MONTEFIORE SQUARE          │
+│ SUTTON PLACE PARK          │
+│ JOYCE KILMER PARK          │
+│ ALLEY ATHLETIC PLAYGROUND  │
+│ ASTORIA PARK               │
+└────────────────────────────┘
+```
+
+The dataset in use at the time of writing has only a few hundred distinct parks and playgrounds in the `PARK_NM` column.  This is a small number based on the [LowCardinality](../../sql-reference/data-types/lowcardinality.md#lowcardinality-dscr) recommendation to stay below 10,000 distinct strings in a `LowCardinality(String)` field.
+
+### DateTime fields
+Based on the **Columns in this Dataset** section of the [dataset web page](https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Current-Year-To-Date-/5uac-w243) there are date and time fields for the start and end of the reported event.  Looking at the min and max of the `CMPLNT_FR_DT` and `CMPLT_TO_DT` gives an idea of whether or not the fields are always populated:
+
+```sh title="CMPLNT_FR_DT"
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"select min(CMPLNT_FR_DT), max(CMPLNT_FR_DT) FROM
+file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
+FORMAT PrettyCompact"
+```
+
+Result:
+```response
+┌─min(CMPLNT_FR_DT)─┬─max(CMPLNT_FR_DT)─┐
+│ 01/01/1973        │ 12/31/2021        │
+└───────────────────┴───────────────────┘
+```
+
+```sh title="CMPLNT_TO_DT"
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"select min(CMPLNT_TO_DT), max(CMPLNT_TO_DT) FROM
+file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
+FORMAT PrettyCompact"
+```
+
+Result:
+```response
+┌─min(CMPLNT_TO_DT)─┬─max(CMPLNT_TO_DT)─┐
+│                   │ 12/31/2021        │
+└───────────────────┴───────────────────┘
+```
+
+```sh title="CMPLNT_FR_TM"
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"select min(CMPLNT_FR_TM), max(CMPLNT_FR_TM) FROM
+file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
+FORMAT PrettyCompact"
+```
+
+Result:
+```response
+┌─min(CMPLNT_FR_TM)─┬─max(CMPLNT_FR_TM)─┐
+│ 00:00:00          │ 23:59:00          │
+└───────────────────┴───────────────────┘
+```
+
+```sh title="CMPLNT_TO_TM"
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"select min(CMPLNT_TO_TM), max(CMPLNT_TO_TM) FROM
+file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
+FORMAT PrettyCompact"
+```
+
+Result:
+```response
+┌─min(CMPLNT_TO_TM)─┬─max(CMPLNT_TO_TM)─┐
+│ (null)            │ 23:59:00          │
+└───────────────────┴───────────────────┘
+```
+
+## Make a plan
+
+Based on the above investigation:
+- `JURISDICTION_CODE` should be cast as `UInt8`.
+- `PARKS_NM` should be cast to `LowCardinality(String)`
+- `CMPLNT_FR_DT` and `CMPLNT_FR_TM` are always populated (possibly with a default time of `00:00:00`)
+- `CMPLNT_TO_DT` and `CMPLNT_TO_TM` may be empty
+- Dates and times are stored in separate fields in the source
+- Dates are `mm/dd/yyyy` format
+- Times are `hh:mm:ss` format
+- Dates and times can be concatenated into DateTime types
+- There are some dates before January 1st 1970, which means we need a 64 bit DateTime
+
+:::note
+There are many more changes to be made to the types, they all can be determined by following the same investigation steps.  Look at the number of distinct strings in a field, the min and max of the numerics, and make your decisions.  The table schema that is given later in the guide has many low cardinality strings and unsigned integer fields and very few floating point numerics.
+:::
+
+## Concatenate the date and time fields
+
+To concatenate the date and time fields `CMPLNT_FR_DT` and `CMPLNT_FR_TM` into a single `String` that can be cast to a `DateTime`, select the two fields joined by the concatenation operator: `CMPLNT_FR_DT || ' ' || CMPLNT_FR_TM`.  The `CMPLNT_TO_DT` and `CMPLNT_TO_TM` fields are handled similarly.
+
+```sh
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"select CMPLNT_FR_DT || ' ' || CMPLNT_FR_TM AS complaint_begin FROM
+file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
+LIMIT 10
+FORMAT PrettyCompact"
+```
+
+Result:
+```response
+┌─complaint_begin─────┐
+│ 07/29/2010 00:01:00 │
+│ 12/01/2011 12:00:00 │
+│ 04/01/2017 15:00:00 │
+│ 03/26/2018 17:20:00 │
+│ 01/01/2019 00:00:00 │
+│ 06/14/2019 00:00:00 │
+│ 11/29/2021 20:00:00 │
+│ 12/04/2021 00:35:00 │
+│ 12/05/2021 12:50:00 │
+│ 12/07/2021 20:30:00 │
+└─────────────────────┘
+```
+
+## Convert the date and time String to a DateTime64 type
+
+Earlier in the guide we discovered that there are dates in the TSV file before January 1st 1970, which means that we need a 64 bit DateTime type for the dates.  The dates also need to be converted from `MM/DD/YYYY` to `YYYY/MM/DD` format.  Both of these can be done with [`parseDateTime64BestEffort()`](../../sql-reference/functions/type-conversion-functions.md#parsedatetime64besteffort).
+
+```sh
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"WITH (CMPLNT_FR_DT || ' ' || CMPLNT_FR_TM) AS CMPLNT_START,
+      (CMPLNT_TO_DT || ' ' || CMPLNT_TO_TM) AS CMPLNT_END
+select parseDateTime64BestEffort(CMPLNT_START) AS complaint_begin,
+       parseDateTime64BestEffortOrNull(CMPLNT_END) AS complaint_end
+FROM file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
+ORDER BY complaint_begin ASC
+LIMIT 25
+FORMAT PrettyCompact"
+```
+
+Lines 2 and 3 above contain the concatenation from the previous step, and lines 4 and 5 above parse the strings into `DateTime64`.  As the complaint end time is not guaranteed to exist `parseDateTime64BestEffortOrNull` is used.
+
+Result:
+```response
+┌─────────complaint_begin─┬───────────complaint_end─┐
+│ 1925-01-01 10:00:00.000 │ 2021-02-12 09:30:00.000 │
+│ 1925-01-01 11:37:00.000 │ 2022-01-16 11:49:00.000 │
+│ 1925-01-01 15:00:00.000 │ 2021-12-31 00:00:00.000 │
+│ 1925-01-01 15:00:00.000 │ 2022-02-02 22:00:00.000 │
+│ 1925-01-01 19:00:00.000 │ 2022-04-14 05:00:00.000 │
+│ 1955-09-01 19:55:00.000 │ 2022-08-01 00:45:00.000 │
+│ 1972-03-17 11:40:00.000 │ 2022-03-17 11:43:00.000 │
+│ 1972-05-23 22:00:00.000 │ 2022-05-24 09:00:00.000 │
+│ 1972-05-30 23:37:00.000 │ 2022-05-30 23:50:00.000 │
+│ 1972-07-04 02:17:00.000 │                    ᴺᵁᴸᴸ │
+│ 1973-01-01 00:00:00.000 │                    ᴺᵁᴸᴸ │
+│ 1975-01-01 00:00:00.000 │                    ᴺᵁᴸᴸ │
+│ 1976-11-05 00:01:00.000 │ 1988-10-05 23:59:00.000 │
+│ 1977-01-01 00:00:00.000 │ 1977-01-01 23:59:00.000 │
+│ 1977-12-20 00:01:00.000 │                    ᴺᵁᴸᴸ │
+│ 1981-01-01 00:01:00.000 │                    ᴺᵁᴸᴸ │
+│ 1981-08-14 00:00:00.000 │ 1987-08-13 23:59:00.000 │
+│ 1983-01-07 00:00:00.000 │ 1990-01-06 00:00:00.000 │
+│ 1984-01-01 00:01:00.000 │ 1984-12-31 23:59:00.000 │
+│ 1985-01-01 12:00:00.000 │ 1987-12-31 15:00:00.000 │
+│ 1985-01-11 09:00:00.000 │ 1985-12-31 12:00:00.000 │
+│ 1986-03-16 00:05:00.000 │ 2022-03-16 00:45:00.000 │
+│ 1987-01-07 00:00:00.000 │ 1987-01-09 00:00:00.000 │
+│ 1988-04-03 18:30:00.000 │ 2022-08-03 09:45:00.000 │
+│ 1988-07-29 12:00:00.000 │ 1990-07-27 22:00:00.000 │
+└─────────────────────────┴─────────────────────────┘
+```
+:::note
+The dates shown as `1925` above are from errors in the data.  There are several records in the original data with dates in the years `1019` - `1022` that should be `2019` - `2022`.  They are being stored as Jan 1st 1925 as that is the earliest date with a 64 bit DateTime.
+:::
+
+## Create a table
+
+The decisions made above on the data types used for the columns are reflected in the table schema
+below. We also need to decide on the `ORDER BY` and `PRIMARY KEY` used for the table.  At least one
+of `ORDER BY` or `PRIMARY KEY` must be specified.  Here are some guidelines on deciding on the 
+columns to includes in `ORDER BY`, and more information is in the *Next Steps* section at the end
+of this document.
+
+### Order By and Primary Key clauses
+
+- The `ORDER BY` tuple should include fields that are used in query filters
+- To maximize compression on disk the `ORDER BY` tuple should be ordered by ascending cardinality
+- If it exists, the `PRIMARY KEY` tuple must be a subset of the `ORDER BY` tuple
+- If only `ORDER BY` is specified, then the same tuple will be used as `PRIMARY KEY`
+- The primary key index is created using the `PRIMARY KEY` tuple if specified, otherwise the `ORDER BY` tuple
+- The `PRIMARY KEY` index is kept in main memory
+
+Looking at the dataset and the questions that might be answered by querying it we might
+decide that we would look at the types of crimes reported over time in the five boroughs of
+New York City.  These fields might be then included in the `ORDER BY`:
+
+| Column      | Description (from the data dictionary)                 |
+| ----------- | ---------------------------------------------------    |
+| OFNS_DESC   | Description of offense corresponding with key code     |
+| RPT_DT      | Date event was reported to police                      |
+| BORO_NM     | The name of the borough in which the incident occurred |
+
+
+Querying the TSV file for the cardinality of the three candidate columns:
+
+```bash
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"select formatReadableQuantity(uniq(OFNS_DESC)) as cardinality_OFNS_DESC,
+        formatReadableQuantity(uniq(RPT_DT)) as cardinality_RPT_DT,
+        formatReadableQuantity(uniq(BORO_NM)) as cardinality_BORO_NM
+  FROM
+  file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
+  FORMAT PrettyCompact"
+```
+
+Result:
+```response
+┌─cardinality_OFNS_DESC─┬─cardinality_RPT_DT─┬─cardinality_BORO_NM─┐
+│ 60.00                 │ 306.00             │ 6.00                │
+└───────────────────────┴────────────────────┴─────────────────────┘
+```
+Ordering by cardinality, the `ORDER BY` becomes:
+
+```
+ORDER BY ( BORO_NM, OFNS_DESC, RPT_DT )
+```
+:::note
+The table below will use more easily read column names, the above names will be mapped to
+```
+ORDER BY ( borough, offense_description, date_reported )
+```
+:::
+
+Putting together the changes to data types and the `ORDER BY` tuple gives this table structure:
+
+```sql
+CREATE TABLE NYPD_Complaint ( 
+    complaint_number     String,
+    precinct             UInt8,
+    borough              LowCardinality(String),
+    complaint_begin      DateTime64(0,'America/New_York'),
+    complaint_end        DateTime64(0,'America/New_York'),
+    was_crime_completed  String,
+    housing_authority    String,
+    housing_level_code   UInt32,
+    jurisdiction_code    UInt8, 
+    jurisdiction         LowCardinality(String),
+    offense_code         UInt8,
+    offense_level        LowCardinality(String),
+    location_descriptor  LowCardinality(String),
+    offense_description  LowCardinality(String),
+    park_name            LowCardinality(String),
+    patrol_borough       LowCardinality(String),
+    PD_CD                UInt16,
+    PD_DESC              String,
+    location_type        LowCardinality(String),
+    date_reported        Date,
+    transit_station      LowCardinality(String),
+    suspect_age_group    LowCardinality(String),
+    suspect_race         LowCardinality(String),
+    suspect_sex          LowCardinality(String),
+    transit_district     UInt8,
+    victim_age_group     LowCardinality(String),
+    victim_race          LowCardinality(String),
+    victim_sex           LowCardinality(String),
+    NY_x_coordinate      UInt32,
+    NY_y_coordinate      UInt32,
+    Latitude             Float64,
+    Longitude            Float64
+) ENGINE = MergeTree
+  ORDER BY ( borough, offense_description, date_reported )
+```
+
+### Finding the primary key of a table
+
+The ClickHouse `system` database, specifically `system.table` has all of the information about the table you
+just created.  This query shows the `ORDER BY` (sorting key), and the `PRIMARY KEY`:
+```sql
+SELECT
+    partition_key,
+    sorting_key,
+    primary_key,
+    table
+FROM system.tables
+WHERE table = 'NYPD_Complaint'
+FORMAT Vertical
+```
+
+Response
+```response
+Query id: 6a5b10bf-9333-4090-b36e-c7f08b1d9e01
+
+Row 1:
+──────
+partition_key: 
+sorting_key:   borough, offense_description, date_reported
+primary_key:   borough, offense_description, date_reported
+table:         NYPD_Complaint
+
+1 row in set. Elapsed: 0.001 sec.
+```
+
+## Preprocess and Import Data {#preprocess-import-data}
+
+We will use `clickhouse-local` tool for data preprocessing and `clickhouse-client` to upload it.
+
+### `clickhouse-local` arguments used
+
+:::tip
+`table='input'` appears in the arguments to clickhouse-local below.  clickhouse-local takes the provided input (`cat ${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv`) and inserts the input into a table.  By default the table is named `table`.  In this guide the name of the table is set to `input` to make the data flow clearer. The final argument to clickhouse-local is a query that selects from the table (`FROM input`) which is then piped to `clickhouse-client` to populate the table `NYPD_Complaint`.
+:::
+  
+```sql
+cat ${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv \
+  | clickhouse-local --table='input' --input-format='TSVWithNames' \
+  --input_format_max_rows_to_read_for_schema_inference=2000 \
+  --query "
+    WITH (CMPLNT_FR_DT || ' ' || CMPLNT_FR_TM) AS CMPLNT_START,
+     (CMPLNT_TO_DT || ' ' || CMPLNT_TO_TM) AS CMPLNT_END
+    SELECT
+      CMPLNT_NUM                                  AS complaint_number,
+      ADDR_PCT_CD                                 AS precinct,
+      BORO_NM                                     AS borough,
+      parseDateTime64BestEffort(CMPLNT_START)     AS complaint_begin,
+      parseDateTime64BestEffortOrNull(CMPLNT_END) AS complaint_end,
+      CRM_ATPT_CPTD_CD                            AS was_crime_completed,
+      HADEVELOPT                                  AS housing_authority_development,
+      HOUSING_PSA                                 AS housing_level_code,
+      JURISDICTION_CODE                           AS jurisdiction_code, 
+      JURIS_DESC                                  AS jurisdiction,
+      KY_CD                                       AS offense_code,
+      LAW_CAT_CD                                  AS offense_level,
+      LOC_OF_OCCUR_DESC                           AS location_descriptor,
+      OFNS_DESC                                   AS offense_description, 
+      PARKS_NM                                    AS park_name,
+      PATROL_BORO                                 AS patrol_borough,
+      PD_CD,
+      PD_DESC,
+      PREM_TYP_DESC                               AS location_type,
+      toDate(parseDateTimeBestEffort(RPT_DT))     AS date_reported,
+      STATION_NAME                                AS transit_station,
+      SUSP_AGE_GROUP                              AS suspect_age_group,
+      SUSP_RACE                                   AS suspect_race,
+      SUSP_SEX                                    AS suspect_sex,
+      TRANSIT_DISTRICT                            AS transit_district,
+      VIC_AGE_GROUP                               AS victim_age_group,   
+      VIC_RACE                                    AS victim_race,
+      VIC_SEX                                     AS victim_sex,
+      X_COORD_CD                                  AS NY_x_coordinate,
+      Y_COORD_CD                                  AS NY_y_coordinate,
+      Latitude,
+      Longitude
+    FROM input" \
+  | clickhouse-client --query='INSERT INTO NYPD_Complaint FORMAT TSV'
+```  
+
+## Validate the Data {#validate-data}
+
+:::note
+The dataset changes once or more per year, your counts may not match what is in this document.
+:::
+
+Query:
+
+```sql
+SELECT count()
+FROM NYPD_Complaint
+```
+
+Result:
+
+```text
+┌─count()─┐
+│  208993 │
+└─────────┘
+
+1 row in set. Elapsed: 0.001 sec. 
+```
+
+The size of the dataset in ClickHouse is just 12% of the original TSV file, compare the size of the original TSV file with the size of the table:
+
+Query:
+
+```sql
+SELECT formatReadableSize(total_bytes)
+FROM system.tables
+WHERE name = 'NYPD_Complaint'
+```
+
+Result:
+```text
+┌─formatReadableSize(total_bytes)─┐
+│ 8.63 MiB                        │
+└─────────────────────────────────┘
+```
+
+
+## Run Some Queries {#run-queries}
+
+### Query 1. Compare the number of complaints by month
+
+Query:
+
+```sql
+SELECT
+    dateName('month', date_reported) AS month,
+    count() AS complaints,
+    bar(complaints, 0, 50000, 80)
+FROM NYPD_Complaint
+GROUP BY month
+ORDER BY complaints DESC
+```
+
+Result:
+```response
+Query id: 7fbd4244-b32a-4acf-b1f3-c3aa198e74d9
+
+┌─month─────┬─complaints─┬─bar(count(), 0, 50000, 80)───────────────────────────────┐
+│ March     │      34536 │ ███████████████████████████████████████████████████████▎ │
+│ May       │      34250 │ ██████████████████████████████████████████████████████▋  │
+│ April     │      32541 │ ████████████████████████████████████████████████████     │
+│ January   │      30806 │ █████████████████████████████████████████████████▎       │
+│ February  │      28118 │ ████████████████████████████████████████████▊            │
+│ November  │       7474 │ ███████████▊                                             │
+│ December  │       7223 │ ███████████▌                                             │
+│ October   │       7070 │ ███████████▎                                             │
+│ September │       6910 │ ███████████                                              │
+│ August    │       6801 │ ██████████▊                                              │
+│ June      │       6779 │ ██████████▋                                              │
+│ July      │       6485 │ ██████████▍                                              │
+└───────────┴────────────┴──────────────────────────────────────────────────────────┘
+
+12 rows in set. Elapsed: 0.006 sec. Processed 208.99 thousand rows, 417.99 KB (37.48 million rows/s., 74.96 MB/s.)
+```
+
+### Query 2. Compare total number of complaints by Borough
+
+Query:
+
+```sql
+SELECT
+    borough,
+    count() AS complaints,
+    bar(complaints, 0, 125000, 60)
+FROM NYPD_Complaint
+GROUP BY borough
+ORDER BY complaints DESC
+```
+
+Result:
+```response
+Query id: 8cdcdfd4-908f-4be0-99e3-265722a2ab8d
+
+┌─borough───────┬─complaints─┬─bar(count(), 0, 125000, 60)──┐
+│ BROOKLYN      │      57947 │ ███████████████████████████▋ │
+│ MANHATTAN     │      53025 │ █████████████████████████▍   │
+│ QUEENS        │      44875 │ █████████████████████▌       │
+│ BRONX         │      44260 │ █████████████████████▏       │
+│ STATEN ISLAND │       8503 │ ████                         │
+│ (null)        │        383 │ ▏                            │
+└───────────────┴────────────┴──────────────────────────────┘
+
+6 rows in set. Elapsed: 0.008 sec. Processed 208.99 thousand rows, 209.43 KB (27.14 million rows/s., 27.20 MB/s.)
+```
+
+## Next Steps
+
+[A Practical Introduction to Sparse Primary Indexes in ClickHouse](../../guides/improving-query-performance/sparse-primary-indexes/sparse-primary-indexes-intro.md) discusses the differences in ClickHouse indexing compared to traditional relational databases, how ClickHouse builds and uses a sparse primary index, and indexing best practices.
--- a/docs/en/getting-started/install.md
+++ b/docs/en/getting-started/install.md
@ -4,10 +4,9 @@ sidebar_position: 1
 keywords: [clickhouse, install, installation, docs]
 description: ClickHouse can run on any Linux, FreeBSD, or Mac OS X with x86_64, AArch64, or PowerPC64LE CPU architecture.
 slug: /en/getting-started/install
+title: Installation
 ---

-# Installation
-
 ## System Requirements {#system-requirements}

 ClickHouse can run on any Linux, FreeBSD, or Mac OS X with x86_64, AArch64, or PowerPC64LE CPU architecture.
@ -59,7 +58,7 @@ clickhouse-client # or "clickhouse-client --password" if you set up a password.

 </details>

-You can replace `stable` with `lts` or `testing` to use different [release trains](../faq/operations/production.md) based on your needs.
+You can replace `stable` with `lts` to use different [release kinds](../faq/operations/production.md) based on your needs.

 You can also download and install packages manually from [here](https://packages.clickhouse.com/deb/pool/stable).

@ -106,7 +105,7 @@ clickhouse-client # or "clickhouse-client --password" if you set up a password.

 </details>

-If you want to use the most recent version, replace `stable` with `testing` (this is recommended for your testing environments). `prestable` is sometimes also available.
+You can replace `stable` with `lts` to use different [release kinds](../faq/operations/production.md) based on your needs.

 Then run these commands to install packages:

@ -221,7 +220,7 @@ For non-Linux operating systems and for AArch64 CPU architecture, ClickHouse bui
    curl -O 'https://builds.clickhouse.com/master/aarch64/clickhouse' && chmod a+x ./clickhouse
    ```

-Run `sudo ./clickhouse install` to install ClickHouse system-wide (also with needed configuration files, configuring users etc.). Then run `clickhouse start` commands to start the clickhouse-server and `clickhouse-client` to connect to it.
+Run `sudo ./clickhouse install` to install ClickHouse system-wide (also with needed configuration files, configuring users etc.). Then run `sudo clickhouse start` commands to start the clickhouse-server and `clickhouse-client` to connect to it.

 Use the `clickhouse client` to connect to the server, or `clickhouse local` to process local data.

--- a/docs/en/operations/backup.md
+++ b/docs/en/operations/backup.md
@ -2,10 +2,9 @@
 slug: /en/operations/backup
 sidebar_position: 49
 sidebar_label: Data backup and restore
+title: Data backup and restore
 ---

-# Data backup and restore
-
 While [replication](../engines/table-engines/mergetree-family/replication.md) provides protection from hardware failures, it does not protect against human errors: accidental deletion of data, deletion of the wrong table or a table on the wrong cluster, and software bugs that result in incorrect data processing or data corruption. In many cases mistakes like these will affect all replicas. ClickHouse has built-in safeguards to prevent some types of mistakes — for example, by default [you can’t just drop tables with a MergeTree-like engine containing more than 50 Gb of data](server-configuration-parameters/settings.md#max-table-size-to-drop). However, these safeguards do not cover all possible cases and can be circumvented.

 In order to effectively mitigate possible human errors, you should carefully prepare a strategy for backing up and restoring your data **in advance**.
--- a/docs/en/operations/caches.md
+++ b/docs/en/operations/caches.md
@ -20,6 +20,7 @@ Additional cache types:
 - [Avro format](../interfaces/formats.md#data-format-avro) schemas cache.
 - [Dictionaries](../sql-reference/dictionaries/index.md) data cache.
 - Schema inference cache.
+- [Filesystem cache](storing-data.md) over S3, Azure, Local and other disks.

 Indirectly used:

--- a/docs/en/operations/server-configuration-parameters/settings.md
+++ b/docs/en/operations/server-configuration-parameters/settings.md
@ -1452,7 +1452,7 @@ Port for communicating with clients over MySQL protocol.

 **Possible values**

-Positive integer.
+Positive integer to specify the port number to listen to or empty value to disable.

 Example

@ -1466,7 +1466,7 @@ Port for communicating with clients over PostgreSQL protocol.

 **Possible values**

-Positive integer.
+Positive integer to specify the port number to listen to or empty value to disable.

 Example

--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@ -1176,8 +1176,9 @@ Enables the quorum writes.

 -   If `insert_quorum < 2`, the quorum writes are disabled.
 -   If `insert_quorum >= 2`, the quorum writes are enabled.
+-   If `insert_quorum = 'auto'`, use majority number (`number_of_replicas / 2 + 1`) as quorum number.

-Default value: 0.
+Default value: 0 - disabled.

 Quorum writes

@ -1259,7 +1260,7 @@ Possible values:

 Default value: 1.

-By default, blocks inserted into replicated tables by the `INSERT` statement are deduplicated (see [Data Replication](../../engines/table-engines/mergetree-family/replication.md)). 
+By default, blocks inserted into replicated tables by the `INSERT` statement are deduplicated (see [Data Replication](../../engines/table-engines/mergetree-family/replication.md)).
 For the replicated tables by default the only 100 of the most recent blocks for each partition are deduplicated (see [replicated_deduplication_window](merge-tree-settings.md#replicated-deduplication-window), [replicated_deduplication_window_seconds](merge-tree-settings.md/#replicated-deduplication-window-seconds)).
 For not replicated tables see [non_replicated_deduplication_window](merge-tree-settings.md/#non-replicated-deduplication-window).

--- a/docs/en/operations/storing-data.md
+++ b/docs/en/operations/storing-data.md
@ -112,6 +112,119 @@ Example of disk configuration:
 </clickhouse>
 ```

+## Using local cache {#using-local-cache}
+
+It is possible to configure local cache over disks in storage configuration starting from version 22.3. For versions 22.3 - 22.7 cache is supported only for `s3` disk type. For versions >= 22.8 cache is supported for any disk type: S3, Azure, Local, Encrypted, etc. Cache uses `LRU` cache policy.
+
+Example of configuration for versions later or equal to 22.8:
+
+``` xml
+<clickhouse>
+    <storage_configuration>
+        <disks>
+            <s3>
+                <type>s3</type>
+                <endpoint>...</endpoint>
+                ... s3 configuration ...
+            </s3>
+            <cache>
+                <type>cache</type>
+                <disk>s3</disk>
+                <path>/s3_cache/</path>
+                <max_size>10000000</max_size>
+            </cache>
+        </disks>
+    </storage_configuration>
+```
+
+Example of configuration for versions earlier than 22.8:
+
+``` xml
+<clickhouse>
+    <storage_configuration>
+        <disks>
+            <s3>
+                <type>s3</type>
+                <endpoint>...</endpoint>
+                ... s3 configuration ...
+                <data_cache_enabled>1</data_cache_enabled>
+                <data_cache_size>10000000</data_cache_size>
+            </s3>
+        </disks>
+    </storage_configuration>
+```
+
+Cache **configuration settings**:
+
+- `path` - path to the directory with cache. Default: None, this setting is obligatory.
+
+- `max_size` - maximum size of the cache in bytes. When the limit is reached, cache files are evicted according to the cache eviction policy. Default: None, this setting is obligatory.
+
+- `cache_on_write_operations` - allow to turn on `write-through` cache (caching data on any write operations: `INSERT` queries, background merges). Default: `false`. The `write-through` cache can be disabled per query using setting `enable_filesystem_cache_on_write_operations` (data is cached only if both cache config settings and corresponding query setting are enabled).
+
+- `enable_filesystem_query_cache_limit` - allow to limit the size of cache which is downloaded within each query (depends on user setting `max_query_cache_size`). Default: `false`.
+
+- `enable_cache_hits_threshold` - a number, which defines how many times some data needs to be read before it will be cached. Default: `0`, e.g. the data is cached at the first attempt to read it.
+
+- `do_not_evict_index_and_mark_files` - do not evict small frequently used files according to cache policy. Default: `true`.
+
+- `max_file_segment_size` - a maximum size of a single cache file. Default: `104857600` (100 Mb).
+
+- `max_elements` - a limit for a number of cache files. Default: `1048576`.
+
+Cache **query settings**:
+
+- `enable_filesystem_cache` - allows to disable cache per query even if storage policy was configured with `cache` disk type. Default: `true`.
+
+- `read_from_filesystem_cache_if_exists_otherwise_bypass_cache` - allows to use cache in query only if it already exists, otherwise query data will not be written to local cache storage. Default: `false`.
+
+- `enable_filesystem_cache_on_write_operations` - turn on `write-through` cache. This setting works only if setting `cache_on_write_operations` in cache configuration is turned on.
+
+- `enable_filesystem_cache_log` - turn on logging to `system.filesystem_cache_log` table. Gives a detailed view of cache usage per query. Default: `false`.
+
+- `max_query_cache_size` - a limit for the cache size, which can be written to local cache storage. Requires enabled `enable_filesystem_query_cache_limit` in cache configuration. Default: `false`.
+
+- `skip_download_if_exceeds_query_cache` - allows to change the behaviour of setting `max_query_cache_size`. Default: `true`. If this setting is turned on and cache download limit during query was reached, no more cache will be downloaded to cache storage. If this setting is turned off and cache download limit during query was reached, cache will still be written by cost of evicting previously downloaded (within current query) data, e.g. second behaviour allows to preserve `last recentltly used` behaviour while keeping query cache limit.
+
+** Warning **
+Cache configuration settings and cache query settings correspond to the latest ClickHouse version, for earlier versions something might not be supported.
+
+Cache **system tables**:
+
+- `system.filesystem_cache` - system tables which shows current state of cache.
+
+- `system.filesystem_cache_log` - system table which shows detailed cache usage per query. Requires `enable_filesystem_cache_log` setting to be `true`.
+
+Cache **commands**:
+
+- `SYSTEM DROP FILESYSTEM CACHE (<path>) (ON CLUSTER)`
+
+- `SHOW CACHES` -- show list of caches which were configured on the server.
+
+- `DESCRIBE CACHE '<cache_name>'` - show cache configuration and some general statistics for a specific cache. Cache name can be taken from `SHOW CACHES` command.
+
+Cache current metrics:
+
+- `FilesystemCacheSize`
+
+- `FilesystemCacheElements`
+
+Cache asynchronous metrics:
+
+- `FilesystemCacheBytes`
+
+- `FilesystemCacheFiles`
+
+Cache profile events:
+
+- `CachedReadBufferReadFromSourceBytes`, `CachedReadBufferReadFromCacheBytes,`
+
+- `CachedReadBufferReadFromSourceMicroseconds`, `CachedReadBufferReadFromCacheMicroseconds`
+
+- `CachedReadBufferCacheWriteBytes`, `CachedReadBufferCacheWriteMicroseconds`
+
+- `CachedWriteBufferCacheWriteBytes`, `CachedWriteBufferCacheWriteMicroseconds`
+
 ## Storing Data on Web Server {#storing-data-on-webserver}

 There is a tool `clickhouse-static-files-uploader`, which prepares a data directory for a given table (`SELECT data_paths FROM system.tables WHERE name = 'table_name'`). For each table you need, you get a directory of files. These files can be uploaded to, for example, a web server with static files. After this preparation, you can load this table into any ClickHouse server via `DiskWeb`.
--- a/docs/en/operations/tips.md
+++ b/docs/en/operations/tips.md
@ -74,13 +74,16 @@ Make sure that [`fstrim`](https://en.wikipedia.org/wiki/Trim_(computing)) is ena

 ## File System {#file-system}

-Ext4 is the most reliable option. Set the mount options `noatime`.
-XFS should be avoided. It works mostly fine but there are some reports about lower performance.
+Ext4 is the most reliable option. Set the mount options `noatime`. XFS works well too.
 Most other file systems should also work fine.

+FAT-32 and exFAT are not supported due to lack of hard links.
+
 Do not use compressed filesystems, because ClickHouse does compression on its own and better.
 It's not recommended to use encrypted filesystems, because you can use builtin encryption in ClickHouse, which is better.

+While ClickHouse can work over NFS, it is not the best idea.
+
 ## Linux Kernel {#linux-kernel}

 Don’t use an outdated Linux kernel.
--- a/docs/en/sql-reference/functions/date-time-functions.md
+++ b/docs/en/sql-reference/functions/date-time-functions.md
@ -640,7 +640,8 @@ Result:

 ## date\_diff

-Returns the difference between two dates or dates with time values.
+Returns the difference between two dates or dates with time values. 
+The difference is calculated using relative units, e.g. the difference between `2022-01-01` and `2021-12-29` is 3 days for day unit (see [toRelativeDayNum](#torelativedaynum)), 1 month for month unit (see [toRelativeMonthNum](#torelativemonthnum)), 1 year for year unit (see [toRelativeYearNum](#torelativeyearnum)).

 **Syntax**

@ -692,6 +693,25 @@ Result:
 └────────────────────────────────────────────────────────────────────────────────────────┘
 ```

+Query:
+
+``` sql
+SELECT
+    toDate('2022-01-01') AS e,
+    toDate('2021-12-29') AS s,
+    dateDiff('day', s, e) AS day_diff,
+    dateDiff('month', s, e) AS month__diff,
+    dateDiff('year', s, e) AS year_diff;
+```
+
+Result:
+
+``` text
+┌──────────e─┬──────────s─┬─day_diff─┬─month__diff─┬─year_diff─┐
+│ 2022-01-01 │ 2021-12-29 │        3 │           1 │         1 │
+└────────────┴────────────┴──────────┴─────────────┴───────────┘
+```
+
 ## date\_sub

 Subtracts the time interval or date interval from the provided date or date with time.
--- a/docs/en/sql-reference/statements/alter/delete.md
+++ b/docs/en/sql-reference/statements/alter/delete.md
@ -12,8 +12,9 @@ ALTER TABLE [db.]table [ON CLUSTER cluster] DELETE WHERE filter_expr

 Deletes data matching the specified filtering expression. Implemented as a [mutation](../../../sql-reference/statements/alter/index.md#mutations).

-:::note    
-The `ALTER TABLE` prefix makes this syntax different from most other systems supporting SQL. It is intended to signify that unlike similar queries in OLTP databases this is a heavy operation not designed for frequent use.
+
+:::note
+The `ALTER TABLE` prefix makes this syntax different from most other systems supporting SQL. It is intended to signify that unlike similar queries in OLTP databases this is a heavy operation not designed for frequent use.  `ALTER TABLE` is considered a heavyweight operation that requires the underlying data to be merged before it is deleted. For MergeTree tables, consider using the [`DELETE FROM` query](../delete.md), which performs a lightweight delete and can be considerably faster.
 :::

 The `filter_expr` must be of type `UInt8`. The query deletes rows in the table for which this expression takes a non-zero value.
--- a/docs/en/sql-reference/statements/delete.md
+++ b/docs/en/sql-reference/statements/delete.md
@ -0,0 +1,37 @@
+---
+slug: /en/sql-reference/statements/delete
+sidebar_position: 36
+sidebar_label: DELETE
+---
+
+# DELETE Statement
+
+``` sql
+DELETE FROM [db.]table [WHERE expr]
+```
+
+`DELETE FROM` removes rows from table `[db.]table` that match expression `expr`. The deleted rows are marked as deleted immediately and will be automatically filtered out of all subsequent queries. Cleanup of data happens asynchronously in background. This feature is only available for MergeTree table engine family.
+
+For example, the following query deletes all rows from the `hits` table where the `Title` column contains the text `hello`:
+
+```sql
+DELETE FROM hits WHERE Title LIKE '%hello%';
+```
+
+Lightweight deletes are asynchronous by default. Set `mutations_sync` equal to 1 to wait for one replica to process the statement, and set `mutations_sync` to 2 to wait for all replicas.
+
+:::note
+This feature is experimental and requires you to set `allow_experimental_lightweight_delete` to true:
+
+```sql
+SET allow_experimental_lightweight_delete = true;
+```
+
+:::
+
+An [alternative way to delete rows](./alter/delete.md) in ClickHouse is `ALTER TABLE ... DELETE`, which might be more efficient if you do bulk deletes only occasionally and don't need the operation to be applied instantly. In most use cases the new lightweight `DELETE FROM` behavior will be considerably faster.
+
+:::warning
+Even though deletes are becoming more lightweight in ClickHouse, they should still not be used as aggressively as on OLTP system. Ligthweight deletes are currently efficient for wide parts, but for compact parts they can be a heavyweight operation, and it may be better to use `ALTER TABLE` for some scenarios.
+:::
+
--- a/docs/en/sql-reference/statements/detach.md
+++ b/docs/en/sql-reference/statements/detach.md
@ -10,7 +10,7 @@ Makes the server "forget" about the existence of a table, a materialized view, o
 **Syntax**

 ``` sql
-DETACH TABLE|VIEW|DICTIONARY [IF EXISTS] [db.]name [ON CLUSTER cluster] [PERMANENTLY]
+DETACH TABLE|VIEW|DICTIONARY [IF EXISTS] [db.]name [ON CLUSTER cluster] [PERMANENTLY] [SYNC]
 ```

 Detaching does not delete the data or metadata of a table, a materialized view or a dictionary. If an entity was not detached `PERMANENTLY`, on the next server launch the server will read the metadata and recall the table/view/dictionary again. If an entity was detached `PERMANENTLY`, there will be no automatic recall.
@ -24,6 +24,8 @@ Note that you can not detach permanently the table which is already detached (te

 Also you can not [DROP](../../sql-reference/statements/drop#drop-table) the detached table, or [CREATE TABLE](../../sql-reference/statements/create/table.md) with the same name as detached permanently, or replace it with the other table with [RENAME TABLE](../../sql-reference/statements/rename.md) query.

+The `SYNC` modifier executes the action without delay.
+
 **Example**

 Creating a table:
--- a/docs/en/sql-reference/statements/drop.md
+++ b/docs/en/sql-reference/statements/drop.md
@ -6,7 +6,7 @@ sidebar_label: DROP

 # DROP Statements

-Deletes existing entity. If the `IF EXISTS` clause is specified, these queries do not return an error if the entity does not exist.
+Deletes existing entity. If the `IF EXISTS` clause is specified, these queries do not return an error if the entity does not exist. If the `SYNC` modifier is specified, the entity is dropped without delay.

 ## DROP DATABASE

@ -15,7 +15,7 @@ Deletes all tables inside the `db` database, then deletes the `db` database itse
 Syntax:

 ``` sql
-DROP DATABASE [IF EXISTS] db [ON CLUSTER cluster]
+DROP DATABASE [IF EXISTS] db [ON CLUSTER cluster] [SYNC]
 ```

 ## DROP TABLE
@ -25,7 +25,7 @@ Deletes the table.
 Syntax:

 ``` sql
-DROP [TEMPORARY] TABLE [IF EXISTS] [db.]name [ON CLUSTER cluster]
+DROP [TEMPORARY] TABLE [IF EXISTS] [db.]name [ON CLUSTER cluster] [SYNC]
 ```

 ## DROP DICTIONARY
@ -35,7 +35,7 @@ Deletes the dictionary.
 Syntax:

 ``` sql
-DROP DICTIONARY [IF EXISTS] [db.]name
+DROP DICTIONARY [IF EXISTS] [db.]name [SYNC]
 ```

 ## DROP USER
@ -95,7 +95,7 @@ Deletes a view. Views can be deleted by a `DROP TABLE` command as well but `DROP
 Syntax:

 ``` sql
-DROP VIEW [IF EXISTS] [db.]name [ON CLUSTER cluster]
+DROP VIEW [IF EXISTS] [db.]name [ON CLUSTER cluster] [SYNC]
 ```

 ## DROP FUNCTION
--- a/docs/zh/development/tests.md
+++ b/docs/zh/development/tests.md
@ -1,5 +1,5 @@
 ---
-slug: /en/development/tests
+slug: /zh/development/tests
 sidebar_position: 70
 sidebar_label: Testing
 title: ClickHouse Testing
--- a/src/Backups/BackupEntryWrappedWith.h
+++ b/src/Backups/BackupEntryWrappedWith.h
@ -0,0 +1,37 @@
+#pragma once
+
+#include <Backups/IBackupEntry.h>
+
+
+namespace DB
+{
+
+/// Wraps another backup entry and a value of any type.
+template <typename T>
+class BackupEntryWrappedWith : public IBackupEntry
+{
+public:
+    BackupEntryWrappedWith(BackupEntryPtr entry_, const T & custom_value_) : entry(entry_), custom_value(custom_value_) { }
+    BackupEntryWrappedWith(BackupEntryPtr entry_, T && custom_value_) : entry(entry_), custom_value(std::move(custom_value_)) { }
+    ~BackupEntryWrappedWith() override = default;
+
+    UInt64 getSize() const override { return entry->getSize(); }
+    std::optional<UInt128> getChecksum() const override { return entry->getChecksum(); }
+    std::unique_ptr<SeekableReadBuffer> getReadBuffer() const override { return entry->getReadBuffer(); }
+    String getFilePath() const override { return entry->getFilePath(); }
+    DiskPtr tryGetDiskIfExists() const override { return entry->tryGetDiskIfExists(); }
+    DataSourceDescription getDataSourceDescription() const override { return entry->getDataSourceDescription(); }
+
+private:
+    BackupEntryPtr entry;
+    T custom_value;
+};
+
+template <typename T>
+void wrapBackupEntriesWith(std::vector<std::pair<String, BackupEntryPtr>> & backup_entries, const T & custom_value)
+{
+    for (auto & [_, backup_entry] : backup_entries)
+        backup_entry = std::make_shared<BackupEntryWrappedWith<T>>(std::move(backup_entry), custom_value);
+}
+
+}
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -42,6 +42,14 @@ endif ()
 # See `src/Common/TargetSpecific.h`
 option(ENABLE_MULTITARGET_CODE "Enable platform-dependent code" ON)

+if (NO_SSE3_OR_HIGHER)
+    # Optimized x86 code in DECLARE_*_SPECIFIC_CODE blocks (see `src/Common/TargetSpecific.h`) is sometimes marked FORCE_INLINE. As a
+    # result, its instruction set requirements (e.g. SSE4.2) leak into generic code. This is normally not a problem for standard x86 builds
+    # because generic code is compiled with SSE 4.2 anyways. But it breaks SSE2-only builds. Therefore disabling the multitarget code
+    # machinery and always use generic code. (The cleaner alternative is removing FORCE_INLINE but that impacts performance too much.)
+    set(ENABLE_MULTITARGET_CODE OFF)
+endif()
+
 if (ENABLE_MULTITARGET_CODE)
    add_definitions(-DENABLE_MULTITARGET_CODE=1)
 else()
--- a/src/Client/ConnectionEstablisher.cpp
+++ b/src/Client/ConnectionEstablisher.cpp
@ -16,6 +16,7 @@ namespace ErrorCodes
    extern const int ATTEMPT_TO_READ_AFTER_EOF;
    extern const int NETWORK_ERROR;
    extern const int SOCKET_TIMEOUT;
+    extern const int DNS_ERROR;
 }

 ConnectionEstablisher::ConnectionEstablisher(
@ -90,6 +91,7 @@ void ConnectionEstablisher::run(ConnectionEstablisher::TryResult & result, std::
    catch (const Exception & e)
    {
        if (e.code() != ErrorCodes::NETWORK_ERROR && e.code() != ErrorCodes::SOCKET_TIMEOUT
+            && e.code() != ErrorCodes::DNS_ERROR
            && e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF)
            throw;

--- a/src/Columns/ColumnArray.cpp
+++ b/src/Columns/ColumnArray.cpp
@ -50,7 +50,7 @@ ColumnArray::ColumnArray(MutableColumnPtr && nested_column, MutableColumnPtr &&
    if (!offsets_concrete)
        throw Exception("offsets_column must be a ColumnUInt64", ErrorCodes::LOGICAL_ERROR);

-    if (!offsets_concrete->empty() && data)
+    if (!offsets_concrete->empty() && data && !data->empty())
    {
        Offset last_offset = offsets_concrete->getData().back();

--- a/src/Common/Config/ConfigProcessor.cpp
+++ b/src/Common/Config/ConfigProcessor.cpp
@ -13,6 +13,7 @@
 #include <Poco/DOM/Text.h>
 #include <Poco/DOM/Attr.h>
 #include <Poco/DOM/Comment.h>
+#include <Poco/XML/XMLWriter.h>
 #include <Poco/Util/XMLConfiguration.h>
 #include <Common/ZooKeeper/ZooKeeperNodeCache.h>
 #include <Common/ZooKeeper/KeeperException.h>
@ -729,7 +730,11 @@ void ConfigProcessor::savePreprocessedConfig(const LoadedConfig & loaded_config,
            if (!preprocessed_path_parent.empty())
                fs::create_directories(preprocessed_path_parent);
        }
-        DOMWriter().writeNode(preprocessed_path, loaded_config.preprocessed_xml);
+        DOMWriter writer;
+        writer.setNewLine("\n");
+        writer.setIndent("    ");
+        writer.setOptions(Poco::XML::XMLWriter::PRETTY_PRINT);
+        writer.writeNode(preprocessed_path, loaded_config.preprocessed_xml);
        LOG_DEBUG(log, "Saved preprocessed configuration to '{}'.", preprocessed_path);
    }
    catch (Poco::Exception & e)
--- a/src/Common/Config/YAMLParser.cpp
+++ b/src/Common/Config/YAMLParser.cpp
@ -26,114 +26,107 @@ namespace ErrorCodes
    extern const int CANNOT_PARSE_YAML;
 }

-/// A prefix symbol in yaml key
-/// We add attributes to nodes by using a prefix symbol in the key part.
-/// Currently we use @ as a prefix symbol. Note, that @ is reserved
-/// by YAML standard, so we need to write a key-value pair like this: "@attribute": attr_value
-const char YAML_ATTRIBUTE_PREFIX = '@';
-
 namespace
 {
+    /// A prefix symbol in yaml key
+    /// We add attributes to nodes by using a prefix symbol in the key part.
+    /// Currently we use @ as a prefix symbol. Note, that @ is reserved
+    /// by YAML standard, so we need to write a key-value pair like this: "@attribute": attr_value
+    const char YAML_ATTRIBUTE_PREFIX = '@';

-Poco::AutoPtr<Poco::XML::Element> createCloneNode(Poco::XML::Element & original_node)
-{
-    Poco::AutoPtr<Poco::XML::Element> clone_node = original_node.ownerDocument()->createElement(original_node.nodeName());
-    original_node.parentNode()->appendChild(clone_node);
-    return clone_node;
-}
-
-void processNode(const YAML::Node & node, Poco::XML::Element & parent_xml_element)
-{
-    auto * xml_document = parent_xml_element.ownerDocument();
-    switch (node.Type())
+    Poco::AutoPtr<Poco::XML::Element> cloneXMLNode(const Poco::XML::Element & original_node)
    {
-        case YAML::NodeType::Scalar:
-        {
-            std::string value = node.as<std::string>();
-            Poco::AutoPtr<Poco::XML::Text> xml_value = xml_document->createTextNode(value);
-            parent_xml_element.appendChild(xml_value);
-            break;
-        }
+        Poco::AutoPtr<Poco::XML::Element> clone_node = original_node.ownerDocument()->createElement(original_node.nodeName());
+        original_node.parentNode()->appendChild(clone_node);
+        return clone_node;
+    }

-        /// We process YAML Sequences as a
-        /// list of <key>value</key> tags with same key and different values.
-        /// For example, we translate this sequence
-        /// seq:
-        ///     - val1
-        ///     - val2
-        ///
-        /// into this:
-        /// <seq>val1</seq>
-        /// <seq>val2</seq>
-        case YAML::NodeType::Sequence:
+    void processNode(const YAML::Node & node, Poco::XML::Element & parent_xml_node)
+    {
+        auto * xml_document = parent_xml_node.ownerDocument();
+        switch (node.Type())
        {
-            for (const auto & child_node : node)
-                /// For sequences it depends how we want to process them.
-                /// Sequences of key-value pairs such as:
-                /// seq:
-                ///     - k1: val1
-                ///     - k2: val2
-                /// into xml like this:
-                /// <seq>
-                ///     <k1>val1</k1>
-                ///     <k2>val2</k2>
-                /// </seq>
-                ///
-                /// But, if the sequence is just a list, the root-node needs to be repeated, such as:
-                /// seq:
-                ///     - val1
-                ///     - val2
-                /// into xml like this:
-                /// <seq>val1</seq>
-                /// <seq>val2</seq>
-                ///
-                /// Therefore check what type the child is, for further processing.
-                /// Mixing types (values list or map) will lead to strange results but should not happen.
-                if (parent_xml_element.hasChildNodes() && !child_node.IsMap())
-                {
-                    /// Create a new parent node with same tag for each child node
-                    processNode(child_node, *createCloneNode(parent_xml_element));
-                }
-                else
-                {
-                    /// Map, so don't recreate the parent node but add directly
-                    processNode(child_node, parent_xml_element);
-                }
-            break;
-        }
-        case YAML::NodeType::Map:
-        {
-            for (const auto & key_value_pair : node)
+            case YAML::NodeType::Scalar:
            {
-                const auto & key_node = key_value_pair.first;
-                const auto & value_node = key_value_pair.second;
-                std::string key = key_node.as<std::string>();
-                bool is_attribute = (key.starts_with(YAML_ATTRIBUTE_PREFIX) && value_node.IsScalar());
-                if (is_attribute)
-                {
-                    /// we use substr(1) here to remove YAML_ATTRIBUTE_PREFIX from key
-                    auto attribute_name = key.substr(1);
-                    std::string value = value_node.as<std::string>();
-                    parent_xml_element.setAttribute(attribute_name, value);
-                }
-                else
-                {
-                    Poco::AutoPtr<Poco::XML::Element> xml_key = xml_document->createElement(key);
-                    parent_xml_element.appendChild(xml_key);
-                    processNode(value_node, *xml_key);
-                }
+                std::string value = node.as<std::string>();
+                Poco::AutoPtr<Poco::XML::Text> xml_value = xml_document->createTextNode(value);
+                parent_xml_node.appendChild(xml_value);
+                break;
+            }
+
+            /// For sequences we repeat the parent xml node. For example,
+            /// seq:
+            ///     - val1
+            ///     - val2
+            /// is converted into the following xml:
+            /// <seq>val1</seq>
+            /// <seq>val2</seq>
+            ///
+            /// A sequence of mappings is converted in the same way:
+            /// seq:
+            ///     - k1: val1
+            ///       k2: val2
+            ///     - k3: val3
+            /// is converted into the following xml:
+            /// <seq><k1>val1</k1><k2>val2</k2></seq>
+            /// <seq><k3>val3</k3></seq>
+            case YAML::NodeType::Sequence:
+            {
+                size_t i = 0;
+                for (auto it = node.begin(); it != node.end(); ++it, ++i)
+                {
+                    const auto & child_node = *it;
+
+                    bool need_clone_parent_xml_node = (i > 0);
+
+                    if (need_clone_parent_xml_node)
+                    {
+                        /// Create a new parent node with same tag for each child node
+                        processNode(child_node, *cloneXMLNode(parent_xml_node));
+                    }
+                    else
+                    {
+                        /// Map, so don't recreate the parent node but add directly
+                        processNode(child_node, parent_xml_node);
+                    }
+                }
+                break;
+            }
+
+            case YAML::NodeType::Map:
+            {
+                for (const auto & key_value_pair : node)
+                {
+                    const auto & key_node = key_value_pair.first;
+                    const auto & value_node = key_value_pair.second;
+                    std::string key = key_node.as<std::string>();
+                    bool is_attribute = (key.starts_with(YAML_ATTRIBUTE_PREFIX) && value_node.IsScalar());
+                    if (is_attribute)
+                    {
+                        /// we use substr(1) here to remove YAML_ATTRIBUTE_PREFIX from key
+                        auto attribute_name = key.substr(1);
+                        std::string value = value_node.as<std::string>();
+                        parent_xml_node.setAttribute(attribute_name, value);
+                    }
+                    else
+                    {
+                        Poco::AutoPtr<Poco::XML::Element> xml_key = xml_document->createElement(key);
+                        parent_xml_node.appendChild(xml_key);
+                        processNode(value_node, *xml_key);
+                    }
+                }
+                break;
+            }
+
+            case YAML::NodeType::Null: break;
+            case YAML::NodeType::Undefined:
+            {
+                throw Exception(ErrorCodes::CANNOT_PARSE_YAML, "YAMLParser has encountered node with undefined type and cannot continue parsing of the file");
            }
-            break;
-        }
-        case YAML::NodeType::Null: break;
-        case YAML::NodeType::Undefined:
-        {
-            throw Exception(ErrorCodes::CANNOT_PARSE_YAML, "YAMLParser has encountered node with undefined type and cannot continue parsing of the file");
        }
    }
 }

-}

 Poco::AutoPtr<Poco::XML::Document> YAMLParser::parse(const String& path)
 {
--- a/src/Common/Elf.cpp
+++ b/src/Common/Elf.cpp
@ -22,13 +22,13 @@ Elf::Elf(const std::string & path)
    /// Check if it's an elf.
    elf_size = in.buffer().size();
    if (elf_size < sizeof(ElfEhdr))
-        throw Exception("The size of supposedly ELF file is too small", ErrorCodes::CANNOT_PARSE_ELF);
+        throw Exception(ErrorCodes::CANNOT_PARSE_ELF, "The size of supposedly ELF file '{}' is too small", path);

    mapped = in.buffer().begin();
    header = reinterpret_cast<const ElfEhdr *>(mapped);

    if (memcmp(header->e_ident, "\x7F""ELF", 4) != 0)
-        throw Exception("The file is not ELF according to magic", ErrorCodes::CANNOT_PARSE_ELF);
+        throw Exception(ErrorCodes::CANNOT_PARSE_ELF, "The file '{}' is not ELF according to magic", path);

    /// Get section header.
    ElfOff section_header_offset = header->e_shoff;
@ -37,7 +37,7 @@ Elf::Elf(const std::string & path)
    if (!section_header_offset
        || !section_header_num_entries
        || section_header_offset + section_header_num_entries * sizeof(ElfShdr) > elf_size)
-        throw Exception("The ELF is truncated (section header points after end of file)", ErrorCodes::CANNOT_PARSE_ELF);
+        throw Exception(ErrorCodes::CANNOT_PARSE_ELF, "The ELF '{}' is truncated (section header points after end of file)", path);

    section_headers = reinterpret_cast<const ElfShdr *>(mapped + section_header_offset);

@ -48,11 +48,11 @@ Elf::Elf(const std::string & path)
    });

    if (!section_names_strtab)
-        throw Exception("The ELF doesn't have string table with section names", ErrorCodes::CANNOT_PARSE_ELF);
+        throw Exception(ErrorCodes::CANNOT_PARSE_ELF, "The ELF '{}' doesn't have string table with section names", path);

    ElfOff section_names_offset = section_names_strtab->header.sh_offset;
    if (section_names_offset >= elf_size)
-        throw Exception("The ELF is truncated (section names string table points after end of file)", ErrorCodes::CANNOT_PARSE_ELF);
+        throw Exception(ErrorCodes::CANNOT_PARSE_ELF, "The ELF '{}' is truncated (section names string table points after end of file)", path);

    section_names = reinterpret_cast<const char *>(mapped + section_names_offset);

@ -64,7 +64,7 @@ Elf::Elf(const std::string & path)
    if (!program_header_offset
        || !program_header_num_entries
        || program_header_offset + program_header_num_entries * sizeof(ElfPhdr) > elf_size)
-        throw Exception("The ELF is truncated (program header points after end of file)", ErrorCodes::CANNOT_PARSE_ELF);
+        throw Exception(ErrorCodes::CANNOT_PARSE_ELF, "The ELF '{}' is truncated (program header points after end of file)", path);

    program_headers = reinterpret_cast<const ElfPhdr *>(mapped + program_header_offset);
 }
--- a/src/Common/FieldVisitorToString.cpp
+++ b/src/Common/FieldVisitorToString.cpp
@ -145,5 +145,11 @@ String FieldVisitorToString::operator() (const Object & x) const

 }

+String convertFieldToString(const Field & field)
+{
+    if (field.getType() == Field::Types::Which::String)
+        return field.get<String>();
+    return applyVisitor(FieldVisitorToString(), field);
 }

+}
--- a/src/Common/FieldVisitorToString.h
+++ b/src/Common/FieldVisitorToString.h
@ -31,5 +31,8 @@ public:
    String operator() (const bool & x) const;
 };

-}
+/// Get value from field and convert it to string.
+/// Also remove quotes from strings.
+String convertFieldToString(const Field & field);

+}
--- a/src/Common/OpenTelemetryTraceContext.cpp
+++ b/src/Common/OpenTelemetryTraceContext.cpp
@ -88,7 +88,13 @@ void Span::addAttribute(std::exception_ptr e) noexcept

 SpanHolder::SpanHolder(std::string_view _operation_name)
 {
-    if (current_thread_trace_context.isTraceEnabled())
+    if (!current_thread_trace_context.isTraceEnabled())
+    {
+        return;
+    }
+
+    /// Use try-catch to make sure the ctor is exception safe.
+    try
    {
        this->trace_id = current_thread_trace_context.trace_id;
        this->parent_span_id = current_thread_trace_context.span_id;
@ -97,9 +103,19 @@ SpanHolder::SpanHolder(std::string_view _operation_name)
        this->start_time_us
            = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();

-        // set current span id to this
-        current_thread_trace_context.span_id = this->span_id;
+        /// Add new initialization here
    }
+    catch (...)
+    {
+        tryLogCurrentException(__FUNCTION__);
+
+        /// Clear related fields to make sure the span won't be recorded.
+        this->trace_id = UUID();
+        return;
+    }
+
+    /// Set current span as parent of other spans created later on this thread.
+    current_thread_trace_context.span_id = this->span_id;
 }

 void SpanHolder::finish() noexcept
@ -216,7 +232,7 @@ const TracingContextOnThread & CurrentContext()
    return current_thread_trace_context;
 }

-void TracingContextOnThread::reset()
+void TracingContextOnThread::reset() noexcept
 {
    this->trace_id = UUID();
    this->span_id = 0;
@ -231,63 +247,75 @@ TracingContextHolder::TracingContextHolder(
    const Settings * settings_ptr,
    const std::weak_ptr<OpenTelemetrySpanLog> & _span_log)
 {
-    if (current_thread_trace_context.isTraceEnabled())
+    /// Use try-catch to make sure the ctor is exception safe.
+    /// If any exception is raised during the construction, the tracing is not enabled on current thread.
+    try
    {
-        ///
-        /// This is not the normal case,
-        /// it means that construction of current object is not at the start of current thread.
-        /// Usually this is due to:
-        ///    1. bad design
-        ///    2. right design but code changes so that original point where this object is constructing is not the new start execution of current thread
-        ///
-        /// In such case, we should use current context as parent of this new constructing object,
-        /// So this branch ensures this class can be instantiated multiple times on one same thread safely.
-        ///
-        this->is_context_owner = false;
-        this->root_span.trace_id = current_thread_trace_context.trace_id;
-        this->root_span.parent_span_id = current_thread_trace_context.span_id;
+        if (current_thread_trace_context.isTraceEnabled())
+        {
+            ///
+            /// This is not the normal case,
+            /// it means that construction of current object is not at the start of current thread.
+            /// Usually this is due to:
+            ///    1. bad design
+            ///    2. right design but code changes so that original point where this object is constructing is not the new start execution of current thread
+            ///
+            /// In such case, we should use current context as parent of this new constructing object,
+            /// So this branch ensures this class can be instantiated multiple times on one same thread safely.
+            ///
+            this->is_context_owner = false;
+            this->root_span.trace_id = current_thread_trace_context.trace_id;
+            this->root_span.parent_span_id = current_thread_trace_context.span_id;
+            this->root_span.span_id = thread_local_rng();
+            this->root_span.operation_name = _operation_name;
+            this->root_span.start_time_us
+                = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
+
+            /// Set the root span as parent of other spans created on current thread
+            current_thread_trace_context.span_id = this->root_span.span_id;
+            return;
+        }
+
+        if (!_parent_trace_context.isTraceEnabled())
+        {
+            if (settings_ptr == nullptr)
+                /// Skip tracing context initialization on current thread
+                return;
+
+            // Start the trace with some configurable probability.
+            std::bernoulli_distribution should_start_trace{settings_ptr->opentelemetry_start_trace_probability};
+            if (!should_start_trace(thread_local_rng))
+                /// skip tracing context initialization on current thread
+                return;
+
+            while (_parent_trace_context.trace_id == UUID())
+            {
+                // Make sure the random generated trace_id is not 0 which is an invalid id.
+                _parent_trace_context.trace_id.toUnderType().items[0] = thread_local_rng(); //-V656
+                _parent_trace_context.trace_id.toUnderType().items[1] = thread_local_rng(); //-V656
+            }
+            _parent_trace_context.span_id = 0;
+        }
+
+        this->root_span.trace_id = _parent_trace_context.trace_id;
+        this->root_span.parent_span_id = _parent_trace_context.span_id;
        this->root_span.span_id = thread_local_rng();
        this->root_span.operation_name = _operation_name;
        this->root_span.start_time_us
            = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();

-        current_thread_trace_context.span_id = this->root_span.span_id;
+        /// Add new initialization here
+    }
+    catch (...)
+    {
+        tryLogCurrentException(__FUNCTION__);
+
+        /// Clear related fields to make sure the tracing is not enabled.
+        this->root_span.trace_id = UUID();
        return;
    }

-    if (!_parent_trace_context.isTraceEnabled())
-    {
-        if (settings_ptr == nullptr)
-            /// skip tracing context initialization on current thread
-            return;
-
-        // start the trace ourselves, with some configurable probability.
-        std::bernoulli_distribution should_start_trace{settings_ptr->opentelemetry_start_trace_probability};
-        if (!should_start_trace(thread_local_rng))
-            /// skip tracing context initialization on current thread
-            return;
-
-        while (_parent_trace_context.trace_id == UUID())
-        {
-            // make sure the random generated trace_id is not 0 which is an invalid id
-            _parent_trace_context.trace_id.toUnderType().items[0] = thread_local_rng(); //-V656
-            _parent_trace_context.trace_id.toUnderType().items[1] = thread_local_rng(); //-V656
-        }
-        _parent_trace_context.span_id = 0;
-    }
-
-    this->root_span.trace_id = _parent_trace_context.trace_id;
-    this->root_span.parent_span_id = _parent_trace_context.span_id;
-    this->root_span.span_id = thread_local_rng();
-    this->root_span.operation_name = _operation_name;
-    this->root_span.start_time_us
-        = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
-
-    /// This object is created to initialize tracing context on a new thread,
-    /// it's helpful to record the thread_id so that we know the thread switching from the span log
-    this->root_span.addAttribute("clickhouse.thread_id", getThreadId());
-
-    /// set up trace context on current thread
+    /// Set up trace context on current thread only when the root span is successfully initialized.
    current_thread_trace_context = _parent_trace_context;
    current_thread_trace_context.span_id = this->root_span.span_id;
    current_thread_trace_context.trace_flags = TRACE_FLAG_SAMPLED;
@ -306,6 +334,18 @@ TracingContextHolder::~TracingContextHolder()
        auto shared_span_log = current_thread_trace_context.span_log.lock();
        if (shared_span_log)
        {
+            try
+            {
+                /// This object is created to initialize tracing context on a new thread,
+                /// it's helpful to record the thread_id so that we know the thread switching from the span log
+                this->root_span.addAttribute("clickhouse.thread_id", getThreadId());
+            }
+            catch (...)
+            {
+                /// It's acceptable that the attribute is not recorded in case of any exception,
+                /// so the exception is ignored to try to log the span.
+            }
+
            this->root_span.finish_time_us
                = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();

--- a/src/Common/OpenTelemetryTraceContext.h
+++ b/src/Common/OpenTelemetryTraceContext.h
@ -74,7 +74,7 @@ struct TracingContextOnThread : TracingContext
        return *this;
    }

-    void reset();
+    void reset() noexcept;

    /// Use weak_ptr instead of shared_ptr to hold a reference to the underlying system.opentelemetry_span_log table
    /// Since this object is kept on threads and passed across threads, a weak_ptr is more safe to prevent potential leak
--- a/src/Common/ProfileEvents.cpp
+++ b/src/Common/ProfileEvents.cpp
@ -286,6 +286,18 @@ The server successfully detected this situation and will download merged part fr
    M(S3WriteRequestsThrottling, "Number of 429 and 503 errors in POST, DELETE, PUT and PATCH requests to S3 storage.") \
    M(S3WriteRequestsRedirects, "Number of redirects in POST, DELETE, PUT and PATCH requests to S3 storage.") \
    \
+    M(DiskS3ReadMicroseconds, "Time of GET and HEAD requests to DiskS3 storage.") \
+    M(DiskS3ReadRequestsCount, "Number of GET and HEAD requests to DiskS3 storage.") \
+    M(DiskS3ReadRequestsErrors, "Number of non-throttling errors in GET and HEAD requests to DiskS3 storage.") \
+    M(DiskS3ReadRequestsThrottling, "Number of 429 and 503 errors in GET and HEAD requests to DiskS3 storage.") \
+    M(DiskS3ReadRequestsRedirects, "Number of redirects in GET and HEAD requests to DiskS3 storage.") \
+    \
+    M(DiskS3WriteMicroseconds, "Time of POST, DELETE, PUT and PATCH requests to DiskS3 storage.") \
+    M(DiskS3WriteRequestsCount, "Number of POST, DELETE, PUT and PATCH requests to DiskS3 storage.") \
+    M(DiskS3WriteRequestsErrors, "Number of non-throttling errors in POST, DELETE, PUT and PATCH requests to DiskS3 storage.") \
+    M(DiskS3WriteRequestsThrottling, "Number of 429 and 503 errors in POST, DELETE, PUT and PATCH requests to DiskS3 storage.") \
+    M(DiskS3WriteRequestsRedirects, "Number of redirects in POST, DELETE, PUT and PATCH requests to DiskS3 storage.") \
+    \
    M(ReadBufferFromS3Microseconds, "Time spend in reading from S3.") \
    M(ReadBufferFromS3Bytes, "Bytes read from S3.") \
    M(ReadBufferFromS3RequestsErrors, "Number of exceptions while reading from S3.") \
--- a/src/Common/SymbolIndex.cpp
+++ b/src/Common/SymbolIndex.cpp
@ -37,7 +37,7 @@ But because ClickHouse is linked with most of the symbols exported (-rdynamic fl
 It allows to get source file names and line numbers from addresses. Only available if you use -g option for compiler.
 It is also used by default for ClickHouse builds, but because of its weight (about two gigabytes)
 it is split to separate binary and provided in clickhouse-common-static-dbg package.
-This separate binary is placed in /usr/lib/debug/usr/bin/clickhouse and is loaded automatically by tools like gdb, addr2line.
+This separate binary is placed in /usr/lib/debug/usr/bin/clickhouse.debug and is loaded automatically by tools like gdb, addr2line.
 When you build ClickHouse by yourself, debug info is not split and present in a single huge binary.

 What ClickHouse is using to provide good stack traces?
@ -391,10 +391,22 @@ void collectSymbolsFromELF(
    std::filesystem::path local_debug_info_path = canonical_path.parent_path() / canonical_path.stem();
    local_debug_info_path += ".debug";
    std::filesystem::path debug_info_path = std::filesystem::path("/usr/lib/debug") / canonical_path.relative_path();
+    debug_info_path += ".debug";

-    if (std::filesystem::exists(local_debug_info_path))
+    /// NOTE: This is a workaround for current package system.
+    ///
+    /// Since nfpm cannot copy file only if it exists,
+    /// and so in cmake empty .debug file is created instead,
+    /// but if we will try to load empty Elf file, then the CANNOT_PARSE_ELF
+    /// exception will be thrown from the Elf::Elf.
+    auto exists_not_empty = [](const std::filesystem::path & path)
+    {
+        return std::filesystem::exists(path) && !std::filesystem::is_empty(path);
+    };
+
+    if (exists_not_empty(local_debug_info_path))
        object_name = local_debug_info_path;
-    else if (std::filesystem::exists(debug_info_path))
+    else if (exists_not_empty(debug_info_path))
        object_name = debug_info_path;
    else if (build_id.size() >= 2)
    {
@ -412,7 +424,7 @@ void collectSymbolsFromELF(

        std::filesystem::path build_id_debug_info_path(
            fmt::format("/usr/lib/debug/.build-id/{}/{}.debug", build_id_hex.substr(0, 2), build_id_hex.substr(2)));
-        if (std::filesystem::exists(build_id_debug_info_path))
+        if (exists_not_empty(build_id_debug_info_path))
            object_name = build_id_debug_info_path;
        else
            object_name = canonical_path;
--- a/src/Common/ZooKeeper/ZooKeeperCommon.cpp
+++ b/src/Common/ZooKeeper/ZooKeeperCommon.cpp
@ -898,4 +898,25 @@ ZooKeeperRequestFactory::ZooKeeperRequestFactory()
    registerZooKeeperRequest<OpNum::FilteredList, ZooKeeperFilteredListRequest>(*this);
 }

+PathMatchResult matchPath(std::string_view path, std::string_view match_to)
+{
+    using enum PathMatchResult;
+
+    if (path.ends_with('/'))
+        path.remove_suffix(1);
+
+    if (match_to.ends_with('/'))
+        match_to.remove_suffix(1);
+
+    auto [first_it, second_it] = std::mismatch(path.begin(), path.end(), match_to.begin(), match_to.end());
+
+    if (second_it != match_to.end())
+        return NOT_MATCH;
+
+    if (first_it == path.end())
+        return EXACT;
+
+    return *first_it == '/' ? IS_CHILD : NOT_MATCH;
+}
+
 }
--- a/src/Common/ZooKeeper/ZooKeeperCommon.h
+++ b/src/Common/ZooKeeper/ZooKeeperCommon.h
@ -554,4 +554,13 @@ private:
    ZooKeeperRequestFactory();
 };

+enum class PathMatchResult
+{
+    NOT_MATCH,
+    EXACT,
+    IS_CHILD
+};
+
+PathMatchResult matchPath(std::string_view path, std::string_view match_to);
+
 }
--- a/src/Common/ZooKeeper/tests/gtest_zookeeper.cpp
+++ b/src/Common/ZooKeeper/tests/gtest_zookeeper.cpp
@ -0,0 +1,15 @@
+#include <gtest/gtest.h>
+
+#include <Common/ZooKeeper/ZooKeeperCommon.h>
+
+TEST(ZooKeeperTest, TestMatchPath)
+{
+    using namespace Coordination;
+
+    ASSERT_EQ(matchPath("/path/file", "/path"), PathMatchResult::IS_CHILD);
+    ASSERT_EQ(matchPath("/path/file", "/path/"), PathMatchResult::IS_CHILD);
+    ASSERT_EQ(matchPath("/path/file", "/"), PathMatchResult::IS_CHILD);
+    ASSERT_EQ(matchPath("/", "/"), PathMatchResult::EXACT);
+    ASSERT_EQ(matchPath("/path", "/path/"), PathMatchResult::EXACT);
+    ASSERT_EQ(matchPath("/path/", "/path"), PathMatchResult::EXACT);
+}
--- a/src/Common/tests/gtest_merge_configs.cpp
+++ b/src/Common/tests/gtest_merge_configs.cpp
@ -43,11 +43,8 @@ clickhouse:
    text_log:
        database: system
        table: text_log
-        partition_by:
-            "@remove": "1"
-        engine:
-            - "@replace" : "1"
-            - "ENGINE MergeTree"
+        partition_by: {"@remove": "1"}
+        engine: "ENGINE MergeTree"
        flush_interval_milliseconds: 7500
        level: debug
 )YAML";
@ -112,11 +109,8 @@ clickhouse:
    text_log :
        database: system
        table: text_log
-        partition_by:
-            "@remove": "1"
-        engine:
-            - "@replace" : "1"
-            - "ENGINE MergeTree"
+        partition_by: {"@remove": "1"}
+        engine: "ENGINE MergeTree"
        flush_interval_milliseconds: 7500
        level: debug
 )YAML";
--- a/src/Common/tests/gtest_yaml_parser.cpp
+++ b/src/Common/tests/gtest_yaml_parser.cpp
@ -13,40 +13,12 @@

 using namespace DB;

-TEST(Common, YamlParserInvalidFile)
+TEST(YamlParser, InvalidFile)
 {
    ASSERT_THROW(YAMLParser::parse("some-non-existing-file.yaml"), Exception);
 }

-TEST(Common, YamlParserProcessKeysList)
-{
-    auto yaml_file = getFileWithContents("keys-list.yaml", R"YAML(
-operator:
-    access_management: "1"
-    networks:
-      - ip: "10.1.6.168"
-      - ip: "::1"
-      - ip: "127.0.0.1"
-)YAML");
-    SCOPE_EXIT({ yaml_file->remove(); });
-
-    Poco::AutoPtr<Poco::XML::Document> xml = YAMLParser::parse(yaml_file->path());
-    auto *p_node = xml->getNodeByPath("/clickhouse");
-    EXPECT_EQ(xmlNodeAsString(p_node), R"CONFIG(<clickhouse>
-<operator>
-<access_management>1</access_management>
-<networks>
-<ip>10.1.6.168</ip>
-<ip>::1</ip>
-<ip>127.0.0.1</ip>
-</networks>
-</operator>
-</clickhouse>
-)CONFIG");
-
-}
-
-TEST(Common, YamlParserProcessValuesList)
+TEST(YamlParser, ProcessValuesList)
 {
    auto yaml_file = getFileWithContents("values-list.yaml", R"YAML(
 rules:
@ -75,4 +47,141 @@ rules:
 )CONFIG");

 }
+
+TEST(YamlParser, ProcessKeysList)
+{
+    auto yaml_file = getFileWithContents("keys-list.yaml", R"YAML(
+operator:
+    access_management: 1
+    networks:
+        ip:
+          - 10.1.6.168
+          - ::1
+          - 127.0.0.1
+)YAML");
+    SCOPE_EXIT({ yaml_file->remove(); });
+
+    Poco::AutoPtr<Poco::XML::Document> xml = YAMLParser::parse(yaml_file->path());
+    auto *p_node = xml->getNodeByPath("/clickhouse");
+    EXPECT_EQ(xmlNodeAsString(p_node), R"CONFIG(<clickhouse>
+<operator>
+<access_management>1</access_management>
+<networks>
+<ip>10.1.6.168</ip>
+<ip>::1</ip>
+<ip>127.0.0.1</ip>
+</networks>
+</operator>
+</clickhouse>
+)CONFIG");
+
+}
+
+TEST(YamlParser, ProcessListAttributes)
+{
+    auto yaml_file = getFileWithContents("list_attributes.yaml", R"YAML(
+seq:
+  - "@attr1": x
+  - k1: val1
+    k2: val2
+    "@attr2": y
+  - k3: val3
+    "@attr3": z
+)YAML");
+    SCOPE_EXIT({ yaml_file->remove(); });
+
+    Poco::AutoPtr<Poco::XML::Document> xml = YAMLParser::parse(yaml_file->path());
+    auto *p_node = xml->getNodeByPath("/clickhouse");
+    EXPECT_EQ(xmlNodeAsString(p_node), R"CONFIG(<clickhouse>
+<seq attr1="x"></seq>
+<seq attr2="y">
+<k1>val1</k1>
+<k2>val2</k2>
+</seq>
+<seq attr3="z">
+<k3>val3</k3>
+</seq>
+</clickhouse>
+)CONFIG");
+
+}
+
+TEST(YamlParser, ProcessMapAttributes)
+{
+    auto yaml_file = getFileWithContents("map_attributes.yaml", R"YAML(
+map:
+    "@attr1": x
+    k1: val1
+    k2: val2
+    "@attr2": y
+    k3: val3
+    "@attr3": z
+)YAML");
+    SCOPE_EXIT({ yaml_file->remove(); });
+
+    Poco::AutoPtr<Poco::XML::Document> xml = YAMLParser::parse(yaml_file->path());
+    auto *p_node = xml->getNodeByPath("/clickhouse");
+    EXPECT_EQ(xmlNodeAsString(p_node), R"CONFIG(<clickhouse>
+<map attr1="x" attr2="y" attr3="z">
+<k1>val1</k1>
+<k2>val2</k2>
+<k3>val3</k3>
+</map>
+</clickhouse>
+)CONFIG");
+
+}
+
+TEST(YamlParser, ClusterDef)
+{
+    auto yaml_file = getFileWithContents("cluster_def.yaml", R"YAML(
+test_cluster:
+    shard:
+        - internal_replication: false
+          replica:
+              - host: 127.0.0.1
+                port: 9000
+              - host: 127.0.0.2
+                port: 9000
+        - internal_replication: true
+          replica:
+              - host: 127.0.0.3
+                port: 9000
+              - host: 127.0.0.4
+                port: 9000
+)YAML");
+    SCOPE_EXIT({ yaml_file->remove(); });
+
+    Poco::AutoPtr<Poco::XML::Document> xml = YAMLParser::parse(yaml_file->path());
+    auto *p_node = xml->getNodeByPath("/clickhouse");
+    EXPECT_EQ(xmlNodeAsString(p_node), R"CONFIG(<clickhouse>
+<test_cluster>
+<shard>
+<internal_replication>false</internal_replication>
+<replica>
+<host>127.0.0.1</host>
+<port>9000</port>
+</replica>
+<replica>
+<host>127.0.0.2</host>
+<port>9000</port>
+</replica>
+</shard>
+<shard>
+<internal_replication>true</internal_replication>
+<replica>
+<host>127.0.0.3</host>
+<port>9000</port>
+</replica>
+<replica>
+<host>127.0.0.4</host>
+<port>9000</port>
+</replica>
+</shard>
+</test_cluster>
+</clickhouse>
+)CONFIG");
+
+}
+
 #endif
--- a/src/Compression/CompressionFactoryAdditions.cpp
+++ b/src/Compression/CompressionFactoryAdditions.cpp
@ -116,8 +116,8 @@ ASTPtr CompressionCodecFactory::validateCodecAndGetPreprocessedAST(
                        }
                    };

-                    ISerialization::SubstreamPath path;
-                    column_type->getDefaultSerialization()->enumerateStreams(path, callback, column_type);
+                    auto serialization = column_type->getDefaultSerialization();
+                    serialization->enumerateStreams(callback, column_type);

                    if (!result_codec)
                        throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot find any substream with data type for type {}. It's a bug", column_type->getName());
--- a/src/Compression/LZ4_decompress_faster.cpp
+++ b/src/Compression/LZ4_decompress_faster.cpp
@ -478,11 +478,7 @@ template <> void inline copyOverlap<32, true>(UInt8 * op, const UInt8 *& match,
 /// See also https://stackoverflow.com/a/30669632

 template <size_t copy_amount, bool use_shuffle>
-bool NO_INLINE decompressImpl(
-     const char * const source,
-     char * const dest,
-     size_t source_size,
-     size_t dest_size)
+bool NO_INLINE decompressImpl(const char * const source, char * const dest, size_t source_size, size_t dest_size)
 {
    const UInt8 * ip = reinterpret_cast<const UInt8 *>(source);
    UInt8 * op = reinterpret_cast<UInt8 *>(dest);
@ -515,6 +511,18 @@ bool NO_INLINE decompressImpl(

        const unsigned token = *ip++;
        length = token >> 4;
+
+        UInt8 * copy_end;
+        size_t real_length;
+
+        /// It might be true fairly often for well-compressed columns.
+        /// ATST it may hurt performance in other cases because this condition is hard to predict (especially if the number of zeros is ~50%).
+        /// In such cases this `if` will significantly increase number of mispredicted instructions. But seems like it results in a
+        /// noticeable slowdown only for implementations with `copy_amount` > 8. Probably because they use havier instructions.
+        if constexpr (copy_amount == 8)
+            if (length == 0)
+                goto decompress_match;
+
        if (length == 0x0F)
        {
            if (unlikely(ip + 1 >= input_end))
@ -524,7 +532,7 @@ bool NO_INLINE decompressImpl(

        /// Copy literals.

-        UInt8 * copy_end = op + length;
+        copy_end = op + length;

        /// input: Hello, world
        ///        ^-ip
@ -541,7 +549,7 @@ bool NO_INLINE decompressImpl(
            return false;

        // Due to implementation specifics the copy length is always a multiple of copy_amount
-        size_t real_length = 0;
+        real_length = 0;

        static_assert(copy_amount == 8 || copy_amount == 16 || copy_amount == 32);
        if constexpr (copy_amount == 8)
@ -552,9 +560,9 @@ bool NO_INLINE decompressImpl(
            real_length = (((length >> 5) + 1) * 32);

        if (unlikely(ip + real_length >= input_end + ADDITIONAL_BYTES_AT_END_OF_BUFFER))
-             return false;
+            return false;

-        wildCopy<copy_amount>(op, ip, copy_end);    /// Here we can write up to copy_amount - 1 bytes after buffer.
+        wildCopy<copy_amount>(op, ip, copy_end); /// Here we can write up to copy_amount - 1 bytes after buffer.

        if (copy_end == output_end)
            return true;
@ -562,6 +570,8 @@ bool NO_INLINE decompressImpl(
        ip += length;
        op = copy_end;

+    decompress_match:
+
        if (unlikely(ip + 1 >= input_end))
            return false;

--- a/src/Coordination/KeeperSnapshotManager.cpp
+++ b/src/Coordination/KeeperSnapshotManager.cpp
@ -13,8 +13,10 @@
 #include <filesystem>
 #include <memory>
 #include <Common/logger_useful.h>
-#include "Coordination/KeeperContext.h"
+#include <Coordination/KeeperContext.h>
 #include <Coordination/KeeperConstants.h>
+#include <Common/ZooKeeper/ZooKeeperCommon.h>
+

 namespace DB
 {
@ -146,33 +148,6 @@ namespace
    }
 }

-namespace
-{
-
-enum class PathMatchResult
-{
-    NOT_MATCH,
-    EXACT,
-    IS_CHILD
-};
-
-PathMatchResult matchPath(const std::string_view path, const std::string_view match_to)
-{
-    using enum PathMatchResult;
-
-    auto [first_it, second_it] = std::mismatch(path.begin(), path.end(), match_to.begin(), match_to.end());
-
-    if (second_it != match_to.end())
-        return NOT_MATCH;
-
-    if (first_it == path.end())
-        return EXACT;
-
-    return *first_it == '/' ? IS_CHILD : NOT_MATCH;
-}
-
-}
-
 void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, WriteBuffer & out, KeeperContextPtr keeper_context)
 {
    writeBinary(static_cast<uint8_t>(snapshot.version), out);
@ -217,7 +192,7 @@ void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, Wr
        const auto & path = it->key;

        // write only the root system path because of digest
-        if (matchPath(path.toView(), keeper_system_path) == PathMatchResult::IS_CHILD)
+        if (Coordination::matchPath(path.toView(), keeper_system_path) == Coordination::PathMatchResult::IS_CHILD)
        {
            ++it;
            continue;
@ -365,8 +340,8 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial
        KeeperStorage::Node node{};
        readNode(node, in, current_version, storage.acl_map);

-        using enum PathMatchResult;
-        auto match_result = matchPath(path, keeper_system_path);
+        using enum Coordination::PathMatchResult;
+        auto match_result = Coordination::matchPath(path, keeper_system_path);

        const std::string error_msg = fmt::format("Cannot read node on path {} from a snapshot because it is used as a system node", path);
        if (match_result == IS_CHILD)
--- a/src/Coordination/KeeperStorage.cpp
+++ b/src/Coordination/KeeperStorage.cpp
@ -879,7 +879,7 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr
            path_created += seq_num_str.str();
        }

-        if (path_created.starts_with(keeper_system_path))
+        if (Coordination::matchPath(path_created, keeper_system_path) != Coordination::PathMatchResult::NOT_MATCH)
        {
            auto error_msg = fmt::format("Trying to create a node inside the internal Keeper path ({}) which is not allowed. Path: {}", keeper_system_path, path_created);

@ -1049,7 +1049,7 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr

        std::vector<KeeperStorage::Delta> new_deltas;

-        if (request.path.starts_with(keeper_system_path))
+        if (Coordination::matchPath(request.path, keeper_system_path) != Coordination::PathMatchResult::NOT_MATCH)
        {
            auto error_msg = fmt::format("Trying to delete an internal Keeper path ({}) which is not allowed", request.path);

@ -1203,7 +1203,7 @@ struct KeeperStorageSetRequestProcessor final : public KeeperStorageRequestProce

        std::vector<KeeperStorage::Delta> new_deltas;

-        if (request.path.starts_with(keeper_system_path))
+        if (Coordination::matchPath(request.path, keeper_system_path) != Coordination::PathMatchResult::NOT_MATCH)
        {
            auto error_msg = fmt::format("Trying to update an internal Keeper path ({}) which is not allowed", request.path);

@ -1472,7 +1472,7 @@ struct KeeperStorageSetACLRequestProcessor final : public KeeperStorageRequestPr
    {
        Coordination::ZooKeeperSetACLRequest & request = dynamic_cast<Coordination::ZooKeeperSetACLRequest &>(*zk_request);

-        if (request.path.starts_with(keeper_system_path))
+        if (Coordination::matchPath(request.path, keeper_system_path) != Coordination::PathMatchResult::NOT_MATCH)
        {
            auto error_msg = fmt::format("Trying to update an internal Keeper path ({}) which is not allowed", request.path);

--- a/src/Coordination/tests/gtest_coordination.cpp
+++ b/src/Coordination/tests/gtest_coordination.cpp
@ -2141,6 +2141,38 @@ TEST_P(CoordinationTest, TestCurrentApiVersion)
    EXPECT_EQ(keeper_version, static_cast<uint8_t>(current_keeper_api_version));
 }

+TEST_P(CoordinationTest, TestSystemNodeModify)
+{
+    using namespace Coordination;
+    int64_t zxid{0};
+
+    // On INIT we abort when a system path is modified
+    keeper_context->server_state = KeeperContext::Phase::RUNNING;
+    KeeperStorage storage{500, "", keeper_context};
+    const auto assert_create = [&](const std::string_view path, const auto expected_code)
+    {
+        auto request = std::make_shared<ZooKeeperCreateRequest>();
+        request->path = path;
+        storage.preprocessRequest(request, 0, 0, zxid);
+        auto responses = storage.processRequest(request, 0, zxid);
+        ASSERT_FALSE(responses.empty());
+
+        const auto & response = responses[0];
+        ASSERT_EQ(response.response->error, expected_code) << "Unexpected error for path " << path;
+
+        ++zxid;
+    };
+
+    assert_create("/keeper", Error::ZBADARGUMENTS);
+    assert_create("/keeper/with_child", Error::ZBADARGUMENTS);
+    assert_create(DB::keeper_api_version_path, Error::ZBADARGUMENTS);
+
+    assert_create("/keeper_map", Error::ZOK);
+    assert_create("/keeper1", Error::ZOK);
+    assert_create("/keepe", Error::ZOK);
+    assert_create("/keeper1/test", Error::ZOK);
+}
+
 INSTANTIATE_TEST_SUITE_P(CoordinationTestSuite,
    CoordinationTest,
    ::testing::ValuesIn(std::initializer_list<CompressionParam>{
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -213,7 +213,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
    \
    M(Bool, insert_deduplicate, true, "For INSERT queries in the replicated table, specifies that deduplication of insertings blocks should be performed", 0) \
    \
-    M(UInt64Auto, insert_quorum, 0, "For INSERT queries in the replicated table, wait writing for the specified number of replicas and linearize the addition of the data. 0 - disabled.", 0) \
+    M(UInt64Auto, insert_quorum, 0, "For INSERT queries in the replicated table, wait writing for the specified number of replicas and linearize the addition of the data. 0 - disabled, 'auto' - use majority", 0) \
    M(Milliseconds, insert_quorum_timeout, 600000, "If the quorum of replicas did not meet in specified time (in milliseconds), exception will be thrown and insertion is aborted.", 0) \
    M(Bool, insert_quorum_parallel, true, "For quorum INSERT queries - enable to make parallel inserts without linearizability", 0) \
    M(UInt64, select_sequential_consistency, 0, "For SELECT queries from the replicated table, throw an exception if the replica does not have a chunk written with the quorum; do not read the parts that have not yet been written with the quorum.", 0) \
--- a/src/DataTypes/IDataType.cpp
+++ b/src/DataTypes/IDataType.cpp
@ -84,18 +84,20 @@ void IDataType::forEachSubcolumn(
    {
        for (size_t i = 0; i < subpath.size(); ++i)
        {
-            if (!subpath[i].visited && ISerialization::hasSubcolumnForPath(subpath, i + 1))
+            size_t prefix_len = i + 1;
+            if (!subpath[i].visited && ISerialization::hasSubcolumnForPath(subpath, prefix_len))
            {
-                auto name = ISerialization::getSubcolumnNameForStream(subpath, i + 1);
-                auto subdata = ISerialization::createFromPath(subpath, i);
+                auto name = ISerialization::getSubcolumnNameForStream(subpath, prefix_len);
+                auto subdata = ISerialization::createFromPath(subpath, prefix_len);
                callback(subpath, name, subdata);
            }
            subpath[i].visited = true;
        }
    };

-    SubstreamPath path;
-    data.serialization->enumerateStreams(path, callback_with_data, data);
+    ISerialization::EnumerateStreamsSettings settings;
+    settings.position_independent_encoding = false;
+    data.serialization->enumerateStreams(settings, callback_with_data, data);
 }

 template <typename Ptr>
@ -118,33 +120,38 @@ Ptr IDataType::getForSubcolumn(
    return res;
 }

+bool IDataType::hasSubcolumn(const String & subcolumn_name) const
+{
+    return tryGetSubcolumnType(subcolumn_name) != nullptr;
+}
+
 DataTypePtr IDataType::tryGetSubcolumnType(const String & subcolumn_name) const
 {
-    SubstreamData data = { getDefaultSerialization(), getPtr(), nullptr, nullptr };
+    auto data = SubstreamData(getDefaultSerialization()).withType(getPtr());
    return getForSubcolumn<DataTypePtr>(subcolumn_name, data, &SubstreamData::type, false);
 }

 DataTypePtr IDataType::getSubcolumnType(const String & subcolumn_name) const
 {
-    SubstreamData data = { getDefaultSerialization(), getPtr(), nullptr, nullptr };
+    auto data = SubstreamData(getDefaultSerialization()).withType(getPtr());
    return getForSubcolumn<DataTypePtr>(subcolumn_name, data, &SubstreamData::type, true);
 }

 ColumnPtr IDataType::tryGetSubcolumn(const String & subcolumn_name, const ColumnPtr & column) const
 {
-    SubstreamData data = { getDefaultSerialization(), nullptr, column, nullptr };
+    auto data = SubstreamData(getDefaultSerialization()).withColumn(column);
    return getForSubcolumn<ColumnPtr>(subcolumn_name, data, &SubstreamData::column, false);
 }

 ColumnPtr IDataType::getSubcolumn(const String & subcolumn_name, const ColumnPtr & column) const
 {
-    SubstreamData data = { getDefaultSerialization(), nullptr, column, nullptr };
+    auto data = SubstreamData(getDefaultSerialization()).withColumn(column);
    return getForSubcolumn<ColumnPtr>(subcolumn_name, data, &SubstreamData::column, true);
 }

 SerializationPtr IDataType::getSubcolumnSerialization(const String & subcolumn_name, const SerializationPtr & serialization) const
 {
-    SubstreamData data = { serialization, nullptr, nullptr, nullptr };
+    auto data = SubstreamData(serialization);
    return getForSubcolumn<SerializationPtr>(subcolumn_name, data, &SubstreamData::serialization, true);
 }

@ -154,7 +161,7 @@ Names IDataType::getSubcolumnNames() const
    forEachSubcolumn([&](const auto &, const auto & name, const auto &)
    {
        res.push_back(name);
-    }, { getDefaultSerialization(), nullptr, nullptr, nullptr });
+    }, SubstreamData(getDefaultSerialization()));
    return res;
 }

--- a/src/DataTypes/IDataType.h
+++ b/src/DataTypes/IDataType.h
@ -79,6 +79,8 @@ public:
    /// Data type id. It's used for runtime type checks.
    virtual TypeIndex getTypeId() const = 0;

+    bool hasSubcolumn(const String & subcolumn_name) const;
+
    DataTypePtr tryGetSubcolumnType(const String & subcolumn_name) const;
    DataTypePtr getSubcolumnType(const String & subcolumn_name) const;

--- a/src/DataTypes/Serializations/ISerialization.cpp
+++ b/src/DataTypes/Serializations/ISerialization.cpp
@ -73,24 +73,24 @@ String ISerialization::SubstreamPath::toString() const
 }

 void ISerialization::enumerateStreams(
-    SubstreamPath & path,
+    EnumerateStreamsSettings & settings,
    const StreamCallback & callback,
    const SubstreamData & data) const
 {
-    path.push_back(Substream::Regular);
-    path.back().data = data;
-    callback(path);
-    path.pop_back();
+    settings.path.push_back(Substream::Regular);
+    settings.path.back().data = data;
+    callback(settings.path);
+    settings.path.pop_back();
 }

-void ISerialization::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const
+void ISerialization::enumerateStreams(
+    const StreamCallback & callback,
+    const DataTypePtr & type,
+    const ColumnPtr & column) const
 {
-    enumerateStreams(path, callback, {getPtr(), nullptr, nullptr, nullptr});
-}
-
-void ISerialization::enumerateStreams(SubstreamPath & path, const StreamCallback & callback, const DataTypePtr & type) const
-{
-    enumerateStreams(path, callback, {getPtr(), type, nullptr, nullptr});
+    EnumerateStreamsSettings settings;
+    auto data = SubstreamData(getPtr()).withType(type).withColumn(column);
+    enumerateStreams(settings, callback, data);
 }

 void ISerialization::serializeBinaryBulk(const IColumn & column, WriteBuffer &, size_t, size_t) const
@ -184,7 +184,7 @@ String ISerialization::getFileNameForStream(const NameAndTypePair & column, cons
    return getFileNameForStream(column.getNameInStorage(), path);
 }

-static size_t isOffsetsOfNested(const ISerialization::SubstreamPath & path)
+bool isOffsetsOfNested(const ISerialization::SubstreamPath & path)
 {
    if (path.empty())
        return false;
@ -287,10 +287,13 @@ bool ISerialization::hasSubcolumnForPath(const SubstreamPath & path, size_t pref

 ISerialization::SubstreamData ISerialization::createFromPath(const SubstreamPath & path, size_t prefix_len)
 {
-    assert(prefix_len < path.size());
+    assert(prefix_len <= path.size());
+    if (prefix_len == 0)
+        return {};

-    SubstreamData res = path[prefix_len].data;
-    for (ssize_t i = static_cast<ssize_t>(prefix_len) - 1; i >= 0; --i)
+    ssize_t last_elem = prefix_len - 1;
+    auto res = path[last_elem].data;
+    for (ssize_t i = last_elem - 1; i >= 0; --i)
    {
        const auto & creator = path[i].creator;
        if (creator)
--- a/src/DataTypes/Serializations/ISerialization.h
+++ b/src/DataTypes/Serializations/ISerialization.h
@ -101,6 +101,30 @@ public:

    struct SubstreamData
    {
+        SubstreamData() = default;
+        SubstreamData(SerializationPtr serialization_)
+            : serialization(std::move(serialization_))
+        {
+        }
+
+        SubstreamData & withType(DataTypePtr type_)
+        {
+            type = std::move(type_);
+            return *this;
+        }
+
+        SubstreamData & withColumn(ColumnPtr column_)
+        {
+            column = std::move(column_);
+            return *this;
+        }
+
+        SubstreamData & withSerializationInfo(SerializationInfoPtr serialization_info_)
+        {
+            serialization_info = std::move(serialization_info_);
+            return *this;
+        }
+
        SerializationPtr serialization;
        DataTypePtr type;
        ColumnPtr column;
@ -164,16 +188,22 @@ public:

    using StreamCallback = std::function<void(const SubstreamPath &)>;

+    struct EnumerateStreamsSettings
+    {
+        SubstreamPath path;
+        bool position_independent_encoding = true;
+    };
+
    virtual void enumerateStreams(
-        SubstreamPath & path,
+        EnumerateStreamsSettings & settings,
        const StreamCallback & callback,
        const SubstreamData & data) const;

-    void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const;
-    void enumerateStreams(const StreamCallback & callback, SubstreamPath && path) const { enumerateStreams(callback, path); }
-    void enumerateStreams(const StreamCallback & callback) const { enumerateStreams(callback, {}); }
-
-    void enumerateStreams(SubstreamPath & path, const StreamCallback & callback, const DataTypePtr & type) const;
+    /// Enumerate streams with default settings.
+    void enumerateStreams(
+        const StreamCallback & callback,
+        const DataTypePtr & type = nullptr,
+        const ColumnPtr & column = nullptr) const;

    using OutputStreamGetter = std::function<WriteBuffer*(const SubstreamPath &)>;
    using InputStreamGetter = std::function<ReadBuffer*(const SubstreamPath &)>;
@ -375,4 +405,6 @@ State * ISerialization::checkAndGetState(const StatePtr & state) const
    return state_concrete;
 }

+bool isOffsetsOfNested(const ISerialization::SubstreamPath & path);
+
 }
--- a/src/DataTypes/Serializations/SerializationArray.cpp
+++ b/src/DataTypes/Serializations/SerializationArray.cpp
@ -155,30 +155,30 @@ namespace

        return column_offsets;
    }
-}

-ColumnPtr arrayOffsetsToSizes(const IColumn & column)
-{
-    const auto & column_offsets = assert_cast<const ColumnArray::ColumnOffsets &>(column);
-    MutableColumnPtr column_sizes = column_offsets.cloneEmpty();
-
-    if (column_offsets.empty())
-        return column_sizes;
-
-    const auto & offsets_data = column_offsets.getData();
-    auto & sizes_data = assert_cast<ColumnArray::ColumnOffsets &>(*column_sizes).getData();
-
-    sizes_data.resize(offsets_data.size());
-
-    IColumn::Offset prev_offset = 0;
-    for (size_t i = 0, size = offsets_data.size(); i < size; ++i)
+    ColumnPtr arrayOffsetsToSizes(const IColumn & column)
    {
-        auto current_offset = offsets_data[i];
-        sizes_data[i] = current_offset - prev_offset;
-        prev_offset =  current_offset;
-    }
+        const auto & column_offsets = assert_cast<const ColumnArray::ColumnOffsets &>(column);
+        MutableColumnPtr column_sizes = column_offsets.cloneEmpty();

-    return column_sizes;
+        if (column_offsets.empty())
+            return column_sizes;
+
+        const auto & offsets_data = column_offsets.getData();
+        auto & sizes_data = assert_cast<ColumnArray::ColumnOffsets &>(*column_sizes).getData();
+
+        sizes_data.resize(offsets_data.size());
+
+        IColumn::Offset prev_offset = 0;
+        for (size_t i = 0, size = offsets_data.size(); i < size; ++i)
+        {
+            auto current_offset = offsets_data[i];
+            sizes_data[i] = current_offset - prev_offset;
+            prev_offset =  current_offset;
+        }
+
+        return column_sizes;
+    }
 }

 DataTypePtr SerializationArray::SubcolumnCreator::create(const DataTypePtr & prev) const
@ -197,41 +197,42 @@ ColumnPtr SerializationArray::SubcolumnCreator::create(const ColumnPtr & prev) c
 }

 void SerializationArray::enumerateStreams(
-    SubstreamPath & path,
+    EnumerateStreamsSettings & settings,
    const StreamCallback & callback,
    const SubstreamData & data) const
 {
    const auto * type_array = data.type ? &assert_cast<const DataTypeArray &>(*data.type) : nullptr;
    const auto * column_array = data.column ? &assert_cast<const ColumnArray &>(*data.column) : nullptr;
-    auto offsets_column = column_array ? column_array->getOffsetsPtr() : nullptr;
+    auto offsets = column_array ? column_array->getOffsetsPtr() : nullptr;

-    path.push_back(Substream::ArraySizes);
-    path.back().data =
-    {
+    auto offsets_serialization =
        std::make_shared<SerializationNamed>(
            std::make_shared<SerializationNumber<UInt64>>(),
-                "size" + std::to_string(getArrayLevel(path)), false),
-        data.type ? std::make_shared<DataTypeUInt64>() : nullptr,
-        offsets_column ? arrayOffsetsToSizes(*offsets_column) : nullptr,
-        data.serialization_info,
-    };
+                "size" + std::to_string(getArrayLevel(settings.path)), false);

-    callback(path);
+    auto offsets_column = offsets && !settings.position_independent_encoding
+        ? arrayOffsetsToSizes(*offsets)
+        : offsets;

-    path.back() = Substream::ArrayElements;
-    path.back().data = data;
-    path.back().creator = std::make_shared<SubcolumnCreator>(offsets_column);
+    settings.path.push_back(Substream::ArraySizes);
+    settings.path.back().data = SubstreamData(offsets_serialization)
+        .withType(type_array ? std::make_shared<DataTypeUInt64>() : nullptr)
+        .withColumn(std::move(offsets_column))
+        .withSerializationInfo(data.serialization_info);

-    SubstreamData next_data =
-    {
-        nested,
-        type_array ? type_array->getNestedType() : nullptr,
-        column_array ? column_array->getDataPtr() : nullptr,
-        data.serialization_info,
-    };
+    callback(settings.path);

-    nested->enumerateStreams(path, callback, next_data);
-    path.pop_back();
+    settings.path.back() = Substream::ArrayElements;
+    settings.path.back().data = data;
+    settings.path.back().creator = std::make_shared<SubcolumnCreator>(offsets);
+
+    auto next_data = SubstreamData(nested)
+        .withType(type_array ? type_array->getNestedType() : nullptr)
+        .withColumn(column_array ? column_array->getDataPtr() : nullptr)
+        .withSerializationInfo(data.serialization_info);
+
+    nested->enumerateStreams(settings, callback, next_data);
+    settings.path.pop_back();
 }

 void SerializationArray::serializeBinaryBulkStatePrefix(
--- a/src/DataTypes/Serializations/SerializationArray.h
+++ b/src/DataTypes/Serializations/SerializationArray.h
@ -36,7 +36,7 @@ public:
      */

    void enumerateStreams(
-        SubstreamPath & path,
+        EnumerateStreamsSettings & settings,
        const StreamCallback & callback,
        const SubstreamData & data) const override;

@ -79,6 +79,4 @@ private:
    };
 };

-ColumnPtr arrayOffsetsToSizes(const IColumn & column);
-
 }
--- a/src/DataTypes/Serializations/SerializationLowCardinality.cpp
+++ b/src/DataTypes/Serializations/SerializationLowCardinality.cpp
@ -41,30 +41,26 @@ SerializationLowCardinality::SerializationLowCardinality(const DataTypePtr & dic
 }

 void SerializationLowCardinality::enumerateStreams(
-    SubstreamPath & path,
+    EnumerateStreamsSettings & settings,
    const StreamCallback & callback,
    const SubstreamData & data) const
 {
    const auto * column_lc = data.column ? &getColumnLowCardinality(*data.column) : nullptr;

-    SubstreamData dict_data =
-    {
-        dict_inner_serialization,
-        data.type ? dictionary_type : nullptr,
-        column_lc ? column_lc->getDictionary().getNestedColumn() : nullptr,
-        data.serialization_info,
-    };
+    settings.path.push_back(Substream::DictionaryKeys);
+    auto dict_data = SubstreamData(dict_inner_serialization)
+        .withType(data.type ? dictionary_type : nullptr)
+        .withColumn(column_lc ? column_lc->getDictionary().getNestedColumn() : nullptr)
+        .withSerializationInfo(data.serialization_info);

-    path.push_back(Substream::DictionaryKeys);
-    path.back().data = dict_data;
+    settings.path.back().data = dict_data;
+    dict_inner_serialization->enumerateStreams(settings, callback, dict_data);

-    dict_inner_serialization->enumerateStreams(path, callback, dict_data);
+    settings.path.back() = Substream::DictionaryIndexes;
+    settings.path.back().data = data;

-    path.back() = Substream::DictionaryIndexes;
-    path.back().data = data;
-
-    callback(path);
-    path.pop_back();
+    callback(settings.path);
+    settings.path.pop_back();
 }

 struct KeysSerializationVersion
--- a/src/DataTypes/Serializations/SerializationLowCardinality.h
+++ b/src/DataTypes/Serializations/SerializationLowCardinality.h
@ -18,7 +18,7 @@ public:
    explicit SerializationLowCardinality(const DataTypePtr & dictionary_type);

    void enumerateStreams(
-        SubstreamPath & path,
+        EnumerateStreamsSettings & settings,
        const StreamCallback & callback,
        const SubstreamData & data) const override;

--- a/src/DataTypes/Serializations/SerializationMap.cpp
+++ b/src/DataTypes/Serializations/SerializationMap.cpp
@ -257,19 +257,16 @@ void SerializationMap::deserializeTextCSV(IColumn & column, ReadBuffer & istr, c
 }

 void SerializationMap::enumerateStreams(
-    SubstreamPath & path,
+    EnumerateStreamsSettings & settings,
    const StreamCallback & callback,
    const SubstreamData & data) const
 {
-    SubstreamData next_data =
-    {
-        nested,
-        data.type ? assert_cast<const DataTypeMap &>(*data.type).getNestedType() : nullptr,
-        data.column ? assert_cast<const ColumnMap &>(*data.column).getNestedColumnPtr() : nullptr,
-        data.serialization_info,
-    };
+    auto next_data = SubstreamData(nested)
+        .withType(data.type ? assert_cast<const DataTypeMap &>(*data.type).getNestedType() : nullptr)
+        .withColumn(data.column ? assert_cast<const ColumnMap &>(*data.column).getNestedColumnPtr() : nullptr)
+        .withSerializationInfo(data.serialization_info);

-    nested->enumerateStreams(path, callback, next_data);
+    nested->enumerateStreams(settings, callback, next_data);
 }

 void SerializationMap::serializeBinaryBulkStatePrefix(
--- a/src/DataTypes/Serializations/SerializationMap.h
+++ b/src/DataTypes/Serializations/SerializationMap.h
@ -32,7 +32,7 @@ public:
    void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override;

    void enumerateStreams(
-        SubstreamPath & path,
+        EnumerateStreamsSettings & settings,
        const StreamCallback & callback,
        const SubstreamData & data) const override;

--- a/src/DataTypes/Serializations/SerializationNamed.cpp
+++ b/src/DataTypes/Serializations/SerializationNamed.cpp
@ -4,16 +4,16 @@ namespace DB
 {

 void SerializationNamed::enumerateStreams(
-    SubstreamPath & path,
+    EnumerateStreamsSettings & settings,
    const StreamCallback & callback,
    const SubstreamData & data) const
 {
-    addToPath(path);
-    path.back().data = data;
-    path.back().creator = std::make_shared<SubcolumnCreator>(name, escape_delimiter);
+    addToPath(settings.path);
+    settings.path.back().data = data;
+    settings.path.back().creator = std::make_shared<SubcolumnCreator>(name, escape_delimiter);

-    nested_serialization->enumerateStreams(path, callback, data);
-    path.pop_back();
+    nested_serialization->enumerateStreams(settings, callback, data);
+    settings.path.pop_back();
 }

 void SerializationNamed::serializeBinaryBulkStatePrefix(
--- a/src/DataTypes/Serializations/SerializationNamed.h
+++ b/src/DataTypes/Serializations/SerializationNamed.h
@ -26,7 +26,7 @@ public:
    const String & getElementName() const { return name; }

    void enumerateStreams(
-        SubstreamPath & path,
+        EnumerateStreamsSettings & settings,
        const StreamCallback & callback,
        const SubstreamData & data) const override;

--- a/src/DataTypes/Serializations/SerializationNullable.cpp
+++ b/src/DataTypes/Serializations/SerializationNullable.cpp
@ -38,38 +38,35 @@ ColumnPtr SerializationNullable::SubcolumnCreator::create(const ColumnPtr & prev
 }

 void SerializationNullable::enumerateStreams(
-    SubstreamPath & path,
+    EnumerateStreamsSettings & settings,
    const StreamCallback & callback,
    const SubstreamData & data) const
 {
    const auto * type_nullable = data.type ? &assert_cast<const DataTypeNullable &>(*data.type) : nullptr;
    const auto * column_nullable = data.column ? &assert_cast<const ColumnNullable &>(*data.column) : nullptr;

-    path.push_back(Substream::NullMap);
-    path.back().data =
-    {
-        std::make_shared<SerializationNamed>(std::make_shared<SerializationNumber<UInt8>>(), "null", false),
-        type_nullable ? std::make_shared<DataTypeUInt8>() : nullptr,
-        column_nullable ? column_nullable->getNullMapColumnPtr() : nullptr,
-        data.serialization_info,
-    };
+    auto null_map_serialization = std::make_shared<SerializationNamed>(std::make_shared<SerializationNumber<UInt8>>(), "null", false);

-    callback(path);
+    settings.path.push_back(Substream::NullMap);
+    auto null_map_data = SubstreamData(null_map_serialization)
+        .withType(type_nullable ? std::make_shared<DataTypeUInt8>() : nullptr)
+        .withColumn(column_nullable ? column_nullable->getNullMapColumnPtr() : nullptr)
+        .withSerializationInfo(data.serialization_info);

-    path.back() = Substream::NullableElements;
-    path.back().creator = std::make_shared<SubcolumnCreator>(path.back().data.column);
-    path.back().data = data;
+    settings.path.back().data = null_map_data;
+    callback(settings.path);

-    SubstreamData next_data =
-    {
-        nested,
-        type_nullable ? type_nullable->getNestedType() : nullptr,
-        column_nullable ? column_nullable->getNestedColumnPtr() : nullptr,
-        data.serialization_info,
-    };
+    settings.path.back() = Substream::NullableElements;
+    settings.path.back().creator = std::make_shared<SubcolumnCreator>(null_map_data.column);
+    settings.path.back().data = data;

-    nested->enumerateStreams(path, callback, next_data);
-    path.pop_back();
+    auto next_data = SubstreamData(nested)
+        .withType(type_nullable ? type_nullable->getNestedType() : nullptr)
+        .withColumn(column_nullable ? column_nullable->getNestedColumnPtr() : nullptr)
+        .withSerializationInfo(data.serialization_info);
+
+    nested->enumerateStreams(settings, callback, next_data);
+    settings.path.pop_back();
 }

 void SerializationNullable::serializeBinaryBulkStatePrefix(
--- a/src/DataTypes/Serializations/SerializationNullable.h
+++ b/src/DataTypes/Serializations/SerializationNullable.h
@ -14,7 +14,7 @@ public:
    explicit SerializationNullable(const SerializationPtr & nested_) : nested(nested_) {}

    void enumerateStreams(
-        SubstreamPath & path,
+        EnumerateStreamsSettings & settings,
        const StreamCallback & callback,
        const SubstreamData & data) const override;

--- a/src/DataTypes/Serializations/SerializationSparse.cpp
+++ b/src/DataTypes/Serializations/SerializationSparse.cpp
@ -148,39 +148,33 @@ ColumnPtr SerializationSparse::SubcolumnCreator::create(const ColumnPtr & prev)
 }

 void SerializationSparse::enumerateStreams(
-    SubstreamPath & path,
+    EnumerateStreamsSettings & settings,
    const StreamCallback & callback,
    const SubstreamData & data) const
 {
    const auto * column_sparse = data.column ? &assert_cast<const ColumnSparse &>(*data.column) : nullptr;
-
    size_t column_size = column_sparse ? column_sparse->size() : 0;

-    path.push_back(Substream::SparseOffsets);
-    path.back().data =
-    {
-        std::make_shared<SerializationNumber<UInt64>>(),
-        data.type ? std::make_shared<DataTypeUInt64>() : nullptr,
-        column_sparse ? column_sparse->getOffsetsPtr() : nullptr,
-        data.serialization_info,
-    };
+    settings.path.push_back(Substream::SparseOffsets);
+    auto offsets_data = SubstreamData(std::make_shared<SerializationNumber<UInt64>>())
+        .withType(data.type ? std::make_shared<DataTypeUInt64>() : nullptr)
+        .withColumn(column_sparse ? column_sparse->getOffsetsPtr() : nullptr)
+        .withSerializationInfo(data.serialization_info);

-    callback(path);
+    settings.path.back().data = offsets_data;
+    callback(settings.path);

-    path.back() = Substream::SparseElements;
-    path.back().creator = std::make_shared<SubcolumnCreator>(path.back().data.column, column_size);
-    path.back().data = data;
+    settings.path.back() = Substream::SparseElements;
+    settings.path.back().creator = std::make_shared<SubcolumnCreator>(offsets_data.column, column_size);
+    settings.path.back().data = data;

-    SubstreamData next_data =
-    {
-        nested,
-        data.type,
-        column_sparse ? column_sparse->getValuesPtr() : nullptr,
-        data.serialization_info,
-    };
+    auto next_data = SubstreamData(nested)
+        .withType(data.type)
+        .withColumn(column_sparse ? column_sparse->getValuesPtr() : nullptr)
+        .withSerializationInfo(data.serialization_info);

-    nested->enumerateStreams(path, callback, next_data);
-    path.pop_back();
+    nested->enumerateStreams(settings, callback, next_data);
+    settings.path.pop_back();
 }

 void SerializationSparse::serializeBinaryBulkStatePrefix(
--- a/src/DataTypes/Serializations/SerializationSparse.h
+++ b/src/DataTypes/Serializations/SerializationSparse.h
@ -28,7 +28,7 @@ public:
    Kind getKind() const override { return Kind::SPARSE; }

    virtual void enumerateStreams(
-        SubstreamPath & path,
+        EnumerateStreamsSettings & settings,
        const StreamCallback & callback,
        const SubstreamData & data) const override;

--- a/src/DataTypes/Serializations/SerializationTuple.cpp
+++ b/src/DataTypes/Serializations/SerializationTuple.cpp
@ -283,7 +283,7 @@ void SerializationTuple::deserializeTextCSV(IColumn & column, ReadBuffer & istr,
 }

 void SerializationTuple::enumerateStreams(
-    SubstreamPath & path,
+    EnumerateStreamsSettings & settings,
    const StreamCallback & callback,
    const SubstreamData & data) const
 {
@ -293,15 +293,12 @@ void SerializationTuple::enumerateStreams(

    for (size_t i = 0; i < elems.size(); ++i)
    {
-        SubstreamData next_data =
-        {
-            elems[i],
-            type_tuple ? type_tuple->getElement(i) : nullptr,
-            column_tuple ? column_tuple->getColumnPtr(i) : nullptr,
-            info_tuple ? info_tuple->getElementInfo(i) : nullptr,
-        };
+        auto next_data = SubstreamData(elems[i])
+            .withType(type_tuple ? type_tuple->getElement(i) : nullptr)
+            .withColumn(column_tuple ? column_tuple->getColumnPtr(i) : nullptr)
+            .withSerializationInfo(info_tuple ? info_tuple->getElementInfo(i) : nullptr);

-        elems[i]->enumerateStreams(path, callback, next_data);
+        elems[i]->enumerateStreams(settings, callback, next_data);
    }
 }

--- a/src/DataTypes/Serializations/SerializationTuple.h
+++ b/src/DataTypes/Serializations/SerializationTuple.h
@ -34,7 +34,7 @@ public:
    /** Each sub-column in a tuple is serialized in separate stream.
      */
    void enumerateStreams(
-        SubstreamPath & path,
+        EnumerateStreamsSettings & settings,
        const StreamCallback & callback,
        const SubstreamData & data) const override;

--- a/src/DataTypes/Serializations/SerializationWrapper.cpp
+++ b/src/DataTypes/Serializations/SerializationWrapper.cpp
@ -5,11 +5,11 @@ namespace DB
 {

 void SerializationWrapper::enumerateStreams(
-    SubstreamPath & path,
+    EnumerateStreamsSettings & settings,
    const StreamCallback & callback,
    const SubstreamData & data) const
 {
-    nested_serialization->enumerateStreams(path, callback, data);
+    nested_serialization->enumerateStreams(settings, callback, data);
 }

 void SerializationWrapper::serializeBinaryBulkStatePrefix(
--- a/src/DataTypes/Serializations/SerializationWrapper.h
+++ b/src/DataTypes/Serializations/SerializationWrapper.h
@ -21,7 +21,7 @@ public:
    Kind getKind() const override { return nested_serialization->getKind(); }

    void enumerateStreams(
-        SubstreamPath & path,
+        EnumerateStreamsSettings & settings,
        const StreamCallback & callback,
        const SubstreamData & data) const override;

--- a/src/Dictionaries/getDictionaryConfigurationFromAST.cpp
+++ b/src/Dictionaries/getDictionaryConfigurationFromAST.cpp
@ -44,15 +44,6 @@ struct AttributeConfiguration

 using AttributeNameToConfiguration = std::unordered_map<std::string, AttributeConfiguration>;

-/// Get value from field and convert it to string.
-/// Also remove quotes from strings.
-String getFieldAsString(const Field & field)
-{
-    if (field.getType() == Field::Types::Which::String)
-        return field.get<String>();
-    return applyVisitor(FieldVisitorToString(), field);
-}
-
 String getAttributeExpression(const ASTDictionaryAttributeDeclaration * dict_attr)
 {
    if (!dict_attr->expression)
@ -61,7 +52,7 @@ String getAttributeExpression(const ASTDictionaryAttributeDeclaration * dict_att
    /// EXPRESSION PROPERTY should be expression or string
    String expression_str;
    if (const auto * literal = dict_attr->expression->as<ASTLiteral>(); literal && literal->value.getType() == Field::Types::String)
-        expression_str = getFieldAsString(literal->value);
+        expression_str = convertFieldToString(literal->value);
    else
        expression_str = queryToString(dict_attr->expression);

@ -275,7 +266,7 @@ void buildSingleAttribute(
    AutoPtr<Element> null_value_element(doc->createElement("null_value"));
    String null_value_str;
    if (dict_attr->default_value)
-        null_value_str = getFieldAsString(dict_attr->default_value->as<ASTLiteral>()->value);
+        null_value_str = convertFieldToString(dict_attr->default_value->as<ASTLiteral>()->value);
    AutoPtr<Text> null_value(doc->createTextNode(null_value_str));
    null_value_element->appendChild(null_value);
    attribute_element->appendChild(null_value_element);
@ -452,7 +443,7 @@ void buildConfigurationFromFunctionWithKeyValueArguments(
        }
        else if (const auto * literal = pair->second->as<const ASTLiteral>())
        {
-            AutoPtr<Text> value(doc->createTextNode(getFieldAsString(literal->value)));
+            AutoPtr<Text> value(doc->createTextNode(convertFieldToString(literal->value)));
            current_xml_element->appendChild(value);
        }
        else if (const auto * list = pair->second->as<const ASTExpressionList>())
@ -473,7 +464,7 @@ void buildConfigurationFromFunctionWithKeyValueArguments(
            Field value;
            result->get(0, value);

-            AutoPtr<Text> text_value(doc->createTextNode(getFieldAsString(value)));
+            AutoPtr<Text> text_value(doc->createTextNode(convertFieldToString(value)));
            current_xml_element->appendChild(text_value);
        }
        else
@ -519,7 +510,7 @@ void buildSourceConfiguration(
        {
            AutoPtr<Element> setting_change_element(doc->createElement(name));
            settings_element->appendChild(setting_change_element);
-            AutoPtr<Text> setting_value(doc->createTextNode(getFieldAsString(value)));
+            AutoPtr<Text> setting_value(doc->createTextNode(convertFieldToString(value)));
            setting_change_element->appendChild(setting_value);
        }
    }
--- a/src/Disks/IDisk.h
+++ b/src/Disks/IDisk.h
@ -239,7 +239,16 @@ public:
    }

    /// For one local path there might be multiple remote paths in case of Log family engines.
-    using LocalPathWithObjectStoragePaths = std::pair<String, StoredObjects>;
+    struct LocalPathWithObjectStoragePaths
+     {
+         std::string local_path;
+         std::string common_prefix_for_objects;
+         StoredObjects objects;
+
+         LocalPathWithObjectStoragePaths(
+             const std::string & local_path_, const std::string & common_prefix_for_objects_, StoredObjects && objects_)
+             : local_path(local_path_), common_prefix_for_objects(common_prefix_for_objects_), objects(std::move(objects_)) {}
+     };

    virtual void getRemotePathsRecursive(const String &, std::vector<LocalPathWithObjectStoragePaths> &)
    {
--- a/src/Disks/ObjectStorages/DiskObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/DiskObjectStorage.cpp
@ -127,7 +127,7 @@ void DiskObjectStorage::getRemotePathsRecursive(const String & local_path, std::
    {
        try
        {
-            paths_map.emplace_back(local_path, getStorageObjects(local_path));
+            paths_map.emplace_back(local_path, metadata_storage->getObjectStorageRootPath(), getStorageObjects(local_path));
        }
        catch (const Exception & e)
        {
@ -282,7 +282,10 @@ String DiskObjectStorage::getUniqueId(const String & path) const
 bool DiskObjectStorage::checkUniqueId(const String & id) const
 {
    if (!id.starts_with(object_storage_root_path))
+    {
+        LOG_DEBUG(log, "Blob with id {} doesn't start with blob storage prefix {}", id, object_storage_root_path);
        return false;
+    }

    auto object = StoredObject::create(*object_storage, id, {}, {}, true);
    return object_storage->exists(object);
--- a/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp
+++ b/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp
@ -68,6 +68,14 @@ void DiskObjectStorageMetadata::deserialize(ReadBuffer & buf)
    }
 }

+void DiskObjectStorageMetadata::createFromSingleObject(const std::string & relative_path, size_t bytes_size, size_t ref_count_, bool read_only_)
+{
+    storage_objects.emplace_back(relative_path, bytes_size);
+    total_size = bytes_size;
+    ref_count = ref_count_;
+    read_only = read_only_;
+}
+
 void DiskObjectStorageMetadata::deserializeFromString(const std::string & data)
 {
    ReadBufferFromString buf(data);
--- a/src/Disks/ObjectStorages/DiskObjectStorageMetadata.h
+++ b/src/Disks/ObjectStorages/DiskObjectStorageMetadata.h
@ -50,6 +50,7 @@ public:

    void deserialize(ReadBuffer & buf);
    void deserializeFromString(const std::string & data);
+    void createFromSingleObject(const std::string & relative_path, size_t bytes_size, size_t ref_count_, bool is_read_only_);

    void serialize(WriteBuffer & buf, bool sync) const;
    std::string serializeToString() const;
--- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp
@ -56,7 +56,7 @@ void throwIfError(const Aws::Utils::Outcome<Result, Error> & response)
    if (!response.IsSuccess())
    {
        const auto & err = response.GetError();
-        throw Exception(ErrorCodes::S3_ERROR, "{} (Code: {})", err.GetMessage(), static_cast<size_t>(err.GetErrorType()));
+        throw S3Exception(fmt::format("{} (Code: {})", err.GetMessage(), static_cast<size_t>(err.GetErrorType())), err.GetErrorType());
    }
 }

@ -70,7 +70,7 @@ void throwIfUnexpectedError(const Aws::Utils::Outcome<Result, Error> & response,
    if (!response.IsSuccess() && (!if_exists || !isNotFoundError(response.GetError().GetErrorType())))
    {
        const auto & err = response.GetError();
-        throw Exception(ErrorCodes::S3_ERROR, "{} (Code: {})", err.GetMessage(), static_cast<size_t>(err.GetErrorType()));
+        throw S3Exception(err.GetErrorType(), "{} (Code: {})", err.GetMessage(), static_cast<size_t>(err.GetErrorType()));
    }
 }

--- a/src/Disks/ObjectStorages/S3/diskSettings.cpp
+++ b/src/Disks/ObjectStorages/S3/diskSettings.cpp
@ -116,7 +116,8 @@ std::unique_ptr<Aws::S3::S3Client> getClient(const Poco::Util::AbstractConfigura
    S3::PocoHTTPClientConfiguration client_configuration = S3::ClientFactory::instance().createClientConfiguration(
        config.getString(config_prefix + ".region", ""),
        context->getRemoteHostFilter(), context->getGlobalContext()->getSettingsRef().s3_max_redirects,
-        context->getGlobalContext()->getSettingsRef().enable_s3_requests_logging);
+        context->getGlobalContext()->getSettingsRef().enable_s3_requests_logging,
+        /* for_disk_s3 = */ true);

    S3::URI uri(Poco::URI(config.getString(config_prefix + ".endpoint")));
    if (uri.key.back() != '/')
--- a/src/IO/ReadBufferFromS3.cpp
+++ b/src/IO/ReadBufferFromS3.cpp
@ -34,6 +34,7 @@ namespace ErrorCodes
    extern const int CANNOT_SEEK_THROUGH_FILE;
    extern const int SEEK_POSITION_OUT_OF_BOUND;
    extern const int LOGICAL_ERROR;
+    extern const int CANNOT_ALLOCATE_MEMORY;
 }


@ -136,6 +137,23 @@ bool ReadBufferFromS3::nextImpl()
            ProfileEvents::increment(ProfileEvents::ReadBufferFromS3Microseconds, watch.elapsedMicroseconds());
            ProfileEvents::increment(ProfileEvents::ReadBufferFromS3RequestsErrors, 1);

+            if (const auto * s3_exception = dynamic_cast<const S3Exception *>(&e))
+            {
+                /// It doesn't make sense to retry Access Denied or No Such Key
+                if (!s3_exception->isRetryableError())
+                {
+                    tryLogCurrentException(log, fmt::format("while reading key: {}, from bucket: {}", key, bucket));
+                    throw;
+                }
+            }
+
+            /// It doesn't make sense to retry allocator errors
+            if (e.code() == ErrorCodes::CANNOT_ALLOCATE_MEMORY)
+            {
+                tryLogCurrentException(log);
+                throw;
+            }
+
            LOG_DEBUG(
                log,
                "Caught exception while reading S3 object. Bucket: {}, Key: {}, Version: {}, Offset: {}, Attempt: {}, Message: {}",
@ -306,7 +324,10 @@ std::unique_ptr<ReadBuffer> ReadBufferFromS3::initialize()
        return std::make_unique<ReadBufferFromIStream>(read_result.GetBody(), buffer_size);
    }
    else
-        throw Exception(outcome.GetError().GetMessage(), ErrorCodes::S3_ERROR);
+    {
+        const auto & error = outcome.GetError();
+        throw S3Exception(error.GetMessage(), error.GetErrorType());
+    }
 }

 SeekableReadBufferPtr ReadBufferS3Factory::getReader()
--- a/src/IO/S3/PocoHTTPClient.cpp
+++ b/src/IO/S3/PocoHTTPClient.cpp
@ -42,6 +42,18 @@ namespace ProfileEvents
    extern const Event S3WriteRequestsErrors;
    extern const Event S3WriteRequestsThrottling;
    extern const Event S3WriteRequestsRedirects;
+
+    extern const Event DiskS3ReadMicroseconds;
+    extern const Event DiskS3ReadRequestsCount;
+    extern const Event DiskS3ReadRequestsErrors;
+    extern const Event DiskS3ReadRequestsThrottling;
+    extern const Event DiskS3ReadRequestsRedirects;
+
+    extern const Event DiskS3WriteMicroseconds;
+    extern const Event DiskS3WriteRequestsCount;
+    extern const Event DiskS3WriteRequestsErrors;
+    extern const Event DiskS3WriteRequestsThrottling;
+    extern const Event DiskS3WriteRequestsRedirects;
 }

 namespace CurrentMetrics
@ -62,11 +74,13 @@ PocoHTTPClientConfiguration::PocoHTTPClientConfiguration(
        const String & force_region_,
        const RemoteHostFilter & remote_host_filter_,
        unsigned int s3_max_redirects_,
-        bool enable_s3_requests_logging_)
+        bool enable_s3_requests_logging_,
+        bool for_disk_s3_)
    : force_region(force_region_)
    , remote_host_filter(remote_host_filter_)
    , s3_max_redirects(s3_max_redirects_)
    , enable_s3_requests_logging(enable_s3_requests_logging_)
+    , for_disk_s3(for_disk_s3_)
 {
 }

@ -112,6 +126,7 @@ PocoHTTPClient::PocoHTTPClient(const PocoHTTPClientConfiguration & client_config
    , remote_host_filter(client_configuration.remote_host_filter)
    , s3_max_redirects(client_configuration.s3_max_redirects)
    , enable_s3_requests_logging(client_configuration.enable_s3_requests_logging)
+    , for_disk_s3(client_configuration.for_disk_s3)
    , extra_headers(client_configuration.extra_headers)
 {
 }
@ -176,6 +191,46 @@ namespace
    }
 }

+PocoHTTPClient::S3MetricKind PocoHTTPClient::getMetricKind(const Aws::Http::HttpRequest & request)
+{
+    switch (request.GetMethod())
+    {
+        case Aws::Http::HttpMethod::HTTP_GET:
+        case Aws::Http::HttpMethod::HTTP_HEAD:
+            return S3MetricKind::Read;
+        case Aws::Http::HttpMethod::HTTP_POST:
+        case Aws::Http::HttpMethod::HTTP_DELETE:
+        case Aws::Http::HttpMethod::HTTP_PUT:
+        case Aws::Http::HttpMethod::HTTP_PATCH:
+            return S3MetricKind::Write;
+    }
+    throw Exception("Unsupported request method", ErrorCodes::NOT_IMPLEMENTED);
+}
+
+void PocoHTTPClient::addMetric(const Aws::Http::HttpRequest & request, S3MetricType type, ProfileEvents::Count amount) const
+{
+    const ProfileEvents::Event events_map[static_cast<size_t>(S3MetricType::EnumSize)][static_cast<size_t>(S3MetricKind::EnumSize)] = {
+        {ProfileEvents::S3ReadMicroseconds, ProfileEvents::S3WriteMicroseconds},
+        {ProfileEvents::S3ReadRequestsCount, ProfileEvents::S3WriteRequestsCount},
+        {ProfileEvents::S3ReadRequestsErrors, ProfileEvents::S3WriteRequestsErrors},
+        {ProfileEvents::S3ReadRequestsThrottling, ProfileEvents::S3WriteRequestsThrottling},
+        {ProfileEvents::S3ReadRequestsRedirects, ProfileEvents::S3WriteRequestsRedirects},
+    };
+
+    const ProfileEvents::Event disk_s3_events_map[static_cast<size_t>(S3MetricType::EnumSize)][static_cast<size_t>(S3MetricKind::EnumSize)] = {
+        {ProfileEvents::DiskS3ReadMicroseconds, ProfileEvents::DiskS3WriteMicroseconds},
+        {ProfileEvents::DiskS3ReadRequestsCount, ProfileEvents::DiskS3WriteRequestsCount},
+        {ProfileEvents::DiskS3ReadRequestsErrors, ProfileEvents::DiskS3WriteRequestsErrors},
+        {ProfileEvents::DiskS3ReadRequestsThrottling, ProfileEvents::DiskS3WriteRequestsThrottling},
+        {ProfileEvents::DiskS3ReadRequestsRedirects, ProfileEvents::DiskS3WriteRequestsRedirects},
+    };
+
+    S3MetricKind kind = getMetricKind(request);
+
+    ProfileEvents::increment(events_map[static_cast<unsigned int>(type)][static_cast<unsigned int>(kind)], amount);
+    if (for_disk_s3)
+        ProfileEvents::increment(disk_s3_events_map[static_cast<unsigned int>(type)][static_cast<unsigned int>(kind)], amount);
+}

 void PocoHTTPClient::makeRequestInternal(
    Aws::Http::HttpRequest & request,
@ -189,45 +244,7 @@ void PocoHTTPClient::makeRequestInternal(
    if (enable_s3_requests_logging)
        LOG_TEST(log, "Make request to: {}", uri);

-    enum class S3MetricType
-    {
-        Microseconds,
-        Count,
-        Errors,
-        Throttling,
-        Redirects,
-
-        EnumSize,
-    };
-
-    auto select_metric = [&request](S3MetricType type)
-    {
-        const ProfileEvents::Event events_map[][2] = {
-            {ProfileEvents::S3ReadMicroseconds, ProfileEvents::S3WriteMicroseconds},
-            {ProfileEvents::S3ReadRequestsCount, ProfileEvents::S3WriteRequestsCount},
-            {ProfileEvents::S3ReadRequestsErrors, ProfileEvents::S3WriteRequestsErrors},
-            {ProfileEvents::S3ReadRequestsThrottling, ProfileEvents::S3WriteRequestsThrottling},
-            {ProfileEvents::S3ReadRequestsRedirects, ProfileEvents::S3WriteRequestsRedirects},
-        };
-
-        static_assert((sizeof(events_map) / sizeof(events_map[0])) == static_cast<unsigned int>(S3MetricType::EnumSize));
-
-        switch (request.GetMethod())
-        {
-            case Aws::Http::HttpMethod::HTTP_GET:
-            case Aws::Http::HttpMethod::HTTP_HEAD:
-                return events_map[static_cast<unsigned int>(type)][0]; // Read
-            case Aws::Http::HttpMethod::HTTP_POST:
-            case Aws::Http::HttpMethod::HTTP_DELETE:
-            case Aws::Http::HttpMethod::HTTP_PUT:
-            case Aws::Http::HttpMethod::HTTP_PATCH:
-                return events_map[static_cast<unsigned int>(type)][1]; // Write
-        }
-
-        throw Exception("Unsupported request method", ErrorCodes::NOT_IMPLEMENTED);
-    };
-
-    ProfileEvents::increment(select_metric(S3MetricType::Count));
+    addMetric(request, S3MetricType::Count);
    CurrentMetrics::Increment metric_increment{CurrentMetrics::S3Requests};

    try
@ -334,7 +351,7 @@ void PocoHTTPClient::makeRequestInternal(
            auto & response_body_stream = session->receiveResponse(poco_response);

            watch.stop();
-            ProfileEvents::increment(select_metric(S3MetricType::Microseconds), watch.elapsedMicroseconds());
+            addMetric(request, S3MetricType::Microseconds, watch.elapsedMicroseconds());

            int status_code = static_cast<int>(poco_response.getStatus());

@ -349,7 +366,7 @@ void PocoHTTPClient::makeRequestInternal(
                if (enable_s3_requests_logging)
                    LOG_TEST(log, "Redirecting request to new location: {}", location);

-                ProfileEvents::increment(select_metric(S3MetricType::Redirects));
+                addMetric(request, S3MetricType::Redirects);

                continue;
            }
@ -387,7 +404,7 @@ void PocoHTTPClient::makeRequestInternal(
                    LOG_WARNING(log, "Response for request contain <Error> tag in body, settings internal server error (500 code)");
                    response->SetResponseCode(Aws::Http::HttpResponseCode::INTERNAL_SERVER_ERROR);

-                    ProfileEvents::increment(select_metric(S3MetricType::Errors));
+                    addMetric(request, S3MetricType::Errors);
                    if (error_report)
                        error_report(request_configuration);

@ -401,11 +418,11 @@ void PocoHTTPClient::makeRequestInternal(

                if (status_code == 429 || status_code == 503)
                { // API throttling
-                    ProfileEvents::increment(select_metric(S3MetricType::Throttling));
+                    addMetric(request, S3MetricType::Throttling);
                }
                else if (status_code >= 300)
                {
-                    ProfileEvents::increment(select_metric(S3MetricType::Errors));
+                    addMetric(request, S3MetricType::Errors);
                    if (status_code >= 500 && error_report)
                        error_report(request_configuration);
                }
@ -423,7 +440,7 @@ void PocoHTTPClient::makeRequestInternal(
        response->SetClientErrorType(Aws::Client::CoreErrors::NETWORK_CONNECTION);
        response->SetClientErrorMessage(getCurrentExceptionMessage(false));

-        ProfileEvents::increment(select_metric(S3MetricType::Errors));
+        addMetric(request, S3MetricType::Errors);
    }
 }

--- a/src/IO/S3/PocoHTTPClient.h
+++ b/src/IO/S3/PocoHTTPClient.h
@ -44,6 +44,7 @@ struct PocoHTTPClientConfiguration : public Aws::Client::ClientConfiguration
    const RemoteHostFilter & remote_host_filter;
    unsigned int s3_max_redirects;
    bool enable_s3_requests_logging;
+    bool for_disk_s3;
    HeaderCollection extra_headers;

    void updateSchemeAndRegion();
@ -55,7 +56,8 @@ private:
        const String & force_region_,
        const RemoteHostFilter & remote_host_filter_,
        unsigned int s3_max_redirects_,
-        bool enable_s3_requests_logging_
+        bool enable_s3_requests_logging_,
+        bool for_disk_s3_
    );

    /// Constructor of Aws::Client::ClientConfiguration must be called after AWS SDK initialization.
@ -113,18 +115,42 @@ public:
        Aws::Utils::RateLimits::RateLimiterInterface * writeLimiter) const override;

 private:
+
    void makeRequestInternal(
        Aws::Http::HttpRequest & request,
        std::shared_ptr<PocoHTTPResponse> & response,
        Aws::Utils::RateLimits::RateLimiterInterface * readLimiter,
        Aws::Utils::RateLimits::RateLimiterInterface * writeLimiter) const;

+    enum class S3MetricType
+    {
+        Microseconds,
+        Count,
+        Errors,
+        Throttling,
+        Redirects,
+
+        EnumSize,
+    };
+
+    enum class S3MetricKind
+    {
+        Read,
+        Write,
+
+        EnumSize,
+    };
+
+    static S3MetricKind getMetricKind(const Aws::Http::HttpRequest & request);
+    void addMetric(const Aws::Http::HttpRequest & request, S3MetricType type, ProfileEvents::Count amount = 1) const;
+
    std::function<ClientConfigurationPerRequest(const Aws::Http::HttpRequest &)> per_request_configuration;
    std::function<void(const ClientConfigurationPerRequest &)> error_report;
    ConnectionTimeouts timeouts;
    const RemoteHostFilter & remote_host_filter;
    unsigned int s3_max_redirects;
    bool enable_s3_requests_logging;
+    bool for_disk_s3;
    const HeaderCollection extra_headers;
 };

--- a/src/IO/S3/tests/gtest_aws_s3_client.cpp
+++ b/src/IO/S3/tests/gtest_aws_s3_client.cpp
@ -87,7 +87,8 @@ TEST(IOTestAwsS3Client, AppendExtraSSECHeaders)
        region,
        remote_host_filter,
        s3_max_redirects,
-        enable_s3_requests_logging
+        enable_s3_requests_logging,
+        /* for_disk_s3 = */ false
    );

    client_configuration.endpointOverride = uri.endpoint;
--- a/src/IO/S3Common.cpp
+++ b/src/IO/S3Common.cpp
@ -35,6 +35,26 @@

 #    include <fstream>

+namespace DB
+{
+
+bool S3Exception::isRetryableError() const
+{
+    /// Looks like these list is quite conservative, add more codes if you wish
+    static const std::unordered_set<Aws::S3::S3Errors> unretryable_errors = {
+        Aws::S3::S3Errors::NO_SUCH_KEY,
+        Aws::S3::S3Errors::ACCESS_DENIED,
+        Aws::S3::S3Errors::INVALID_ACCESS_KEY_ID,
+        Aws::S3::S3Errors::INVALID_SIGNATURE,
+        Aws::S3::S3Errors::NO_SUCH_UPLOAD,
+        Aws::S3::S3Errors::NO_SUCH_BUCKET,
+    };
+
+    return !unretryable_errors.contains(code);
+}
+
+}
+
 namespace
 {

@ -543,7 +563,7 @@ public:
            /// AWS API tries credentials providers one by one. Some of providers (like ProfileConfigFileAWSCredentialsProvider) can be
            /// quite verbose even if nobody configured them. So we use our provider first and only after it use default providers.
            {
-                DB::S3::PocoHTTPClientConfiguration aws_client_configuration = DB::S3::ClientFactory::instance().createClientConfiguration(configuration.region, configuration.remote_host_filter, configuration.s3_max_redirects, configuration.enable_s3_requests_logging);
+                DB::S3::PocoHTTPClientConfiguration aws_client_configuration = DB::S3::ClientFactory::instance().createClientConfiguration(configuration.region, configuration.remote_host_filter, configuration.s3_max_redirects, configuration.enable_s3_requests_logging, configuration.for_disk_s3);
                AddProvider(std::make_shared<AwsAuthSTSAssumeRoleWebIdentityCredentialsProvider>(aws_client_configuration));
            }

@ -580,7 +600,7 @@ public:
            }
            else if (Aws::Utils::StringUtils::ToLower(ec2_metadata_disabled.c_str()) != "true")
            {
-                DB::S3::PocoHTTPClientConfiguration aws_client_configuration = DB::S3::ClientFactory::instance().createClientConfiguration(configuration.region, configuration.remote_host_filter, configuration.s3_max_redirects, configuration.enable_s3_requests_logging);
+                DB::S3::PocoHTTPClientConfiguration aws_client_configuration = DB::S3::ClientFactory::instance().createClientConfiguration(configuration.region, configuration.remote_host_filter, configuration.s3_max_redirects, configuration.enable_s3_requests_logging, configuration.for_disk_s3);

                /// See MakeDefaultHttpResourceClientConfiguration().
                /// This is part of EC2 metadata client, but unfortunately it can't be accessed from outside
@ -700,9 +720,10 @@ namespace S3
        const String & force_region,
        const RemoteHostFilter & remote_host_filter,
        unsigned int s3_max_redirects,
-        bool enable_s3_requests_logging)
+        bool enable_s3_requests_logging,
+        bool for_disk_s3)
    {
-        return PocoHTTPClientConfiguration(force_region, remote_host_filter, s3_max_redirects, enable_s3_requests_logging);
+        return PocoHTTPClientConfiguration(force_region, remote_host_filter, s3_max_redirects, enable_s3_requests_logging, for_disk_s3);
    }

    URI::URI(const Poco::URI & uri_)
--- a/src/IO/S3Common.h
+++ b/src/IO/S3Common.h
@ -7,23 +7,62 @@
 #include <base/types.h>
 #include <aws/core/Aws.h>
 #include <aws/core/client/ClientConfiguration.h>
+#include <aws/s3/S3Errors.h>
 #include <IO/S3/PocoHTTPClient.h>
 #include <Poco/URI.h>

+#include <Common/Exception.h>
+
 namespace Aws::S3
 {
    class S3Client;
 }

+
 namespace DB
 {
-    class RemoteHostFilter;
-    struct HttpHeader;
-    using HeaderCollection = std::vector<HttpHeader>;
+namespace ErrorCodes
+{
+    extern const int S3_ERROR;
 }

+class RemoteHostFilter;
+struct HttpHeader;
+using HeaderCollection = std::vector<HttpHeader>;
+
+class S3Exception : public Exception
+{
+public:
+
+    // Format message with fmt::format, like the logging functions.
+    template <typename... Args>
+    S3Exception(Aws::S3::S3Errors code_, fmt::format_string<Args...> fmt, Args &&... args)
+        : Exception(fmt::format(fmt, std::forward<Args>(args)...), ErrorCodes::S3_ERROR)
+        , code(code_)
+    {
+    }
+
+    S3Exception(const std::string & msg, Aws::S3::S3Errors code_)
+        : Exception(msg, ErrorCodes::S3_ERROR)
+        , code(code_)
+    {}
+
+    Aws::S3::S3Errors getS3ErrorCode() const
+    {
+        return code;
+    }
+
+    bool isRetryableError() const;
+
+private:
+    const Aws::S3::S3Errors code;
+};
+}
+
+
 namespace DB::S3
 {
+
 class ClientFactory
 {
 public:
@ -45,7 +84,8 @@ public:
        const String & force_region,
        const RemoteHostFilter & remote_host_filter,
        unsigned int s3_max_redirects,
-        bool enable_s3_requests_logging);
+        bool enable_s3_requests_logging,
+        bool for_disk_s3);

 private:
    ClientFactory();
--- a/src/IO/WriteBufferFromS3.cpp
+++ b/src/IO/WriteBufferFromS3.cpp
@ -8,6 +8,7 @@

 #include <IO/WriteBufferFromS3.h>
 #include <IO/WriteHelpers.h>
+#include <IO/S3Common.h>
 #include <Interpreters/Context.h>

 #include <aws/s3/S3Client.h>
@ -173,7 +174,9 @@ void WriteBufferFromS3::finalizeImpl()
        auto response = client_ptr->HeadObject(request);

        if (!response.IsSuccess())
-            throw Exception(ErrorCodes::S3_ERROR, "Object {} from bucket {} disappeared immediately after upload, it's a bug in S3 or S3 API.", key, bucket);
+            throw S3Exception(fmt::format("Object {} from bucket {} disappeared immediately after upload, it's a bug in S3 or S3 API.", key, bucket), response.GetError().GetErrorType());
+        else
+            LOG_TRACE(log, "Object {} exists after upload", key);
    }
 }

@ -197,7 +200,7 @@ void WriteBufferFromS3::createMultipartUpload()
        LOG_TRACE(log, "Multipart upload has created. Bucket: {}, Key: {}, Upload id: {}", bucket, key, multipart_upload_id);
    }
    else
-        throw Exception(outcome.GetError().GetMessage(), ErrorCodes::S3_ERROR);
+        throw S3Exception(outcome.GetError().GetMessage(), outcome.GetError().GetErrorType());
 }

 void WriteBufferFromS3::writePart()
@ -309,7 +312,7 @@ void WriteBufferFromS3::processUploadRequest(UploadPartTask & task)
        LOG_TRACE(log, "Writing part finished. Bucket: {}, Key: {}, Upload_id: {}, Etag: {}, Parts: {}", bucket, key, multipart_upload_id, task.tag, part_tags.size());
    }
    else
-        throw Exception(outcome.GetError().GetMessage(), ErrorCodes::S3_ERROR);
+        throw S3Exception(outcome.GetError().GetMessage(), outcome.GetError().GetErrorType());

    total_parts_uploaded++;
 }
@ -343,9 +346,10 @@ void WriteBufferFromS3::completeMultipartUpload()
        LOG_TRACE(log, "Multipart upload has completed. Bucket: {}, Key: {}, Upload_id: {}, Parts: {}", bucket, key, multipart_upload_id, tags.size());
    else
    {
-        throw Exception(ErrorCodes::S3_ERROR, "{} Tags:{}",
-            outcome.GetError().GetMessage(),
-            fmt::join(tags.begin(), tags.end(), " "));
+        throw S3Exception(
+            outcome.GetError().GetErrorType(),
+            "Message: {}, Key: {}, Bucket: {}, Tags: {}",
+            outcome.GetError().GetMessage(), key, bucket, fmt::join(tags.begin(), tags.end(), " "));
    }
 }

@ -430,7 +434,10 @@ void WriteBufferFromS3::processPutRequest(const PutObjectTask & task)
    if (outcome.IsSuccess())
        LOG_TRACE(log, "Single part upload has completed. Bucket: {}, Key: {}, Object size: {}, WithPool: {}", bucket, key, task.req.GetContentLength(), with_pool);
    else
-        throw Exception(outcome.GetError().GetMessage(), ErrorCodes::S3_ERROR);
+        throw S3Exception(
+            outcome.GetError().GetErrorType(),
+            "Message: {}, Key: {}, Bucket: {}, Object size: {}, WithPool: {}",
+            outcome.GetError().GetMessage(), key, bucket, task.req.GetContentLength(), with_pool);
 }

 void WriteBufferFromS3::waitForReadyBackGroundTasks()
--- a/src/Interpreters/Cache/FileCacheSettings.cpp
+++ b/src/Interpreters/Cache/FileCacheSettings.cpp
@ -31,7 +31,7 @@ void FileCacheSettings::loadFromConfig(const Poco::Util::AbstractConfiguration &
    enable_filesystem_query_cache_limit = config.getUInt64(config_prefix + ".enable_filesystem_query_cache_limit", false);
    enable_cache_hits_threshold = config.getUInt64(config_prefix + ".enable_cache_hits_threshold", REMOTE_FS_OBJECTS_CACHE_ENABLE_HITS_THRESHOLD);

-    do_not_evict_index_and_mark_files = config.getUInt64(config_prefix + ".do_not_evict_index_and_mark_files", true);
+    do_not_evict_index_and_mark_files = config.getUInt64(config_prefix + ".do_not_evict_index_and_mark_files", false);
 }

 }
--- a/src/Interpreters/InterpreterDeleteQuery.cpp
+++ b/src/Interpreters/InterpreterDeleteQuery.cpp
@ -21,7 +21,6 @@ namespace DB

 namespace ErrorCodes
 {
-    extern const int BAD_ARGUMENTS;
    extern const int TABLE_IS_READ_ONLY;
    extern const int SUPPORT_IS_DISABLED;
 }
@ -34,11 +33,6 @@ InterpreterDeleteQuery::InterpreterDeleteQuery(const ASTPtr & query_ptr_, Contex

 BlockIO InterpreterDeleteQuery::execute()
 {
-    if (!getContext()->getSettingsRef().allow_experimental_lightweight_delete)
-    {
-        throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Lightweight delete mutate is experimental. Set `allow_experimental_lightweight_delete` setting to enable it");
-    }
-
    FunctionNameNormalizer().visit(query_ptr.get());
    const ASTDeleteQuery & delete_query = query_ptr->as<ASTDeleteQuery &>();
    auto table_id = getContext()->resolveStorageID(delete_query, Context::ResolveOrdinary);
@ -49,10 +43,6 @@ BlockIO InterpreterDeleteQuery::execute()

    /// First check table storage for validations.
    StoragePtr table = DatabaseCatalog::instance().getTable(table_id, getContext());
-    auto merge_tree = std::dynamic_pointer_cast<MergeTreeData>(table);
-    if (!merge_tree)
-        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Only MergeTree family tables are supported");
-
    checkStorageSupportsTransactionsIfNeeded(table, getContext());
    if (table->isStaticStorage())
        throw Exception(ErrorCodes::TABLE_IS_READ_ONLY, "Table is read-only");
@ -69,6 +59,27 @@ BlockIO InterpreterDeleteQuery::execute()
    auto table_lock = table->lockForShare(getContext()->getCurrentQueryId(), getContext()->getSettingsRef().lock_acquire_timeout);
    auto metadata_snapshot = table->getInMemoryMetadataPtr();

+    auto merge_tree = std::dynamic_pointer_cast<MergeTreeData>(table);
+    if (!merge_tree)
+    {
+        /// Convert to MutationCommand
+        MutationCommands mutation_commands;
+        MutationCommand mut_command;
+
+        mut_command.type = MutationCommand::Type::DELETE;
+        mut_command.predicate = delete_query.predicate;
+
+        mutation_commands.emplace_back(mut_command);
+
+        table->checkMutationIsPossible(mutation_commands, getContext()->getSettingsRef());
+        MutationsInterpreter(table, metadata_snapshot, mutation_commands, getContext(), false).validate();
+        table->mutate(mutation_commands, getContext());
+        return {};
+    }
+
+    if (!getContext()->getSettingsRef().allow_experimental_lightweight_delete)
+        throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Lightweight delete mutate is experimental. Set `allow_experimental_lightweight_delete` setting to enable it");
+
    /// Convert to MutationCommand
    MutationCommands mutation_commands;
    MutationCommand mut_command;
--- a/src/Interpreters/InterpreterDescribeQuery.cpp
+++ b/src/Interpreters/InterpreterDescribeQuery.cpp
@ -163,7 +163,7 @@ BlockIO InterpreterDescribeQuery::execute()
                    res_columns[6]->insertDefault();

                res_columns[7]->insert(1u);
-            }, { type->getDefaultSerialization(), type, nullptr, nullptr });
+            }, ISerialization::SubstreamData(type->getDefaultSerialization()).withType(type));
        }
    }

--- a/src/Interpreters/MutationsInterpreter.cpp
+++ b/src/Interpreters/MutationsInterpreter.cpp
@ -226,7 +226,7 @@ bool isStorageTouchedByMutations(
    ASTPtr select_query = prepareQueryAffectedAST(commands, storage, context_copy);

    /// Interpreter must be alive, when we use result of execute() method.
-    /// For some reason it may copy context and and give it into ExpressionTransform
+    /// For some reason it may copy context and give it into ExpressionTransform
    /// after that we will use context from destroyed stack frame in our stream.
    InterpreterSelectQuery interpreter(
        select_query, context_copy, storage, metadata_snapshot, SelectQueryOptions().ignoreLimits().ignoreProjections());
@ -288,13 +288,17 @@ MutationsInterpreter::MutationsInterpreter(
    const StorageMetadataPtr & metadata_snapshot_,
    MutationCommands commands_,
    ContextPtr context_,
-    bool can_execute_)
+    bool can_execute_,
+    bool return_all_columns_,
+    bool return_deleted_rows_)
    : storage(std::move(storage_))
    , metadata_snapshot(metadata_snapshot_)
    , commands(std::move(commands_))
    , context(Context::createCopy(context_))
    , can_execute(can_execute_)
    , select_limits(SelectQueryOptions().analyze(!can_execute).ignoreLimits().ignoreProjections())
+    , return_all_columns(return_all_columns_)
+    , return_deleted_rows(return_deleted_rows_)
 {
    mutation_ast = prepare(!can_execute);
 }
@ -472,14 +476,21 @@ ASTPtr MutationsInterpreter::prepare(bool dry_run)
    /// First, break a sequence of commands into stages.
    for (auto & command : commands)
    {
+        // we can return deleted rows only if it's the only present command
+        assert(command.type == MutationCommand::DELETE || !return_deleted_rows);
+
        if (command.type == MutationCommand::DELETE)
        {
            mutation_kind.set(MutationKind::MUTATE_OTHER);
            if (stages.empty() || !stages.back().column_to_updated.empty())
                stages.emplace_back(context);

-            auto negated_predicate = makeASTFunction("isZeroOrNull", getPartitionAndPredicateExpressionForMutationCommand(command));
-            stages.back().filters.push_back(negated_predicate);
+            auto predicate  = getPartitionAndPredicateExpressionForMutationCommand(command);
+
+            if (!return_deleted_rows)
+                predicate = makeASTFunction("isZeroOrNull", predicate);
+
+            stages.back().filters.push_back(predicate);
        }
        else if (command.type == MutationCommand::UPDATE)
        {
@ -789,7 +800,7 @@ ASTPtr MutationsInterpreter::prepareInterpreterSelectQuery(std::vector<Stage> &
    /// Next, for each stage calculate columns changed by this and previous stages.
    for (size_t i = 0; i < prepared_stages.size(); ++i)
    {
-        if (!prepared_stages[i].filters.empty())
+        if (return_all_columns || !prepared_stages[i].filters.empty())
        {
            for (const auto & column : all_columns)
                prepared_stages[i].output_columns.insert(column.name);
--- a/src/Interpreters/MutationsInterpreter.h
+++ b/src/Interpreters/MutationsInterpreter.h
@ -43,7 +43,9 @@ public:
        const StorageMetadataPtr & metadata_snapshot_,
        MutationCommands commands_,
        ContextPtr context_,
-        bool can_execute_);
+        bool can_execute_,
+        bool return_all_columns_ = false,
+        bool return_deleted_rows_ = false);

    void validate();

@ -156,6 +158,12 @@ private:

    /// Columns, that we need to read for calculation of skip indices, projections or TTL expressions.
    ColumnDependencies dependencies;
+
+    // whether all columns should be returned, not just updated
+    bool return_all_columns;
+
+    // whether we should return deleted or nondeleted rows on DELETE mutation
+    bool return_deleted_rows;
 };

 }
--- a/src/Interpreters/inplaceBlockConversions.cpp
+++ b/src/Interpreters/inplaceBlockConversions.cpp
@ -12,6 +12,7 @@
 #include <Parsers/ASTFunction.h>
 #include <utility>
 #include <DataTypes/DataTypesNumber.h>
+#include <DataTypes/ObjectUtils.h>
 #include <Interpreters/RequiredSourceColumnsVisitor.h>
 #include <Common/checkStackSize.h>
 #include <Storages/ColumnsDescription.h>
@ -187,29 +188,56 @@ ActionsDAGPtr evaluateMissingDefaults(
    return createExpressions(header, expr_list, save_unneeded_columns, context);
 }

-static bool arrayHasNoElementsRead(const IColumn & column)
+static std::unordered_map<String, ColumnPtr> collectOffsetsColumns(
+    const NamesAndTypesList & available_columns, const Columns & res_columns)
 {
-    const auto * column_array = typeid_cast<const ColumnArray *>(&column);
+    std::unordered_map<String, ColumnPtr> offsets_columns;

-    if (!column_array)
-        return false;
+    auto available_column = available_columns.begin();
+    for (size_t i = 0; i < available_columns.size(); ++i, ++available_column)
+    {
+        if (res_columns[i] == nullptr || isColumnConst(*res_columns[i]))
+            continue;

-    size_t size = column_array->size();
-    if (!size)
-        return false;
+        auto serialization = IDataType::getSerialization(*available_column);
+        serialization->enumerateStreams([&](const auto & subpath)
+        {
+            if (subpath.empty() || subpath.back().type != ISerialization::Substream::ArraySizes)
+                return;

-    size_t data_size = column_array->getData().size();
-    if (data_size)
-        return false;
+            auto stream_name = ISerialization::getFileNameForStream(*available_column, subpath);
+            const auto & current_offsets_column = subpath.back().data.column;

-    size_t last_offset = column_array->getOffsets()[size - 1];
-    return last_offset != 0;
+            /// If for some reason multiple offsets columns are present
+            /// for the same nested data structure, choose the one that is not empty.
+            if (current_offsets_column && !current_offsets_column->empty())
+            {
+                auto & offsets_column = offsets_columns[stream_name];
+                if (!offsets_column)
+                    offsets_column = current_offsets_column;
+
+            #ifndef NDEBUG
+                const auto & offsets_data = assert_cast<const ColumnUInt64 &>(*offsets_column).getData();
+                const auto & current_offsets_data = assert_cast<const ColumnUInt64 &>(*current_offsets_column).getData();
+
+                if (offsets_data != current_offsets_data)
+                    throw Exception(ErrorCodes::LOGICAL_ERROR,
+                        "Found non-equal columns with offsets (sizes: {} and {}) for stream {}",
+                        offsets_data.size(), current_offsets_data.size(), stream_name);
+            #endif
+            }
+        }, available_column->type, res_columns[i]);
+    }
+
+    return offsets_columns;
 }

 void fillMissingColumns(
    Columns & res_columns,
    size_t num_rows,
    const NamesAndTypesList & requested_columns,
+    const NamesAndTypesList & available_columns,
+    const NameSet & partially_read_columns,
    StorageMetadataPtr metadata_snapshot)
 {
    size_t num_columns = requested_columns.size();
@ -218,65 +246,79 @@ void fillMissingColumns(
            "Invalid number of columns passed to fillMissingColumns. Expected {}, got {}",
            num_columns, res_columns.size());

-    /// For a missing column of a nested data structure we must create not a column of empty
-    /// arrays, but a column of arrays of correct length.
+    /// For a missing column of a nested data structure
+    /// we must create not a column of empty arrays,
+    /// but a column of arrays of correct length.

    /// First, collect offset columns for all arrays in the block.
+    auto offsets_columns = collectOffsetsColumns(available_columns, res_columns);

-    std::unordered_map<String, ColumnPtr> offset_columns;
+    /// Insert default values only for columns without default expressions.
    auto requested_column = requested_columns.begin();
    for (size_t i = 0; i < num_columns; ++i, ++requested_column)
-    {
-        if (res_columns[i] == nullptr)
-            continue;
-
-        if (const auto * array = typeid_cast<const ColumnArray *>(res_columns[i].get()))
-        {
-            String offsets_name = Nested::extractTableName(requested_column->name);
-            auto & offsets_column = offset_columns[offsets_name];
-
-            /// If for some reason multiple offsets columns are present for the same nested data structure,
-            /// choose the one that is not empty.
-            if (!offsets_column || offsets_column->empty())
-                offsets_column = array->getOffsetsPtr();
-        }
-    }
-
-    /// insert default values only for columns without default expressions
-    requested_column = requested_columns.begin();
-    for (size_t i = 0; i < num_columns; ++i, ++requested_column)
    {
        const auto & [name, type] = *requested_column;

-        if (res_columns[i] && arrayHasNoElementsRead(*res_columns[i]))
+        if (res_columns[i] && partially_read_columns.contains(name))
            res_columns[i] = nullptr;

-        if (res_columns[i] == nullptr)
+        if (res_columns[i])
+            continue;
+
+        if (metadata_snapshot && metadata_snapshot->getColumns().hasDefault(name))
+            continue;
+
+        std::vector<ColumnPtr> current_offsets;
+        size_t num_dimensions = 0;
+
+        const auto * array_type = typeid_cast<const DataTypeArray *>(type.get());
+        if (array_type && !offsets_columns.empty())
        {
-            if (metadata_snapshot && metadata_snapshot->getColumns().hasDefault(name))
-                continue;
+            num_dimensions = getNumberOfDimensions(*array_type);
+            current_offsets.resize(num_dimensions);

-            String offsets_name = Nested::extractTableName(name);
-            auto offset_it = offset_columns.find(offsets_name);
-            const auto * array_type = typeid_cast<const DataTypeArray *>(type.get());
-            if (offset_it != offset_columns.end() && array_type)
+            auto serialization = IDataType::getSerialization(*requested_column);
+            serialization->enumerateStreams([&](const auto & subpath)
            {
-                const auto & nested_type = array_type->getNestedType();
-                ColumnPtr offsets_column = offset_it->second;
-                size_t nested_rows = typeid_cast<const ColumnUInt64 &>(*offsets_column).getData().back();
+                if (subpath.empty() || subpath.back().type != ISerialization::Substream::ArraySizes)
+                    return;

-                ColumnPtr nested_column =
-                    nested_type->createColumnConstWithDefaultValue(nested_rows)->convertToFullColumnIfConst();
+                size_t level = ISerialization::getArrayLevel(subpath);
+                assert(level < num_dimensions);

-                res_columns[i] = ColumnArray::create(nested_column, offsets_column);
-            }
-            else
+                auto stream_name = ISerialization::getFileNameForStream(*requested_column, subpath);
+                auto it = offsets_columns.find(stream_name);
+                if (it != offsets_columns.end())
+                    current_offsets[level] = it->second;
+            });
+
+            for (size_t j = 0; j < num_dimensions; ++j)
            {
-                /// We must turn a constant column into a full column because the interpreter could infer
-                /// that it is constant everywhere but in some blocks (from other parts) it can be a full column.
-                res_columns[i] = type->createColumnConstWithDefaultValue(num_rows)->convertToFullColumnIfConst();
+                if (!current_offsets[j])
+                {
+                    current_offsets.resize(j);
+                    break;
+                }
            }
        }
+
+        if (!current_offsets.empty())
+        {
+            size_t num_empty_dimensions = num_dimensions - current_offsets.size();
+            auto scalar_type = createArrayOfType(getBaseTypeOfArray(type), num_empty_dimensions);
+
+            size_t data_size = assert_cast<const ColumnUInt64 &>(*current_offsets.back()).getData().back();
+            res_columns[i] = scalar_type->createColumnConstWithDefaultValue(data_size)->convertToFullColumnIfConst();
+
+            for (auto it = current_offsets.rbegin(); it != current_offsets.rend(); ++it)
+                res_columns[i] = ColumnArray::create(res_columns[i], *it);
+        }
+        else
+        {
+            /// We must turn a constant column into a full column because the interpreter could infer
+            /// that it is constant everywhere but in some blocks (from other parts) it can be a full column.
+            res_columns[i] = type->createColumnConstWithDefaultValue(num_rows)->convertToFullColumnIfConst();
+        }
    }
 }

--- a/src/Interpreters/inplaceBlockConversions.h
+++ b/src/Interpreters/inplaceBlockConversions.h
@ -1,5 +1,6 @@
 #pragma once

+#include <Core/Names.h>
 #include <Interpreters/Context_fwd.h>
 #include <Common/COW.h>

@ -43,6 +44,8 @@ void fillMissingColumns(
    Columns & res_columns,
    size_t num_rows,
    const NamesAndTypesList & requested_columns,
+    const NamesAndTypesList & available_columns,
+    const NameSet & partially_read_columns,
    StorageMetadataPtr metadata_snapshot);

 }
--- a/src/Parsers/ParserSelectQuery.cpp
+++ b/src/Parsers/ParserSelectQuery.cpp
@ -224,8 +224,6 @@ bool ParserSelectQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
            select_query->group_by_with_rollup = true;
        else if (s_cube.ignore(pos, expected))
            select_query->group_by_with_cube = true;
-        else if (s_grouping_sets.ignore(pos, expected))
-            select_query->group_by_with_grouping_sets = true;
        else if (s_totals.ignore(pos, expected))
            select_query->group_by_with_totals = true;
        else
--- a/src/Processors/QueryPlan/AggregatingStep.cpp
+++ b/src/Processors/QueryPlan/AggregatingStep.cpp
@ -251,14 +251,17 @@ void AggregatingStep::transformPipeline(QueryPipelineBuilder & pipeline, const B
                outputs.push_back(grouping_node);

                const auto & missing_columns = grouping_sets_params[set_counter].missing_keys;
+                const auto & used_keys = grouping_sets_params[set_counter].used_keys;

                auto to_nullable_function = FunctionFactory::instance().get("toNullable", nullptr);
                for (size_t i = 0; i < output_header.columns(); ++i)
                {
                    auto & col = output_header.getByPosition(i);
-                    const auto it = std::find_if(
+                    const auto missing_it = std::find_if(
                        missing_columns.begin(), missing_columns.end(), [&](const auto & missing_col) { return missing_col == col.name; });
-                    if (it != missing_columns.end())
+                    const auto used_it = std::find_if(
+                        used_keys.begin(), used_keys.end(), [&](const auto & used_col) { return used_col == col.name; });
+                    if (missing_it != missing_columns.end())
                    {
                        auto column_with_default = col.column->cloneEmpty();
                        col.type->insertDefaultInto(*column_with_default);
@ -270,7 +273,7 @@ void AggregatingStep::transformPipeline(QueryPipelineBuilder & pipeline, const B
                    else
                    {
                        const auto * column_node = dag->getOutputs()[header.getPositionByName(col.name)];
-                        if (group_by_use_nulls && column_node->result_type->canBeInsideNullable())
+                        if (used_it != used_keys.end() && group_by_use_nulls && column_node->result_type->canBeInsideNullable())
                            outputs.push_back(&dag->addFunction(to_nullable_function, { column_node }, col.name));
                        else
                            outputs.push_back(column_node);
--- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp
+++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp
@ -179,7 +179,6 @@ Pipe ReadFromMergeTree::readFromPool(
        sum_marks,
        min_marks_for_concurrent_read,
        std::move(parts_with_range),
-        data,
        storage_snapshot,
        prewhere_info,
        required_columns,
--- a/src/Storages/ColumnsDescription.cpp
+++ b/src/Storages/ColumnsDescription.cpp
@ -780,7 +780,7 @@ void ColumnsDescription::addSubcolumns(const String & name_in_storage, const Dat
                "Cannot add subcolumn {}: column with this name already exists", subcolumn.name);

        subcolumns.get<0>().insert(std::move(subcolumn));
-    }, {type_in_storage->getDefaultSerialization(), type_in_storage, nullptr, nullptr});
+    }, ISerialization::SubstreamData(type_in_storage->getDefaultSerialization()).withType(type_in_storage));
 }

 void ColumnsDescription::removeSubcolumns(const String & name_in_storage)
--- a/src/Storages/MergeTree/AlterConversions.h
+++ b/src/Storages/MergeTree/AlterConversions.h
@ -0,0 +1,24 @@
+#pragma once
+
+#include <string>
+#include <unordered_map>
+
+
+namespace DB
+{
+
+/// Alter conversions which should be applied on-fly for part. Build from of
+/// the most recent mutation commands for part. Now we have only rename_map
+/// here (from ALTER_RENAME) command, because for all other type of alters
+/// we can deduce conversions for part from difference between
+/// part->getColumns() and storage->getColumns().
+struct AlterConversions
+{
+    /// Rename map new_name -> old_name
+    std::unordered_map<std::string, std::string> rename_map;
+
+    bool isColumnRenamed(const std::string & new_name) const { return rename_map.count(new_name) > 0; }
+    std::string getColumnOldName(const std::string & new_name) const { return rename_map.at(new_name); }
+};
+
+}
--- a/Show More
+++ b/Show More