Merge branch 'master' into no-hardlinks-while-making-backup-of-mergetree-in-atomic-db

2024-11-21 15:12:02 +00:00 · 2022-09-09 14:24:44 +03:00 · 2022-09-09 14:24:44 +03:00 · 48927ba0ac
commit 48927ba0ac
parent 10629a66e5 a713c5ac27
117 changed files with 2554 additions and 629 deletions
--- a/.github/workflows/master.yml
+++ b/.github/workflows/master.yml
@ -923,6 +923,53 @@ jobs:
          # shellcheck disable=SC2046
          docker rm -f $(docker ps -a -q) ||:
          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
+  BuilderBinAmd64SSE2:
+    needs: [DockerHubPush]
+    runs-on: [self-hosted, builder]
+    steps:
+      - name: Set envs
+        run: |
+          cat >> "$GITHUB_ENV" << 'EOF'
+          TEMP_PATH=${{runner.temp}}/build_check
+          IMAGES_PATH=${{runner.temp}}/images_path
+          REPO_COPY=${{runner.temp}}/build_check/ClickHouse
+          CACHES_PATH=${{runner.temp}}/../ccaches
+          BUILD_NAME=binary_amd64sse2
+          EOF
+      - name: Download changed images
+        uses: actions/download-artifact@v2
+        with:
+          name: changed_images
+          path: ${{ env.IMAGES_PATH }}
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 0 # otherwise we will have no info about contributors
+      - name: Build
+        run: |
+          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
+          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          sudo rm -fr "$TEMP_PATH"
+          mkdir -p "$TEMP_PATH"
+          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
+          cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME"
+      - name: Upload build URLs to artifacts
+        if: ${{ success() || failure() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: ${{ env.BUILD_URLS }}
+          path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json
+      - name: Cleanup
+        if: always()
+        run: |
+          # shellcheck disable=SC2046
+          docker kill $(docker ps -q) ||:
+          # shellcheck disable=SC2046
+          docker rm -f $(docker ps -a -q) ||:
+          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
 ############################################################################################
 ##################################### Docker images  #######################################
 ############################################################################################
@ -1011,6 +1058,7 @@ jobs:
      - BuilderBinFreeBSD
      # - BuilderBinGCC
      - BuilderBinPPC64
+      - BuilderBinAmd64SSE2
      - BuilderBinClangTidy
      - BuilderDebShared
    runs-on: [self-hosted, style-checker]
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@ -935,6 +935,51 @@ jobs:
          # shellcheck disable=SC2046
          docker rm -f $(docker ps -a -q) ||:
          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
+  BuilderBinAmd64SSE2:
+    needs: [DockerHubPush, FastTest, StyleCheck]
+    runs-on: [self-hosted, builder]
+    steps:
+      - name: Set envs
+        run: |
+          cat >> "$GITHUB_ENV" << 'EOF'
+          TEMP_PATH=${{runner.temp}}/build_check
+          IMAGES_PATH=${{runner.temp}}/images_path
+          REPO_COPY=${{runner.temp}}/build_check/ClickHouse
+          CACHES_PATH=${{runner.temp}}/../ccaches
+          BUILD_NAME=binary_amd64sse2
+          EOF
+      - name: Download changed images
+        uses: actions/download-artifact@v2
+        with:
+          name: changed_images
+          path: ${{ env.IMAGES_PATH }}
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+      - name: Build
+        run: |
+          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
+          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          sudo rm -fr "$TEMP_PATH"
+          mkdir -p "$TEMP_PATH"
+          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
+          cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME"
+      - name: Upload build URLs to artifacts
+        if: ${{ success() || failure() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: ${{ env.BUILD_URLS }}
+          path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json
+      - name: Cleanup
+        if: always()
+        run: |
+          # shellcheck disable=SC2046
+          docker kill $(docker ps -q) ||:
+          # shellcheck disable=SC2046
+          docker rm -f $(docker ps -a -q) ||:
+          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
 ############################################################################################
 ##################################### Docker images  #######################################
 ############################################################################################
@ -1023,6 +1068,7 @@ jobs:
      - BuilderBinFreeBSD
      # - BuilderBinGCC
      - BuilderBinPPC64
+      - BuilderBinAmd64SSE2
      - BuilderBinClangTidy
      - BuilderDebShared
    runs-on: [self-hosted, style-checker]
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -143,6 +143,8 @@ include (cmake/add_warning.cmake)
 if (COMPILER_CLANG)
    # generate ranges for fast "addr2line" search
    if (NOT CMAKE_BUILD_TYPE_UC STREQUAL "RELEASE")
+        # NOTE: that clang has a bug because of it does not emit .debug_aranges
+        # with ThinLTO, so custom ld.lld wrapper is shipped in docker images.
        set(COMPILER_FLAGS "${COMPILER_FLAGS} -gdwarf-aranges")
    endif ()

--- a/README.md
+++ b/README.md
@ -15,4 +15,5 @@ ClickHouse® is an open-source column-oriented database management system that a
 * [Contacts](https://clickhouse.com/company/contact) can help to get your questions answered if there are any.

 ## Upcoming events
-* [**v22.8 Release Webinar**](https://clickhouse.com/company/events/v22-8-release-webinar) Original creator, co-founder, and CTO of ClickHouse Alexey Milovidov will walk us through the highlights of the release, provide live demos, and share vision into what is coming in the roadmap.
+* [**v22.9 Release Webinar**](https://clickhouse.com/company/events/v22-9-release-webinar) Original creator, co-founder, and CTO of ClickHouse Alexey Milovidov will walk us through the highlights of the release, provide live demos, and share vision into what is coming in the roadmap.
+* [**ClickHouse for Analytics @ Barracuda Networks**](https://www.meetup.com/clickhouse-silicon-valley-meetup-group/events/288140358/) Join us for this in person meetup hosted by our friends at Barracuda in Bay Area.
--- a/cmake/cpu_features.cmake
+++ b/cmake/cpu_features.cmake
@ -24,6 +24,23 @@ option (ENABLE_BMI "Use BMI instructions on x86_64" 0)
 option (ENABLE_AVX2_FOR_SPEC_OP "Use avx2 instructions for specific operations on x86_64" 0)
 option (ENABLE_AVX512_FOR_SPEC_OP "Use avx512 instructions for specific operations on x86_64" 0)

+# X86: Allow compilation for a SSE2-only target machine. Done by a special build in CI for embedded or very old hardware.
+option (NO_SSE3_OR_HIGHER "Disable SSE3 or higher on x86_64" 0)
+if (NO_SSE3_OR_HIGHER)
+    SET(ENABLE_SSSE3 0)
+    SET(ENABLE_SSE41 0)
+    SET(ENABLE_SSE42 0)
+    SET(ENABLE_PCLMULQDQ 0)
+    SET(ENABLE_POPCNT 0)
+    SET(ENABLE_AVX 0)
+    SET(ENABLE_AVX2 0)
+    SET(ENABLE_AVX512 0)
+    SET(ENABLE_AVX512_VBMI 0)
+    SET(ENABLE_BMI 0)
+    SET(ENABLE_AVX2_FOR_SPEC_OP 0)
+    SET(ENABLE_AVX512_FOR_SPEC_OP 0)
+endif()
+
 option (ARCH_NATIVE "Add -march=native compiler flag. This makes your binaries non-portable but more performant code may be generated. This option overrides ENABLE_* options for specific instruction set. Highly not recommended to use." 0)

 if (ARCH_NATIVE)
--- a/cmake/ld.lld.in
+++ b/cmake/ld.lld.in
@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+# This is a workaround for bug in llvm/clang,
+# that does not produce .debug_aranges with LTO
+#
+# NOTE: this is a temporary solution, that should be removed once [1] will be
+# resolved.
+#
+#   [1]: https://discourse.llvm.org/t/clang-does-not-produce-full-debug-aranges-section-with-thinlto/64898/8
+
+# NOTE: only -flto=thin is supported.
+# NOTE: it is not possible to check was there -gdwarf-aranges initially or not.
+if [[ "$*" =~ -plugin-opt=thinlto ]]; then
+    exec "@LLD_PATH@" -mllvm -generate-arange-section "$@"
+else
+    exec "@LLD_PATH@" "$@"
+fi
--- a/cmake/split_debug_symbols.cmake
+++ b/cmake/split_debug_symbols.cmake
@ -20,7 +20,7 @@ macro(clickhouse_split_debug_symbols)
       COMMAND mkdir -p "${STRIP_DESTINATION_DIR}/bin"
       COMMAND cp "${STRIP_BINARY_PATH}" "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}"
       # Splits debug symbols into separate file, leaves the binary untouched:
-       COMMAND "${OBJCOPY_PATH}" --only-keep-debug --compress-debug-sections "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}" "${STRIP_DESTINATION_DIR}/lib/debug/bin/${STRIP_TARGET}.debug"
+       COMMAND "${OBJCOPY_PATH}" --only-keep-debug "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}" "${STRIP_DESTINATION_DIR}/lib/debug/bin/${STRIP_TARGET}.debug"
       COMMAND chmod 0644 "${STRIP_DESTINATION_DIR}/lib/debug/bin/${STRIP_TARGET}.debug"
       # Strips binary, sections '.note' & '.comment' are removed in line with Debian's stripping policy: www.debian.org/doc/debian-policy/ch-files.html, section '.clickhouse.hash' is needed for integrity check:
       COMMAND "${STRIP_PATH}" --remove-section=.comment --remove-section=.note --keep-section=.clickhouse.hash "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}"
--- a/cmake/tools.cmake
+++ b/cmake/tools.cmake
@ -94,8 +94,13 @@ if (LINKER_NAME)
        if (NOT LLD_PATH)
            message (FATAL_ERROR "Using linker ${LINKER_NAME} but can't find its path.")
        endif ()
-        set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} --ld-path=${LLD_PATH}")
-        set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} --ld-path=${LLD_PATH}")
+
+        # This a temporary quirk to emit .debug_aranges with ThinLTO
+        set (LLD_WRAPPER "${CMAKE_CURRENT_BINARY_DIR}/ld.lld")
+        configure_file ("${CMAKE_CURRENT_SOURCE_DIR}/cmake/ld.lld.in" "${LLD_WRAPPER}" @ONLY)
+
+        set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} --ld-path=${LLD_WRAPPER}")
+        set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} --ld-path=${LLD_WRAPPER}")
    else ()
        set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=${LINKER_NAME}")
        set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fuse-ld=${LINKER_NAME}")
--- a/contrib/vectorscan-cmake/CMakeLists.txt
+++ b/contrib/vectorscan-cmake/CMakeLists.txt
@ -1,6 +1,6 @@
 # We use vectorscan, a portable and API/ABI-compatible drop-in replacement for hyperscan.

-if (ARCH_AMD64)
+if (ARCH_AMD64 AND NOT NO_SSE3_OR_HIGHER)
    option (ENABLE_VECTORSCAN "Enable vectorscan library" ${ENABLE_LIBRARIES})
 endif()

--- a/docker/packager/packager
+++ b/docker/packager/packager
@ -130,6 +130,7 @@ def parse_env_variables(
    ARM_SUFFIX = "-aarch64"
    FREEBSD_SUFFIX = "-freebsd"
    PPC_SUFFIX = "-ppc64le"
+    AMD64_SSE2_SUFFIX = "-amd64sse2"

    result = []
    result.append("OUTPUT_DIR=/output")
@ -141,6 +142,7 @@ def parse_env_variables(
    is_cross_arm = compiler.endswith(ARM_SUFFIX)
    is_cross_ppc = compiler.endswith(PPC_SUFFIX)
    is_cross_freebsd = compiler.endswith(FREEBSD_SUFFIX)
+    is_amd64_sse2 = compiler.endswith(AMD64_SSE2_SUFFIX)

    if is_cross_darwin:
        cc = compiler[: -len(DARWIN_SUFFIX)]
@ -186,6 +188,10 @@ def parse_env_variables(
        cmake_flags.append(
            "-DCMAKE_TOOLCHAIN_FILE=/build/cmake/linux/toolchain-ppc64le.cmake"
        )
+    elif is_amd64_sse2:
+        cc = compiler[: -len(AMD64_SSE2_SUFFIX)]
+        result.append("DEB_ARCH=amd64")
+        cmake_flags.append("-DNO_SSE3_OR_HIGHER=1")
    else:
        cc = compiler
        result.append("DEB_ARCH=amd64")
@ -339,6 +345,7 @@ if __name__ == "__main__":
            "clang-14-darwin-aarch64",
            "clang-14-aarch64",
            "clang-14-ppc64le",
+            "clang-14-amd64sse2",
            "clang-14-freebsd",
            "gcc-11",
        ),
--- a/docs/en/development/build.md
+++ b/docs/en/development/build.md
@ -140,6 +140,6 @@ hash cmake

 ClickHouse is available in pre-built binaries and packages. Binaries are portable and can be run on any Linux flavour.

-They are built for stable, prestable and testing releases as long as for every commit to master and for every pull request.
+Binaries are built for stable and LTS releases and also every commit to `master` for each pull request.

 To find the freshest build from `master`, go to [commits page](https://github.com/ClickHouse/ClickHouse/commits/master), click on the first green check mark or red cross near commit, and click to the “Details” link right after “ClickHouse Build Check”.
--- a/docs/en/getting-started/example-datasets/nypd_complaint_data.md
+++ b/docs/en/getting-started/example-datasets/nypd_complaint_data.md
@ -0,0 +1,654 @@
+---
+slug: /en/getting-started/example-datasets/nypd_complaint_data
+sidebar_label: NYPD Complaint Data
+description: "Ingest and query Tab Separated Value data in 5 steps"
+title: NYPD Complaint Data
+---
+
+Tab separated value, or TSV, files are common and may include field headings as the first line of the file. ClickHouse can ingest TSVs, and also can query TSVs without ingesting the files.  This guide covers both of these cases. If you need to query or ingest CSV files, the same techniques work, simply substitute `TSV` with `CSV` in your format arguments.
+
+While working through this guide you will:
+- **Investigate**: Query the structure and content of the TSV file.
+- **Determine the target ClickHouse schema**: Choose proper data types and map the existing data to those types.
+- **Create a ClickHouse table**.
+- **Preprocess and stream** the data to ClickHouse.
+- **Run some queries** against ClickHouse.
+
+The dataset used in this guide comes from the NYC Open Data team, and contains data about "all valid felony, misdemeanor, and violation crimes reported to the New York City Police Department (NYPD)". At the time of writing, the data file is 166MB, but it is updated regularly.
+
+**Source**: [data.cityofnewyork.us](https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Current-Year-To-Date-/5uac-w243)  
+**Terms of use**: https://www1.nyc.gov/home/terms-of-use.page
+
+## Prerequisites
+- Download the dataset by visiting the [NYPD Complaint Data Current (Year To Date)](https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Current-Year-To-Date-/5uac-w243) page, clicking the Export button, and choosing **TSV for Excel**.
+- Install [ClickHouse server and client](../../getting-started/install.md).
+- [Launch](../../getting-started/install.md#launch) ClickHouse server, and connect with `clickhouse-client`
+
+### A note about the commands described in this guide
+There are two types of commands in this guide:
+- Some of the commands are querying the TSV files, these are run at the command prompt.
+- The rest of the commands are querying ClickHouse, and these are run in the `clickhouse-client` or Play UI.
+
+:::note
+The examples in this guide assume that you have saved the TSV file to `${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv`, please adjust the commands if needed.
+:::
+
+## Familiarize yourself with the TSV file
+
+Before starting to work with the ClickHouse database familiarize yourself with the data. 
+
+### Look at the fields in the source TSV file
+
+This is an example of a command to query a TSV file, but don't run it yet.
+```sh
+clickhouse-local --query \
+"describe file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')"
+```
+
+Sample response
+```response
+CMPLNT_NUM                  Nullable(Float64)					
+ADDR_PCT_CD                 Nullable(Float64)					
+BORO_NM                     Nullable(String)					
+CMPLNT_FR_DT                Nullable(String)					
+CMPLNT_FR_TM                Nullable(String)					
+```
+
+:::tip
+Most of the time the above command will let you know which fields in the input data are numeric, and which are strings, and which are tuples.  This is not always the case.  Because ClickHouse is routineley used with datasets containing billions of records there is a default number (100) of rows examined to [infer the schema](../../guides/developer/working-with-json/json-semi-structured.md/#relying-on-schema-inference) in order to avoid parsing billions of rows to infer the schema. The response below may not match what you see, as the dataset is updated several times each year. Looking at the Data Dictionary you can see that CMPLNT_NUM is specified as text, and not numeric.  By overriding the default of 100 rows for inference with the setting `SETTINGS input_format_max_rows_to_read_for_schema_inference=2000`
+you can get a better idea of the content.
+
+Note: as of version 22.5 the default is now 25,000 rows for inferring the schema, so only change the setting if you are on an older version or if you need more than 25,000 rows to be sampled.
+:::
+
+Run this command at your command prompt.  You will be using `clickhouse-local` to query the data in the TSV file you downloaded.
+```sh
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"describe file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')" 
+```
+
+Result:
+```response
+CMPLNT_NUM        Nullable(String)					
+ADDR_PCT_CD       Nullable(Float64)					
+BORO_NM           Nullable(String)					
+CMPLNT_FR_DT      Nullable(String)					
+CMPLNT_FR_TM      Nullable(String)					
+CMPLNT_TO_DT      Nullable(String)					
+CMPLNT_TO_TM      Nullable(String)					
+CRM_ATPT_CPTD_CD  Nullable(String)					
+HADEVELOPT        Nullable(String)					
+HOUSING_PSA       Nullable(Float64)					
+JURISDICTION_CODE Nullable(Float64)					
+JURIS_DESC        Nullable(String)					
+KY_CD             Nullable(Float64)					
+LAW_CAT_CD        Nullable(String)					
+LOC_OF_OCCUR_DESC Nullable(String)					
+OFNS_DESC         Nullable(String)					
+PARKS_NM          Nullable(String)					
+PATROL_BORO       Nullable(String)					
+PD_CD             Nullable(Float64)					
+PD_DESC           Nullable(String)					
+PREM_TYP_DESC     Nullable(String)					
+RPT_DT            Nullable(String)					
+STATION_NAME      Nullable(String)					
+SUSP_AGE_GROUP    Nullable(String)					
+SUSP_RACE         Nullable(String)					
+SUSP_SEX          Nullable(String)					
+TRANSIT_DISTRICT  Nullable(Float64)					
+VIC_AGE_GROUP     Nullable(String)					
+VIC_RACE          Nullable(String)					
+VIC_SEX           Nullable(String)					
+X_COORD_CD        Nullable(Float64)					
+Y_COORD_CD        Nullable(Float64)					
+Latitude          Nullable(Float64)					
+Longitude         Nullable(Float64)					
+Lat_Lon           Tuple(Nullable(Float64), Nullable(Float64))					
+New Georeferenced Column Nullable(String)
+```
+
+At this point you should check that the columns in the TSV file match the names and types specified in the **Columns in this Dataset** section of the [dataset web page](https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Current-Year-To-Date-/5uac-w243).  The data types are not very specific, all numeric fields are set to `Nullable(Float64)`, and all other fields are `Nullable(String)`.  When you create a ClickHouse table to store the data you can specify more appropriate and performant types.
+
+### Determine the proper schema
+
+In order to figure out what types should be used for the fields it is necessary to know what the data looks like. For example, the field `JURISDICTION_CODE` is a numeric: should it be a `UInt8`, or an `Enum`, or is `Float64` appropriate?
+
+```sql
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"select JURISDICTION_CODE, count() FROM
+ file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
+ GROUP BY JURISDICTION_CODE
+ ORDER BY JURISDICTION_CODE
+ FORMAT PrettyCompact"
+```
+
+Result:
+```response
+┌─JURISDICTION_CODE─┬─count()─┐
+│                 0 │  188875 │
+│                 1 │    4799 │
+│                 2 │   13833 │
+│                 3 │     656 │
+│                 4 │      51 │
+│                 6 │       5 │
+│                 7 │       2 │
+│                 9 │      13 │
+│                11 │      14 │
+│                12 │       5 │
+│                13 │       2 │
+│                14 │      70 │
+│                15 │      20 │
+│                72 │     159 │
+│                87 │       9 │
+│                88 │      75 │
+│                97 │     405 │
+└───────────────────┴─────────┘
+```
+
+The query response shows that the `JURISDICTION_CODE` fits well in a `UInt8`.
+
+Similarly, look at some of the `String` fields and see if they are well suited to being `DateTime` or [`LowCardinality(String)`](../../sql-reference/data-types/lowcardinality.md) fields.
+
+For example, the field `PARKS_NM` is described as "Name of NYC park, playground or greenspace of occurrence, if applicable (state parks are not included)".  The names of parks in New York City may be a good candidate for a `LowCardinality(String)`:
+
+```sh
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"select count(distinct PARKS_NM) FROM
+ file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
+ FORMAT PrettyCompact"
+```
+
+Result:
+```response
+┌─uniqExact(PARKS_NM)─┐
+│                 319 │
+└─────────────────────┘
+```
+
+Have a look at some of the park names:
+```sql
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"select distinct PARKS_NM FROM
+ file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
+ LIMIT 10
+ FORMAT PrettyCompact"
+```
+
+Result:
+```response
+┌─PARKS_NM───────────────────┐
+│ (null)                     │
+│ ASSER LEVY PARK            │
+│ JAMES J WALKER PARK        │
+│ BELT PARKWAY/SHORE PARKWAY │
+│ PROSPECT PARK              │
+│ MONTEFIORE SQUARE          │
+│ SUTTON PLACE PARK          │
+│ JOYCE KILMER PARK          │
+│ ALLEY ATHLETIC PLAYGROUND  │
+│ ASTORIA PARK               │
+└────────────────────────────┘
+```
+
+The dataset in use at the time of writing has only a few hundred distinct parks and playgrounds in the `PARK_NM` column.  This is a small number based on the [LowCardinality](../../sql-reference/data-types/lowcardinality.md#lowcardinality-dscr) recommendation to stay below 10,000 distinct strings in a `LowCardinality(String)` field.
+
+### DateTime fields
+Based on the **Columns in this Dataset** section of the [dataset web page](https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Current-Year-To-Date-/5uac-w243) there are date and time fields for the start and end of the reported event.  Looking at the min and max of the `CMPLNT_FR_DT` and `CMPLT_TO_DT` gives an idea of whether or not the fields are always populated:
+
+```sh title="CMPLNT_FR_DT"
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"select min(CMPLNT_FR_DT), max(CMPLNT_FR_DT) FROM
+file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
+FORMAT PrettyCompact"
+```
+
+Result:
+```response
+┌─min(CMPLNT_FR_DT)─┬─max(CMPLNT_FR_DT)─┐
+│ 01/01/1973        │ 12/31/2021        │
+└───────────────────┴───────────────────┘
+```
+
+```sh title="CMPLNT_TO_DT"
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"select min(CMPLNT_TO_DT), max(CMPLNT_TO_DT) FROM
+file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
+FORMAT PrettyCompact"
+```
+
+Result:
+```response
+┌─min(CMPLNT_TO_DT)─┬─max(CMPLNT_TO_DT)─┐
+│                   │ 12/31/2021        │
+└───────────────────┴───────────────────┘
+```
+
+```sh title="CMPLNT_FR_TM"
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"select min(CMPLNT_FR_TM), max(CMPLNT_FR_TM) FROM
+file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
+FORMAT PrettyCompact"
+```
+
+Result:
+```response
+┌─min(CMPLNT_FR_TM)─┬─max(CMPLNT_FR_TM)─┐
+│ 00:00:00          │ 23:59:00          │
+└───────────────────┴───────────────────┘
+```
+
+```sh title="CMPLNT_TO_TM"
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"select min(CMPLNT_TO_TM), max(CMPLNT_TO_TM) FROM
+file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
+FORMAT PrettyCompact"
+```
+
+Result:
+```response
+┌─min(CMPLNT_TO_TM)─┬─max(CMPLNT_TO_TM)─┐
+│ (null)            │ 23:59:00          │
+└───────────────────┴───────────────────┘
+```
+
+## Make a plan
+
+Based on the above investigation:
+- `JURISDICTION_CODE` should be cast as `UInt8`.
+- `PARKS_NM` should be cast to `LowCardinality(String)`
+- `CMPLNT_FR_DT` and `CMPLNT_FR_TM` are always populated (possibly with a default time of `00:00:00`)
+- `CMPLNT_TO_DT` and `CMPLNT_TO_TM` may be empty
+- Dates and times are stored in separate fields in the source
+- Dates are `mm/dd/yyyy` format
+- Times are `hh:mm:ss` format
+- Dates and times can be concatenated into DateTime types
+- There are some dates before January 1st 1970, which means we need a 64 bit DateTime
+
+:::note
+There are many more changes to be made to the types, they all can be determined by following the same investigation steps.  Look at the number of distinct strings in a field, the min and max of the numerics, and make your decisions.  The table schema that is given later in the guide has many low cardinality strings and unsigned integer fields and very few floating point numerics.
+:::
+
+## Concatenate the date and time fields
+
+To concatenate the date and time fields `CMPLNT_FR_DT` and `CMPLNT_FR_TM` into a single `String` that can be cast to a `DateTime`, select the two fields joined by the concatenation operator: `CMPLNT_FR_DT || ' ' || CMPLNT_FR_TM`.  The `CMPLNT_TO_DT` and `CMPLNT_TO_TM` fields are handled similarly.
+
+```sh
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"select CMPLNT_FR_DT || ' ' || CMPLNT_FR_TM AS complaint_begin FROM
+file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
+LIMIT 10
+FORMAT PrettyCompact"
+```
+
+Result:
+```response
+┌─complaint_begin─────┐
+│ 07/29/2010 00:01:00 │
+│ 12/01/2011 12:00:00 │
+│ 04/01/2017 15:00:00 │
+│ 03/26/2018 17:20:00 │
+│ 01/01/2019 00:00:00 │
+│ 06/14/2019 00:00:00 │
+│ 11/29/2021 20:00:00 │
+│ 12/04/2021 00:35:00 │
+│ 12/05/2021 12:50:00 │
+│ 12/07/2021 20:30:00 │
+└─────────────────────┘
+```
+
+## Convert the date and time String to a DateTime64 type
+
+Earlier in the guide we discovered that there are dates in the TSV file before January 1st 1970, which means that we need a 64 bit DateTime type for the dates.  The dates also need to be converted from `MM/DD/YYYY` to `YYYY/MM/DD` format.  Both of these can be done with [`parseDateTime64BestEffort()`](../../sql-reference/functions/type-conversion-functions.md#parsedatetime64besteffort).
+
+```sh
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"WITH (CMPLNT_FR_DT || ' ' || CMPLNT_FR_TM) AS CMPLNT_START,
+      (CMPLNT_TO_DT || ' ' || CMPLNT_TO_TM) AS CMPLNT_END
+select parseDateTime64BestEffort(CMPLNT_START) AS complaint_begin,
+       parseDateTime64BestEffortOrNull(CMPLNT_END) AS complaint_end
+FROM file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
+ORDER BY complaint_begin ASC
+LIMIT 25
+FORMAT PrettyCompact"
+```
+
+Lines 2 and 3 above contain the concatenation from the previous step, and lines 4 and 5 above parse the strings into `DateTime64`.  As the complaint end time is not guaranteed to exist `parseDateTime64BestEffortOrNull` is used.
+
+Result:
+```response
+┌─────────complaint_begin─┬───────────complaint_end─┐
+│ 1925-01-01 10:00:00.000 │ 2021-02-12 09:30:00.000 │
+│ 1925-01-01 11:37:00.000 │ 2022-01-16 11:49:00.000 │
+│ 1925-01-01 15:00:00.000 │ 2021-12-31 00:00:00.000 │
+│ 1925-01-01 15:00:00.000 │ 2022-02-02 22:00:00.000 │
+│ 1925-01-01 19:00:00.000 │ 2022-04-14 05:00:00.000 │
+│ 1955-09-01 19:55:00.000 │ 2022-08-01 00:45:00.000 │
+│ 1972-03-17 11:40:00.000 │ 2022-03-17 11:43:00.000 │
+│ 1972-05-23 22:00:00.000 │ 2022-05-24 09:00:00.000 │
+│ 1972-05-30 23:37:00.000 │ 2022-05-30 23:50:00.000 │
+│ 1972-07-04 02:17:00.000 │                    ᴺᵁᴸᴸ │
+│ 1973-01-01 00:00:00.000 │                    ᴺᵁᴸᴸ │
+│ 1975-01-01 00:00:00.000 │                    ᴺᵁᴸᴸ │
+│ 1976-11-05 00:01:00.000 │ 1988-10-05 23:59:00.000 │
+│ 1977-01-01 00:00:00.000 │ 1977-01-01 23:59:00.000 │
+│ 1977-12-20 00:01:00.000 │                    ᴺᵁᴸᴸ │
+│ 1981-01-01 00:01:00.000 │                    ᴺᵁᴸᴸ │
+│ 1981-08-14 00:00:00.000 │ 1987-08-13 23:59:00.000 │
+│ 1983-01-07 00:00:00.000 │ 1990-01-06 00:00:00.000 │
+│ 1984-01-01 00:01:00.000 │ 1984-12-31 23:59:00.000 │
+│ 1985-01-01 12:00:00.000 │ 1987-12-31 15:00:00.000 │
+│ 1985-01-11 09:00:00.000 │ 1985-12-31 12:00:00.000 │
+│ 1986-03-16 00:05:00.000 │ 2022-03-16 00:45:00.000 │
+│ 1987-01-07 00:00:00.000 │ 1987-01-09 00:00:00.000 │
+│ 1988-04-03 18:30:00.000 │ 2022-08-03 09:45:00.000 │
+│ 1988-07-29 12:00:00.000 │ 1990-07-27 22:00:00.000 │
+└─────────────────────────┴─────────────────────────┘
+```
+:::note
+The dates shown as `1925` above are from errors in the data.  There are several records in the original data with dates in the years `1019` - `1022` that should be `2019` - `2022`.  They are being stored as Jan 1st 1925 as that is the earliest date with a 64 bit DateTime.
+:::
+
+## Create a table
+
+The decisions made above on the data types used for the columns are reflected in the table schema
+below. We also need to decide on the `ORDER BY` and `PRIMARY KEY` used for the table.  At least one
+of `ORDER BY` or `PRIMARY KEY` must be specified.  Here are some guidelines on deciding on the 
+columns to includes in `ORDER BY`, and more information is in the *Next Steps* section at the end
+of this document.
+
+### Order By and Primary Key clauses
+
+- The `ORDER BY` tuple should include fields that are used in query filters
+- To maximize compression on disk the `ORDER BY` tuple should be ordered by ascending cardinality
+- If it exists, the `PRIMARY KEY` tuple must be a subset of the `ORDER BY` tuple
+- If only `ORDER BY` is specified, then the same tuple will be used as `PRIMARY KEY`
+- The primary key index is created using the `PRIMARY KEY` tuple if specified, otherwise the `ORDER BY` tuple
+- The `PRIMARY KEY` index is kept in main memory
+
+Looking at the dataset and the questions that might be answered by querying it we might
+decide that we would look at the types of crimes reported over time in the five boroughs of
+New York City.  These fields might be then included in the `ORDER BY`:
+
+| Column      | Description (from the data dictionary)                 |
+| ----------- | ---------------------------------------------------    |
+| OFNS_DESC   | Description of offense corresponding with key code     |
+| RPT_DT      | Date event was reported to police                      |
+| BORO_NM     | The name of the borough in which the incident occurred |
+
+
+Querying the TSV file for the cardinality of the three candidate columns:
+
+```bash
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"select formatReadableQuantity(uniq(OFNS_DESC)) as cardinality_OFNS_DESC,
+        formatReadableQuantity(uniq(RPT_DT)) as cardinality_RPT_DT,
+        formatReadableQuantity(uniq(BORO_NM)) as cardinality_BORO_NM
+  FROM
+  file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
+  FORMAT PrettyCompact"
+```
+
+Result:
+```response
+┌─cardinality_OFNS_DESC─┬─cardinality_RPT_DT─┬─cardinality_BORO_NM─┐
+│ 60.00                 │ 306.00             │ 6.00                │
+└───────────────────────┴────────────────────┴─────────────────────┘
+```
+Ordering by cardinality, the `ORDER BY` becomes:
+
+```
+ORDER BY ( BORO_NM, OFNS_DESC, RPT_DT )
+```
+:::note
+The table below will use more easily read column names, the above names will be mapped to
+```
+ORDER BY ( borough, offense_description, date_reported )
+```
+:::
+
+Putting together the changes to data types and the `ORDER BY` tuple gives this table structure:
+
+```sql
+CREATE TABLE NYPD_Complaint ( 
+    complaint_number     String,
+    precinct             UInt8,
+    borough              LowCardinality(String),
+    complaint_begin      DateTime64(0,'America/New_York'),
+    complaint_end        DateTime64(0,'America/New_York'),
+    was_crime_completed  String,
+    housing_authority    String,
+    housing_level_code   UInt32,
+    jurisdiction_code    UInt8, 
+    jurisdiction         LowCardinality(String),
+    offense_code         UInt8,
+    offense_level        LowCardinality(String),
+    location_descriptor  LowCardinality(String),
+    offense_description  LowCardinality(String),
+    park_name            LowCardinality(String),
+    patrol_borough       LowCardinality(String),
+    PD_CD                UInt16,
+    PD_DESC              String,
+    location_type        LowCardinality(String),
+    date_reported        Date,
+    transit_station      LowCardinality(String),
+    suspect_age_group    LowCardinality(String),
+    suspect_race         LowCardinality(String),
+    suspect_sex          LowCardinality(String),
+    transit_district     UInt8,
+    victim_age_group     LowCardinality(String),
+    victim_race          LowCardinality(String),
+    victim_sex           LowCardinality(String),
+    NY_x_coordinate      UInt32,
+    NY_y_coordinate      UInt32,
+    Latitude             Float64,
+    Longitude            Float64
+) ENGINE = MergeTree
+  ORDER BY ( borough, offense_description, date_reported )
+```
+
+### Finding the primary key of a table
+
+The ClickHouse `system` database, specifically `system.table` has all of the information about the table you
+just created.  This query shows the `ORDER BY` (sorting key), and the `PRIMARY KEY`:
+```sql
+SELECT
+    partition_key,
+    sorting_key,
+    primary_key,
+    table
+FROM system.tables
+WHERE table = 'NYPD_Complaint'
+FORMAT Vertical
+```
+
+Response
+```response
+Query id: 6a5b10bf-9333-4090-b36e-c7f08b1d9e01
+
+Row 1:
+──────
+partition_key: 
+sorting_key:   borough, offense_description, date_reported
+primary_key:   borough, offense_description, date_reported
+table:         NYPD_Complaint
+
+1 row in set. Elapsed: 0.001 sec.
+```
+
+## Preprocess and Import Data {#preprocess-import-data}
+
+We will use `clickhouse-local` tool for data preprocessing and `clickhouse-client` to upload it.
+
+### `clickhouse-local` arguments used
+
+:::tip
+`table='input'` appears in the arguments to clickhouse-local below.  clickhouse-local takes the provided input (`cat ${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv`) and inserts the input into a table.  By default the table is named `table`.  In this guide the name of the table is set to `input` to make the data flow clearer. The final argument to clickhouse-local is a query that selects from the table (`FROM input`) which is then piped to `clickhouse-client` to populate the table `NYPD_Complaint`.
+:::
+  
+```sql
+cat ${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv \
+  | clickhouse-local --table='input' --input-format='TSVWithNames' \
+  --input_format_max_rows_to_read_for_schema_inference=2000 \
+  --query "
+    WITH (CMPLNT_FR_DT || ' ' || CMPLNT_FR_TM) AS CMPLNT_START,
+     (CMPLNT_TO_DT || ' ' || CMPLNT_TO_TM) AS CMPLNT_END
+    SELECT
+      CMPLNT_NUM                                  AS complaint_number,
+      ADDR_PCT_CD                                 AS precinct,
+      BORO_NM                                     AS borough,
+      parseDateTime64BestEffort(CMPLNT_START)     AS complaint_begin,
+      parseDateTime64BestEffortOrNull(CMPLNT_END) AS complaint_end,
+      CRM_ATPT_CPTD_CD                            AS was_crime_completed,
+      HADEVELOPT                                  AS housing_authority_development,
+      HOUSING_PSA                                 AS housing_level_code,
+      JURISDICTION_CODE                           AS jurisdiction_code, 
+      JURIS_DESC                                  AS jurisdiction,
+      KY_CD                                       AS offense_code,
+      LAW_CAT_CD                                  AS offense_level,
+      LOC_OF_OCCUR_DESC                           AS location_descriptor,
+      OFNS_DESC                                   AS offense_description, 
+      PARKS_NM                                    AS park_name,
+      PATROL_BORO                                 AS patrol_borough,
+      PD_CD,
+      PD_DESC,
+      PREM_TYP_DESC                               AS location_type,
+      toDate(parseDateTimeBestEffort(RPT_DT))     AS date_reported,
+      STATION_NAME                                AS transit_station,
+      SUSP_AGE_GROUP                              AS suspect_age_group,
+      SUSP_RACE                                   AS suspect_race,
+      SUSP_SEX                                    AS suspect_sex,
+      TRANSIT_DISTRICT                            AS transit_district,
+      VIC_AGE_GROUP                               AS victim_age_group,   
+      VIC_RACE                                    AS victim_race,
+      VIC_SEX                                     AS victim_sex,
+      X_COORD_CD                                  AS NY_x_coordinate,
+      Y_COORD_CD                                  AS NY_y_coordinate,
+      Latitude,
+      Longitude
+    FROM input" \
+  | clickhouse-client --query='INSERT INTO NYPD_Complaint FORMAT TSV'
+```  
+
+## Validate the Data {#validate-data}
+
+:::note
+The dataset changes once or more per year, your counts may not match what is in this document.
+:::
+
+Query:
+
+```sql
+SELECT count()
+FROM NYPD_Complaint
+```
+
+Result:
+
+```text
+┌─count()─┐
+│  208993 │
+└─────────┘
+
+1 row in set. Elapsed: 0.001 sec. 
+```
+
+The size of the dataset in ClickHouse is just 12% of the original TSV file, compare the size of the original TSV file with the size of the table:
+
+Query:
+
+```sql
+SELECT formatReadableSize(total_bytes)
+FROM system.tables
+WHERE name = 'NYPD_Complaint'
+```
+
+Result:
+```text
+┌─formatReadableSize(total_bytes)─┐
+│ 8.63 MiB                        │
+└─────────────────────────────────┘
+```
+
+
+## Run Some Queries {#run-queries}
+
+### Query 1. Compare the number of complaints by month
+
+Query:
+
+```sql
+SELECT
+    dateName('month', date_reported) AS month,
+    count() AS complaints,
+    bar(complaints, 0, 50000, 80)
+FROM NYPD_Complaint
+GROUP BY month
+ORDER BY complaints DESC
+```
+
+Result:
+```response
+Query id: 7fbd4244-b32a-4acf-b1f3-c3aa198e74d9
+
+┌─month─────┬─complaints─┬─bar(count(), 0, 50000, 80)───────────────────────────────┐
+│ March     │      34536 │ ███████████████████████████████████████████████████████▎ │
+│ May       │      34250 │ ██████████████████████████████████████████████████████▋  │
+│ April     │      32541 │ ████████████████████████████████████████████████████     │
+│ January   │      30806 │ █████████████████████████████████████████████████▎       │
+│ February  │      28118 │ ████████████████████████████████████████████▊            │
+│ November  │       7474 │ ███████████▊                                             │
+│ December  │       7223 │ ███████████▌                                             │
+│ October   │       7070 │ ███████████▎                                             │
+│ September │       6910 │ ███████████                                              │
+│ August    │       6801 │ ██████████▊                                              │
+│ June      │       6779 │ ██████████▋                                              │
+│ July      │       6485 │ ██████████▍                                              │
+└───────────┴────────────┴──────────────────────────────────────────────────────────┘
+
+12 rows in set. Elapsed: 0.006 sec. Processed 208.99 thousand rows, 417.99 KB (37.48 million rows/s., 74.96 MB/s.)
+```
+
+### Query 2. Compare total number of complaints by Borough
+
+Query:
+
+```sql
+SELECT
+    borough,
+    count() AS complaints,
+    bar(complaints, 0, 125000, 60)
+FROM NYPD_Complaint
+GROUP BY borough
+ORDER BY complaints DESC
+```
+
+Result:
+```response
+Query id: 8cdcdfd4-908f-4be0-99e3-265722a2ab8d
+
+┌─borough───────┬─complaints─┬─bar(count(), 0, 125000, 60)──┐
+│ BROOKLYN      │      57947 │ ███████████████████████████▋ │
+│ MANHATTAN     │      53025 │ █████████████████████████▍   │
+│ QUEENS        │      44875 │ █████████████████████▌       │
+│ BRONX         │      44260 │ █████████████████████▏       │
+│ STATEN ISLAND │       8503 │ ████                         │
+│ (null)        │        383 │ ▏                            │
+└───────────────┴────────────┴──────────────────────────────┘
+
+6 rows in set. Elapsed: 0.008 sec. Processed 208.99 thousand rows, 209.43 KB (27.14 million rows/s., 27.20 MB/s.)
+```
+
+## Next Steps
+
+[A Practical Introduction to Sparse Primary Indexes in ClickHouse](../../guides/improving-query-performance/sparse-primary-indexes/sparse-primary-indexes-intro.md) discusses the differences in ClickHouse indexing compared to traditional relational databases, how ClickHouse builds and uses a sparse primary index, and indexing best practices.
--- a/docs/en/getting-started/install.md
+++ b/docs/en/getting-started/install.md
@ -59,7 +59,7 @@ clickhouse-client # or "clickhouse-client --password" if you set up a password.

 </details>

-You can replace `stable` with `lts` or `testing` to use different [release trains](../faq/operations/production.md) based on your needs.
+You can replace `stable` with `lts` to use different [release kinds](../faq/operations/production.md) based on your needs.

 You can also download and install packages manually from [here](https://packages.clickhouse.com/deb/pool/stable).

@ -106,7 +106,7 @@ clickhouse-client # or "clickhouse-client --password" if you set up a password.

 </details>

-If you want to use the most recent version, replace `stable` with `testing` (this is recommended for your testing environments). `prestable` is sometimes also available.
+You can replace `stable` with `lts` to use different [release kinds](../faq/operations/production.md) based on your needs.

 Then run these commands to install packages:

@ -221,7 +221,7 @@ For non-Linux operating systems and for AArch64 CPU architecture, ClickHouse bui
    curl -O 'https://builds.clickhouse.com/master/aarch64/clickhouse' && chmod a+x ./clickhouse
    ```

-Run `sudo ./clickhouse install` to install ClickHouse system-wide (also with needed configuration files, configuring users etc.). Then run `clickhouse start` commands to start the clickhouse-server and `clickhouse-client` to connect to it.
+Run `sudo ./clickhouse install` to install ClickHouse system-wide (also with needed configuration files, configuring users etc.). Then run `sudo clickhouse start` commands to start the clickhouse-server and `clickhouse-client` to connect to it.

 Use the `clickhouse client` to connect to the server, or `clickhouse local` to process local data.

--- a/docs/en/operations/backup.md
+++ b/docs/en/operations/backup.md
@ -2,10 +2,9 @@
 slug: /en/operations/backup
 sidebar_position: 49
 sidebar_label: Data backup and restore
+title: Data backup and restore
 ---

-# Data backup and restore
-
 While [replication](../engines/table-engines/mergetree-family/replication.md) provides protection from hardware failures, it does not protect against human errors: accidental deletion of data, deletion of the wrong table or a table on the wrong cluster, and software bugs that result in incorrect data processing or data corruption. In many cases mistakes like these will affect all replicas. ClickHouse has built-in safeguards to prevent some types of mistakes — for example, by default [you can’t just drop tables with a MergeTree-like engine containing more than 50 Gb of data](server-configuration-parameters/settings.md#max-table-size-to-drop). However, these safeguards do not cover all possible cases and can be circumvented.

 In order to effectively mitigate possible human errors, you should carefully prepare a strategy for backing up and restoring your data **in advance**.
--- a/docs/en/operations/caches.md
+++ b/docs/en/operations/caches.md
@ -20,6 +20,7 @@ Additional cache types:
 - [Avro format](../interfaces/formats.md#data-format-avro) schemas cache.
 - [Dictionaries](../sql-reference/dictionaries/index.md) data cache.
 - Schema inference cache.
+- [Filesystem cache](storing-data.md) over S3, Azure, Local and other disks.

 Indirectly used:

--- a/docs/en/operations/server-configuration-parameters/settings.md
+++ b/docs/en/operations/server-configuration-parameters/settings.md
@ -1452,7 +1452,7 @@ Port for communicating with clients over MySQL protocol.

 **Possible values**

-Positive integer.
+Positive integer to specify the port number to listen to or empty value to disable.

 Example

@ -1466,7 +1466,7 @@ Port for communicating with clients over PostgreSQL protocol.

 **Possible values**

-Positive integer.
+Positive integer to specify the port number to listen to or empty value to disable.

 Example

--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@ -1176,8 +1176,9 @@ Enables the quorum writes.

 -   If `insert_quorum < 2`, the quorum writes are disabled.
 -   If `insert_quorum >= 2`, the quorum writes are enabled.
+-   If `insert_quorum = 'auto'`, use majority number (`number_of_replicas / 2 + 1`) as quorum number.

-Default value: 0.
+Default value: 0 - disabled.

 Quorum writes

@ -1259,7 +1260,7 @@ Possible values:

 Default value: 1.

-By default, blocks inserted into replicated tables by the `INSERT` statement are deduplicated (see [Data Replication](../../engines/table-engines/mergetree-family/replication.md)). 
+By default, blocks inserted into replicated tables by the `INSERT` statement are deduplicated (see [Data Replication](../../engines/table-engines/mergetree-family/replication.md)).
 For the replicated tables by default the only 100 of the most recent blocks for each partition are deduplicated (see [replicated_deduplication_window](merge-tree-settings.md#replicated-deduplication-window), [replicated_deduplication_window_seconds](merge-tree-settings.md/#replicated-deduplication-window-seconds)).
 For not replicated tables see [non_replicated_deduplication_window](merge-tree-settings.md/#non-replicated-deduplication-window).

--- a/docs/en/operations/storing-data.md
+++ b/docs/en/operations/storing-data.md
@ -112,6 +112,119 @@ Example of disk configuration:
 </clickhouse>
 ```

+## Using local cache {#using-local-cache}
+
+It is possible to configure local cache over disks in storage configuration starting from version 22.3. For versions 22.3 - 22.7 cache is supported only for `s3` disk type. For versions >= 22.8 cache is supported for any disk type: S3, Azure, Local, Encrypted, etc. Cache uses `LRU` cache policy.
+
+Example of configuration for versions later or equal to 22.8:
+
+``` xml
+<clickhouse>
+    <storage_configuration>
+        <disks>
+            <s3>
+                <type>s3</type>
+                <endpoint>...</endpoint>
+                ... s3 configuration ...
+            </s3>
+            <cache>
+                <type>cache</type>
+                <disk>s3</disk>
+                <path>/s3_cache/</path>
+                <max_size>10000000</max_size>
+            </cache>
+        </disks>
+    </storage_configuration>
+```
+
+Example of configuration for versions earlier than 22.8:
+
+``` xml
+<clickhouse>
+    <storage_configuration>
+        <disks>
+            <s3>
+                <type>s3</type>
+                <endpoint>...</endpoint>
+                ... s3 configuration ...
+                <data_cache_enabled>1</data_cache_enabled>
+                <data_cache_size>10000000</data_cache_size>
+            </s3>
+        </disks>
+    </storage_configuration>
+```
+
+Cache **configuration settings**:
+
+- `path` - path to the directory with cache. Default: None, this setting is obligatory.
+
+- `max_size` - maximum size of the cache in bytes. When the limit is reached, cache files are evicted according to the cache eviction policy. Default: None, this setting is obligatory.
+
+- `cache_on_write_operations` - allow to turn on `write-through` cache (caching data on any write operations: `INSERT` queries, background merges). Default: `false`. The `write-through` cache can be disabled per query using setting `enable_filesystem_cache_on_write_operations` (data is cached only if both cache config settings and corresponding query setting are enabled).
+
+- `enable_filesystem_query_cache_limit` - allow to limit the size of cache which is downloaded within each query (depends on user setting `max_query_cache_size`). Default: `false`.
+
+- `enable_cache_hits_threshold` - a number, which defines how many times some data needs to be read before it will be cached. Default: `0`, e.g. the data is cached at the first attempt to read it.
+
+- `do_not_evict_index_and_mark_files` - do not evict small frequently used files according to cache policy. Default: `true`.
+
+- `max_file_segment_size` - a maximum size of a single cache file. Default: `104857600` (100 Mb).
+
+- `max_elements` - a limit for a number of cache files. Default: `1048576`.
+
+Cache **query settings**:
+
+- `enable_filesystem_cache` - allows to disable cache per query even if storage policy was configured with `cache` disk type. Default: `true`.
+
+- `read_from_filesystem_cache_if_exists_otherwise_bypass_cache` - allows to use cache in query only if it already exists, otherwise query data will not be written to local cache storage. Default: `false`.
+
+- `enable_filesystem_cache_on_write_operations` - turn on `write-through` cache. This setting works only if setting `cache_on_write_operations` in cache configuration is turned on.
+
+- `enable_filesystem_cache_log` - turn on logging to `system.filesystem_cache_log` table. Gives a detailed view of cache usage per query. Default: `false`.
+
+- `max_query_cache_size` - a limit for the cache size, which can be written to local cache storage. Requires enabled `enable_filesystem_query_cache_limit` in cache configuration. Default: `false`.
+
+- `skip_download_if_exceeds_query_cache` - allows to change the behaviour of setting `max_query_cache_size`. Default: `true`. If this setting is turned on and cache download limit during query was reached, no more cache will be downloaded to cache storage. If this setting is turned off and cache download limit during query was reached, cache will still be written by cost of evicting previously downloaded (within current query) data, e.g. second behaviour allows to preserve `last recentltly used` behaviour while keeping query cache limit.
+
+** Warning **
+Cache configuration settings and cache query settings correspond to the latest ClickHouse version, for earlier versions something might not be supported.
+
+Cache **system tables**:
+
+- `system.filesystem_cache` - system tables which shows current state of cache.
+
+- `system.filesystem_cache_log` - system table which shows detailed cache usage per query. Requires `enable_filesystem_cache_log` setting to be `true`.
+
+Cache **commands**:
+
+- `SYSTEM DROP FILESYSTEM CACHE (<path>) (ON CLUSTER)`
+
+- `SHOW CACHES` -- show list of caches which were configured on the server.
+
+- `DESCRIBE CACHE '<cache_name>'` - show cache configuration and some general statistics for a specific cache. Cache name can be taken from `SHOW CACHES` command.
+
+Cache current metrics:
+
+- `FilesystemCacheSize`
+
+- `FilesystemCacheElements`
+
+Cache asynchronous metrics:
+
+- `FilesystemCacheBytes`
+
+- `FilesystemCacheFiles`
+
+Cache profile events:
+
+- `CachedReadBufferReadFromSourceBytes`, `CachedReadBufferReadFromCacheBytes,`
+
+- `CachedReadBufferReadFromSourceMicroseconds`, `CachedReadBufferReadFromCacheMicroseconds`
+
+- `CachedReadBufferCacheWriteBytes`, `CachedReadBufferCacheWriteMicroseconds`
+
+- `CachedWriteBufferCacheWriteBytes`, `CachedWriteBufferCacheWriteMicroseconds`
+
 ## Storing Data on Web Server {#storing-data-on-webserver}

 There is a tool `clickhouse-static-files-uploader`, which prepares a data directory for a given table (`SELECT data_paths FROM system.tables WHERE name = 'table_name'`). For each table you need, you get a directory of files. These files can be uploaded to, for example, a web server with static files. After this preparation, you can load this table into any ClickHouse server via `DiskWeb`.
--- a/docs/en/operations/tips.md
+++ b/docs/en/operations/tips.md
@ -74,13 +74,16 @@ Make sure that [`fstrim`](https://en.wikipedia.org/wiki/Trim_(computing)) is ena

 ## File System {#file-system}

-Ext4 is the most reliable option. Set the mount options `noatime`.
-XFS should be avoided. It works mostly fine but there are some reports about lower performance.
+Ext4 is the most reliable option. Set the mount options `noatime`. XFS works well too.
 Most other file systems should also work fine.

+FAT-32 and exFAT are not supported due to lack of hard links.
+
 Do not use compressed filesystems, because ClickHouse does compression on its own and better.
 It's not recommended to use encrypted filesystems, because you can use builtin encryption in ClickHouse, which is better.

+While ClickHouse can work over NFS, it is not the best idea.
+
 ## Linux Kernel {#linux-kernel}

 Don’t use an outdated Linux kernel.
--- a/docs/en/sql-reference/functions/date-time-functions.md
+++ b/docs/en/sql-reference/functions/date-time-functions.md
@ -640,7 +640,8 @@ Result:

 ## date\_diff

-Returns the difference between two dates or dates with time values.
+Returns the difference between two dates or dates with time values. 
+The difference is calculated using relative units, e.g. the difference between `2022-01-01` and `2021-12-29` is 3 days for day unit (see [toRelativeDayNum](#torelativedaynum)), 1 month for month unit (see [toRelativeMonthNum](#torelativemonthnum)), 1 year for year unit (see [toRelativeYearNum](#torelativeyearnum)).

 **Syntax**

@ -692,6 +693,25 @@ Result:
 └────────────────────────────────────────────────────────────────────────────────────────┘
 ```

+Query:
+
+``` sql
+SELECT
+    toDate('2022-01-01') AS e,
+    toDate('2021-12-29') AS s,
+    dateDiff('day', s, e) AS day_diff,
+    dateDiff('month', s, e) AS month__diff,
+    dateDiff('year', s, e) AS year_diff;
+```
+
+Result:
+
+``` text
+┌──────────e─┬──────────s─┬─day_diff─┬─month__diff─┬─year_diff─┐
+│ 2022-01-01 │ 2021-12-29 │        3 │           1 │         1 │
+└────────────┴────────────┴──────────┴─────────────┴───────────┘
+```
+
 ## date\_sub

 Subtracts the time interval or date interval from the provided date or date with time.
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -42,6 +42,14 @@ endif ()
 # See `src/Common/TargetSpecific.h`
 option(ENABLE_MULTITARGET_CODE "Enable platform-dependent code" ON)

+if (NO_SSE3_OR_HIGHER)
+    # Optimized x86 code in DECLARE_*_SPECIFIC_CODE blocks (see `src/Common/TargetSpecific.h`) is sometimes marked FORCE_INLINE. As a
+    # result, its instruction set requirements (e.g. SSE4.2) leak into generic code. This is normally not a problem for standard x86 builds
+    # because generic code is compiled with SSE 4.2 anyways. But it breaks SSE2-only builds. Therefore disabling the multitarget code
+    # machinery and always use generic code. (The cleaner alternative is removing FORCE_INLINE but that impacts performance too much.)
+    set(ENABLE_MULTITARGET_CODE OFF)
+endif()
+
 if (ENABLE_MULTITARGET_CODE)
    add_definitions(-DENABLE_MULTITARGET_CODE=1)
 else()
--- a/src/Client/ConnectionEstablisher.cpp
+++ b/src/Client/ConnectionEstablisher.cpp
@ -16,6 +16,7 @@ namespace ErrorCodes
    extern const int ATTEMPT_TO_READ_AFTER_EOF;
    extern const int NETWORK_ERROR;
    extern const int SOCKET_TIMEOUT;
+    extern const int DNS_ERROR;
 }

 ConnectionEstablisher::ConnectionEstablisher(
@ -90,6 +91,7 @@ void ConnectionEstablisher::run(ConnectionEstablisher::TryResult & result, std::
    catch (const Exception & e)
    {
        if (e.code() != ErrorCodes::NETWORK_ERROR && e.code() != ErrorCodes::SOCKET_TIMEOUT
+            && e.code() != ErrorCodes::DNS_ERROR
            && e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF)
            throw;

--- a/src/Common/Config/ConfigProcessor.cpp
+++ b/src/Common/Config/ConfigProcessor.cpp
@ -13,6 +13,7 @@
 #include <Poco/DOM/Text.h>
 #include <Poco/DOM/Attr.h>
 #include <Poco/DOM/Comment.h>
+#include <Poco/XML/XMLWriter.h>
 #include <Poco/Util/XMLConfiguration.h>
 #include <Common/ZooKeeper/ZooKeeperNodeCache.h>
 #include <Common/ZooKeeper/KeeperException.h>
@ -729,7 +730,11 @@ void ConfigProcessor::savePreprocessedConfig(const LoadedConfig & loaded_config,
            if (!preprocessed_path_parent.empty())
                fs::create_directories(preprocessed_path_parent);
        }
-        DOMWriter().writeNode(preprocessed_path, loaded_config.preprocessed_xml);
+        DOMWriter writer;
+        writer.setNewLine("\n");
+        writer.setIndent("    ");
+        writer.setOptions(Poco::XML::XMLWriter::PRETTY_PRINT);
+        writer.writeNode(preprocessed_path, loaded_config.preprocessed_xml);
        LOG_DEBUG(log, "Saved preprocessed configuration to '{}'.", preprocessed_path);
    }
    catch (Poco::Exception & e)
--- a/src/Common/Config/YAMLParser.cpp
+++ b/src/Common/Config/YAMLParser.cpp
@ -26,114 +26,107 @@ namespace ErrorCodes
    extern const int CANNOT_PARSE_YAML;
 }

-/// A prefix symbol in yaml key
-/// We add attributes to nodes by using a prefix symbol in the key part.
-/// Currently we use @ as a prefix symbol. Note, that @ is reserved
-/// by YAML standard, so we need to write a key-value pair like this: "@attribute": attr_value
-const char YAML_ATTRIBUTE_PREFIX = '@';
-
 namespace
 {
+    /// A prefix symbol in yaml key
+    /// We add attributes to nodes by using a prefix symbol in the key part.
+    /// Currently we use @ as a prefix symbol. Note, that @ is reserved
+    /// by YAML standard, so we need to write a key-value pair like this: "@attribute": attr_value
+    const char YAML_ATTRIBUTE_PREFIX = '@';

-Poco::AutoPtr<Poco::XML::Element> createCloneNode(Poco::XML::Element & original_node)
-{
-    Poco::AutoPtr<Poco::XML::Element> clone_node = original_node.ownerDocument()->createElement(original_node.nodeName());
-    original_node.parentNode()->appendChild(clone_node);
-    return clone_node;
-}
-
-void processNode(const YAML::Node & node, Poco::XML::Element & parent_xml_element)
-{
-    auto * xml_document = parent_xml_element.ownerDocument();
-    switch (node.Type())
+    Poco::AutoPtr<Poco::XML::Element> cloneXMLNode(const Poco::XML::Element & original_node)
    {
-        case YAML::NodeType::Scalar:
-        {
-            std::string value = node.as<std::string>();
-            Poco::AutoPtr<Poco::XML::Text> xml_value = xml_document->createTextNode(value);
-            parent_xml_element.appendChild(xml_value);
-            break;
-        }
+        Poco::AutoPtr<Poco::XML::Element> clone_node = original_node.ownerDocument()->createElement(original_node.nodeName());
+        original_node.parentNode()->appendChild(clone_node);
+        return clone_node;
+    }

-        /// We process YAML Sequences as a
-        /// list of <key>value</key> tags with same key and different values.
-        /// For example, we translate this sequence
-        /// seq:
-        ///     - val1
-        ///     - val2
-        ///
-        /// into this:
-        /// <seq>val1</seq>
-        /// <seq>val2</seq>
-        case YAML::NodeType::Sequence:
+    void processNode(const YAML::Node & node, Poco::XML::Element & parent_xml_node)
+    {
+        auto * xml_document = parent_xml_node.ownerDocument();
+        switch (node.Type())
        {
-            for (const auto & child_node : node)
-                /// For sequences it depends how we want to process them.
-                /// Sequences of key-value pairs such as:
-                /// seq:
-                ///     - k1: val1
-                ///     - k2: val2
-                /// into xml like this:
-                /// <seq>
-                ///     <k1>val1</k1>
-                ///     <k2>val2</k2>
-                /// </seq>
-                ///
-                /// But, if the sequence is just a list, the root-node needs to be repeated, such as:
-                /// seq:
-                ///     - val1
-                ///     - val2
-                /// into xml like this:
-                /// <seq>val1</seq>
-                /// <seq>val2</seq>
-                ///
-                /// Therefore check what type the child is, for further processing.
-                /// Mixing types (values list or map) will lead to strange results but should not happen.
-                if (parent_xml_element.hasChildNodes() && !child_node.IsMap())
-                {
-                    /// Create a new parent node with same tag for each child node
-                    processNode(child_node, *createCloneNode(parent_xml_element));
-                }
-                else
-                {
-                    /// Map, so don't recreate the parent node but add directly
-                    processNode(child_node, parent_xml_element);
-                }
-            break;
-        }
-        case YAML::NodeType::Map:
-        {
-            for (const auto & key_value_pair : node)
+            case YAML::NodeType::Scalar:
            {
-                const auto & key_node = key_value_pair.first;
-                const auto & value_node = key_value_pair.second;
-                std::string key = key_node.as<std::string>();
-                bool is_attribute = (key.starts_with(YAML_ATTRIBUTE_PREFIX) && value_node.IsScalar());
-                if (is_attribute)
-                {
-                    /// we use substr(1) here to remove YAML_ATTRIBUTE_PREFIX from key
-                    auto attribute_name = key.substr(1);
-                    std::string value = value_node.as<std::string>();
-                    parent_xml_element.setAttribute(attribute_name, value);
-                }
-                else
-                {
-                    Poco::AutoPtr<Poco::XML::Element> xml_key = xml_document->createElement(key);
-                    parent_xml_element.appendChild(xml_key);
-                    processNode(value_node, *xml_key);
-                }
+                std::string value = node.as<std::string>();
+                Poco::AutoPtr<Poco::XML::Text> xml_value = xml_document->createTextNode(value);
+                parent_xml_node.appendChild(xml_value);
+                break;
+            }
+
+            /// For sequences we repeat the parent xml node. For example,
+            /// seq:
+            ///     - val1
+            ///     - val2
+            /// is converted into the following xml:
+            /// <seq>val1</seq>
+            /// <seq>val2</seq>
+            ///
+            /// A sequence of mappings is converted in the same way:
+            /// seq:
+            ///     - k1: val1
+            ///       k2: val2
+            ///     - k3: val3
+            /// is converted into the following xml:
+            /// <seq><k1>val1</k1><k2>val2</k2></seq>
+            /// <seq><k3>val3</k3></seq>
+            case YAML::NodeType::Sequence:
+            {
+                size_t i = 0;
+                for (auto it = node.begin(); it != node.end(); ++it, ++i)
+                {
+                    const auto & child_node = *it;
+
+                    bool need_clone_parent_xml_node = (i > 0);
+
+                    if (need_clone_parent_xml_node)
+                    {
+                        /// Create a new parent node with same tag for each child node
+                        processNode(child_node, *cloneXMLNode(parent_xml_node));
+                    }
+                    else
+                    {
+                        /// Map, so don't recreate the parent node but add directly
+                        processNode(child_node, parent_xml_node);
+                    }
+                }
+                break;
+            }
+
+            case YAML::NodeType::Map:
+            {
+                for (const auto & key_value_pair : node)
+                {
+                    const auto & key_node = key_value_pair.first;
+                    const auto & value_node = key_value_pair.second;
+                    std::string key = key_node.as<std::string>();
+                    bool is_attribute = (key.starts_with(YAML_ATTRIBUTE_PREFIX) && value_node.IsScalar());
+                    if (is_attribute)
+                    {
+                        /// we use substr(1) here to remove YAML_ATTRIBUTE_PREFIX from key
+                        auto attribute_name = key.substr(1);
+                        std::string value = value_node.as<std::string>();
+                        parent_xml_node.setAttribute(attribute_name, value);
+                    }
+                    else
+                    {
+                        Poco::AutoPtr<Poco::XML::Element> xml_key = xml_document->createElement(key);
+                        parent_xml_node.appendChild(xml_key);
+                        processNode(value_node, *xml_key);
+                    }
+                }
+                break;
+            }
+
+            case YAML::NodeType::Null: break;
+            case YAML::NodeType::Undefined:
+            {
+                throw Exception(ErrorCodes::CANNOT_PARSE_YAML, "YAMLParser has encountered node with undefined type and cannot continue parsing of the file");
            }
-            break;
-        }
-        case YAML::NodeType::Null: break;
-        case YAML::NodeType::Undefined:
-        {
-            throw Exception(ErrorCodes::CANNOT_PARSE_YAML, "YAMLParser has encountered node with undefined type and cannot continue parsing of the file");
        }
    }
 }

-}

 Poco::AutoPtr<Poco::XML::Document> YAMLParser::parse(const String& path)
 {
--- a/src/Common/Elf.cpp
+++ b/src/Common/Elf.cpp
@ -22,13 +22,13 @@ Elf::Elf(const std::string & path)
    /// Check if it's an elf.
    elf_size = in.buffer().size();
    if (elf_size < sizeof(ElfEhdr))
-        throw Exception("The size of supposedly ELF file is too small", ErrorCodes::CANNOT_PARSE_ELF);
+        throw Exception(ErrorCodes::CANNOT_PARSE_ELF, "The size of supposedly ELF file '{}' is too small", path);

    mapped = in.buffer().begin();
    header = reinterpret_cast<const ElfEhdr *>(mapped);

    if (memcmp(header->e_ident, "\x7F""ELF", 4) != 0)
-        throw Exception("The file is not ELF according to magic", ErrorCodes::CANNOT_PARSE_ELF);
+        throw Exception(ErrorCodes::CANNOT_PARSE_ELF, "The file '{}' is not ELF according to magic", path);

    /// Get section header.
    ElfOff section_header_offset = header->e_shoff;
@ -37,7 +37,7 @@ Elf::Elf(const std::string & path)
    if (!section_header_offset
        || !section_header_num_entries
        || section_header_offset + section_header_num_entries * sizeof(ElfShdr) > elf_size)
-        throw Exception("The ELF is truncated (section header points after end of file)", ErrorCodes::CANNOT_PARSE_ELF);
+        throw Exception(ErrorCodes::CANNOT_PARSE_ELF, "The ELF '{}' is truncated (section header points after end of file)", path);

    section_headers = reinterpret_cast<const ElfShdr *>(mapped + section_header_offset);

@ -48,11 +48,11 @@ Elf::Elf(const std::string & path)
    });

    if (!section_names_strtab)
-        throw Exception("The ELF doesn't have string table with section names", ErrorCodes::CANNOT_PARSE_ELF);
+        throw Exception(ErrorCodes::CANNOT_PARSE_ELF, "The ELF '{}' doesn't have string table with section names", path);

    ElfOff section_names_offset = section_names_strtab->header.sh_offset;
    if (section_names_offset >= elf_size)
-        throw Exception("The ELF is truncated (section names string table points after end of file)", ErrorCodes::CANNOT_PARSE_ELF);
+        throw Exception(ErrorCodes::CANNOT_PARSE_ELF, "The ELF '{}' is truncated (section names string table points after end of file)", path);

    section_names = reinterpret_cast<const char *>(mapped + section_names_offset);

@ -64,7 +64,7 @@ Elf::Elf(const std::string & path)
    if (!program_header_offset
        || !program_header_num_entries
        || program_header_offset + program_header_num_entries * sizeof(ElfPhdr) > elf_size)
-        throw Exception("The ELF is truncated (program header points after end of file)", ErrorCodes::CANNOT_PARSE_ELF);
+        throw Exception(ErrorCodes::CANNOT_PARSE_ELF, "The ELF '{}' is truncated (program header points after end of file)", path);

    program_headers = reinterpret_cast<const ElfPhdr *>(mapped + program_header_offset);
 }
--- a/src/Common/FieldVisitorToString.cpp
+++ b/src/Common/FieldVisitorToString.cpp
@ -145,5 +145,11 @@ String FieldVisitorToString::operator() (const Object & x) const

 }

+String convertFieldToString(const Field & field)
+{
+    if (field.getType() == Field::Types::Which::String)
+        return field.get<String>();
+    return applyVisitor(FieldVisitorToString(), field);
 }

+}
--- a/src/Common/FieldVisitorToString.h
+++ b/src/Common/FieldVisitorToString.h
@ -31,5 +31,8 @@ public:
    String operator() (const bool & x) const;
 };

-}
+/// Get value from field and convert it to string.
+/// Also remove quotes from strings.
+String convertFieldToString(const Field & field);

+}
--- a/src/Common/OpenTelemetryTraceContext.cpp
+++ b/src/Common/OpenTelemetryTraceContext.cpp
@ -88,7 +88,13 @@ void Span::addAttribute(std::exception_ptr e) noexcept

 SpanHolder::SpanHolder(std::string_view _operation_name)
 {
-    if (current_thread_trace_context.isTraceEnabled())
+    if (!current_thread_trace_context.isTraceEnabled())
+    {
+        return;
+    }
+
+    /// Use try-catch to make sure the ctor is exception safe.
+    try
    {
        this->trace_id = current_thread_trace_context.trace_id;
        this->parent_span_id = current_thread_trace_context.span_id;
@ -97,9 +103,19 @@ SpanHolder::SpanHolder(std::string_view _operation_name)
        this->start_time_us
            = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();

-        // set current span id to this
-        current_thread_trace_context.span_id = this->span_id;
+        /// Add new initialization here
    }
+    catch (...)
+    {
+        tryLogCurrentException(__FUNCTION__);
+
+        /// Clear related fields to make sure the span won't be recorded.
+        this->trace_id = UUID();
+        return;
+    }
+
+    /// Set current span as parent of other spans created later on this thread.
+    current_thread_trace_context.span_id = this->span_id;
 }

 void SpanHolder::finish() noexcept
@ -216,7 +232,7 @@ const TracingContextOnThread & CurrentContext()
    return current_thread_trace_context;
 }

-void TracingContextOnThread::reset()
+void TracingContextOnThread::reset() noexcept
 {
    this->trace_id = UUID();
    this->span_id = 0;
@ -231,63 +247,75 @@ TracingContextHolder::TracingContextHolder(
    const Settings * settings_ptr,
    const std::weak_ptr<OpenTelemetrySpanLog> & _span_log)
 {
-    if (current_thread_trace_context.isTraceEnabled())
+    /// Use try-catch to make sure the ctor is exception safe.
+    /// If any exception is raised during the construction, the tracing is not enabled on current thread.
+    try
    {
-        ///
-        /// This is not the normal case,
-        /// it means that construction of current object is not at the start of current thread.
-        /// Usually this is due to:
-        ///    1. bad design
-        ///    2. right design but code changes so that original point where this object is constructing is not the new start execution of current thread
-        ///
-        /// In such case, we should use current context as parent of this new constructing object,
-        /// So this branch ensures this class can be instantiated multiple times on one same thread safely.
-        ///
-        this->is_context_owner = false;
-        this->root_span.trace_id = current_thread_trace_context.trace_id;
-        this->root_span.parent_span_id = current_thread_trace_context.span_id;
+        if (current_thread_trace_context.isTraceEnabled())
+        {
+            ///
+            /// This is not the normal case,
+            /// it means that construction of current object is not at the start of current thread.
+            /// Usually this is due to:
+            ///    1. bad design
+            ///    2. right design but code changes so that original point where this object is constructing is not the new start execution of current thread
+            ///
+            /// In such case, we should use current context as parent of this new constructing object,
+            /// So this branch ensures this class can be instantiated multiple times on one same thread safely.
+            ///
+            this->is_context_owner = false;
+            this->root_span.trace_id = current_thread_trace_context.trace_id;
+            this->root_span.parent_span_id = current_thread_trace_context.span_id;
+            this->root_span.span_id = thread_local_rng();
+            this->root_span.operation_name = _operation_name;
+            this->root_span.start_time_us
+                = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
+
+            /// Set the root span as parent of other spans created on current thread
+            current_thread_trace_context.span_id = this->root_span.span_id;
+            return;
+        }
+
+        if (!_parent_trace_context.isTraceEnabled())
+        {
+            if (settings_ptr == nullptr)
+                /// Skip tracing context initialization on current thread
+                return;
+
+            // Start the trace with some configurable probability.
+            std::bernoulli_distribution should_start_trace{settings_ptr->opentelemetry_start_trace_probability};
+            if (!should_start_trace(thread_local_rng))
+                /// skip tracing context initialization on current thread
+                return;
+
+            while (_parent_trace_context.trace_id == UUID())
+            {
+                // Make sure the random generated trace_id is not 0 which is an invalid id.
+                _parent_trace_context.trace_id.toUnderType().items[0] = thread_local_rng(); //-V656
+                _parent_trace_context.trace_id.toUnderType().items[1] = thread_local_rng(); //-V656
+            }
+            _parent_trace_context.span_id = 0;
+        }
+
+        this->root_span.trace_id = _parent_trace_context.trace_id;
+        this->root_span.parent_span_id = _parent_trace_context.span_id;
        this->root_span.span_id = thread_local_rng();
        this->root_span.operation_name = _operation_name;
        this->root_span.start_time_us
            = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();

-        current_thread_trace_context.span_id = this->root_span.span_id;
+        /// Add new initialization here
+    }
+    catch (...)
+    {
+        tryLogCurrentException(__FUNCTION__);
+
+        /// Clear related fields to make sure the tracing is not enabled.
+        this->root_span.trace_id = UUID();
        return;
    }

-    if (!_parent_trace_context.isTraceEnabled())
-    {
-        if (settings_ptr == nullptr)
-            /// skip tracing context initialization on current thread
-            return;
-
-        // start the trace ourselves, with some configurable probability.
-        std::bernoulli_distribution should_start_trace{settings_ptr->opentelemetry_start_trace_probability};
-        if (!should_start_trace(thread_local_rng))
-            /// skip tracing context initialization on current thread
-            return;
-
-        while (_parent_trace_context.trace_id == UUID())
-        {
-            // make sure the random generated trace_id is not 0 which is an invalid id
-            _parent_trace_context.trace_id.toUnderType().items[0] = thread_local_rng(); //-V656
-            _parent_trace_context.trace_id.toUnderType().items[1] = thread_local_rng(); //-V656
-        }
-        _parent_trace_context.span_id = 0;
-    }
-
-    this->root_span.trace_id = _parent_trace_context.trace_id;
-    this->root_span.parent_span_id = _parent_trace_context.span_id;
-    this->root_span.span_id = thread_local_rng();
-    this->root_span.operation_name = _operation_name;
-    this->root_span.start_time_us
-        = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
-
-    /// This object is created to initialize tracing context on a new thread,
-    /// it's helpful to record the thread_id so that we know the thread switching from the span log
-    this->root_span.addAttribute("clickhouse.thread_id", getThreadId());
-
-    /// set up trace context on current thread
+    /// Set up trace context on current thread only when the root span is successfully initialized.
    current_thread_trace_context = _parent_trace_context;
    current_thread_trace_context.span_id = this->root_span.span_id;
    current_thread_trace_context.trace_flags = TRACE_FLAG_SAMPLED;
@ -306,6 +334,18 @@ TracingContextHolder::~TracingContextHolder()
        auto shared_span_log = current_thread_trace_context.span_log.lock();
        if (shared_span_log)
        {
+            try
+            {
+                /// This object is created to initialize tracing context on a new thread,
+                /// it's helpful to record the thread_id so that we know the thread switching from the span log
+                this->root_span.addAttribute("clickhouse.thread_id", getThreadId());
+            }
+            catch (...)
+            {
+                /// It's acceptable that the attribute is not recorded in case of any exception,
+                /// so the exception is ignored to try to log the span.
+            }
+
            this->root_span.finish_time_us
                = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();

--- a/src/Common/OpenTelemetryTraceContext.h
+++ b/src/Common/OpenTelemetryTraceContext.h
@ -74,7 +74,7 @@ struct TracingContextOnThread : TracingContext
        return *this;
    }

-    void reset();
+    void reset() noexcept;

    /// Use weak_ptr instead of shared_ptr to hold a reference to the underlying system.opentelemetry_span_log table
    /// Since this object is kept on threads and passed across threads, a weak_ptr is more safe to prevent potential leak
--- a/src/Common/SymbolIndex.cpp
+++ b/src/Common/SymbolIndex.cpp
@ -37,7 +37,7 @@ But because ClickHouse is linked with most of the symbols exported (-rdynamic fl
 It allows to get source file names and line numbers from addresses. Only available if you use -g option for compiler.
 It is also used by default for ClickHouse builds, but because of its weight (about two gigabytes)
 it is split to separate binary and provided in clickhouse-common-static-dbg package.
-This separate binary is placed in /usr/lib/debug/usr/bin/clickhouse and is loaded automatically by tools like gdb, addr2line.
+This separate binary is placed in /usr/lib/debug/usr/bin/clickhouse.debug and is loaded automatically by tools like gdb, addr2line.
 When you build ClickHouse by yourself, debug info is not split and present in a single huge binary.

 What ClickHouse is using to provide good stack traces?
@ -391,10 +391,22 @@ void collectSymbolsFromELF(
    std::filesystem::path local_debug_info_path = canonical_path.parent_path() / canonical_path.stem();
    local_debug_info_path += ".debug";
    std::filesystem::path debug_info_path = std::filesystem::path("/usr/lib/debug") / canonical_path.relative_path();
+    debug_info_path += ".debug";

-    if (std::filesystem::exists(local_debug_info_path))
+    /// NOTE: This is a workaround for current package system.
+    ///
+    /// Since nfpm cannot copy file only if it exists,
+    /// and so in cmake empty .debug file is created instead,
+    /// but if we will try to load empty Elf file, then the CANNOT_PARSE_ELF
+    /// exception will be thrown from the Elf::Elf.
+    auto exists_not_empty = [](const std::filesystem::path & path)
+    {
+        return std::filesystem::exists(path) && !std::filesystem::is_empty(path);
+    };
+
+    if (exists_not_empty(local_debug_info_path))
        object_name = local_debug_info_path;
-    else if (std::filesystem::exists(debug_info_path))
+    else if (exists_not_empty(debug_info_path))
        object_name = debug_info_path;
    else if (build_id.size() >= 2)
    {
@ -412,7 +424,7 @@ void collectSymbolsFromELF(

        std::filesystem::path build_id_debug_info_path(
            fmt::format("/usr/lib/debug/.build-id/{}/{}.debug", build_id_hex.substr(0, 2), build_id_hex.substr(2)));
-        if (std::filesystem::exists(build_id_debug_info_path))
+        if (exists_not_empty(build_id_debug_info_path))
            object_name = build_id_debug_info_path;
        else
            object_name = canonical_path;
--- a/src/Common/ZooKeeper/ZooKeeperCommon.cpp
+++ b/src/Common/ZooKeeper/ZooKeeperCommon.cpp
@ -898,4 +898,25 @@ ZooKeeperRequestFactory::ZooKeeperRequestFactory()
    registerZooKeeperRequest<OpNum::FilteredList, ZooKeeperFilteredListRequest>(*this);
 }

+PathMatchResult matchPath(std::string_view path, std::string_view match_to)
+{
+    using enum PathMatchResult;
+
+    if (path.ends_with('/'))
+        path.remove_suffix(1);
+
+    if (match_to.ends_with('/'))
+        match_to.remove_suffix(1);
+
+    auto [first_it, second_it] = std::mismatch(path.begin(), path.end(), match_to.begin(), match_to.end());
+
+    if (second_it != match_to.end())
+        return NOT_MATCH;
+
+    if (first_it == path.end())
+        return EXACT;
+
+    return *first_it == '/' ? IS_CHILD : NOT_MATCH;
+}
+
 }
--- a/src/Common/ZooKeeper/ZooKeeperCommon.h
+++ b/src/Common/ZooKeeper/ZooKeeperCommon.h
@ -554,4 +554,13 @@ private:
    ZooKeeperRequestFactory();
 };

+enum class PathMatchResult
+{
+    NOT_MATCH,
+    EXACT,
+    IS_CHILD
+};
+
+PathMatchResult matchPath(std::string_view path, std::string_view match_to);
+
 }
--- a/src/Common/ZooKeeper/tests/gtest_zookeeper.cpp
+++ b/src/Common/ZooKeeper/tests/gtest_zookeeper.cpp
@ -0,0 +1,15 @@
+#include <gtest/gtest.h>
+
+#include <Common/ZooKeeper/ZooKeeperCommon.h>
+
+TEST(ZooKeeperTest, TestMatchPath)
+{
+    using namespace Coordination;
+
+    ASSERT_EQ(matchPath("/path/file", "/path"), PathMatchResult::IS_CHILD);
+    ASSERT_EQ(matchPath("/path/file", "/path/"), PathMatchResult::IS_CHILD);
+    ASSERT_EQ(matchPath("/path/file", "/"), PathMatchResult::IS_CHILD);
+    ASSERT_EQ(matchPath("/", "/"), PathMatchResult::EXACT);
+    ASSERT_EQ(matchPath("/path", "/path/"), PathMatchResult::EXACT);
+    ASSERT_EQ(matchPath("/path/", "/path"), PathMatchResult::EXACT);
+}
--- a/src/Common/tests/gtest_merge_configs.cpp
+++ b/src/Common/tests/gtest_merge_configs.cpp
@ -43,11 +43,8 @@ clickhouse:
    text_log:
        database: system
        table: text_log
-        partition_by:
-            "@remove": "1"
-        engine:
-            - "@replace" : "1"
-            - "ENGINE MergeTree"
+        partition_by: {"@remove": "1"}
+        engine: "ENGINE MergeTree"
        flush_interval_milliseconds: 7500
        level: debug
 )YAML";
@ -112,11 +109,8 @@ clickhouse:
    text_log :
        database: system
        table: text_log
-        partition_by:
-            "@remove": "1"
-        engine:
-            - "@replace" : "1"
-            - "ENGINE MergeTree"
+        partition_by: {"@remove": "1"}
+        engine: "ENGINE MergeTree"
        flush_interval_milliseconds: 7500
        level: debug
 )YAML";
--- a/src/Common/tests/gtest_yaml_parser.cpp
+++ b/src/Common/tests/gtest_yaml_parser.cpp
@ -13,40 +13,12 @@

 using namespace DB;

-TEST(Common, YamlParserInvalidFile)
+TEST(YamlParser, InvalidFile)
 {
    ASSERT_THROW(YAMLParser::parse("some-non-existing-file.yaml"), Exception);
 }

-TEST(Common, YamlParserProcessKeysList)
-{
-    auto yaml_file = getFileWithContents("keys-list.yaml", R"YAML(
-operator:
-    access_management: "1"
-    networks:
-      - ip: "10.1.6.168"
-      - ip: "::1"
-      - ip: "127.0.0.1"
-)YAML");
-    SCOPE_EXIT({ yaml_file->remove(); });
-
-    Poco::AutoPtr<Poco::XML::Document> xml = YAMLParser::parse(yaml_file->path());
-    auto *p_node = xml->getNodeByPath("/clickhouse");
-    EXPECT_EQ(xmlNodeAsString(p_node), R"CONFIG(<clickhouse>
-<operator>
-<access_management>1</access_management>
-<networks>
-<ip>10.1.6.168</ip>
-<ip>::1</ip>
-<ip>127.0.0.1</ip>
-</networks>
-</operator>
-</clickhouse>
-)CONFIG");
-
-}
-
-TEST(Common, YamlParserProcessValuesList)
+TEST(YamlParser, ProcessValuesList)
 {
    auto yaml_file = getFileWithContents("values-list.yaml", R"YAML(
 rules:
@ -75,4 +47,141 @@ rules:
 )CONFIG");

 }
+
+TEST(YamlParser, ProcessKeysList)
+{
+    auto yaml_file = getFileWithContents("keys-list.yaml", R"YAML(
+operator:
+    access_management: 1
+    networks:
+        ip:
+          - 10.1.6.168
+          - ::1
+          - 127.0.0.1
+)YAML");
+    SCOPE_EXIT({ yaml_file->remove(); });
+
+    Poco::AutoPtr<Poco::XML::Document> xml = YAMLParser::parse(yaml_file->path());
+    auto *p_node = xml->getNodeByPath("/clickhouse");
+    EXPECT_EQ(xmlNodeAsString(p_node), R"CONFIG(<clickhouse>
+<operator>
+<access_management>1</access_management>
+<networks>
+<ip>10.1.6.168</ip>
+<ip>::1</ip>
+<ip>127.0.0.1</ip>
+</networks>
+</operator>
+</clickhouse>
+)CONFIG");
+
+}
+
+TEST(YamlParser, ProcessListAttributes)
+{
+    auto yaml_file = getFileWithContents("list_attributes.yaml", R"YAML(
+seq:
+  - "@attr1": x
+  - k1: val1
+    k2: val2
+    "@attr2": y
+  - k3: val3
+    "@attr3": z
+)YAML");
+    SCOPE_EXIT({ yaml_file->remove(); });
+
+    Poco::AutoPtr<Poco::XML::Document> xml = YAMLParser::parse(yaml_file->path());
+    auto *p_node = xml->getNodeByPath("/clickhouse");
+    EXPECT_EQ(xmlNodeAsString(p_node), R"CONFIG(<clickhouse>
+<seq attr1="x"></seq>
+<seq attr2="y">
+<k1>val1</k1>
+<k2>val2</k2>
+</seq>
+<seq attr3="z">
+<k3>val3</k3>
+</seq>
+</clickhouse>
+)CONFIG");
+
+}
+
+TEST(YamlParser, ProcessMapAttributes)
+{
+    auto yaml_file = getFileWithContents("map_attributes.yaml", R"YAML(
+map:
+    "@attr1": x
+    k1: val1
+    k2: val2
+    "@attr2": y
+    k3: val3
+    "@attr3": z
+)YAML");
+    SCOPE_EXIT({ yaml_file->remove(); });
+
+    Poco::AutoPtr<Poco::XML::Document> xml = YAMLParser::parse(yaml_file->path());
+    auto *p_node = xml->getNodeByPath("/clickhouse");
+    EXPECT_EQ(xmlNodeAsString(p_node), R"CONFIG(<clickhouse>
+<map attr1="x" attr2="y" attr3="z">
+<k1>val1</k1>
+<k2>val2</k2>
+<k3>val3</k3>
+</map>
+</clickhouse>
+)CONFIG");
+
+}
+
+TEST(YamlParser, ClusterDef)
+{
+    auto yaml_file = getFileWithContents("cluster_def.yaml", R"YAML(
+test_cluster:
+    shard:
+        - internal_replication: false
+          replica:
+              - host: 127.0.0.1
+                port: 9000
+              - host: 127.0.0.2
+                port: 9000
+        - internal_replication: true
+          replica:
+              - host: 127.0.0.3
+                port: 9000
+              - host: 127.0.0.4
+                port: 9000
+)YAML");
+    SCOPE_EXIT({ yaml_file->remove(); });
+
+    Poco::AutoPtr<Poco::XML::Document> xml = YAMLParser::parse(yaml_file->path());
+    auto *p_node = xml->getNodeByPath("/clickhouse");
+    EXPECT_EQ(xmlNodeAsString(p_node), R"CONFIG(<clickhouse>
+<test_cluster>
+<shard>
+<internal_replication>false</internal_replication>
+<replica>
+<host>127.0.0.1</host>
+<port>9000</port>
+</replica>
+<replica>
+<host>127.0.0.2</host>
+<port>9000</port>
+</replica>
+</shard>
+<shard>
+<internal_replication>true</internal_replication>
+<replica>
+<host>127.0.0.3</host>
+<port>9000</port>
+</replica>
+<replica>
+<host>127.0.0.4</host>
+<port>9000</port>
+</replica>
+</shard>
+</test_cluster>
+</clickhouse>
+)CONFIG");
+
+}
+
 #endif
--- a/src/Coordination/KeeperSnapshotManager.cpp
+++ b/src/Coordination/KeeperSnapshotManager.cpp
@ -13,8 +13,10 @@
 #include <filesystem>
 #include <memory>
 #include <Common/logger_useful.h>
-#include "Coordination/KeeperContext.h"
+#include <Coordination/KeeperContext.h>
 #include <Coordination/KeeperConstants.h>
+#include <Common/ZooKeeper/ZooKeeperCommon.h>
+

 namespace DB
 {
@ -146,33 +148,6 @@ namespace
    }
 }

-namespace
-{
-
-enum class PathMatchResult
-{
-    NOT_MATCH,
-    EXACT,
-    IS_CHILD
-};
-
-PathMatchResult matchPath(const std::string_view path, const std::string_view match_to)
-{
-    using enum PathMatchResult;
-
-    auto [first_it, second_it] = std::mismatch(path.begin(), path.end(), match_to.begin(), match_to.end());
-
-    if (second_it != match_to.end())
-        return NOT_MATCH;
-
-    if (first_it == path.end())
-        return EXACT;
-
-    return *first_it == '/' ? IS_CHILD : NOT_MATCH;
-}
-
-}
-
 void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, WriteBuffer & out, KeeperContextPtr keeper_context)
 {
    writeBinary(static_cast<uint8_t>(snapshot.version), out);
@ -217,7 +192,7 @@ void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, Wr
        const auto & path = it->key;

        // write only the root system path because of digest
-        if (matchPath(path.toView(), keeper_system_path) == PathMatchResult::IS_CHILD)
+        if (Coordination::matchPath(path.toView(), keeper_system_path) == Coordination::PathMatchResult::IS_CHILD)
        {
            ++it;
            continue;
@ -365,8 +340,8 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial
        KeeperStorage::Node node{};
        readNode(node, in, current_version, storage.acl_map);

-        using enum PathMatchResult;
-        auto match_result = matchPath(path, keeper_system_path);
+        using enum Coordination::PathMatchResult;
+        auto match_result = Coordination::matchPath(path, keeper_system_path);

        const std::string error_msg = fmt::format("Cannot read node on path {} from a snapshot because it is used as a system node", path);
        if (match_result == IS_CHILD)
--- a/src/Coordination/KeeperStorage.cpp
+++ b/src/Coordination/KeeperStorage.cpp
@ -879,7 +879,7 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr
            path_created += seq_num_str.str();
        }

-        if (path_created.starts_with(keeper_system_path))
+        if (Coordination::matchPath(path_created, keeper_system_path) != Coordination::PathMatchResult::NOT_MATCH)
        {
            auto error_msg = fmt::format("Trying to create a node inside the internal Keeper path ({}) which is not allowed. Path: {}", keeper_system_path, path_created);

@ -1049,7 +1049,7 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr

        std::vector<KeeperStorage::Delta> new_deltas;

-        if (request.path.starts_with(keeper_system_path))
+        if (Coordination::matchPath(request.path, keeper_system_path) != Coordination::PathMatchResult::NOT_MATCH)
        {
            auto error_msg = fmt::format("Trying to delete an internal Keeper path ({}) which is not allowed", request.path);

@ -1203,7 +1203,7 @@ struct KeeperStorageSetRequestProcessor final : public KeeperStorageRequestProce

        std::vector<KeeperStorage::Delta> new_deltas;

-        if (request.path.starts_with(keeper_system_path))
+        if (Coordination::matchPath(request.path, keeper_system_path) != Coordination::PathMatchResult::NOT_MATCH)
        {
            auto error_msg = fmt::format("Trying to update an internal Keeper path ({}) which is not allowed", request.path);

@ -1472,7 +1472,7 @@ struct KeeperStorageSetACLRequestProcessor final : public KeeperStorageRequestPr
    {
        Coordination::ZooKeeperSetACLRequest & request = dynamic_cast<Coordination::ZooKeeperSetACLRequest &>(*zk_request);

-        if (request.path.starts_with(keeper_system_path))
+        if (Coordination::matchPath(request.path, keeper_system_path) != Coordination::PathMatchResult::NOT_MATCH)
        {
            auto error_msg = fmt::format("Trying to update an internal Keeper path ({}) which is not allowed", request.path);

--- a/src/Coordination/tests/gtest_coordination.cpp
+++ b/src/Coordination/tests/gtest_coordination.cpp
@ -2141,6 +2141,38 @@ TEST_P(CoordinationTest, TestCurrentApiVersion)
    EXPECT_EQ(keeper_version, static_cast<uint8_t>(current_keeper_api_version));
 }

+TEST_P(CoordinationTest, TestSystemNodeModify)
+{
+    using namespace Coordination;
+    int64_t zxid{0};
+
+    // On INIT we abort when a system path is modified
+    keeper_context->server_state = KeeperContext::Phase::RUNNING;
+    KeeperStorage storage{500, "", keeper_context};
+    const auto assert_create = [&](const std::string_view path, const auto expected_code)
+    {
+        auto request = std::make_shared<ZooKeeperCreateRequest>();
+        request->path = path;
+        storage.preprocessRequest(request, 0, 0, zxid);
+        auto responses = storage.processRequest(request, 0, zxid);
+        ASSERT_FALSE(responses.empty());
+
+        const auto & response = responses[0];
+        ASSERT_EQ(response.response->error, expected_code) << "Unexpected error for path " << path;
+
+        ++zxid;
+    };
+
+    assert_create("/keeper", Error::ZBADARGUMENTS);
+    assert_create("/keeper/with_child", Error::ZBADARGUMENTS);
+    assert_create(DB::keeper_api_version_path, Error::ZBADARGUMENTS);
+
+    assert_create("/keeper_map", Error::ZOK);
+    assert_create("/keeper1", Error::ZOK);
+    assert_create("/keepe", Error::ZOK);
+    assert_create("/keeper1/test", Error::ZOK);
+}
+
 INSTANTIATE_TEST_SUITE_P(CoordinationTestSuite,
    CoordinationTest,
    ::testing::ValuesIn(std::initializer_list<CompressionParam>{
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -213,7 +213,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
    \
    M(Bool, insert_deduplicate, true, "For INSERT queries in the replicated table, specifies that deduplication of insertings blocks should be performed", 0) \
    \
-    M(UInt64Auto, insert_quorum, 0, "For INSERT queries in the replicated table, wait writing for the specified number of replicas and linearize the addition of the data. 0 - disabled.", 0) \
+    M(UInt64Auto, insert_quorum, 0, "For INSERT queries in the replicated table, wait writing for the specified number of replicas and linearize the addition of the data. 0 - disabled, 'auto' - use majority", 0) \
    M(Milliseconds, insert_quorum_timeout, 600000, "If the quorum of replicas did not meet in specified time (in milliseconds), exception will be thrown and insertion is aborted.", 0) \
    M(Bool, insert_quorum_parallel, true, "For quorum INSERT queries - enable to make parallel inserts without linearizability", 0) \
    M(UInt64, select_sequential_consistency, 0, "For SELECT queries from the replicated table, throw an exception if the replica does not have a chunk written with the quorum; do not read the parts that have not yet been written with the quorum.", 0) \
--- a/src/Dictionaries/getDictionaryConfigurationFromAST.cpp
+++ b/src/Dictionaries/getDictionaryConfigurationFromAST.cpp
@ -44,15 +44,6 @@ struct AttributeConfiguration

 using AttributeNameToConfiguration = std::unordered_map<std::string, AttributeConfiguration>;

-/// Get value from field and convert it to string.
-/// Also remove quotes from strings.
-String getFieldAsString(const Field & field)
-{
-    if (field.getType() == Field::Types::Which::String)
-        return field.get<String>();
-    return applyVisitor(FieldVisitorToString(), field);
-}
-
 String getAttributeExpression(const ASTDictionaryAttributeDeclaration * dict_attr)
 {
    if (!dict_attr->expression)
@ -61,7 +52,7 @@ String getAttributeExpression(const ASTDictionaryAttributeDeclaration * dict_att
    /// EXPRESSION PROPERTY should be expression or string
    String expression_str;
    if (const auto * literal = dict_attr->expression->as<ASTLiteral>(); literal && literal->value.getType() == Field::Types::String)
-        expression_str = getFieldAsString(literal->value);
+        expression_str = convertFieldToString(literal->value);
    else
        expression_str = queryToString(dict_attr->expression);

@ -275,7 +266,7 @@ void buildSingleAttribute(
    AutoPtr<Element> null_value_element(doc->createElement("null_value"));
    String null_value_str;
    if (dict_attr->default_value)
-        null_value_str = getFieldAsString(dict_attr->default_value->as<ASTLiteral>()->value);
+        null_value_str = convertFieldToString(dict_attr->default_value->as<ASTLiteral>()->value);
    AutoPtr<Text> null_value(doc->createTextNode(null_value_str));
    null_value_element->appendChild(null_value);
    attribute_element->appendChild(null_value_element);
@ -452,7 +443,7 @@ void buildConfigurationFromFunctionWithKeyValueArguments(
        }
        else if (const auto * literal = pair->second->as<const ASTLiteral>())
        {
-            AutoPtr<Text> value(doc->createTextNode(getFieldAsString(literal->value)));
+            AutoPtr<Text> value(doc->createTextNode(convertFieldToString(literal->value)));
            current_xml_element->appendChild(value);
        }
        else if (const auto * list = pair->second->as<const ASTExpressionList>())
@ -473,7 +464,7 @@ void buildConfigurationFromFunctionWithKeyValueArguments(
            Field value;
            result->get(0, value);

-            AutoPtr<Text> text_value(doc->createTextNode(getFieldAsString(value)));
+            AutoPtr<Text> text_value(doc->createTextNode(convertFieldToString(value)));
            current_xml_element->appendChild(text_value);
        }
        else
@ -519,7 +510,7 @@ void buildSourceConfiguration(
        {
            AutoPtr<Element> setting_change_element(doc->createElement(name));
            settings_element->appendChild(setting_change_element);
-            AutoPtr<Text> setting_value(doc->createTextNode(getFieldAsString(value)));
+            AutoPtr<Text> setting_value(doc->createTextNode(convertFieldToString(value)));
            setting_change_element->appendChild(setting_value);
        }
    }
--- a/src/Disks/IDisk.h
+++ b/src/Disks/IDisk.h
@ -239,7 +239,16 @@ public:
    }

    /// For one local path there might be multiple remote paths in case of Log family engines.
-    using LocalPathWithObjectStoragePaths = std::pair<String, StoredObjects>;
+    struct LocalPathWithObjectStoragePaths
+     {
+         std::string local_path;
+         std::string common_prefix_for_objects;
+         StoredObjects objects;
+
+         LocalPathWithObjectStoragePaths(
+             const std::string & local_path_, const std::string & common_prefix_for_objects_, StoredObjects && objects_)
+             : local_path(local_path_), common_prefix_for_objects(common_prefix_for_objects_), objects(std::move(objects_)) {}
+     };

    virtual void getRemotePathsRecursive(const String &, std::vector<LocalPathWithObjectStoragePaths> &)
    {
--- a/src/Disks/ObjectStorages/DiskObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/DiskObjectStorage.cpp
@ -127,7 +127,7 @@ void DiskObjectStorage::getRemotePathsRecursive(const String & local_path, std::
    {
        try
        {
-            paths_map.emplace_back(local_path, getStorageObjects(local_path));
+            paths_map.emplace_back(local_path, metadata_storage->getObjectStorageRootPath(), getStorageObjects(local_path));
        }
        catch (const Exception & e)
        {
@ -282,7 +282,10 @@ String DiskObjectStorage::getUniqueId(const String & path) const
 bool DiskObjectStorage::checkUniqueId(const String & id) const
 {
    if (!id.starts_with(object_storage_root_path))
+    {
+        LOG_DEBUG(log, "Blob with id {} doesn't start with blob storage prefix {}", id, object_storage_root_path);
        return false;
+    }

    auto object = StoredObject::create(*object_storage, id, {}, {}, true);
    return object_storage->exists(object);
--- a/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp
+++ b/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp
@ -68,6 +68,14 @@ void DiskObjectStorageMetadata::deserialize(ReadBuffer & buf)
    }
 }

+void DiskObjectStorageMetadata::createFromSingleObject(const std::string & relative_path, size_t bytes_size, size_t ref_count_, bool read_only_)
+{
+    storage_objects.emplace_back(relative_path, bytes_size);
+    total_size = bytes_size;
+    ref_count = ref_count_;
+    read_only = read_only_;
+}
+
 void DiskObjectStorageMetadata::deserializeFromString(const std::string & data)
 {
    ReadBufferFromString buf(data);
--- a/src/Disks/ObjectStorages/DiskObjectStorageMetadata.h
+++ b/src/Disks/ObjectStorages/DiskObjectStorageMetadata.h
@ -50,6 +50,7 @@ public:

    void deserialize(ReadBuffer & buf);
    void deserializeFromString(const std::string & data);
+    void createFromSingleObject(const std::string & relative_path, size_t bytes_size, size_t ref_count_, bool is_read_only_);

    void serialize(WriteBuffer & buf, bool sync) const;
    std::string serializeToString() const;
--- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp
@ -56,7 +56,7 @@ void throwIfError(const Aws::Utils::Outcome<Result, Error> & response)
    if (!response.IsSuccess())
    {
        const auto & err = response.GetError();
-        throw Exception(ErrorCodes::S3_ERROR, "{} (Code: {})", err.GetMessage(), static_cast<size_t>(err.GetErrorType()));
+        throw S3Exception(fmt::format("{} (Code: {})", err.GetMessage(), static_cast<size_t>(err.GetErrorType())), err.GetErrorType());
    }
 }

@ -70,7 +70,7 @@ void throwIfUnexpectedError(const Aws::Utils::Outcome<Result, Error> & response,
    if (!response.IsSuccess() && (!if_exists || !isNotFoundError(response.GetError().GetErrorType())))
    {
        const auto & err = response.GetError();
-        throw Exception(ErrorCodes::S3_ERROR, "{} (Code: {})", err.GetMessage(), static_cast<size_t>(err.GetErrorType()));
+        throw S3Exception(err.GetErrorType(), "{} (Code: {})", err.GetMessage(), static_cast<size_t>(err.GetErrorType()));
    }
 }

--- a/src/IO/ReadBufferFromS3.cpp
+++ b/src/IO/ReadBufferFromS3.cpp
@ -34,6 +34,7 @@ namespace ErrorCodes
    extern const int CANNOT_SEEK_THROUGH_FILE;
    extern const int SEEK_POSITION_OUT_OF_BOUND;
    extern const int LOGICAL_ERROR;
+    extern const int CANNOT_ALLOCATE_MEMORY;
 }


@ -136,6 +137,23 @@ bool ReadBufferFromS3::nextImpl()
            ProfileEvents::increment(ProfileEvents::ReadBufferFromS3Microseconds, watch.elapsedMicroseconds());
            ProfileEvents::increment(ProfileEvents::ReadBufferFromS3RequestsErrors, 1);

+            if (const auto * s3_exception = dynamic_cast<const S3Exception *>(&e))
+            {
+                /// It doesn't make sense to retry Access Denied or No Such Key
+                if (!s3_exception->isRetryableError())
+                {
+                    tryLogCurrentException(log, fmt::format("while reading key: {}, from bucket: {}", key, bucket));
+                    throw;
+                }
+            }
+
+            /// It doesn't make sense to retry allocator errors
+            if (e.code() == ErrorCodes::CANNOT_ALLOCATE_MEMORY)
+            {
+                tryLogCurrentException(log);
+                throw;
+            }
+
            LOG_DEBUG(
                log,
                "Caught exception while reading S3 object. Bucket: {}, Key: {}, Version: {}, Offset: {}, Attempt: {}, Message: {}",
@ -306,7 +324,10 @@ std::unique_ptr<ReadBuffer> ReadBufferFromS3::initialize()
        return std::make_unique<ReadBufferFromIStream>(read_result.GetBody(), buffer_size);
    }
    else
-        throw Exception(outcome.GetError().GetMessage(), ErrorCodes::S3_ERROR);
+    {
+        const auto & error = outcome.GetError();
+        throw S3Exception(error.GetMessage(), error.GetErrorType());
+    }
 }

 SeekableReadBufferPtr ReadBufferS3Factory::getReader()
--- a/src/IO/S3Common.cpp
+++ b/src/IO/S3Common.cpp
@ -35,6 +35,26 @@

 #    include <fstream>

+namespace DB
+{
+
+bool S3Exception::isRetryableError() const
+{
+    /// Looks like these list is quite conservative, add more codes if you wish
+    static const std::unordered_set<Aws::S3::S3Errors> unretryable_errors = {
+        Aws::S3::S3Errors::NO_SUCH_KEY,
+        Aws::S3::S3Errors::ACCESS_DENIED,
+        Aws::S3::S3Errors::INVALID_ACCESS_KEY_ID,
+        Aws::S3::S3Errors::INVALID_SIGNATURE,
+        Aws::S3::S3Errors::NO_SUCH_UPLOAD,
+        Aws::S3::S3Errors::NO_SUCH_BUCKET,
+    };
+
+    return !unretryable_errors.contains(code);
+}
+
+}
+
 namespace
 {

--- a/src/IO/S3Common.h
+++ b/src/IO/S3Common.h
@ -7,23 +7,62 @@
 #include <base/types.h>
 #include <aws/core/Aws.h>
 #include <aws/core/client/ClientConfiguration.h>
+#include <aws/s3/S3Errors.h>
 #include <IO/S3/PocoHTTPClient.h>
 #include <Poco/URI.h>

+#include <Common/Exception.h>
+
 namespace Aws::S3
 {
    class S3Client;
 }

+
 namespace DB
 {
-    class RemoteHostFilter;
-    struct HttpHeader;
-    using HeaderCollection = std::vector<HttpHeader>;
+namespace ErrorCodes
+{
+    extern const int S3_ERROR;
 }

+class RemoteHostFilter;
+struct HttpHeader;
+using HeaderCollection = std::vector<HttpHeader>;
+
+class S3Exception : public Exception
+{
+public:
+
+    // Format message with fmt::format, like the logging functions.
+    template <typename... Args>
+    S3Exception(Aws::S3::S3Errors code_, fmt::format_string<Args...> fmt, Args &&... args)
+        : Exception(fmt::format(fmt, std::forward<Args>(args)...), ErrorCodes::S3_ERROR)
+        , code(code_)
+    {
+    }
+
+    S3Exception(const std::string & msg, Aws::S3::S3Errors code_)
+        : Exception(msg, ErrorCodes::S3_ERROR)
+        , code(code_)
+    {}
+
+    Aws::S3::S3Errors getS3ErrorCode() const
+    {
+        return code;
+    }
+
+    bool isRetryableError() const;
+
+private:
+    const Aws::S3::S3Errors code;
+};
+}
+
+
 namespace DB::S3
 {
+
 class ClientFactory
 {
 public:
--- a/src/IO/WriteBufferFromS3.cpp
+++ b/src/IO/WriteBufferFromS3.cpp
@ -8,6 +8,7 @@

 #include <IO/WriteBufferFromS3.h>
 #include <IO/WriteHelpers.h>
+#include <IO/S3Common.h>
 #include <Interpreters/Context.h>

 #include <aws/s3/S3Client.h>
@ -173,7 +174,9 @@ void WriteBufferFromS3::finalizeImpl()
        auto response = client_ptr->HeadObject(request);

        if (!response.IsSuccess())
-            throw Exception(ErrorCodes::S3_ERROR, "Object {} from bucket {} disappeared immediately after upload, it's a bug in S3 or S3 API.", key, bucket);
+            throw S3Exception(fmt::format("Object {} from bucket {} disappeared immediately after upload, it's a bug in S3 or S3 API.", key, bucket), response.GetError().GetErrorType());
+        else
+            LOG_TRACE(log, "Object {} exists after upload", key);
    }
 }

@ -197,7 +200,7 @@ void WriteBufferFromS3::createMultipartUpload()
        LOG_TRACE(log, "Multipart upload has created. Bucket: {}, Key: {}, Upload id: {}", bucket, key, multipart_upload_id);
    }
    else
-        throw Exception(outcome.GetError().GetMessage(), ErrorCodes::S3_ERROR);
+        throw S3Exception(outcome.GetError().GetMessage(), outcome.GetError().GetErrorType());
 }

 void WriteBufferFromS3::writePart()
@ -309,7 +312,7 @@ void WriteBufferFromS3::processUploadRequest(UploadPartTask & task)
        LOG_TRACE(log, "Writing part finished. Bucket: {}, Key: {}, Upload_id: {}, Etag: {}, Parts: {}", bucket, key, multipart_upload_id, task.tag, part_tags.size());
    }
    else
-        throw Exception(outcome.GetError().GetMessage(), ErrorCodes::S3_ERROR);
+        throw S3Exception(outcome.GetError().GetMessage(), outcome.GetError().GetErrorType());

    total_parts_uploaded++;
 }
@ -343,9 +346,10 @@ void WriteBufferFromS3::completeMultipartUpload()
        LOG_TRACE(log, "Multipart upload has completed. Bucket: {}, Key: {}, Upload_id: {}, Parts: {}", bucket, key, multipart_upload_id, tags.size());
    else
    {
-        throw Exception(ErrorCodes::S3_ERROR, "{} Tags:{}",
-            outcome.GetError().GetMessage(),
-            fmt::join(tags.begin(), tags.end(), " "));
+        throw S3Exception(
+            outcome.GetError().GetErrorType(),
+            "Message: {}, Key: {}, Bucket: {}, Tags: {}",
+            outcome.GetError().GetMessage(), key, bucket, fmt::join(tags.begin(), tags.end(), " "));
    }
 }

@ -430,7 +434,10 @@ void WriteBufferFromS3::processPutRequest(const PutObjectTask & task)
    if (outcome.IsSuccess())
        LOG_TRACE(log, "Single part upload has completed. Bucket: {}, Key: {}, Object size: {}, WithPool: {}", bucket, key, task.req.GetContentLength(), with_pool);
    else
-        throw Exception(outcome.GetError().GetMessage(), ErrorCodes::S3_ERROR);
+        throw S3Exception(
+            outcome.GetError().GetErrorType(),
+            "Message: {}, Key: {}, Bucket: {}, Object size: {}, WithPool: {}",
+            outcome.GetError().GetMessage(), key, bucket, task.req.GetContentLength(), with_pool);
 }

 void WriteBufferFromS3::waitForReadyBackGroundTasks()
--- a/src/Interpreters/InterpreterDeleteQuery.cpp
+++ b/src/Interpreters/InterpreterDeleteQuery.cpp
@ -21,7 +21,6 @@ namespace DB

 namespace ErrorCodes
 {
-    extern const int BAD_ARGUMENTS;
    extern const int TABLE_IS_READ_ONLY;
    extern const int SUPPORT_IS_DISABLED;
 }
@ -34,11 +33,6 @@ InterpreterDeleteQuery::InterpreterDeleteQuery(const ASTPtr & query_ptr_, Contex

 BlockIO InterpreterDeleteQuery::execute()
 {
-    if (!getContext()->getSettingsRef().allow_experimental_lightweight_delete)
-    {
-        throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Lightweight delete mutate is experimental. Set `allow_experimental_lightweight_delete` setting to enable it");
-    }
-
    FunctionNameNormalizer().visit(query_ptr.get());
    const ASTDeleteQuery & delete_query = query_ptr->as<ASTDeleteQuery &>();
    auto table_id = getContext()->resolveStorageID(delete_query, Context::ResolveOrdinary);
@ -49,10 +43,6 @@ BlockIO InterpreterDeleteQuery::execute()

    /// First check table storage for validations.
    StoragePtr table = DatabaseCatalog::instance().getTable(table_id, getContext());
-    auto merge_tree = std::dynamic_pointer_cast<MergeTreeData>(table);
-    if (!merge_tree)
-        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Only MergeTree family tables are supported");
-
    checkStorageSupportsTransactionsIfNeeded(table, getContext());
    if (table->isStaticStorage())
        throw Exception(ErrorCodes::TABLE_IS_READ_ONLY, "Table is read-only");
@ -69,6 +59,27 @@ BlockIO InterpreterDeleteQuery::execute()
    auto table_lock = table->lockForShare(getContext()->getCurrentQueryId(), getContext()->getSettingsRef().lock_acquire_timeout);
    auto metadata_snapshot = table->getInMemoryMetadataPtr();

+    auto merge_tree = std::dynamic_pointer_cast<MergeTreeData>(table);
+    if (!merge_tree)
+    {
+        /// Convert to MutationCommand
+        MutationCommands mutation_commands;
+        MutationCommand mut_command;
+
+        mut_command.type = MutationCommand::Type::DELETE;
+        mut_command.predicate = delete_query.predicate;
+
+        mutation_commands.emplace_back(mut_command);
+
+        table->checkMutationIsPossible(mutation_commands, getContext()->getSettingsRef());
+        MutationsInterpreter(table, metadata_snapshot, mutation_commands, getContext(), false).validate();
+        table->mutate(mutation_commands, getContext());
+        return {};
+    }
+
+    if (!getContext()->getSettingsRef().allow_experimental_lightweight_delete)
+        throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Lightweight delete mutate is experimental. Set `allow_experimental_lightweight_delete` setting to enable it");
+
    /// Convert to MutationCommand
    MutationCommands mutation_commands;
    MutationCommand mut_command;
--- a/src/Interpreters/MutationsInterpreter.cpp
+++ b/src/Interpreters/MutationsInterpreter.cpp
@ -226,7 +226,7 @@ bool isStorageTouchedByMutations(
    ASTPtr select_query = prepareQueryAffectedAST(commands, storage, context_copy);

    /// Interpreter must be alive, when we use result of execute() method.
-    /// For some reason it may copy context and and give it into ExpressionTransform
+    /// For some reason it may copy context and give it into ExpressionTransform
    /// after that we will use context from destroyed stack frame in our stream.
    InterpreterSelectQuery interpreter(
        select_query, context_copy, storage, metadata_snapshot, SelectQueryOptions().ignoreLimits().ignoreProjections());
@ -288,13 +288,17 @@ MutationsInterpreter::MutationsInterpreter(
    const StorageMetadataPtr & metadata_snapshot_,
    MutationCommands commands_,
    ContextPtr context_,
-    bool can_execute_)
+    bool can_execute_,
+    bool return_all_columns_,
+    bool return_deleted_rows_)
    : storage(std::move(storage_))
    , metadata_snapshot(metadata_snapshot_)
    , commands(std::move(commands_))
    , context(Context::createCopy(context_))
    , can_execute(can_execute_)
    , select_limits(SelectQueryOptions().analyze(!can_execute).ignoreLimits().ignoreProjections())
+    , return_all_columns(return_all_columns_)
+    , return_deleted_rows(return_deleted_rows_)
 {
    mutation_ast = prepare(!can_execute);
 }
@ -472,14 +476,21 @@ ASTPtr MutationsInterpreter::prepare(bool dry_run)
    /// First, break a sequence of commands into stages.
    for (auto & command : commands)
    {
+        // we can return deleted rows only if it's the only present command
+        assert(command.type == MutationCommand::DELETE || !return_deleted_rows);
+
        if (command.type == MutationCommand::DELETE)
        {
            mutation_kind.set(MutationKind::MUTATE_OTHER);
            if (stages.empty() || !stages.back().column_to_updated.empty())
                stages.emplace_back(context);

-            auto negated_predicate = makeASTFunction("isZeroOrNull", getPartitionAndPredicateExpressionForMutationCommand(command));
-            stages.back().filters.push_back(negated_predicate);
+            auto predicate  = getPartitionAndPredicateExpressionForMutationCommand(command);
+
+            if (!return_deleted_rows)
+                predicate = makeASTFunction("isZeroOrNull", predicate);
+
+            stages.back().filters.push_back(predicate);
        }
        else if (command.type == MutationCommand::UPDATE)
        {
@ -789,7 +800,7 @@ ASTPtr MutationsInterpreter::prepareInterpreterSelectQuery(std::vector<Stage> &
    /// Next, for each stage calculate columns changed by this and previous stages.
    for (size_t i = 0; i < prepared_stages.size(); ++i)
    {
-        if (!prepared_stages[i].filters.empty())
+        if (return_all_columns || !prepared_stages[i].filters.empty())
        {
            for (const auto & column : all_columns)
                prepared_stages[i].output_columns.insert(column.name);
--- a/src/Interpreters/MutationsInterpreter.h
+++ b/src/Interpreters/MutationsInterpreter.h
@ -43,7 +43,9 @@ public:
        const StorageMetadataPtr & metadata_snapshot_,
        MutationCommands commands_,
        ContextPtr context_,
-        bool can_execute_);
+        bool can_execute_,
+        bool return_all_columns_ = false,
+        bool return_deleted_rows_ = false);

    void validate();

@ -156,6 +158,12 @@ private:

    /// Columns, that we need to read for calculation of skip indices, projections or TTL expressions.
    ColumnDependencies dependencies;
+
+    // whether all columns should be returned, not just updated
+    bool return_all_columns;
+
+    // whether we should return deleted or nondeleted rows on DELETE mutation
+    bool return_deleted_rows;
 };

 }
--- a/src/Parsers/ParserSelectQuery.cpp
+++ b/src/Parsers/ParserSelectQuery.cpp
@ -224,8 +224,6 @@ bool ParserSelectQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
            select_query->group_by_with_rollup = true;
        else if (s_cube.ignore(pos, expected))
            select_query->group_by_with_cube = true;
-        else if (s_grouping_sets.ignore(pos, expected))
-            select_query->group_by_with_grouping_sets = true;
        else if (s_totals.ignore(pos, expected))
            select_query->group_by_with_totals = true;
        else
--- a/src/Processors/QueryPlan/AggregatingStep.cpp
+++ b/src/Processors/QueryPlan/AggregatingStep.cpp
@ -251,14 +251,17 @@ void AggregatingStep::transformPipeline(QueryPipelineBuilder & pipeline, const B
                outputs.push_back(grouping_node);

                const auto & missing_columns = grouping_sets_params[set_counter].missing_keys;
+                const auto & used_keys = grouping_sets_params[set_counter].used_keys;

                auto to_nullable_function = FunctionFactory::instance().get("toNullable", nullptr);
                for (size_t i = 0; i < output_header.columns(); ++i)
                {
                    auto & col = output_header.getByPosition(i);
-                    const auto it = std::find_if(
+                    const auto missing_it = std::find_if(
                        missing_columns.begin(), missing_columns.end(), [&](const auto & missing_col) { return missing_col == col.name; });
-                    if (it != missing_columns.end())
+                    const auto used_it = std::find_if(
+                        used_keys.begin(), used_keys.end(), [&](const auto & used_col) { return used_col == col.name; });
+                    if (missing_it != missing_columns.end())
                    {
                        auto column_with_default = col.column->cloneEmpty();
                        col.type->insertDefaultInto(*column_with_default);
@ -270,7 +273,7 @@ void AggregatingStep::transformPipeline(QueryPipelineBuilder & pipeline, const B
                    else
                    {
                        const auto * column_node = dag->getOutputs()[header.getPositionByName(col.name)];
-                        if (group_by_use_nulls && column_node->result_type->canBeInsideNullable())
+                        if (used_it != used_keys.end() && group_by_use_nulls && column_node->result_type->canBeInsideNullable())
                            outputs.push_back(&dag->addFunction(to_nullable_function, { column_node }, col.name));
                        else
                            outputs.push_back(column_node);
--- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp
+++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp
@ -179,7 +179,6 @@ Pipe ReadFromMergeTree::readFromPool(
        sum_marks,
        min_marks_for_concurrent_read,
        std::move(parts_with_range),
-        data,
        storage_snapshot,
        prewhere_info,
        required_columns,
--- a/src/Storages/MergeTree/AlterConversions.h
+++ b/src/Storages/MergeTree/AlterConversions.h
@ -0,0 +1,24 @@
+#pragma once
+
+#include <string>
+#include <unordered_map>
+
+
+namespace DB
+{
+
+/// Alter conversions which should be applied on-fly for part. Build from of
+/// the most recent mutation commands for part. Now we have only rename_map
+/// here (from ALTER_RENAME) command, because for all other type of alters
+/// we can deduce conversions for part from difference between
+/// part->getColumns() and storage->getColumns().
+struct AlterConversions
+{
+    /// Rename map new_name -> old_name
+    std::unordered_map<std::string, std::string> rename_map;
+
+    bool isColumnRenamed(const std::string & new_name) const { return rename_map.count(new_name) > 0; }
+    std::string getColumnOldName(const std::string & new_name) const { return rename_map.at(new_name); }
+};
+
+}
--- a/src/Storages/MergeTree/DataPartsExchange.cpp
+++ b/src/Storages/MergeTree/DataPartsExchange.cpp
@ -399,7 +399,7 @@ MergeTreeData::DataPartPtr Service::findPart(const String & name)
    throw Exception(ErrorCodes::NO_SUCH_DATA_PART, "No part {} in table", name);
 }

-MergeTreeData::MutableDataPartPtr Fetcher::fetchPart(
+MergeTreeData::MutableDataPartPtr Fetcher::fetchSelectedPart(
    const StorageMetadataPtr & metadata_snapshot,
    ContextPtr context,
    const String & part_name,
@ -420,6 +420,11 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchPart(
    if (blocker.isCancelled())
        throw Exception("Fetching of part was cancelled", ErrorCodes::ABORTED);

+    const auto data_settings = data.getSettings();
+
+    if (data_settings->allow_remote_fs_zero_copy_replication && !try_zero_copy)
+        LOG_WARNING(log, "Zero copy replication enabled, but trying to fetch part {} without zero copy", part_name);
+
    /// It should be "tmp-fetch_" and not "tmp_fetch_", because we can fetch part to detached/,
    /// but detached part name prefix should not contain underscore.
    static const String TMP_PREFIX = "tmp-fetch_";
@ -429,7 +434,6 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchPart(

    /// Validation of the input that may come from malicious replica.
    auto part_info = MergeTreePartInfo::fromPartName(part_name, data.format_version);
-    const auto data_settings = data.getSettings();

    Poco::URI uri;
    uri.setScheme(interserver_scheme);
@ -465,6 +469,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchPart(
            capability.push_back(toString(disk->getDataSourceDescription().type));
        }
    }
+
    if (!capability.empty())
    {
        ::sort(capability.begin(), capability.end());
@ -474,6 +479,9 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchPart(
    }
    else
    {
+        if (data_settings->allow_remote_fs_zero_copy_replication)
+            LOG_WARNING(log, "Cannot select any zero-copy disk for {}", part_name);
+
        try_zero_copy = false;
    }

@ -585,7 +593,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchPart(
            temporary_directory_lock = {};

            /// Try again but without zero-copy
-            return fetchPart(metadata_snapshot, context, part_name, replica_path, host, port, timeouts,
+            return fetchSelectedPart(metadata_snapshot, context, part_name, replica_path, host, port, timeouts,
                user, password, interserver_scheme, throttler, to_detached, tmp_prefix, nullptr, false, disk);
        }
    }
--- a/src/Storages/MergeTree/DataPartsExchange.h
+++ b/src/Storages/MergeTree/DataPartsExchange.h
@ -66,7 +66,7 @@ public:
    explicit Fetcher(StorageReplicatedMergeTree & data_) : data(data_), log(&Poco::Logger::get("Fetcher")) {}

    /// Downloads a part to tmp_directory. If to_detached - downloads to the `detached` directory.
-    MergeTreeData::MutableDataPartPtr fetchPart(
+    MergeTreeData::MutableDataPartPtr fetchSelectedPart(
        const StorageMetadataPtr & metadata_snapshot,
        ContextPtr context,
        const String & part_name,
--- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp
+++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp
@ -532,25 +532,6 @@ void IMergeTreeDataPart::removeIfNeeded()
            LOG_TRACE(storage.log, "Removed part from old location {}", path);
        }
    }
-    catch (const Exception & ex)
-    {
-        tryLogCurrentException(__PRETTY_FUNCTION__, fmt::format("while removing part {} with path {}", name, path));
-
-        /// In this case we want to avoid assertions, because such errors are unavoidable in setup
-        /// with zero-copy replication.
-        if (const auto * keeper_exception = dynamic_cast<const Coordination::Exception *>(&ex))
-        {
-            if (Coordination::isHardwareError(keeper_exception->code))
-                return;
-        }
-
-        /// FIXME If part it temporary, then directory will not be removed for 1 day (temporary_directories_lifetime).
-        /// If it's tmp_merge_<part_name> or tmp_fetch_<part_name>,
-        /// then all future attempts to execute part producing operation will fail with "directory already exists".
-        assert(!is_temp);
-        assert(state != MergeTreeDataPartState::DeleteOnDestroy);
-        assert(state != MergeTreeDataPartState::Temporary);
-    }
    catch (...)
    {
        tryLogCurrentException(__PRETTY_FUNCTION__, fmt::format("while removing part {} with path {}", name, path));
@ -558,11 +539,6 @@ void IMergeTreeDataPart::removeIfNeeded()
        /// FIXME If part it temporary, then directory will not be removed for 1 day (temporary_directories_lifetime).
        /// If it's tmp_merge_<part_name> or tmp_fetch_<part_name>,
        /// then all future attempts to execute part producing operation will fail with "directory already exists".
-        ///
-        /// For remote disks this issue is really frequent, so we don't about server here
-        assert(!is_temp);
-        assert(state != MergeTreeDataPartState::DeleteOnDestroy);
-        assert(state != MergeTreeDataPartState::Temporary);
    }
 }

@ -1433,7 +1409,10 @@ std::pair<bool, NameSet> IMergeTreeDataPart::canRemovePart() const
 {
    /// NOTE: It's needed for zero-copy replication
    if (force_keep_shared_data)
+    {
+        LOG_DEBUG(storage.log, "Blobs for part {} cannot be removed because it's forced to be keeped", name);
        return std::make_pair(false, NameSet{});
+    }

    return storage.unlockSharedData(*this);
 }
@ -1457,6 +1436,12 @@ void IMergeTreeDataPart::remove() const

    auto [can_remove, files_not_to_remove] = canRemovePart();

+    if (!can_remove)
+        LOG_TRACE(storage.log, "Blobs of part {} cannot be removed", name);
+
+    if (!files_not_to_remove.empty())
+        LOG_TRACE(storage.log, "Some blobs ({}) of part {} cannot be removed", fmt::join(files_not_to_remove, ", "), name);
+
    if (!isStoredOnDisk())
        return;

--- a/src/Storages/MergeTree/IMergeTreeDataPartInfoForReader.h
+++ b/src/Storages/MergeTree/IMergeTreeDataPartInfoForReader.h
@ -0,0 +1,68 @@
+#pragma once
+#include <Interpreters/Context.h>
+#include <Storages/MergeTree/AlterConversions.h>
+#include <Core/NamesAndTypes.h>
+
+namespace DB
+{
+
+class IDataPartStorage;
+using DataPartStoragePtr = std::shared_ptr<IDataPartStorage>;
+class MergeTreeIndexGranularity;
+struct MergeTreeDataPartChecksums;
+struct MergeTreeIndexGranularityInfo;
+class ISerialization;
+using SerializationPtr = std::shared_ptr<const ISerialization>;
+
+/**
+ * A class which contains all information about a data part that is required
+ * in order to use MergeTreeDataPartReader's.
+ * It is a separate interface and not a simple struct because
+ * otherwise it will need to copy all the information which might not
+ * be even used (for example, an IndexGranulary class object is quite heavy).
+ */
+class IMergeTreeDataPartInfoForReader : public WithContext
+{
+public:
+    explicit IMergeTreeDataPartInfoForReader(ContextPtr context_) : WithContext(context_) {}
+
+    virtual ~IMergeTreeDataPartInfoForReader() = default;
+
+    virtual bool isCompactPart() const = 0;
+
+    virtual bool isWidePart() const = 0;
+
+    virtual bool isInMemoryPart() const = 0;
+
+    virtual bool isProjectionPart() const = 0;
+
+    virtual const DataPartStoragePtr & getDataPartStorage() const = 0;
+
+    virtual const NamesAndTypesList & getColumns() const = 0;
+
+    virtual std::optional<size_t> getColumnPosition(const String & column_name) const = 0;
+
+    virtual String getColumnNameWithMinimumCompressedSize(bool with_subcolumns) const = 0;
+
+    virtual const MergeTreeDataPartChecksums & getChecksums() const = 0;
+
+    virtual AlterConversions getAlterConversions() const = 0;
+
+    virtual size_t getMarksCount() const = 0;
+
+    virtual size_t getFileSizeOrZero(const std::string & file_name) const = 0;
+
+    virtual const MergeTreeIndexGranularityInfo & getIndexGranularityInfo() const = 0;
+
+    virtual const MergeTreeIndexGranularity & getIndexGranularity() const = 0;
+
+    virtual SerializationPtr getSerialization(const NameAndTypePair & column) const = 0;
+
+    virtual const SerializationInfoByName & getSerializationInfos() const = 0;
+
+    virtual void reportBroken() = 0;
+};
+
+using MergeTreeDataPartInfoForReaderPtr = std::shared_ptr<IMergeTreeDataPartInfoForReader>;
+
+}
--- a/src/Storages/MergeTree/IMergeTreeReader.cpp
+++ b/src/Storages/MergeTree/IMergeTreeReader.cpp
@ -23,7 +23,7 @@ namespace ErrorCodes


 IMergeTreeReader::IMergeTreeReader(
-    const MergeTreeData::DataPartPtr & data_part_,
+    MergeTreeDataPartInfoForReaderPtr data_part_info_for_read_,
    const NamesAndTypesList & columns_,
    const StorageMetadataPtr & metadata_snapshot_,
    UncompressedCache * uncompressed_cache_,
@ -31,19 +31,18 @@ IMergeTreeReader::IMergeTreeReader(
    const MarkRanges & all_mark_ranges_,
    const MergeTreeReaderSettings & settings_,
    const ValueSizeMap & avg_value_size_hints_)
-    : data_part(data_part_)
+    : data_part_info_for_read(data_part_info_for_read_)
    , avg_value_size_hints(avg_value_size_hints_)
    , uncompressed_cache(uncompressed_cache_)
    , mark_cache(mark_cache_)
    , settings(settings_)
-    , storage(data_part_->storage)
    , metadata_snapshot(metadata_snapshot_)
    , all_mark_ranges(all_mark_ranges_)
-    , alter_conversions(storage.getAlterConversionsForPart(data_part))
+    , alter_conversions(data_part_info_for_read->getAlterConversions())
    /// For wide parts convert plain arrays of Nested to subcolumns
    /// to allow to use shared offset column from cache.
-    , requested_columns(isWidePart(data_part) ? Nested::convertToSubcolumns(columns_) : columns_)
-    , part_columns(isWidePart(data_part) ? Nested::collect(data_part->getColumns()) : data_part->getColumns())
+    , requested_columns(data_part_info_for_read->isWidePart() ? Nested::convertToSubcolumns(columns_) : columns_)
+    , part_columns(data_part_info_for_read->isWidePart() ? Nested::collect(data_part_info_for_read->getColumns()) : data_part_info_for_read->getColumns())
 {
    columns_to_read.reserve(requested_columns.size());
    serializations.reserve(requested_columns.size());
@ -71,7 +70,7 @@ void IMergeTreeReader::fillMissingColumns(Columns & res_columns, bool & should_e
    catch (Exception & e)
    {
        /// Better diagnostics.
-        e.addMessage("(while reading from part " + data_part->data_part_storage->getFullPath() + ")");
+        e.addMessage("(while reading from part " + data_part_info_for_read->getDataPartStorage()->getFullPath() + ")");
        throw;
    }
 }
@ -99,13 +98,13 @@ void IMergeTreeReader::evaluateMissingDefaults(Block additional_columns, Columns
        }

        auto dag = DB::evaluateMissingDefaults(
-                additional_columns, requested_columns, metadata_snapshot->getColumns(), storage.getContext());
+                additional_columns, requested_columns, metadata_snapshot->getColumns(), data_part_info_for_read->getContext());
        if (dag)
        {
            dag->addMaterializingOutputActions();
            auto actions = std::make_shared<
                ExpressionActions>(std::move(dag),
-                ExpressionActionsSettings::fromSettings(storage.getContext()->getSettingsRef()));
+                ExpressionActionsSettings::fromSettings(data_part_info_for_read->getContext()->getSettingsRef()));
            actions->execute(additional_columns);
        }

@ -117,7 +116,7 @@ void IMergeTreeReader::evaluateMissingDefaults(Block additional_columns, Columns
    catch (Exception & e)
    {
        /// Better diagnostics.
-        e.addMessage("(while reading from part " + data_part->data_part_storage->getFullPath() + ")");
+        e.addMessage("(while reading from part " + data_part_info_for_read->getDataPartStorage()->getFullPath() + ")");
        throw;
    }
 }
@ -151,7 +150,7 @@ SerializationPtr IMergeTreeReader::getSerializationInPart(const NameAndTypePair
    if (!column_in_part)
        return IDataType::getSerialization(required_column);

-    const auto & infos = data_part->getSerializationInfos();
+    const auto & infos = data_part_info_for_read->getSerializationInfos();
    if (auto it = infos.find(column_in_part->getNameInStorage()); it != infos.end())
        return IDataType::getSerialization(*column_in_part, *it->second);

@ -187,7 +186,7 @@ void IMergeTreeReader::performRequiredConversions(Columns & res_columns) const
            copy_block.insert({res_columns[pos], getColumnInPart(*name_and_type).type, name_and_type->name});
        }

-        DB::performRequiredConversions(copy_block, requested_columns, storage.getContext());
+        DB::performRequiredConversions(copy_block, requested_columns, data_part_info_for_read->getContext());

        /// Move columns from block.
        name_and_type = requested_columns.begin();
@ -197,7 +196,7 @@ void IMergeTreeReader::performRequiredConversions(Columns & res_columns) const
    catch (Exception & e)
    {
        /// Better diagnostics.
-        e.addMessage("(while reading from part " + data_part->data_part_storage->getFullPath() + ")");
+        e.addMessage("(while reading from part " + data_part_info_for_read->getDataPartStorage()->getFullPath() + ")");
        throw;
    }
 }
@ -205,11 +204,11 @@ void IMergeTreeReader::performRequiredConversions(Columns & res_columns) const
 IMergeTreeReader::ColumnPosition IMergeTreeReader::findColumnForOffsets(const String & column_name) const
 {
    String table_name = Nested::extractTableName(column_name);
-    for (const auto & part_column : data_part->getColumns())
+    for (const auto & part_column : data_part_info_for_read->getColumns())
    {
        if (typeid_cast<const DataTypeArray *>(part_column.type.get()))
        {
-            auto position = data_part->getColumnPosition(part_column.getNameInStorage());
+            auto position = data_part_info_for_read->getColumnPosition(part_column.getNameInStorage());
            if (position && Nested::extractTableName(part_column.name) == table_name)
                return position;
        }
--- a/src/Storages/MergeTree/IMergeTreeReader.h
+++ b/src/Storages/MergeTree/IMergeTreeReader.h
@ -4,6 +4,8 @@
 #include <Common/HashTable/HashMap.h>
 #include <Storages/MergeTree/MergeTreeReaderStream.h>
 #include <Storages/MergeTree/MergeTreeBlockReadUtils.h>
+#include <Storages/MergeTree/IMergeTreeDataPart.h>
+#include <Storages/MergeTree/IMergeTreeDataPartInfoForReader.h>

 namespace DB
 {
@ -20,7 +22,7 @@ public:
    using DeserializeBinaryBulkStateMap = std::map<std::string, ISerialization::DeserializeBinaryBulkStatePtr>;

    IMergeTreeReader(
-        const MergeTreeData::DataPartPtr & data_part_,
+        MergeTreeDataPartInfoForReaderPtr data_part_info_for_read_,
        const NamesAndTypesList & columns_,
        const StorageMetadataPtr & metadata_snapshot_,
        UncompressedCache * uncompressed_cache_,
@ -57,7 +59,7 @@ public:

    size_t getFirstMarkToRead() const { return all_mark_ranges.front().begin; }

-    MergeTreeData::DataPartPtr data_part;
+    MergeTreeDataPartInfoForReaderPtr data_part_info_for_read;

 protected:
    /// Returns actual column name in part, which can differ from table metadata.
@ -86,7 +88,6 @@ protected:

    MergeTreeReaderSettings settings;

-    const MergeTreeData & storage;
    StorageMetadataPtr metadata_snapshot;
    MarkRanges all_mark_ranges;

@ -95,7 +96,7 @@ protected:

 private:
    /// Alter conversions, which must be applied on fly if required
-    MergeTreeData::AlterConversions alter_conversions;
+    AlterConversions alter_conversions;

    /// Columns that are requested to read.
    NamesAndTypesList requested_columns;
--- a/src/Storages/MergeTree/LoadedMergeTreeDataPartInfoForReader.h
+++ b/src/Storages/MergeTree/LoadedMergeTreeDataPartInfoForReader.h
@ -0,0 +1,55 @@
+#pragma once
+#include <Storages/MergeTree/IMergeTreeDataPartInfoForReader.h>
+#include <Storages/MergeTree/MergeTreeData.h>
+
+
+namespace DB
+{
+
+class LoadedMergeTreeDataPartInfoForReader final : public IMergeTreeDataPartInfoForReader
+{
+public:
+    explicit LoadedMergeTreeDataPartInfoForReader(MergeTreeData::DataPartPtr data_part_)
+        : IMergeTreeDataPartInfoForReader(data_part_->storage.getContext())
+        , data_part(data_part_)
+    {}
+
+    bool isCompactPart() const override { return DB::isCompactPart(data_part); }
+
+    bool isWidePart() const override { return DB::isWidePart(data_part); }
+
+    bool isInMemoryPart() const override { return DB::isInMemoryPart(data_part); }
+
+    bool isProjectionPart() const override { return data_part->isProjectionPart(); }
+
+    const DataPartStoragePtr & getDataPartStorage() const override { return data_part->data_part_storage; }
+
+    const NamesAndTypesList & getColumns() const override { return data_part->getColumns(); }
+
+    std::optional<size_t> getColumnPosition(const String & column_name) const override { return data_part->getColumnPosition(column_name); }
+
+    AlterConversions getAlterConversions() const override { return data_part->storage.getAlterConversionsForPart(data_part); }
+
+    String getColumnNameWithMinimumCompressedSize(bool with_subcolumns) const override { return data_part->getColumnNameWithMinimumCompressedSize(with_subcolumns); }
+
+    const MergeTreeDataPartChecksums & getChecksums() const override { return data_part->checksums; }
+
+    void reportBroken() override { data_part->storage.reportBrokenPart(data_part); }
+
+    size_t getMarksCount() const override { return data_part->getMarksCount(); }
+
+    size_t getFileSizeOrZero(const std::string & file_name) const override { return data_part->getFileSizeOrZero(file_name); }
+
+    const MergeTreeIndexGranularityInfo & getIndexGranularityInfo() const override { return data_part->index_granularity_info; }
+
+    const MergeTreeIndexGranularity & getIndexGranularity() const override { return data_part->index_granularity; }
+
+    const SerializationInfoByName & getSerializationInfos() const override { return data_part->getSerializationInfos(); }
+
+    SerializationPtr getSerialization(const NameAndTypePair & column) const override { return data_part->getSerialization(column.name); }
+
+private:
+    MergeTreeData::DataPartPtr data_part;
+};
+
+}
--- a/src/Storages/MergeTree/MarkRange.cpp
+++ b/src/Storages/MergeTree/MarkRange.cpp
@ -36,4 +36,16 @@ size_t getLastMark(const MarkRanges & ranges)
    return current_task_last_mark;
 }

+std::string toString(const MarkRanges & ranges)
+{
+    std::string result;
+    for (const auto & mark_range : ranges)
+    {
+        if (!result.empty())
+            result += ", ";
+        result += "(" + std::to_string(mark_range.begin) + ", " + std::to_string(mark_range.end) + ")";
+    }
+    return result;
+}
+
 }
--- a/src/Storages/MergeTree/MarkRange.h
+++ b/src/Storages/MergeTree/MarkRange.h
@ -32,4 +32,6 @@ using MarkRanges = std::deque<MarkRange>;
 */
 size_t getLastMark(const MarkRanges & ranges);

+std::string toString(const MarkRanges & ranges);
+
 }
--- a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp
+++ b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp
@ -43,6 +43,7 @@ MergeTreeBaseSelectProcessor::MergeTreeBaseSelectProcessor(
    , storage(storage_)
    , storage_snapshot(storage_snapshot_)
    , prewhere_info(prewhere_info_)
+    , prewhere_actions(getPrewhereActions(prewhere_info, actions_settings))
    , max_block_size_rows(max_block_size_rows_)
    , preferred_block_size_bytes(preferred_block_size_bytes_)
    , preferred_max_column_in_block_size_bytes(preferred_max_column_in_block_size_bytes_)
@ -72,7 +73,12 @@ MergeTreeBaseSelectProcessor::MergeTreeBaseSelectProcessor(
                header_without_virtual_columns.erase(*it);
        }
    }
+}

+
+std::unique_ptr<PrewhereExprInfo> MergeTreeBaseSelectProcessor::getPrewhereActions(PrewhereInfoPtr prewhere_info, const ExpressionActionsSettings & actions_settings)
+{
+    std::unique_ptr<PrewhereExprInfo> prewhere_actions;
    if (prewhere_info)
    {
        prewhere_actions = std::make_unique<PrewhereExprInfo>();
@ -100,6 +106,8 @@ MergeTreeBaseSelectProcessor::MergeTreeBaseSelectProcessor(

        prewhere_actions->steps.emplace_back(std::move(prewhere_step));
    }
+
+    return prewhere_actions;
 }


@ -262,45 +270,62 @@ void MergeTreeBaseSelectProcessor::initializeMergeTreeReadersForPart(

 void MergeTreeBaseSelectProcessor::initializeRangeReaders(MergeTreeReadTask & current_task)
 {
-    MergeTreeRangeReader* prev_reader = nullptr;
+    return initializeRangeReadersImpl(
+        current_task.range_reader, current_task.pre_range_readers, prewhere_info, prewhere_actions.get(),
+        reader.get(), current_task.data_part->hasLightweightDelete(), reader_settings,
+        pre_reader_for_step, lightweight_delete_filter_step, non_const_virtual_column_names);
+}
+
+void MergeTreeBaseSelectProcessor::initializeRangeReadersImpl(
+    MergeTreeRangeReader & range_reader, std::deque<MergeTreeRangeReader> & pre_range_readers,
+    PrewhereInfoPtr prewhere_info, const PrewhereExprInfo * prewhere_actions,
+    IMergeTreeReader * reader, bool has_lightweight_delete, const MergeTreeReaderSettings & reader_settings,
+    const std::vector<std::unique_ptr<IMergeTreeReader>> & pre_reader_for_step,
+    const PrewhereExprStep & lightweight_delete_filter_step, const Names & non_const_virtual_column_names)
+{
+    MergeTreeRangeReader * prev_reader = nullptr;
    bool last_reader = false;
    size_t pre_readers_shift = 0;

    /// Add filtering step with lightweight delete mask
-    if (reader_settings.apply_deleted_mask && current_task.data_part->hasLightweightDelete())
+    if (reader_settings.apply_deleted_mask && has_lightweight_delete)
    {
-        current_task.pre_range_readers.push_back(
-            MergeTreeRangeReader(pre_reader_for_step[0].get(), prev_reader, &lightweight_delete_filter_step, last_reader, non_const_virtual_column_names));
-        prev_reader = &current_task.pre_range_readers.back();
+        MergeTreeRangeReader pre_range_reader(pre_reader_for_step[0].get(), prev_reader, &lightweight_delete_filter_step, last_reader, non_const_virtual_column_names);
+        pre_range_readers.push_back(std::move(pre_range_reader));
+        prev_reader = &pre_range_readers.back();
        pre_readers_shift++;
    }

    if (prewhere_info)
    {
        if (prewhere_actions->steps.size() + pre_readers_shift != pre_reader_for_step.size())
-            throw Exception(ErrorCodes::LOGICAL_ERROR,
-                            "PREWHERE steps count mismatch, actions: {}, readers: {}",
-                            prewhere_actions->steps.size(), pre_reader_for_step.size());
+        {
+            throw Exception(
+                ErrorCodes::LOGICAL_ERROR,
+                "PREWHERE steps count mismatch, actions: {}, readers: {}",
+                prewhere_actions->steps.size(), pre_reader_for_step.size());
+        }

        for (size_t i = 0; i < prewhere_actions->steps.size(); ++i)
        {
            last_reader = reader->getColumns().empty() && (i + 1 == prewhere_actions->steps.size());
-            current_task.pre_range_readers.push_back(
-                MergeTreeRangeReader(pre_reader_for_step[i + pre_readers_shift].get(), prev_reader, &prewhere_actions->steps[i], last_reader, non_const_virtual_column_names));

-            prev_reader = &current_task.pre_range_readers.back();
+            MergeTreeRangeReader current_reader(pre_reader_for_step[i + pre_readers_shift].get(), prev_reader, &prewhere_actions->steps[i], last_reader, non_const_virtual_column_names);
+
+            pre_range_readers.push_back(std::move(current_reader));
+            prev_reader = &pre_range_readers.back();
        }
    }

    if (!last_reader)
    {
-        current_task.range_reader = MergeTreeRangeReader(reader.get(), prev_reader, nullptr, true, non_const_virtual_column_names);
+        range_reader = MergeTreeRangeReader(reader, prev_reader, nullptr, true, non_const_virtual_column_names);
    }
    else
    {
        /// If all columns are read by pre_range_readers than move last pre_range_reader into range_reader
-        current_task.range_reader = std::move(current_task.pre_range_readers.back());
-        current_task.pre_range_readers.pop_back();
+        range_reader = std::move(pre_range_readers.back());
+        pre_range_readers.pop_back();
    }
 }

--- a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h
+++ b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h
@ -89,6 +89,20 @@ protected:
    static void
    injectVirtualColumns(Block & block, size_t row_count, MergeTreeReadTask * task, const DataTypePtr & partition_value_type, const Names & virtual_columns);

+    static std::unique_ptr<PrewhereExprInfo> getPrewhereActions(PrewhereInfoPtr prewhere_info, const ExpressionActionsSettings & actions_settings);
+
+    static void initializeRangeReadersImpl(
+         MergeTreeRangeReader & range_reader,
+         std::deque<MergeTreeRangeReader> & pre_range_readers,
+         PrewhereInfoPtr prewhere_info,
+         const PrewhereExprInfo * prewhere_actions,
+         IMergeTreeReader * reader,
+         bool has_lightweight_delete,
+         const MergeTreeReaderSettings & reader_settings,
+         const std::vector<std::unique_ptr<IMergeTreeReader>> & pre_reader_for_step,
+         const PrewhereExprStep & lightweight_delete_filter_step,
+         const Names & non_const_virtual_column_names);
+
    /// Sets up data readers for each step of prewhere and where
    void initializeMergeTreeReadersForPart(
        MergeTreeData::DataPartPtr & data_part,
--- a/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp
+++ b/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp
@ -1,5 +1,6 @@
 #include <Storages/MergeTree/MergeTreeBlockReadUtils.h>
 #include <Storages/MergeTree/MergeTreeData.h>
+#include <Storages/MergeTree/IMergeTreeDataPartInfoForReader.h>
 #include <DataTypes/NestedUtils.h>
 #include <Core/NamesAndTypes.h>
 #include <Common/checkStackSize.h>
@ -28,8 +29,8 @@ namespace
 bool injectRequiredColumnsRecursively(
    const String & column_name,
    const StorageSnapshotPtr & storage_snapshot,
-    const MergeTreeData::AlterConversions & alter_conversions,
-    const MergeTreeData::DataPartPtr & part,
+    const AlterConversions & alter_conversions,
+    const IMergeTreeDataPartInfoForReader & data_part_info_for_reader,
    const GetColumnsOptions & options,
    Names & columns,
    NameSet & required_columns,
@ -47,7 +48,7 @@ bool injectRequiredColumnsRecursively(
        if (alter_conversions.isColumnRenamed(column_name_in_part))
            column_name_in_part = alter_conversions.getColumnOldName(column_name_in_part);

-        auto column_in_part = part->getColumns().tryGetByName(column_name_in_part);
+        auto column_in_part = data_part_info_for_reader.getColumns().tryGetByName(column_name_in_part);

        if (column_in_part
            && (!column_in_storage->isSubcolumn()
@ -78,7 +79,7 @@ bool injectRequiredColumnsRecursively(
    bool result = false;
    for (const auto & identifier : identifiers)
        result |= injectRequiredColumnsRecursively(
-            identifier, storage_snapshot, alter_conversions, part,
+            identifier, storage_snapshot, alter_conversions, data_part_info_for_reader,
            options, columns, required_columns, injected_columns);

    return result;
@ -87,9 +88,8 @@ bool injectRequiredColumnsRecursively(
 }

 NameSet injectRequiredColumns(
-    const MergeTreeData & storage,
+    const IMergeTreeDataPartInfoForReader & data_part_info_for_reader,
    const StorageSnapshotPtr & storage_snapshot,
-    const MergeTreeData::DataPartPtr & part,
    bool with_subcolumns,
    Names & columns)
 {
@ -97,9 +97,9 @@ NameSet injectRequiredColumns(
    NameSet injected_columns;

    bool have_at_least_one_physical_column = false;
-    MergeTreeData::AlterConversions alter_conversions;
-    if (!part->isProjectionPart())
-        alter_conversions = storage.getAlterConversionsForPart(part);
+    AlterConversions alter_conversions;
+    if (!data_part_info_for_reader.isProjectionPart())
+        alter_conversions = data_part_info_for_reader.getAlterConversions();

    auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical)
        .withExtendedObjects()
@ -115,7 +115,7 @@ NameSet injectRequiredColumns(

        have_at_least_one_physical_column |= injectRequiredColumnsRecursively(
            columns[i], storage_snapshot, alter_conversions,
-            part, options, columns, required_columns, injected_columns);
+            data_part_info_for_reader, options, columns, required_columns, injected_columns);
    }

    /** Add a column of the minimum size.
@ -124,7 +124,7 @@ NameSet injectRequiredColumns(
        */
    if (!have_at_least_one_physical_column)
    {
-        const auto minimum_size_column_name = part->getColumnNameWithMinimumCompressedSize(with_subcolumns);
+        const auto minimum_size_column_name = data_part_info_for_reader.getColumnNameWithMinimumCompressedSize(with_subcolumns);
        columns.push_back(minimum_size_column_name);
        /// correctly report added column
        injected_columns.insert(columns.back());
@ -135,13 +135,22 @@ NameSet injectRequiredColumns(


 MergeTreeReadTask::MergeTreeReadTask(
-    const MergeTreeData::DataPartPtr & data_part_, const MarkRanges & mark_ranges_, size_t part_index_in_query_,
-    const Names & ordered_names_, const NameSet & column_name_set_, const MergeTreeReadTaskColumns & task_columns_,
+    const MergeTreeData::DataPartPtr & data_part_,
+    const MarkRanges & mark_ranges_,
+    size_t part_index_in_query_,
+    const Names & ordered_names_,
+    const NameSet & column_name_set_,
+    const MergeTreeReadTaskColumns & task_columns_,
    bool remove_prewhere_column_,
    MergeTreeBlockSizePredictorPtr && size_predictor_)
-    : data_part{data_part_}, mark_ranges{mark_ranges_}, part_index_in_query{part_index_in_query_},
-    ordered_names{ordered_names_}, column_name_set{column_name_set_}, task_columns{task_columns_},
-    remove_prewhere_column{remove_prewhere_column_}, size_predictor{std::move(size_predictor_)}
+    : data_part{data_part_}
+    , mark_ranges{mark_ranges_}
+    , part_index_in_query{part_index_in_query_}
+    , ordered_names{ordered_names_}
+    , column_name_set{column_name_set_}
+    , task_columns{task_columns_}
+    , remove_prewhere_column{remove_prewhere_column_}
+    , size_predictor{std::move(size_predictor_)}
 {
 }

@ -270,9 +279,8 @@ void MergeTreeBlockSizePredictor::update(const Block & sample_block, const Colum


 MergeTreeReadTaskColumns getReadTaskColumns(
-    const MergeTreeData & storage,
+    const IMergeTreeDataPartInfoForReader & data_part_info_for_reader,
    const StorageSnapshotPtr & storage_snapshot,
-    const MergeTreeData::DataPartPtr & data_part,
    const Names & required_columns,
    const Names & system_columns,
    const PrewhereInfoPtr & prewhere_info,
@ -284,13 +292,13 @@ MergeTreeReadTaskColumns getReadTaskColumns(
    /// Read system columns such as lightweight delete mask "_row_exists" if it is persisted in the part
    for (const auto & name : system_columns)
    {
-        if (data_part->getColumns().contains(name))
+        if (data_part_info_for_reader.getColumns().contains(name))
            column_names.push_back(name);
    }

    /// inject columns required for defaults evaluation
    injectRequiredColumns(
-        storage, storage_snapshot, data_part, with_subcolumns, column_names);
+        data_part_info_for_reader, storage_snapshot, with_subcolumns, column_names);

    MergeTreeReadTaskColumns result;
    auto options = GetColumnsOptions(GetColumnsOptions::All)
@ -316,7 +324,7 @@ MergeTreeReadTaskColumns getReadTaskColumns(
        Names all_pre_column_names = prewhere_info->prewhere_actions->getRequiredColumnsNames();

        const auto injected_pre_columns = injectRequiredColumns(
-            storage, storage_snapshot, data_part, with_subcolumns, all_pre_column_names);
+             data_part_info_for_reader, storage_snapshot, with_subcolumns, all_pre_column_names);

        for (const auto & name : all_pre_column_names)
        {
--- a/src/Storages/MergeTree/MergeTreeBlockReadUtils.h
+++ b/src/Storages/MergeTree/MergeTreeBlockReadUtils.h
@ -12,6 +12,7 @@ namespace DB
 class MergeTreeData;
 struct MergeTreeReadTask;
 struct MergeTreeBlockSizePredictor;
+class IMergeTreeDataPartInfoForReader;

 using MergeTreeReadTaskPtr = std::unique_ptr<MergeTreeReadTask>;
 using MergeTreeBlockSizePredictorPtr = std::shared_ptr<MergeTreeBlockSizePredictor>;
@ -23,9 +24,8 @@ using MergeTreeBlockSizePredictorPtr = std::shared_ptr<MergeTreeBlockSizePredict
  * Adds them to the `columns`.
  */
 NameSet injectRequiredColumns(
-    const MergeTreeData & storage,
+    const IMergeTreeDataPartInfoForReader & data_part_info_for_reader,
    const StorageSnapshotPtr & storage_snapshot,
-    const MergeTreeData::DataPartPtr & part,
    bool with_subcolumns,
    Names & columns);

@ -68,16 +68,19 @@ struct MergeTreeReadTask
    bool isFinished() const { return mark_ranges.empty() && range_reader.isCurrentRangeFinished(); }

    MergeTreeReadTask(
-        const MergeTreeData::DataPartPtr & data_part_, const MarkRanges & mark_ranges_, size_t part_index_in_query_,
-        const Names & ordered_names_, const NameSet & column_name_set_, const MergeTreeReadTaskColumns & task_columns_,
+        const MergeTreeData::DataPartPtr & data_part_,
+        const MarkRanges & mark_ranges_,
+        size_t part_index_in_query_,
+        const Names & ordered_names_,
+        const NameSet & column_name_set_,
+        const MergeTreeReadTaskColumns & task_columns_,
        bool remove_prewhere_column_,
        MergeTreeBlockSizePredictorPtr && size_predictor_);
 };

 MergeTreeReadTaskColumns getReadTaskColumns(
-    const MergeTreeData & storage,
+    const IMergeTreeDataPartInfoForReader & data_part_info_for_reader,
    const StorageSnapshotPtr & storage_snapshot,
-    const MergeTreeData::DataPartPtr & data_part,
    const Names & required_columns,
    const Names & system_columns,
    const PrewhereInfoPtr & prewhere_info,
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@ -1047,12 +1047,12 @@ void MergeTreeData::loadDataPartsFromDisk(
                throw;

            broken = true;
-            tryLogCurrentException(__PRETTY_FUNCTION__, fmt::format("while loading part {} on path {}", part->name, part_path));
+            tryLogCurrentException(log, fmt::format("while loading part {} on path {}", part->name, part_path));
        }
        catch (...)
        {
            broken = true;
-            tryLogCurrentException(__PRETTY_FUNCTION__, fmt::format("while loading part {} on path {}", part->name, part_path));
+            tryLogCurrentException(log, fmt::format("while loading part {} on path {}", part->name, part_path));
        }

        /// Ignore broken parts that can appear as a result of hard server restart.
@ -1066,7 +1066,7 @@ void MergeTreeData::loadDataPartsFromDisk(
            }
            catch (...)
            {
-                tryLogCurrentException(__PRETTY_FUNCTION__, fmt::format("while calculating part size {} on path {}", part->name, part_path));
+                tryLogCurrentException(log, fmt::format("while calculating part size {} on path {}", part->name, part_path));
            }

            std::string part_size_str = "failed to calculate size";
@ -1902,7 +1902,9 @@ void MergeTreeData::clearPartsFromFilesystem(const DataPartsVector & parts, bool
 void MergeTreeData::clearPartsFromFilesystemImpl(const DataPartsVector & parts_to_remove, NameSet * part_names_succeed)
 {
    const auto settings = getSettings();
-    if (parts_to_remove.size() > 1 && settings->max_part_removal_threads > 1 && parts_to_remove.size() > settings->concurrent_part_removal_threshold)
+    if (parts_to_remove.size() > 1
+        && settings->max_part_removal_threads > 1
+        && parts_to_remove.size() > settings->concurrent_part_removal_threshold)
    {
        /// Parallel parts removal.
        size_t num_threads = std::min<size_t>(settings->max_part_removal_threads, parts_to_remove.size());
@ -1917,7 +1919,7 @@ void MergeTreeData::clearPartsFromFilesystemImpl(const DataPartsVector & parts_t
                if (thread_group)
                    CurrentThread::attachToIfDetached(thread_group);

-                LOG_DEBUG(log, "Removing part from filesystem {}", part->name);
+                LOG_DEBUG(log, "Removing part from filesystem {} (concurrently)", part->name);
                part->remove();
                if (part_names_succeed)
                {
@ -5069,6 +5071,8 @@ void MergeTreeData::Transaction::rollbackPartsToTemporaryState()
 void MergeTreeData::Transaction::addPart(MutableDataPartPtr & part, DataPartStorageBuilderPtr builder)
 {
    precommitted_parts.insert(part);
+    if (asInMemoryPart(part))
+        has_in_memory_parts = true;
    part_builders.push_back(builder);
 }

@ -5091,6 +5095,12 @@ void MergeTreeData::Transaction::rollback()
    clear();
 }

+void MergeTreeData::Transaction::clear()
+{
+    precommitted_parts.clear();
+    has_in_memory_parts = false;
+}
+
 MergeTreeData::DataPartsVector MergeTreeData::Transaction::commit(MergeTreeData::DataPartsLock * acquired_parts_lock)
 {
    DataPartsVector total_covered_parts;
@ -5098,20 +5108,30 @@ MergeTreeData::DataPartsVector MergeTreeData::Transaction::commit(MergeTreeData:
    if (!isEmpty())
    {
        auto settings = data.getSettings();
-        MergeTreeData::WriteAheadLogPtr wal;
        auto parts_lock = acquired_parts_lock ? MergeTreeData::DataPartsLock() : data.lockParts();
        auto * owing_parts_lock = acquired_parts_lock ? acquired_parts_lock : &parts_lock;

        for (auto & builder : part_builders)
            builder->commit();

-        if (txn)
+        bool commit_to_wal = has_in_memory_parts && settings->in_memory_parts_enable_wal;
+        if (txn || commit_to_wal)
        {
+            MergeTreeData::WriteAheadLogPtr wal;
+            if (commit_to_wal)
+                wal = data.getWriteAheadLog();
+
            for (const DataPartPtr & part : precommitted_parts)
            {
-                DataPartPtr covering_part;
-                DataPartsVector covered_parts = data.getActivePartsToReplace(part->info, part->name, covering_part, *owing_parts_lock);
-                MergeTreeTransaction::addNewPartAndRemoveCovered(data.shared_from_this(), part, covered_parts, txn);
+                if (txn)
+                {
+                    DataPartPtr covering_part;
+                    DataPartsVector covered_parts = data.getActivePartsToReplace(part->info, part->name, covering_part, *owing_parts_lock);
+                    MergeTreeTransaction::addNewPartAndRemoveCovered(data.shared_from_this(), part, covered_parts, txn);
+                }
+
+                if (auto part_in_memory = asInMemoryPart(part))
+                    wal->addPart(part_in_memory);
            }
        }

@ -5128,15 +5148,6 @@ MergeTreeData::DataPartsVector MergeTreeData::Transaction::commit(MergeTreeData:

            for (const DataPartPtr & part : precommitted_parts)
            {
-                auto part_in_memory = asInMemoryPart(part);
-                if (part_in_memory && settings->in_memory_parts_enable_wal)
-                {
-                    if (!wal)
-                        wal = data.getWriteAheadLog();
-
-                    wal->addPart(part_in_memory);
-                }
-
                DataPartPtr covering_part;
                DataPartsVector covered_parts = data.getActivePartsToReplace(part->info, part->name, covering_part, *owing_parts_lock);
                if (covering_part)
@ -6717,7 +6728,7 @@ bool MergeTreeData::canUsePolymorphicParts(const MergeTreeSettings & settings, S
    return true;
 }

-MergeTreeData::AlterConversions MergeTreeData::getAlterConversionsForPart(const MergeTreeDataPartPtr part) const
+AlterConversions MergeTreeData::getAlterConversionsForPart(const MergeTreeDataPartPtr part) const
 {
    MutationCommands commands = getFirstAlterMutationCommandsForPart(part);

--- a/src/Storages/MergeTree/MergeTreeData.h
+++ b/src/Storages/MergeTree/MergeTreeData.h
@ -24,6 +24,7 @@
 #include <Storages/MergeTree/ZeroCopyLock.h>
 #include <Storages/MergeTree/TemporaryParts.h>
 #include <Storages/IndicesDescription.h>
+#include <Storages/MergeTree/AlterConversions.h>
 #include <Storages/DataDestinationType.h>
 #include <Storages/extractKeyExpressionList.h>
 #include <Storages/PartitionCommands.h>
@ -167,20 +168,6 @@ public:

    STRONG_TYPEDEF(String, PartitionID)

-    /// Alter conversions which should be applied on-fly for part. Build from of
-    /// the most recent mutation commands for part. Now we have only rename_map
-    /// here (from ALTER_RENAME) command, because for all other type of alters
-    /// we can deduce conversions for part from difference between
-    /// part->getColumns() and storage->getColumns().
-    struct AlterConversions
-    {
-        /// Rename map new_name -> old_name
-        std::unordered_map<String, String> rename_map;
-
-        bool isColumnRenamed(const String & new_name) const { return rename_map.contains(new_name); }
-        String getColumnOldName(const String & new_name) const { return rename_map.at(new_name); }
-    };
-
    struct LessDataPart
    {
        using is_transparent = void;
@ -290,8 +277,9 @@ public:
        DataParts precommitted_parts;
        std::vector<DataPartStorageBuilderPtr> part_builders;
        DataParts locked_parts;
+        bool has_in_memory_parts = false;

-        void clear() { precommitted_parts.clear(); }
+        void clear();
    };

    using TransactionUniquePtr = std::unique_ptr<Transaction>;
--- a/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp
@ -2,6 +2,7 @@
 #include <DataTypes/NestedUtils.h>
 #include <Storages/MergeTree/MergeTreeReaderCompact.h>
 #include <Storages/MergeTree/MergeTreeDataPartWriterCompact.h>
+#include <Storages/MergeTree/LoadedMergeTreeDataPartInfoForReader.h>


 namespace DB
@ -45,9 +46,9 @@ IMergeTreeDataPart::MergeTreeReaderPtr MergeTreeDataPartCompact::getReader(
    const ValueSizeMap & avg_value_size_hints,
    const ReadBufferFromFileBase::ProfileCallback & profile_callback) const
 {
-    auto ptr = std::static_pointer_cast<const MergeTreeDataPartCompact>(shared_from_this());
+    auto read_info = std::make_shared<LoadedMergeTreeDataPartInfoForReader>(shared_from_this());
    return std::make_unique<MergeTreeReaderCompact>(
-        ptr, columns_to_read, metadata_snapshot, uncompressed_cache,
+        read_info, columns_to_read, metadata_snapshot, uncompressed_cache,
        mark_cache, mark_ranges, reader_settings,
        avg_value_size_hints, profile_callback);
 }
@ -90,39 +91,44 @@ void MergeTreeDataPartCompact::calculateEachColumnSizes(ColumnSizeByName & /*eac
        total_size.marks += mrk_checksum->second.file_size;
 }

-void MergeTreeDataPartCompact::loadIndexGranularity()
+void MergeTreeDataPartCompact::loadIndexGranularityImpl(
+    MergeTreeIndexGranularity & index_granularity_, const MergeTreeIndexGranularityInfo & index_granularity_info_,
+    const NamesAndTypesList & columns_, const DataPartStoragePtr & data_part_storage_)
 {
-    //String full_path = getRelativePath();
-
-    if (columns.empty())
-        throw Exception("No columns in part " + name, ErrorCodes::NO_FILE_IN_DATA_PART);
-
-    if (!index_granularity_info.is_adaptive)
+    if (!index_granularity_info_.is_adaptive)
        throw Exception("MergeTreeDataPartCompact cannot be created with non-adaptive granulary.", ErrorCodes::NOT_IMPLEMENTED);

-    auto marks_file_path = index_granularity_info.getMarksFilePath("data");
-    if (!data_part_storage->exists(marks_file_path))
+    auto marks_file_path = index_granularity_info_.getMarksFilePath("data");
+    if (!data_part_storage_->exists(marks_file_path))
        throw Exception(
            ErrorCodes::NO_FILE_IN_DATA_PART,
            "Marks file '{}' doesn't exist",
-            std::string(fs::path(data_part_storage->getFullPath()) / marks_file_path));
+            std::string(fs::path(data_part_storage_->getFullPath()) / marks_file_path));

-    size_t marks_file_size = data_part_storage->getFileSize(marks_file_path);
+    size_t marks_file_size = data_part_storage_->getFileSize(marks_file_path);

-    auto buffer = data_part_storage->readFile(marks_file_path, ReadSettings().adjustBufferSize(marks_file_size), marks_file_size, std::nullopt);
+    auto buffer = data_part_storage_->readFile(marks_file_path, ReadSettings().adjustBufferSize(marks_file_size), marks_file_size, std::nullopt);
    while (!buffer->eof())
    {
        /// Skip offsets for columns
-        buffer->seek(columns.size() * sizeof(MarkInCompressedFile), SEEK_CUR);
+        buffer->seek(columns_.size() * sizeof(MarkInCompressedFile), SEEK_CUR);
        size_t granularity;
        readIntBinary(granularity, *buffer);
-        index_granularity.appendMark(granularity);
+        index_granularity_.appendMark(granularity);
    }

-    if (index_granularity.getMarksCount() * index_granularity_info.getMarkSizeInBytes(columns.size()) != marks_file_size)
+    if (index_granularity_.getMarksCount() * index_granularity_info_.getMarkSizeInBytes(columns_.size()) != marks_file_size)
        throw Exception("Cannot read all marks from file " + marks_file_path, ErrorCodes::CANNOT_READ_ALL_DATA);

-    index_granularity.setInitialized();
+    index_granularity_.setInitialized();
+}
+
+void MergeTreeDataPartCompact::loadIndexGranularity()
+{
+    if (columns.empty())
+        throw Exception("No columns in part " + name, ErrorCodes::NO_FILE_IN_DATA_PART);
+
+    loadIndexGranularityImpl(index_granularity, index_granularity_info, columns, data_part_storage);
 }

 bool MergeTreeDataPartCompact::hasColumnFiles(const NameAndTypePair & column) const
--- a/src/Storages/MergeTree/MergeTreeDataPartCompact.h
+++ b/src/Storages/MergeTree/MergeTreeDataPartCompact.h
@ -65,6 +65,11 @@ public:

    ~MergeTreeDataPartCompact() override;

+protected:
+     static void loadIndexGranularityImpl(
+         MergeTreeIndexGranularity & index_granularity_, const MergeTreeIndexGranularityInfo & index_granularity_info_,
+         const NamesAndTypesList & columns_, const DataPartStoragePtr & data_part_storage_);
+
 private:
    void checkConsistency(bool require_part_metadata) const override;

--- a/src/Storages/MergeTree/MergeTreeDataPartInMemory.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataPartInMemory.cpp
@ -3,6 +3,7 @@
 #include <Storages/MergeTree/MergedBlockOutputStream.h>
 #include <Storages/MergeTree/MergeTreeDataPartWriterInMemory.h>
 #include <Storages/MergeTree/IMergeTreeReader.h>
+#include <Storages/MergeTree/LoadedMergeTreeDataPartInfoForReader.h>
 #include <DataTypes/NestedUtils.h>
 #include <Interpreters/Context.h>
 #include <Poco/Logger.h>
@ -48,9 +49,10 @@ IMergeTreeDataPart::MergeTreeReaderPtr MergeTreeDataPartInMemory::getReader(
    const ValueSizeMap & /* avg_value_size_hints */,
    const ReadBufferFromFileBase::ProfileCallback & /* profile_callback */) const
 {
+    auto read_info = std::make_shared<LoadedMergeTreeDataPartInfoForReader>(shared_from_this());
    auto ptr = std::static_pointer_cast<const MergeTreeDataPartInMemory>(shared_from_this());
    return std::make_unique<MergeTreeReaderInMemory>(
-        ptr, columns_to_read, metadata_snapshot, mark_ranges, reader_settings);
+        read_info, ptr, columns_to_read, metadata_snapshot, mark_ranges, reader_settings);
 }

 IMergeTreeDataPart::MergeTreeWriterPtr MergeTreeDataPartInMemory::getWriter(
--- a/src/Storages/MergeTree/MergeTreeDataPartWide.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataPartWide.cpp
@ -2,6 +2,7 @@
 #include <Storages/MergeTree/MergeTreeReaderWide.h>
 #include <Storages/MergeTree/MergeTreeDataPartWriterWide.h>
 #include <Storages/MergeTree/IMergeTreeDataPartWriter.h>
+#include <Storages/MergeTree/LoadedMergeTreeDataPartInfoForReader.h>
 #include <DataTypes/NestedUtils.h>
 #include <Core/NamesAndTypes.h>

@ -47,9 +48,9 @@ IMergeTreeDataPart::MergeTreeReaderPtr MergeTreeDataPartWide::getReader(
    const ValueSizeMap & avg_value_size_hints,
    const ReadBufferFromFileBase::ProfileCallback & profile_callback) const
 {
-    auto ptr = std::static_pointer_cast<const MergeTreeDataPartWide>(shared_from_this());
+    auto read_info = std::make_shared<LoadedMergeTreeDataPartInfoForReader>(shared_from_this());
    return std::make_unique<MergeTreeReaderWide>(
-        ptr, columns_to_read,
+        read_info, columns_to_read,
        metadata_snapshot, uncompressed_cache,
        mark_cache, mark_ranges, reader_settings,
        avg_value_size_hints, profile_callback);
@ -103,46 +104,52 @@ ColumnSize MergeTreeDataPartWide::getColumnSizeImpl(
    return size;
 }

-void MergeTreeDataPartWide::loadIndexGranularity()
+void MergeTreeDataPartWide::loadIndexGranularityImpl(
+    MergeTreeIndexGranularity & index_granularity_, MergeTreeIndexGranularityInfo & index_granularity_info_,
+    const DataPartStoragePtr & data_part_storage_, const std::string & any_column_file_name)
 {
-    index_granularity_info.changeGranularityIfRequired(data_part_storage);
-
-
-    if (columns.empty())
-        throw Exception("No columns in part " + name, ErrorCodes::NO_FILE_IN_DATA_PART);
+    index_granularity_info_.changeGranularityIfRequired(data_part_storage_);

    /// We can use any column, it doesn't matter
-    std::string marks_file_path = index_granularity_info.getMarksFilePath(getFileNameForColumn(columns.front()));
-    if (!data_part_storage->exists(marks_file_path))
+    std::string marks_file_path = index_granularity_info_.getMarksFilePath(any_column_file_name);
+    if (!data_part_storage_->exists(marks_file_path))
        throw Exception(
            ErrorCodes::NO_FILE_IN_DATA_PART, "Marks file '{}' doesn't exist",
-            std::string(fs::path(data_part_storage->getFullPath()) / marks_file_path));
+            std::string(fs::path(data_part_storage_->getFullPath()) / marks_file_path));

-    size_t marks_file_size = data_part_storage->getFileSize(marks_file_path);
+    size_t marks_file_size = data_part_storage_->getFileSize(marks_file_path);

-    if (!index_granularity_info.is_adaptive)
+    if (!index_granularity_info_.is_adaptive)
    {
-        size_t marks_count = marks_file_size / index_granularity_info.getMarkSizeInBytes();
-        index_granularity.resizeWithFixedGranularity(marks_count, index_granularity_info.fixed_index_granularity); /// all the same
+        size_t marks_count = marks_file_size / index_granularity_info_.getMarkSizeInBytes();
+        index_granularity_.resizeWithFixedGranularity(marks_count, index_granularity_info_.fixed_index_granularity); /// all the same
    }
    else
    {
-        auto buffer = data_part_storage->readFile(marks_file_path, ReadSettings().adjustBufferSize(marks_file_size), marks_file_size, std::nullopt);
+        auto buffer = data_part_storage_->readFile(marks_file_path, ReadSettings().adjustBufferSize(marks_file_size), marks_file_size, std::nullopt);
        while (!buffer->eof())
        {
            buffer->seek(sizeof(size_t) * 2, SEEK_CUR); /// skip offset_in_compressed file and offset_in_decompressed_block
            size_t granularity;
            readIntBinary(granularity, *buffer);
-            index_granularity.appendMark(granularity);
+            index_granularity_.appendMark(granularity);
        }

-        if (index_granularity.getMarksCount() * index_granularity_info.getMarkSizeInBytes() != marks_file_size)
+        if (index_granularity_.getMarksCount() * index_granularity_info_.getMarkSizeInBytes() != marks_file_size)
            throw Exception(
                ErrorCodes::CANNOT_READ_ALL_DATA, "Cannot read all marks from file {}",
-                std::string(fs::path(data_part_storage->getFullPath()) / marks_file_path));
+                std::string(fs::path(data_part_storage_->getFullPath()) / marks_file_path));
    }

-    index_granularity.setInitialized();
+    index_granularity_.setInitialized();
+}
+
+void MergeTreeDataPartWide::loadIndexGranularity()
+{
+    if (columns.empty())
+        throw Exception("No columns in part " + name, ErrorCodes::NO_FILE_IN_DATA_PART);
+
+    loadIndexGranularityImpl(index_granularity, index_granularity_info, data_part_storage, getFileNameForColumn(columns.front()));
 }

 bool MergeTreeDataPartWide::isStoredOnRemoteDisk() const
--- a/src/Storages/MergeTree/MergeTreeDataPartWide.h
+++ b/src/Storages/MergeTree/MergeTreeDataPartWide.h
@ -61,6 +61,11 @@ public:

    bool hasColumnFiles(const NameAndTypePair & column) const override;

+protected:
+    static void loadIndexGranularityImpl(
+        MergeTreeIndexGranularity & index_granularity_, MergeTreeIndexGranularityInfo & index_granularity_info_,
+        const DataPartStoragePtr & data_part_storage_, const std::string & any_column_file_name);
+
 private:
    void checkConsistency(bool require_part_metadata) const override;

--- a/src/Storages/MergeTree/MergeTreeIndexGranularityInfo.h
+++ b/src/Storages/MergeTree/MergeTreeIndexGranularityInfo.h
@ -29,6 +29,8 @@ public:

    MergeTreeIndexGranularityInfo(const MergeTreeData & storage, MergeTreeDataPartType type_);

+    MergeTreeIndexGranularityInfo(MergeTreeDataPartType type_, bool is_adaptive_, size_t index_granularity_, size_t index_granularity_bytes_);
+
    void changeGranularityIfRequired(const DataPartStoragePtr & data_part_storage);

    String getMarksFilePath(const String & path_prefix) const
--- a/src/Storages/MergeTree/MergeTreeRangeReader.cpp
+++ b/src/Storages/MergeTree/MergeTreeRangeReader.cpp
@ -83,7 +83,7 @@ MergeTreeRangeReader::DelayedStream::DelayedStream(
        : current_mark(from_mark), current_offset(0), num_delayed_rows(0)
        , current_task_last_mark(current_task_last_mark_)
        , merge_tree_reader(merge_tree_reader_)
-        , index_granularity(&(merge_tree_reader->data_part->index_granularity))
+        , index_granularity(&(merge_tree_reader->data_part_info_for_read->getIndexGranularity()))
        , continue_reading(false), is_finished(false)
 {
 }
@ -181,7 +181,7 @@ MergeTreeRangeReader::Stream::Stream(
        : current_mark(from_mark), offset_after_current_mark(0)
        , last_mark(to_mark)
        , merge_tree_reader(merge_tree_reader_)
-        , index_granularity(&(merge_tree_reader->data_part->index_granularity))
+        , index_granularity(&(merge_tree_reader->data_part_info_for_read->getIndexGranularity()))
        , current_mark_index_granularity(index_granularity->getMarkRows(from_mark))
        , stream(from_mark, current_task_last_mark, merge_tree_reader)
 {
@ -652,7 +652,7 @@ MergeTreeRangeReader::MergeTreeRangeReader(
    bool last_reader_in_chain_,
    const Names & non_const_virtual_column_names_)
    : merge_tree_reader(merge_tree_reader_)
-    , index_granularity(&(merge_tree_reader->data_part->index_granularity))
+    , index_granularity(&(merge_tree_reader->data_part_info_for_read->getIndexGranularity()))
    , prev_reader(prev_reader_)
    , prewhere_info(prewhere_info_)
    , last_reader_in_chain(last_reader_in_chain_)
@ -946,7 +946,8 @@ MergeTreeRangeReader::ReadResult MergeTreeRangeReader::startReadingChain(size_t
    result.addRows(stream.finalize(result.columns));

    /// Last granule may be incomplete.
-    result.adjustLastGranule();
+    if (!result.rowsPerGranule().empty())
+        result.adjustLastGranule();

    for (const auto & column_name : non_const_virtual_column_names)
    {
--- a/src/Storages/MergeTree/MergeTreeReadPool.cpp
+++ b/src/Storages/MergeTree/MergeTreeReadPool.cpp
@ -1,5 +1,6 @@
 #include <Storages/MergeTree/MergeTreeReadPool.h>
 #include <Storages/MergeTree/MergeTreeBaseSelectProcessor.h>
+#include <Storages/MergeTree/LoadedMergeTreeDataPartInfoForReader.h>
 #include <Common/formatReadable.h>
 #include <base/range.h>

@ -22,7 +23,6 @@ MergeTreeReadPool::MergeTreeReadPool(
    size_t sum_marks_,
    size_t min_marks_for_concurrent_read_,
    RangesInDataParts && parts_,
-    const MergeTreeData & data_,
    const StorageSnapshotPtr & storage_snapshot_,
    const PrewhereInfoPtr & prewhere_info_,
    const Names & column_names_,
@ -32,7 +32,6 @@ MergeTreeReadPool::MergeTreeReadPool(
    bool do_not_steal_tasks_)
    : backoff_settings{backoff_settings_}
    , backoff_state{threads_}
-    , data{data_}
    , storage_snapshot{storage_snapshot_}
    , column_names{column_names_}
    , virtual_column_names{virtual_column_names_}
@ -214,7 +213,7 @@ std::vector<size_t> MergeTreeReadPool::fillPerPartInfo(const RangesInDataParts &
        per_part_sum_marks.push_back(sum_marks);

        auto task_columns = getReadTaskColumns(
-            data, storage_snapshot, part.data_part,
+            LoadedMergeTreeDataPartInfoForReader(part.data_part), storage_snapshot,
            column_names, virtual_column_names, prewhere_info, /*with_subcolumns=*/ true);

        auto size_predictor = !predict_block_size_bytes ? nullptr
--- a/src/Storages/MergeTree/MergeTreeReadPool.h
+++ b/src/Storages/MergeTree/MergeTreeReadPool.h
@ -70,11 +70,16 @@ private:

 public:
    MergeTreeReadPool(
-        size_t threads_, size_t sum_marks_, size_t min_marks_for_concurrent_read_,
-        RangesInDataParts && parts_, const MergeTreeData & data_, const StorageSnapshotPtr & storage_snapshot_,
+        size_t threads_,
+        size_t sum_marks_,
+        size_t min_marks_for_concurrent_read_,
+        RangesInDataParts && parts_,
+        const StorageSnapshotPtr & storage_snapshot_,
        const PrewhereInfoPtr & prewhere_info_,
-        const Names & column_names_, const Names & virtual_column_names_,
-        const BackoffSettings & backoff_settings_, size_t preferred_block_size_bytes_,
+        const Names & column_names_,
+        const Names & virtual_column_names_,
+        const BackoffSettings & backoff_settings_,
+        size_t preferred_block_size_bytes_,
        bool do_not_steal_tasks_ = false);

    MergeTreeReadTaskPtr getTask(size_t min_marks_to_read, size_t thread, const Names & ordered_names);
@ -94,7 +99,6 @@ private:
        size_t threads, size_t sum_marks, std::vector<size_t> per_part_sum_marks,
        const RangesInDataParts & parts, size_t min_marks_for_concurrent_read);

-    const MergeTreeData & data;
    StorageSnapshotPtr storage_snapshot;
    const Names column_names;
    const Names virtual_column_names;
--- a/src/Storages/MergeTree/MergeTreeReaderCompact.cpp
+++ b/src/Storages/MergeTree/MergeTreeReaderCompact.cpp
@ -15,7 +15,7 @@ namespace ErrorCodes


 MergeTreeReaderCompact::MergeTreeReaderCompact(
-    DataPartCompactPtr data_part_,
+    MergeTreeDataPartInfoForReaderPtr data_part_info_for_read_,
    NamesAndTypesList columns_,
    const StorageMetadataPtr & metadata_snapshot_,
    UncompressedCache * uncompressed_cache_,
@ -26,7 +26,7 @@ MergeTreeReaderCompact::MergeTreeReaderCompact(
    const ReadBufferFromFileBase::ProfileCallback & profile_callback_,
    clockid_t clock_type_)
    : IMergeTreeReader(
-        data_part_,
+        data_part_info_for_read_,
        columns_,
        metadata_snapshot_,
        uncompressed_cache_,
@ -35,14 +35,14 @@ MergeTreeReaderCompact::MergeTreeReaderCompact(
        settings_,
        avg_value_size_hints_)
    , marks_loader(
-          data_part->data_part_storage,
+          data_part_info_for_read_->getDataPartStorage(),
          mark_cache,
-          data_part->index_granularity_info.getMarksFilePath(MergeTreeDataPartCompact::DATA_FILE_NAME),
-          data_part->getMarksCount(),
-          data_part->index_granularity_info,
+          data_part_info_for_read_->getIndexGranularityInfo().getMarksFilePath(MergeTreeDataPartCompact::DATA_FILE_NAME),
+          data_part_info_for_read_->getMarksCount(),
+          data_part_info_for_read_->getIndexGranularityInfo(),
          settings.save_marks_in_cache,
          settings.read_settings,
-          data_part->getColumns().size())
+          data_part_info_for_read_->getColumns().size())
 {
    try
    {
@ -64,7 +64,7 @@ MergeTreeReaderCompact::MergeTreeReaderCompact(
                    continue;
            }

-            auto position = data_part->getColumnPosition(column_to_read.getNameInStorage());
+            auto position = data_part_info_for_read->getColumnPosition(column_to_read.getNameInStorage());
            if (!position && typeid_cast<const DataTypeArray *>(column_to_read.type.get()))
            {
                /// If array of Nested column is missing in part,
@ -77,7 +77,7 @@ MergeTreeReaderCompact::MergeTreeReaderCompact(
        }

        /// Do not use max_read_buffer_size, but try to lower buffer size with maximal size of granule to avoid reading much data.
-        auto buffer_size = getReadBufferSize(data_part, marks_loader, column_positions, all_mark_ranges);
+        auto buffer_size = getReadBufferSize(*data_part_info_for_read, marks_loader, column_positions, all_mark_ranges);
        if (buffer_size)
            settings.read_settings = settings.read_settings.adjustBufferSize(buffer_size);

@ -88,10 +88,10 @@ MergeTreeReaderCompact::MergeTreeReaderCompact(
        if (uncompressed_cache)
        {
            auto buffer = std::make_unique<CachedCompressedReadBuffer>(
-                std::string(fs::path(data_part->data_part_storage->getFullPath()) / path),
+                std::string(fs::path(data_part_info_for_read->getDataPartStorage()->getFullPath()) / path),
                [this, path]()
                {
-                    return data_part->data_part_storage->readFile(
+                    return data_part_info_for_read->getDataPartStorage()->readFile(
                        path,
                        settings.read_settings,
                        std::nullopt, std::nullopt);
@ -113,7 +113,7 @@ MergeTreeReaderCompact::MergeTreeReaderCompact(
        {
            auto buffer =
                std::make_unique<CompressedReadBufferFromFile>(
-                    data_part->data_part_storage->readFile(
+                    data_part_info_for_read->getDataPartStorage()->readFile(
                        path,
                        settings.read_settings,
                        std::nullopt, std::nullopt),
@ -132,7 +132,7 @@ MergeTreeReaderCompact::MergeTreeReaderCompact(
    }
    catch (...)
    {
-        storage.reportBrokenPart(data_part);
+        data_part_info_for_read->reportBroken();
        throw;
    }
 }
@ -156,7 +156,7 @@ size_t MergeTreeReaderCompact::readRows(

    while (read_rows < max_rows_to_read)
    {
-        size_t rows_to_read = data_part->index_granularity.getMarkRows(from_mark);
+        size_t rows_to_read = data_part_info_for_read->getIndexGranularity().getMarkRows(from_mark);

        for (size_t pos = 0; pos < num_columns; ++pos)
        {
@ -179,7 +179,7 @@ size_t MergeTreeReaderCompact::readRows(
            catch (Exception & e)
            {
                if (e.code() != ErrorCodes::MEMORY_LIMIT_EXCEEDED)
-                    storage.reportBrokenPart(data_part);
+                    data_part_info_for_read->reportBroken();

                /// Better diagnostics.
                e.addMessage("(while reading column " + columns_to_read[pos].name + ")");
@ -187,7 +187,7 @@ size_t MergeTreeReaderCompact::readRows(
            }
            catch (...)
            {
-                storage.reportBrokenPart(data_part);
+                data_part_info_for_read->reportBroken();
                throw;
            }
        }
@ -279,7 +279,7 @@ void MergeTreeReaderCompact::seekToMark(size_t row_index, size_t column_index)
 void MergeTreeReaderCompact::adjustUpperBound(size_t last_mark)
 {
    size_t right_offset = 0;
-    if (last_mark < data_part->getMarksCount()) /// Otherwise read until the end of file
+    if (last_mark < data_part_info_for_read->getMarksCount()) /// Otherwise read until the end of file
        right_offset = marks_loader.getMark(last_mark).offset_in_compressed_file;

    if (right_offset == 0)
@ -307,7 +307,7 @@ bool MergeTreeReaderCompact::isContinuousReading(size_t mark, size_t column_posi
        return false;
    const auto & [last_mark, last_column] = *last_read_granule;
    return (mark == last_mark && column_position == last_column + 1)
-        || (mark == last_mark + 1 && column_position == 0 && last_column == data_part->getColumns().size() - 1);
+        || (mark == last_mark + 1 && column_position == 0 && last_column == data_part_info_for_read->getColumns().size() - 1);
 }

 namespace
@ -359,16 +359,16 @@ private:
 }

 size_t MergeTreeReaderCompact::getReadBufferSize(
-    const DataPartPtr & part,
+    const IMergeTreeDataPartInfoForReader & data_part_info_for_reader,
    MergeTreeMarksLoader & marks_loader,
    const ColumnPositions & column_positions,
    const MarkRanges & mark_ranges)
 {
    size_t buffer_size = 0;
    size_t columns_num = column_positions.size();
-    size_t file_size = part->getFileSizeOrZero(MergeTreeDataPartCompact::DATA_FILE_NAME_WITH_EXTENSION);
+    size_t file_size = data_part_info_for_reader.getFileSizeOrZero(MergeTreeDataPartCompact::DATA_FILE_NAME_WITH_EXTENSION);

-    MarksCounter counter(part->getMarksCount(), part->getColumns().size());
+    MarksCounter counter(data_part_info_for_reader.getMarksCount(), data_part_info_for_reader.getColumns().size());

    for (const auto & mark_range : mark_ranges)
    {
--- a/src/Storages/MergeTree/MergeTreeReaderCompact.h
+++ b/src/Storages/MergeTree/MergeTreeReaderCompact.h
@ -19,7 +19,7 @@ class MergeTreeReaderCompact : public IMergeTreeReader
 {
 public:
    MergeTreeReaderCompact(
-        DataPartCompactPtr data_part_,
+        MergeTreeDataPartInfoForReaderPtr data_part_info_for_read_,
        NamesAndTypesList columns_,
        const StorageMetadataPtr & metadata_snapshot_,
        UncompressedCache * uncompressed_cache_,
@ -67,7 +67,7 @@ private:
    /// Returns maximal value of granule size in compressed file from @mark_ranges.
    /// This value is used as size of read buffer.
    static size_t getReadBufferSize(
-        const DataPartPtr & part,
+        const IMergeTreeDataPartInfoForReader & data_part_info_for_reader,
        MergeTreeMarksLoader & marks_loader,
        const ColumnPositions & column_positions,
        const MarkRanges & mark_ranges);
--- a/src/Storages/MergeTree/MergeTreeReaderInMemory.cpp
+++ b/src/Storages/MergeTree/MergeTreeReaderInMemory.cpp
@ -16,13 +16,14 @@ namespace ErrorCodes


 MergeTreeReaderInMemory::MergeTreeReaderInMemory(
+    MergeTreeDataPartInfoForReaderPtr data_part_info_for_read_,
    DataPartInMemoryPtr data_part_,
    NamesAndTypesList columns_,
    const StorageMetadataPtr & metadata_snapshot_,
    MarkRanges mark_ranges_,
    MergeTreeReaderSettings settings_)
    : IMergeTreeReader(
-        data_part_,
+        data_part_info_for_read_,
        columns_,
        metadata_snapshot_,
        nullptr,
@ -48,7 +49,7 @@ size_t MergeTreeReaderInMemory::readRows(
    if (!continue_reading)
        total_rows_read = 0;

-    size_t total_marks = data_part->index_granularity.getMarksCount();
+    size_t total_marks = data_part_info_for_read->getIndexGranularity().getMarksCount();
    if (from_mark >= total_marks)
        throw Exception("Mark " + toString(from_mark) + " is out of bound. Max mark: "
            + toString(total_marks), ErrorCodes::ARGUMENT_OUT_OF_BOUND);
--- a/src/Storages/MergeTree/MergeTreeReaderInMemory.h
+++ b/src/Storages/MergeTree/MergeTreeReaderInMemory.h
@ -15,6 +15,7 @@ class MergeTreeReaderInMemory : public IMergeTreeReader
 {
 public:
    MergeTreeReaderInMemory(
+        MergeTreeDataPartInfoForReaderPtr data_part_info_for_read_,
        DataPartInMemoryPtr data_part_,
        NamesAndTypesList columns_,
        const StorageMetadataPtr & metadata_snapshot_,
--- a/src/Storages/MergeTree/MergeTreeReaderWide.cpp
+++ b/src/Storages/MergeTree/MergeTreeReaderWide.cpp
@ -26,7 +26,7 @@ namespace ErrorCodes
 }

 MergeTreeReaderWide::MergeTreeReaderWide(
-    DataPartWidePtr data_part_,
+    MergeTreeDataPartInfoForReaderPtr data_part_info_,
    NamesAndTypesList columns_,
    const StorageMetadataPtr & metadata_snapshot_,
    UncompressedCache * uncompressed_cache_,
@ -37,7 +37,7 @@ MergeTreeReaderWide::MergeTreeReaderWide(
    const ReadBufferFromFileBase::ProfileCallback & profile_callback_,
    clockid_t clock_type_)
    : IMergeTreeReader(
-        data_part_,
+        data_part_info_,
        columns_,
        metadata_snapshot_,
        uncompressed_cache_,
@ -53,7 +53,7 @@ MergeTreeReaderWide::MergeTreeReaderWide(
    }
    catch (...)
    {
-        storage.reportBrokenPart(data_part);
+        data_part_info_for_read->reportBroken();
        throw;
    }
 }
@ -73,7 +73,7 @@ size_t MergeTreeReaderWide::readRows(
        std::unordered_map<String, ISerialization::SubstreamsCache> caches;

        std::unordered_set<std::string> prefetched_streams;
-        if (data_part->data_part_storage->isStoredOnRemoteDisk() ? settings.read_settings.remote_fs_prefetch : settings.read_settings.local_fs_prefetch)
+        if (data_part_info_for_read->getDataPartStorage()->isStoredOnRemoteDisk() ? settings.read_settings.remote_fs_prefetch : settings.read_settings.local_fs_prefetch)
        {
            /// Request reading of data in advance,
            /// so if reading can be asynchronous, it will also be performed in parallel for all columns.
@ -136,17 +136,17 @@ size_t MergeTreeReaderWide::readRows(
    catch (Exception & e)
    {
        if (e.code() != ErrorCodes::MEMORY_LIMIT_EXCEEDED)
-            storage.reportBrokenPart(data_part);
+            data_part_info_for_read->reportBroken();

        /// Better diagnostics.
-        e.addMessage("(while reading from part " + data_part->data_part_storage->getFullPath() + " "
+        e.addMessage("(while reading from part " + data_part_info_for_read->getDataPartStorage()->getFullPath() + " "
                     "from mark " + toString(from_mark) + " "
                     "with max_rows_to_read = " + toString(max_rows_to_read) + ")");
        throw;
    }
    catch (...)
    {
-        storage.reportBrokenPart(data_part);
+        data_part_info_for_read->reportBroken();

        throw;
    }
@ -167,7 +167,7 @@ void MergeTreeReaderWide::addStreams(
        if (streams.contains(stream_name))
            return;

-        bool data_file_exists = data_part->checksums.files.contains(stream_name + DATA_FILE_EXTENSION);
+        bool data_file_exists = data_part_info_for_read->getChecksums().files.contains(stream_name + DATA_FILE_EXTENSION);

        /** If data file is missing then we will not try to open it.
          * It is necessary since it allows to add new column to structure of the table without creating new files for old parts.
@ -178,10 +178,10 @@ void MergeTreeReaderWide::addStreams(
        bool is_lc_dict = substream_path.size() > 1 && substream_path[substream_path.size() - 2].type == ISerialization::Substream::Type::DictionaryKeys;

        streams.emplace(stream_name, std::make_unique<MergeTreeReaderStream>(
-            data_part->data_part_storage, stream_name, DATA_FILE_EXTENSION,
-            data_part->getMarksCount(), all_mark_ranges, settings, mark_cache,
-            uncompressed_cache, data_part->getFileSizeOrZero(stream_name + DATA_FILE_EXTENSION),
-            &data_part->index_granularity_info,
+            data_part_info_for_read->getDataPartStorage(), stream_name, DATA_FILE_EXTENSION,
+            data_part_info_for_read->getMarksCount(), all_mark_ranges, settings, mark_cache,
+            uncompressed_cache, data_part_info_for_read->getFileSizeOrZero(stream_name + DATA_FILE_EXTENSION),
+            &data_part_info_for_read->getIndexGranularityInfo(),
            profile_callback, clock_type, is_lc_dict));
    };

--- a/src/Storages/MergeTree/MergeTreeReaderWide.h
+++ b/src/Storages/MergeTree/MergeTreeReaderWide.h
@ -15,7 +15,7 @@ class MergeTreeReaderWide : public IMergeTreeReader
 {
 public:
    MergeTreeReaderWide(
-        DataPartWidePtr data_part_,
+        MergeTreeDataPartInfoForReaderPtr data_part_info_for_read_,
        NamesAndTypesList columns_,
        const StorageMetadataPtr & metadata_snapshot_,
        UncompressedCache * uncompressed_cache_,
--- a/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp
+++ b/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp
@ -1,6 +1,7 @@
 #include <Storages/MergeTree/MergeTreeSelectProcessor.h>
 #include <Storages/MergeTree/MergeTreeBaseSelectProcessor.h>
 #include <Storages/MergeTree/IMergeTreeReader.h>
+#include <Storages/MergeTree/LoadedMergeTreeDataPartInfoForReader.h>
 #include <Interpreters/Context.h>


@ -51,7 +52,7 @@ MergeTreeSelectProcessor::MergeTreeSelectProcessor(
 void MergeTreeSelectProcessor::initializeReaders()
 {
    task_columns = getReadTaskColumns(
-        storage, storage_snapshot, data_part,
+        LoadedMergeTreeDataPartInfoForReader(data_part), storage_snapshot,
        required_columns, virt_column_names, prewhere_info, /*with_subcolumns=*/ true);

    /// Will be used to distinguish between PREWHERE and WHERE columns when applying filter
--- a/src/Storages/MergeTree/MergeTreeSequentialSource.cpp
+++ b/src/Storages/MergeTree/MergeTreeSequentialSource.cpp
@ -1,5 +1,6 @@
 #include <Storages/MergeTree/MergeTreeSequentialSource.h>
 #include <Storages/MergeTree/MergeTreeBlockReadUtils.h>
+#include <Storages/MergeTree/LoadedMergeTreeDataPartInfoForReader.h>
 #include <Processors/Transforms/FilterTransform.h>
 #include <QueryPipeline/Pipe.h>
 #include <Interpreters/Context.h>
@ -102,7 +103,7 @@ MergeTreeSequentialSource::MergeTreeSequentialSource(
    addTotalRowsApprox(data_part->rows_count);

    /// Add columns because we don't want to read empty blocks
-    injectRequiredColumns(storage, storage_snapshot, data_part, /*with_subcolumns=*/ false, columns_to_read);
+    injectRequiredColumns(LoadedMergeTreeDataPartInfoForReader(data_part), storage_snapshot, /*with_subcolumns=*/ false, columns_to_read);

    NamesAndTypesList columns_for_reader;
    if (take_column_types_from_storage)
--- a/src/Storages/MergeTree/MergeTreeSink.cpp
+++ b/src/Storages/MergeTree/MergeTreeSink.cpp
@ -23,6 +23,7 @@ MergeTreeSink::MergeTreeSink(
    , metadata_snapshot(metadata_snapshot_)
    , max_parts_per_block(max_parts_per_block_)
    , context(context_)
+    , storage_snapshot(storage.getStorageSnapshot(metadata_snapshot, context))
 {
 }

@ -54,7 +55,6 @@ struct MergeTreeSink::DelayedChunk
 void MergeTreeSink::consume(Chunk chunk)
 {
    auto block = getHeader().cloneWithColumns(chunk.detachColumns());
-    auto storage_snapshot = storage.getStorageSnapshot(metadata_snapshot, context);

    storage.writer.deduceTypesOfObjectColumns(storage_snapshot, block);
    auto part_blocks = storage.writer.splitBlockIntoParts(block, max_parts_per_block, metadata_snapshot, context);
--- a/src/Storages/MergeTree/MergeTreeSink.h
+++ b/src/Storages/MergeTree/MergeTreeSink.h
@ -9,6 +9,8 @@ namespace DB

 class Block;
 class StorageMergeTree;
+struct StorageSnapshot;
+using StorageSnapshotPtr = std::shared_ptr<StorageSnapshot>;


 class MergeTreeSink : public SinkToStorage
@ -32,6 +34,7 @@ private:
    StorageMetadataPtr metadata_snapshot;
    size_t max_parts_per_block;
    ContextPtr context;
+    StorageSnapshotPtr storage_snapshot;
    uint64_t chunk_dedup_seqnum = 0; /// input chunk ordinal number in case of dedup token

    /// We can delay processing for previous chunk and start writing a new one.
--- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp
+++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp
@ -41,23 +41,29 @@ struct ReplicatedMergeTreeSink::DelayedChunk
        String block_id;
    };

+    DelayedChunk() = default;
+    explicit DelayedChunk(size_t replicas_num_) : replicas_num(replicas_num_) {}
+
+    size_t replicas_num = 0;
+
    std::vector<Partition> partitions;
 };

 ReplicatedMergeTreeSink::ReplicatedMergeTreeSink(
    StorageReplicatedMergeTree & storage_,
    const StorageMetadataPtr & metadata_snapshot_,
-    size_t quorum_,
+    size_t quorum_size,
    size_t quorum_timeout_ms_,
    size_t max_parts_per_block_,
    bool quorum_parallel_,
    bool deduplicate_,
+    bool majority_quorum,
    ContextPtr context_,
    bool is_attach_)
    : SinkToStorage(metadata_snapshot_->getSampleBlock())
    , storage(storage_)
    , metadata_snapshot(metadata_snapshot_)
-    , quorum(quorum_)
+    , required_quorum_size(majority_quorum ? std::nullopt : std::make_optional<size_t>(quorum_size))
    , quorum_timeout_ms(quorum_timeout_ms_)
    , max_parts_per_block(max_parts_per_block_)
    , is_attach(is_attach_)
@ -65,15 +71,15 @@ ReplicatedMergeTreeSink::ReplicatedMergeTreeSink(
    , deduplicate(deduplicate_)
    , log(&Poco::Logger::get(storage.getLogName() + " (Replicated OutputStream)"))
    , context(context_)
+    , storage_snapshot(storage.getStorageSnapshot(metadata_snapshot, context))
 {
    /// The quorum value `1` has the same meaning as if it is disabled.
-    if (quorum == 1)
-        quorum = 0;
+    if (required_quorum_size == 1)
+        required_quorum_size = 0;
 }

 ReplicatedMergeTreeSink::~ReplicatedMergeTreeSink() = default;

-
 /// Allow to verify that the session in ZooKeeper is still alive.
 static void assertSessionIsNotExpired(zkutil::ZooKeeperPtr & zookeeper)
 {
@ -84,9 +90,11 @@ static void assertSessionIsNotExpired(zkutil::ZooKeeperPtr & zookeeper)
        throw Exception("ZooKeeper session has been expired.", ErrorCodes::NO_ZOOKEEPER);
 }

-
-void ReplicatedMergeTreeSink::checkQuorumPrecondition(zkutil::ZooKeeperPtr & zookeeper)
+size_t ReplicatedMergeTreeSink::checkQuorumPrecondition(zkutil::ZooKeeperPtr & zookeeper)
 {
+    if (!isQuorumEnabled())
+        return 0;
+
    quorum_info.status_path = storage.zookeeper_path + "/quorum/status";

    Strings replicas = zookeeper->getChildren(fs::path(storage.zookeeper_path) / "replicas");
@ -104,9 +112,12 @@ void ReplicatedMergeTreeSink::checkQuorumPrecondition(zkutil::ZooKeeperPtr & zoo
        if (status.get().error == Coordination::Error::ZOK)
            ++active_replicas;

-    if (active_replicas < quorum)
-        throw Exception(ErrorCodes::TOO_FEW_LIVE_REPLICAS, "Number of alive replicas ({}) is less than requested quorum ({}).",
-                        active_replicas, quorum);
+    size_t replicas_number = replicas.size();
+    size_t quorum_size = getQuorumSize(replicas_number);
+
+    if (active_replicas < quorum_size)
+        throw Exception(ErrorCodes::TOO_FEW_LIVE_REPLICAS, "Number of alive replicas ({}) is less than requested quorum ({}/{}).",
+                        active_replicas, quorum_size, replicas_number);

    /** Is there a quorum for the last part for which a quorum is needed?
        * Write of all the parts with the included quorum is linearly ordered.
@ -132,8 +143,9 @@ void ReplicatedMergeTreeSink::checkQuorumPrecondition(zkutil::ZooKeeperPtr & zoo
    quorum_info.is_active_node_value = is_active.data;
    quorum_info.is_active_node_version = is_active.stat.version;
    quorum_info.host_node_version = host.stat.version;
-}

+    return replicas_number;
+}

 void ReplicatedMergeTreeSink::consume(Chunk chunk)
 {
@ -147,10 +159,8 @@ void ReplicatedMergeTreeSink::consume(Chunk chunk)
      * And also check that during the insertion, the replica was not reinitialized or disabled (by the value of `is_active` node).
      * TODO Too complex logic, you can do better.
      */
-    if (quorum)
-        checkQuorumPrecondition(zookeeper);
+    size_t replicas_num = checkQuorumPrecondition(zookeeper);

-    auto storage_snapshot = storage.getStorageSnapshot(metadata_snapshot, context);
    storage.writer.deduceTypesOfObjectColumns(storage_snapshot, block);
    auto part_blocks = storage.writer.splitBlockIntoParts(block, max_parts_per_block, metadata_snapshot, context);

@ -193,11 +203,11 @@ void ReplicatedMergeTreeSink::consume(Chunk chunk)
            }

            block_id = temp_part.part->getZeroLevelPartBlockID(block_dedup_token);
-            LOG_DEBUG(log, "Wrote block with ID '{}', {} rows", block_id, current_block.block.rows());
+            LOG_DEBUG(log, "Wrote block with ID '{}', {} rows on {} replicas", block_id, current_block.block.rows(), replicas_num);
        }
        else
        {
-            LOG_DEBUG(log, "Wrote block with {} rows", current_block.block.rows());
+            LOG_DEBUG(log, "Wrote block with {} rows on {} replicas", current_block.block.rows(), replicas_num);
        }

        UInt64 elapsed_ns = watch.elapsed();
@ -211,7 +221,7 @@ void ReplicatedMergeTreeSink::consume(Chunk chunk)
        if (streams > max_insert_delayed_streams_for_parallel_write)
        {
            finishDelayedChunk(zookeeper);
-            delayed_chunk = std::make_unique<ReplicatedMergeTreeSink::DelayedChunk>();
+            delayed_chunk = std::make_unique<ReplicatedMergeTreeSink::DelayedChunk>(replicas_num);
            delayed_chunk->partitions = std::move(partitions);
            finishDelayedChunk(zookeeper);

@ -254,7 +264,7 @@ void ReplicatedMergeTreeSink::finishDelayedChunk(zkutil::ZooKeeperPtr & zookeepe

        try
        {
-            commitPart(zookeeper, part, partition.block_id, partition.temp_part.builder);
+            commitPart(zookeeper, part, partition.block_id, partition.temp_part.builder, delayed_chunk->replicas_num);

            last_block_is_duplicate = last_block_is_duplicate || part->is_duplicate;

@ -273,7 +283,6 @@ void ReplicatedMergeTreeSink::finishDelayedChunk(zkutil::ZooKeeperPtr & zookeepe
    delayed_chunk.reset();
 }

-
 void ReplicatedMergeTreeSink::writeExistingPart(MergeTreeData::MutableDataPartPtr & part)
 {
    /// NOTE: No delay in this case. That's Ok.
@ -281,15 +290,14 @@ void ReplicatedMergeTreeSink::writeExistingPart(MergeTreeData::MutableDataPartPt
    auto zookeeper = storage.getZooKeeper();
    assertSessionIsNotExpired(zookeeper);

-    if (quorum)
-        checkQuorumPrecondition(zookeeper);
+    size_t replicas_num = checkQuorumPrecondition(zookeeper);

    Stopwatch watch;

    try
    {
        part->version.setCreationTID(Tx::PrehistoricTID, nullptr);
-        commitPart(zookeeper, part, "", part->data_part_storage->getBuilder());
+        commitPart(zookeeper, part, "", part->data_part_storage->getBuilder(), replicas_num);
        PartLog::addNewPart(storage.getContext(), part, watch.elapsed());
    }
    catch (...)
@ -299,12 +307,12 @@ void ReplicatedMergeTreeSink::writeExistingPart(MergeTreeData::MutableDataPartPt
    }
 }

-
 void ReplicatedMergeTreeSink::commitPart(
    zkutil::ZooKeeperPtr & zookeeper,
    MergeTreeData::MutableDataPartPtr & part,
    const String & block_id,
-    DataPartStorageBuilderPtr builder)
+    DataPartStorageBuilderPtr builder,
+    size_t replicas_num)
 {
    metadata_snapshot->check(part->getColumns());
    assertSessionIsNotExpired(zookeeper);
@ -367,7 +375,7 @@ void ReplicatedMergeTreeSink::commitPart(
            log_entry.source_replica = storage.replica_name;
            log_entry.new_part_name = part->name;
            /// TODO maybe add UUID here as well?
-            log_entry.quorum = quorum;
+            log_entry.quorum = getQuorumSize(replicas_num);
            log_entry.block_id = block_id;
            log_entry.new_part_type = part->getType();

@ -384,11 +392,11 @@ void ReplicatedMergeTreeSink::commitPart(
              *  but for it the quorum has not yet been reached.
              *  You can not do the next quorum record at this time.)
              */
-            if (quorum)
+            if (isQuorumEnabled())
            {
                ReplicatedMergeTreeQuorumEntry quorum_entry;
                quorum_entry.part_name = part->name;
-                quorum_entry.required_number_of_replicas = quorum;
+                quorum_entry.required_number_of_replicas = getQuorumSize(replicas_num);
                quorum_entry.replicas.insert(storage.replica_name);

                /** At this point, this node will contain information that the current replica received a part.
@ -436,7 +444,7 @@ void ReplicatedMergeTreeSink::commitPart(
            {
                part->is_duplicate = true;
                ProfileEvents::increment(ProfileEvents::DuplicatedInsertedBlocks);
-                if (quorum)
+                if (isQuorumEnabled())
                {
                    LOG_INFO(log, "Block with ID {} already exists locally as part {}; ignoring it, but checking quorum.", block_id, existing_part_name);

@ -446,7 +454,7 @@ void ReplicatedMergeTreeSink::commitPart(
                    else
                        quorum_path = storage.zookeeper_path + "/quorum/status";

-                    waitForQuorum(zookeeper, existing_part_name, quorum_path, quorum_info.is_active_node_value);
+                    waitForQuorum(zookeeper, existing_part_name, quorum_path, quorum_info.is_active_node_value, replicas_num);
                }
                else
                {
@ -593,7 +601,7 @@ void ReplicatedMergeTreeSink::commitPart(
        break;
    }

-    if (quorum)
+    if (isQuorumEnabled())
    {
        if (is_already_existing_part)
        {
@ -605,7 +613,7 @@ void ReplicatedMergeTreeSink::commitPart(
                storage.updateQuorum(part->name, false);
        }

-        waitForQuorum(zookeeper, part->name, quorum_info.status_path, quorum_info.is_active_node_value);
+        waitForQuorum(zookeeper, part->name, quorum_info.status_path, quorum_info.is_active_node_value, replicas_num);
    }
 }

@ -627,10 +635,11 @@ void ReplicatedMergeTreeSink::waitForQuorum(
    zkutil::ZooKeeperPtr & zookeeper,
    const std::string & part_name,
    const std::string & quorum_path,
-    const std::string & is_active_node_value) const
+    const std::string & is_active_node_value,
+    size_t replicas_num) const
 {
    /// We are waiting for quorum to be satisfied.
-    LOG_TRACE(log, "Waiting for quorum");
+    LOG_TRACE(log, "Waiting for quorum '{}' for part {} on {} replicas", quorum_path, part_name, replicas_num);

    try
    {
@ -654,7 +663,7 @@ void ReplicatedMergeTreeSink::waitForQuorum(
            if (!event->tryWait(quorum_timeout_ms))
                throw Exception("Timeout while waiting for quorum", ErrorCodes::TIMEOUT_EXCEEDED);

-            LOG_TRACE(log, "Quorum {} updated, will check quorum node still exists", quorum_path);
+            LOG_TRACE(log, "Quorum {} for part {} updated, will check quorum node still exists", quorum_path, part_name);
        }

        /// And what if it is possible that the current replica at this time has ceased to be active
@ -672,8 +681,23 @@ void ReplicatedMergeTreeSink::waitForQuorum(
            ErrorCodes::UNKNOWN_STATUS_OF_INSERT);
    }

-    LOG_TRACE(log, "Quorum satisfied");
+    LOG_TRACE(log, "Quorum '{}' for part {} satisfied", quorum_path, part_name);
 }

+size_t ReplicatedMergeTreeSink::getQuorumSize(size_t replicas_num) const
+{
+    if (!isQuorumEnabled())
+        return 0;
+
+    if (required_quorum_size)
+        return required_quorum_size.value();
+
+    return replicas_num / 2 + 1;
+}
+
+bool ReplicatedMergeTreeSink::isQuorumEnabled() const
+{
+    return !required_quorum_size.has_value() || required_quorum_size.value() > 1;
+}

 }
--- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.h
+++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.h
@ -17,6 +17,8 @@ namespace DB
 {

 class StorageReplicatedMergeTree;
+struct StorageSnapshot;
+using StorageSnapshotPtr = std::shared_ptr<StorageSnapshot>;


 class ReplicatedMergeTreeSink : public SinkToStorage
@ -30,6 +32,7 @@ public:
        size_t max_parts_per_block_,
        bool quorum_parallel_,
        bool deduplicate_,
+        bool majority_quorum_,
        ContextPtr context_,
        // special flag to determine the ALTER TABLE ATTACH PART without the query context,
        // needed to set the special LogEntryType::ATTACH_PART
@ -66,24 +69,34 @@ private:
    };

    QuorumInfo quorum_info;
-    void checkQuorumPrecondition(zkutil::ZooKeeperPtr & zookeeper);
+
+    /// Checks active replicas.
+    /// Returns total number of replicas.
+    size_t checkQuorumPrecondition(zkutil::ZooKeeperPtr & zookeeper);

    /// Rename temporary part and commit to ZooKeeper.
    void commitPart(
        zkutil::ZooKeeperPtr & zookeeper,
        MergeTreeData::MutableDataPartPtr & part,
        const String & block_id,
-        DataPartStorageBuilderPtr part_builder);
+        DataPartStorageBuilderPtr part_builder,
+        size_t replicas_num);

    /// Wait for quorum to be satisfied on path (quorum_path) form part (part_name)
    /// Also checks that replica still alive.
    void waitForQuorum(
        zkutil::ZooKeeperPtr & zookeeper, const std::string & part_name,
-        const std::string & quorum_path, const std::string & is_active_node_value) const;
+        const std::string & quorum_path, const std::string & is_active_node_value, size_t replicas_num) const;

    StorageReplicatedMergeTree & storage;
    StorageMetadataPtr metadata_snapshot;
-    size_t quorum;
+
+    /// Empty means use majority quorum.
+    std::optional<size_t> required_quorum_size;
+
+    size_t getQuorumSize(size_t replicas_num) const;
+    bool isQuorumEnabled() const;
+
    size_t quorum_timeout_ms;
    size_t max_parts_per_block;

@ -96,6 +109,8 @@ private:
    Poco::Logger * log;

    ContextPtr context;
+    StorageSnapshotPtr storage_snapshot;
+
    UInt64 chunk_dedup_seqnum = 0; /// input chunk ordinal number in case of dedup token

    /// We can delay processing for previous chunk and start writing a new one.
--- a/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp
+++ b/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp
@ -1,6 +1,7 @@
 #include <Storages/checkAndGetLiteralArgument.h>
 #include <Storages/RocksDB/StorageEmbeddedRocksDB.h>
 #include <Storages/RocksDB/EmbeddedRocksDBSink.h>
+#include <Storages/MutationCommands.h>

 #include <DataTypes/DataTypesNumber.h>

@ -10,11 +11,15 @@
 #include <Parsers/ASTCreateQuery.h>

 #include <QueryPipeline/Pipe.h>
+#include <QueryPipeline/QueryPipelineBuilder.h>
 #include <Processors/ISource.h>

 #include <Interpreters/castColumn.h>
 #include <Interpreters/Context.h>
 #include <Interpreters/TreeRewriter.h>
+#include <Interpreters/MutationsInterpreter.h>
+
+#include <Processors/Executors/PullingPipelineExecutor.h>

 #include <Poco/Logger.h>
 #include <Poco/Util/AbstractConfiguration.h>
@ -200,6 +205,92 @@ void StorageEmbeddedRocksDB::truncate(const ASTPtr &, const StorageMetadataPtr &
    initDB();
 }

+void StorageEmbeddedRocksDB::checkMutationIsPossible(const MutationCommands & commands, const Settings & /* settings */) const
+{
+    if (commands.empty())
+        return;
+
+    if (commands.size() > 1)
+        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Mutations cannot be combined for EmbeddedRocksDB");
+
+    const auto command_type = commands.front().type;
+    if (command_type != MutationCommand::Type::UPDATE && command_type != MutationCommand::Type::DELETE)
+        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Only DELETE and UPDATE mutation supported for EmbeddedRocksDB");
+}
+
+void StorageEmbeddedRocksDB::mutate(const MutationCommands & commands, ContextPtr context_)
+{
+    if (commands.empty())
+        return;
+
+    assert(commands.size() == 1);
+
+    auto metadata_snapshot = getInMemoryMetadataPtr();
+    auto storage = getStorageID();
+    auto storage_ptr = DatabaseCatalog::instance().getTable(storage, context_);
+
+    if (commands.front().type == MutationCommand::Type::DELETE)
+    {
+        auto interpreter = std::make_unique<MutationsInterpreter>(
+            storage_ptr,
+            metadata_snapshot,
+            commands,
+            context_,
+            /*can_execute_*/ true,
+            /*return_all_columns_*/ true,
+            /*return_deleted_rows_*/ true);
+        auto pipeline = QueryPipelineBuilder::getPipeline(interpreter->execute());
+        PullingPipelineExecutor executor(pipeline);
+
+        auto sink = std::make_shared<EmbeddedRocksDBSink>(*this, metadata_snapshot);
+
+        Block block;
+        while (executor.pull(block))
+        {
+            auto column_it = std::find_if(block.begin(), block.end(), [&](const auto & column) { return column.name == primary_key; });
+            assert(column_it != block.end());
+
+            auto column = column_it->column;
+            auto size = column->size();
+
+            rocksdb::WriteBatch batch;
+            WriteBufferFromOwnString wb_key;
+            for (size_t i = 0; i < size; ++i)
+            {
+                wb_key.restart();
+
+                column_it->type->getDefaultSerialization()->serializeBinary(*column, i, wb_key);
+                auto status = batch.Delete(wb_key.str());
+                if (!status.ok())
+                    throw Exception("RocksDB write error: " + status.ToString(), ErrorCodes::ROCKSDB_ERROR);
+            }
+
+            auto status = rocksdb_ptr->Write(rocksdb::WriteOptions(), &batch);
+            if (!status.ok())
+                throw Exception("RocksDB write error: " + status.ToString(), ErrorCodes::ROCKSDB_ERROR);
+        }
+
+        return;
+    }
+
+    assert(commands.front().type == MutationCommand::Type::UPDATE);
+    if (commands.front().column_to_update_expression.contains(primary_key))
+        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Primary key cannot be updated");
+
+    auto interpreter = std::make_unique<MutationsInterpreter>(
+        storage_ptr, metadata_snapshot, commands, context_, /*can_execute_*/ true, /*return_all_columns*/ true);
+    auto pipeline = QueryPipelineBuilder::getPipeline(interpreter->execute());
+    PullingPipelineExecutor executor(pipeline);
+
+    auto sink = std::make_shared<EmbeddedRocksDBSink>(*this, metadata_snapshot);
+
+    Block block;
+    while (executor.pull(block))
+    {
+        sink->consume(Chunk{block.getColumns(), block.rows()});
+    }
+}
+
 void StorageEmbeddedRocksDB::initDB()
 {
    rocksdb::Status status;
--- a/src/Storages/RocksDB/StorageEmbeddedRocksDB.h
+++ b/src/Storages/RocksDB/StorageEmbeddedRocksDB.h
@ -51,6 +51,9 @@ public:
    SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context) override;
    void truncate(const ASTPtr &, const StorageMetadataPtr & metadata_snapshot, ContextPtr, TableExclusiveLockHolder &) override;

+    void checkMutationIsPossible(const MutationCommands & commands, const Settings & settings) const override;
+    void mutate(const MutationCommands &, ContextPtr) override;
+
    bool supportsParallelInsert() const override { return true; }
    bool supportsIndexForIn() const override { return true; }
    bool mayBenefitFromIndexForIn(
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@ -2180,7 +2180,7 @@ bool StorageReplicatedMergeTree::executeReplaceRange(const LogEntry & entry)
            if (interserver_scheme != address.scheme)
                throw Exception("Interserver schemas are different '" + interserver_scheme + "' != '" + address.scheme + "', can't fetch part from " + address.host, ErrorCodes::LOGICAL_ERROR);

-            part_desc->res_part = fetcher.fetchPart(
+            part_desc->res_part = fetcher.fetchSelectedPart(
                metadata_snapshot, getContext(), part_desc->found_new_part_name, source_replica_path,
                address.host, address.replication_port, timeouts, credentials->getUser(), credentials->getPassword(),
                interserver_scheme, replicated_fetches_throttler, false, TMP_PREFIX + "fetch_");
@ -2299,7 +2299,7 @@ void StorageReplicatedMergeTree::executeClonePartFromShard(const LogEntry & entr
                                + "' != '" + address.scheme + "', can't fetch part from " + address.host,
                                ErrorCodes::LOGICAL_ERROR);

-            return fetcher.fetchPart(
+            return fetcher.fetchSelectedPart(
                metadata_snapshot, getContext(), entry.new_part_name, source_replica_path,
                address.host, address.replication_port,
                timeouts, credentials->getUser(), credentials->getPassword(), interserver_scheme,
@ -3641,8 +3641,8 @@ void StorageReplicatedMergeTree::updateQuorum(const String & part_name, bool is_
        if (quorum_entry.replicas.size() >= quorum_entry.required_number_of_replicas)
        {
            /// The quorum is reached. Delete the node, and update information about the last part that was successfully written with quorum.
-            LOG_TRACE(log, "Got {} replicas confirmed quorum {}, going to remove node",
-                      quorum_entry.replicas.size(), quorum_status_path);
+            LOG_TRACE(log, "Got {} (of {}) replicas confirmed quorum {}, going to remove node",
+                      quorum_entry.replicas.size(), quorum_entry.required_number_of_replicas, quorum_status_path);

            Coordination::Requests ops;
            Coordination::Responses responses;
@ -3690,8 +3690,8 @@ void StorageReplicatedMergeTree::updateQuorum(const String & part_name, bool is_
        }
        else
        {
-            LOG_TRACE(log, "Quorum {} still not satisfied (have only {} replicas), updating node",
-                      quorum_status_path, quorum_entry.replicas.size());
+            LOG_TRACE(log, "Quorum {} still not satisfied (have only {} of {} replicas), updating node",
+                      quorum_status_path, quorum_entry.replicas.size(), quorum_entry.required_number_of_replicas);
            /// We update the node, registering there one more replica.
            auto code = zookeeper->trySet(quorum_status_path, quorum_entry.toString(), stat.version);

@ -3831,9 +3831,10 @@ bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const Stora

    LOG_DEBUG(log, "Fetching part {} from {}", part_name, source_replica_path);

+    auto settings_ptr = getSettings();
    TableLockHolder table_lock_holder;
    if (!to_detached)
-        table_lock_holder = lockForShare(RWLockImpl::NO_QUERY, getSettings()->lock_acquire_timeout_for_background_operations);
+        table_lock_holder = lockForShare(RWLockImpl::NO_QUERY, settings_ptr->lock_acquire_timeout_for_background_operations);

    /// Logging
    Stopwatch stopwatch;
@ -3857,7 +3858,8 @@ bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const Stora
        covered_part_info.mutation = 0;
        auto source_part = getActiveContainingPart(covered_part_info);

-        if (source_part)
+        /// Fetch for zero-copy replication is cheap and straightforward, so we don't use local clone here
+        if (source_part && (!settings_ptr->allow_remote_fs_zero_copy_replication || !source_part->data_part_storage->supportZeroCopyReplication()))
        {
            auto source_part_header = ReplicatedMergeTreePartHeader::fromColumnsAndChecksums(
                source_part->getColumns(), source_part->checksums);
@ -3897,7 +3899,6 @@ bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const Stora
                part_to_clone = source_part;
            }
        }
-
    }

    ReplicatedMergeTreeAddress address;
@ -3933,7 +3934,7 @@ bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const Stora
                    + "' != '" + address.scheme + "', can't fetch part from " + address.host,
                    ErrorCodes::INTERSERVER_SCHEME_DOESNT_MATCH);

-            return fetcher.fetchPart(
+            return fetcher.fetchSelectedPart(
                metadata_snapshot,
                getContext(),
                part_name,
@ -4070,7 +4071,7 @@ DataPartStoragePtr StorageReplicatedMergeTree::fetchExistsPart(
        currently_fetching_parts.erase(part_name);
    });

-    LOG_DEBUG(log, "Fetching part {} from {}", part_name, source_replica_path);
+    LOG_DEBUG(log, "Fetching already known part {} from {}", part_name, source_replica_path);

    TableLockHolder table_lock_holder = lockForShare(RWLockImpl::NO_QUERY, getSettings()->lock_acquire_timeout_for_background_operations);

@ -4100,7 +4101,7 @@ DataPartStoragePtr StorageReplicatedMergeTree::fetchExistsPart(
                + "' != '" + address.scheme + "', can't fetch part from " + address.host,
                ErrorCodes::INTERSERVER_SCHEME_DOESNT_MATCH);

-        return fetcher.fetchPart(
+        return fetcher.fetchSelectedPart(
            metadata_snapshot, getContext(), part_name, source_replica_path,
            address.host, address.replication_port,
            timeouts, credentials->getUser(), credentials->getPassword(),
@ -4304,12 +4305,12 @@ ReplicatedMergeTreeQuorumAddedParts::PartitionIdToMaxBlock StorageReplicatedMerg
            auto added_parts = part_with_quorum.added_parts;

            for (const auto & added_part : added_parts)
+            {
                if (!getActiveContainingPart(added_part.second))
-                    throw Exception(
-                        "Replica doesn't have part " + added_part.second
-                            + " which was successfully written to quorum of other replicas."
-                              " Send query to another replica or disable 'select_sequential_consistency' setting.",
-                        ErrorCodes::REPLICA_IS_NOT_IN_QUORUM);
+                    throw Exception(ErrorCodes::REPLICA_IS_NOT_IN_QUORUM,
+                        "Replica doesn't have part '{}' which was successfully written to quorum of other replicas. "
+                        "Send query to another replica or disable 'select_sequential_consistency' setting", added_part.second);
+            }

            for (const auto & max_block : part_with_quorum.getMaxInsertedBlocks())
                max_added_blocks[max_block.first] = max_block.second;
@ -4430,13 +4431,13 @@ SinkToStoragePtr StorageReplicatedMergeTree::write(const ASTPtr & /*query*/, con
    bool deduplicate = storage_settings_ptr->replicated_deduplication_window != 0 && query_settings.insert_deduplicate;

    // TODO: should we also somehow pass list of columns to deduplicate on to the ReplicatedMergeTreeSink?
-    // TODO: insert_quorum = 'auto' would be supported in https://github.com/ClickHouse/ClickHouse/pull/39970, now it's same as 0.
    return std::make_shared<ReplicatedMergeTreeSink>(
        *this, metadata_snapshot, query_settings.insert_quorum.valueOr(0),
        query_settings.insert_quorum_timeout.totalMilliseconds(),
        query_settings.max_partitions_per_insert_block,
        query_settings.insert_quorum_parallel,
        deduplicate,
+        query_settings.insert_quorum.is_auto,
        local_context);
 }

@ -5125,7 +5126,7 @@ PartitionCommandsResultInfo StorageReplicatedMergeTree::attachPartition(
    MutableDataPartsVector loaded_parts = tryLoadPartsToAttach(partition, attach_part, query_context, renamed_parts);

    /// TODO Allow to use quorum here.
-    ReplicatedMergeTreeSink output(*this, metadata_snapshot, 0, 0, 0, false, false, query_context,
+    ReplicatedMergeTreeSink output(*this, metadata_snapshot, 0, 0, 0, false, false, false, query_context,
        /*is_attach*/true);

    for (size_t i = 0; i < loaded_parts.size(); ++i)
@ -7538,21 +7539,42 @@ void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part,

 std::pair<bool, NameSet> StorageReplicatedMergeTree::unlockSharedData(const IMergeTreeDataPart & part) const
 {
-    if (!part.data_part_storage || !part.isStoredOnDisk())
+    auto settings = getSettings();
+    if (!settings->allow_remote_fs_zero_copy_replication)
        return std::make_pair(true, NameSet{});

-    if (!part.data_part_storage || !part.data_part_storage->supportZeroCopyReplication())
+    if (!part.data_part_storage)
+        LOG_WARNING(log, "Datapart storage for part {} (temp: {}) is not initialzied", part.name, part.is_temp);
+
+    if (!part.data_part_storage || !part.isStoredOnDisk())
+    {
+        LOG_TRACE(log, "Part {} is not stored on disk, blobs can be removed", part.name);
        return std::make_pair(true, NameSet{});
+    }
+
+    if (!part.data_part_storage || !part.data_part_storage->supportZeroCopyReplication())
+    {
+        LOG_TRACE(log, "Part {} is not stored on zero-copy replicaed disk, blobs can be removed", part.name);
+        return std::make_pair(true, NameSet{});
+    }

    /// If part is temporary refcount file may be absent
    if (part.data_part_storage->exists(IMergeTreeDataPart::FILE_FOR_REFERENCES_CHECK))
    {
        auto ref_count = part.data_part_storage->getRefCount(IMergeTreeDataPart::FILE_FOR_REFERENCES_CHECK);
        if (ref_count > 0) /// Keep part shard info for frozen backups
+        {
+            LOG_TRACE(log, "Part {} has more than zero local references ({}), blobs cannot be removed", part.name, ref_count);
            return std::make_pair(false, NameSet{});
+        }
+        else
+        {
+            LOG_TRACE(log, "Part {} local references is zero, will check blobs can be removed in zookeeper", part.name);
+        }
    }
    else
    {
+        LOG_TRACE(log, "Part {} looks temporary, because checksums file doesn't exists, blobs can be removed", part.name);
        /// Temporary part with some absent file cannot be locked in shared mode
        return std::make_pair(true, NameSet{});
    }
@ -7600,10 +7622,14 @@ std::pair<bool, NameSet> StorageReplicatedMergeTree::unlockSharedDataByID(

        if (!children.empty())
        {
-            LOG_TRACE(logger, "Found {} ({}) zookeeper locks for {}", zookeeper_part_uniq_node, children.size(), fmt::join(children, ", "));
+            LOG_TRACE(logger, "Found {} ({}) zookeper locks for {}", children.size(), fmt::join(children, ", "), zookeeper_part_uniq_node);
            part_has_no_more_locks = false;
            continue;
        }
+        else
+        {
+            LOG_TRACE(logger, "No more children left for for {}, will try to remove the whole node", zookeeper_part_uniq_node);
+        }

        auto error_code = zookeeper_ptr->tryRemove(zookeeper_part_uniq_node);

@ -7654,7 +7680,7 @@ std::pair<bool, NameSet> StorageReplicatedMergeTree::unlockSharedDataByID(
        }
        else
        {
-            LOG_TRACE(logger, "Can't remove parent zookeeper lock {} for part {}, because children {} ({}) were concurrently created",
+            LOG_TRACE(logger, "Can't remove parent zookeeper lock {} for part {}, because children {} ({}) exists",
                      zookeeper_part_node, part_name, children.size(), fmt::join(children, ", "));
        }
    }
@ -8394,7 +8420,7 @@ void StorageReplicatedMergeTree::restoreDataFromBackup(RestorerFromBackup & rest
 void StorageReplicatedMergeTree::attachRestoredParts(MutableDataPartsVector && parts)
 {
    auto metadata_snapshot = getInMemoryMetadataPtr();
-    auto sink = std::make_shared<ReplicatedMergeTreeSink>(*this, metadata_snapshot, 0, 0, 0, false, false, getContext(), /*is_attach*/true);
+    auto sink = std::make_shared<ReplicatedMergeTreeSink>(*this, metadata_snapshot, 0, 0, 0, false, false, false,  getContext(), /*is_attach*/true);
    for (auto part : parts)
        sink->writeExistingPart(part);
 }
--- a/src/Storages/System/StorageSystemRemoteDataPaths.cpp
+++ b/src/Storages/System/StorageSystemRemoteDataPaths.cpp
@ -1,6 +1,7 @@
 #include "StorageSystemRemoteDataPaths.h"
 #include <DataTypes/DataTypeString.h>
 #include <DataTypes/DataTypeArray.h>
+#include <DataTypes/DataTypesNumber.h>
 #include <Interpreters/Cache/FileCache.h>
 #include <Interpreters/Cache/FileCacheFactory.h>
 #include <Columns/ColumnString.h>
@ -23,6 +24,8 @@ StorageSystemRemoteDataPaths::StorageSystemRemoteDataPaths(const StorageID & tab
        {"cache_base_path", std::make_shared<DataTypeString>()},
        {"local_path", std::make_shared<DataTypeString>()},
        {"remote_path", std::make_shared<DataTypeString>()},
+        {"size", std::make_shared<DataTypeUInt64>()},
+        {"common_prefix_for_blobs", std::make_shared<DataTypeString>()},
        {"cache_paths", std::make_shared<DataTypeArray>(std::make_shared<DataTypeString>())},
    }));
    setInMemoryMetadata(storage_metadata);
@ -44,6 +47,8 @@ Pipe StorageSystemRemoteDataPaths::read(
    MutableColumnPtr col_cache_base_path = ColumnString::create();
    MutableColumnPtr col_local_path = ColumnString::create();
    MutableColumnPtr col_remote_path = ColumnString::create();
+    MutableColumnPtr col_size = ColumnUInt64::create();
+    MutableColumnPtr col_namespace = ColumnString::create();
    MutableColumnPtr col_cache_paths = ColumnArray::create(ColumnString::create());

    auto disks = context->getDisksMap();
@ -61,7 +66,7 @@ Pipe StorageSystemRemoteDataPaths::read(
            if (!cache_base_path.empty())
                cache = FileCacheFactory::instance().get(cache_base_path);

-            for (const auto & [local_path, storage_objects] : remote_paths_by_local_path)
+            for (const auto & [local_path, common_prefox_for_objects, storage_objects] : remote_paths_by_local_path)
            {
                for (const auto & object : storage_objects)
                {
@ -70,6 +75,8 @@ Pipe StorageSystemRemoteDataPaths::read(
                    col_cache_base_path->insert(cache_base_path);
                    col_local_path->insert(local_path);
                    col_remote_path->insert(object.absolute_path);
+                    col_size->insert(object.bytes_size);
+                    col_namespace->insert(common_prefox_for_objects);

                    if (cache)
                    {
@ -91,6 +98,8 @@ Pipe StorageSystemRemoteDataPaths::read(
    res_columns.emplace_back(std::move(col_cache_base_path));
    res_columns.emplace_back(std::move(col_local_path));
    res_columns.emplace_back(std::move(col_remote_path));
+    res_columns.emplace_back(std::move(col_size));
+    res_columns.emplace_back(std::move(col_namespace));
    res_columns.emplace_back(std::move(col_cache_paths));

    UInt64 num_rows = res_columns.at(0)->size();
--- a/tests/ci/ci_config.py
+++ b/tests/ci/ci_config.py
@ -161,6 +161,16 @@ CI_CONFIG = {
            "tidy": "disable",
            "with_coverage": False,
        },
+        "binary_amd64sse2": {
+            "compiler": "clang-14-amd64sse2",
+            "build_type": "",
+            "sanitizer": "",
+            "package_type": "binary",
+            "static_binary_name": "amd64sse2",
+            "libraries": "static",
+            "tidy": "disable",
+            "with_coverage": False,
+        },
    },
    "builds_report_config": {
        "ClickHouse build check": [
@ -182,6 +192,7 @@ CI_CONFIG = {
            "binary_freebsd",
            "binary_darwin_aarch64",
            "binary_ppc64le",
+            "binary_amd64sse2",
        ],
    },
    "tests_config": {
--- a/tests/ci/run_check.py
+++ b/tests/ci/run_check.py
@ -87,14 +87,19 @@ def should_run_checks_for_pr(pr_info: PRInfo) -> Tuple[bool, str, str]:
    # Consider the labels and whether the user is trusted.
    print("Got labels", pr_info.labels)
    if FORCE_TESTS_LABEL in pr_info.labels:
+        print(f"Label '{FORCE_TESTS_LABEL}' set, forcing remaining checks")
        return True, f"Labeled '{FORCE_TESTS_LABEL}'", "pending"

    if DO_NOT_TEST_LABEL in pr_info.labels:
+        print(f"Label '{DO_NOT_TEST_LABEL}' set, skipping remaining checks")
        return False, f"Labeled '{DO_NOT_TEST_LABEL}'", "success"

    if CAN_BE_TESTED_LABEL not in pr_info.labels and not pr_is_by_trusted_user(
        pr_info.user_login, pr_info.user_orgs
    ):
+        print(
+            f"PRs by untrusted users need the '{CAN_BE_TESTED_LABEL}' label - please contact a member of the core team"
+        )
        return False, "Needs 'can be tested' label", "failure"

    if OK_SKIP_LABELS.intersection(pr_info.labels):
@ -219,7 +224,7 @@ if __name__ == "__main__":
    elif SUBMODULE_CHANGED_LABEL in pr_info.labels:
        pr_labels_to_remove.append(SUBMODULE_CHANGED_LABEL)

-    print(f"change labels: add {pr_labels_to_add}, remove {pr_labels_to_remove}")
+    print(f"Change labels: add {pr_labels_to_add}, remove {pr_labels_to_remove}")
    if pr_labels_to_add:
        post_labels(gh, pr_info, pr_labels_to_add)

--- a/tests/integration/test_config_xml_yaml_mix/configs/config.d/path.yaml
+++ b/tests/integration/test_config_xml_yaml_mix/configs/config.d/path.yaml
@ -1,18 +1,6 @@
-path:
-  - /var/lib/clickhouse
-  - "@replace": replace
-tmp_path:
-  - /var/lib/clickhouse/tmp/
-  - "@replace": replace
-user_files_path:
-  - /var/lib/clickhouse/user_files/
-  - "@replace": replace
-format_schema_path:
-  - /var/lib/clickhouse/format_schemas/
-  - "@replace": replace
-access_control_path:
-  - /var/lib/clickhouse/access/
-  - "@replace": replace
-top_level_domains_path:
-  - /var/lib/clickhouse/top_level_domains/
-  - "@replace": replace
+path: /var/lib/clickhouse
+tmp_path: /var/lib/clickhouse/tmp/
+user_files_path: /var/lib/clickhouse/user_files/
+format_schema_path: /var/lib/clickhouse/format_schemas/
+access_control_path: /var/lib/clickhouse/access/
+top_level_domains_path: /var/lib/clickhouse/top_level_domains/
--- a/tests/integration/test_config_xml_yaml_mix/configs/users.yaml
+++ b/tests/integration/test_config_xml_yaml_mix/configs/users.yaml
@ -6,7 +6,6 @@ users:
  default:
    password: ''
    networks:
-      "@replace": replace
      ip: '::/0'
    profile: default

--- a/Show More
+++ b/Show More