Merge branch 'master' into fix-odr-vialation

2024-11-21 15:12:02 +00:00 · 2022-09-09 00:42:14 +02:00 · 2022-09-09 00:42:14 +02:00 · 323fdeff0b
commit 323fdeff0b
parent 29b5d023ee 9e8e66c567
188 changed files with 3574 additions and 1252 deletions
--- a/.github/workflows/master.yml
+++ b/.github/workflows/master.yml
@ -923,6 +923,53 @@ jobs:
          # shellcheck disable=SC2046
          docker rm -f $(docker ps -a -q) ||:
          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
+  BuilderBinAmd64SSE2:
+    needs: [DockerHubPush]
+    runs-on: [self-hosted, builder]
+    steps:
+      - name: Set envs
+        run: |
+          cat >> "$GITHUB_ENV" << 'EOF'
+          TEMP_PATH=${{runner.temp}}/build_check
+          IMAGES_PATH=${{runner.temp}}/images_path
+          REPO_COPY=${{runner.temp}}/build_check/ClickHouse
+          CACHES_PATH=${{runner.temp}}/../ccaches
+          BUILD_NAME=binary_amd64sse2
+          EOF
+      - name: Download changed images
+        uses: actions/download-artifact@v2
+        with:
+          name: changed_images
+          path: ${{ env.IMAGES_PATH }}
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 0 # otherwise we will have no info about contributors
+      - name: Build
+        run: |
+          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
+          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          sudo rm -fr "$TEMP_PATH"
+          mkdir -p "$TEMP_PATH"
+          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
+          cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME"
+      - name: Upload build URLs to artifacts
+        if: ${{ success() || failure() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: ${{ env.BUILD_URLS }}
+          path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json
+      - name: Cleanup
+        if: always()
+        run: |
+          # shellcheck disable=SC2046
+          docker kill $(docker ps -q) ||:
+          # shellcheck disable=SC2046
+          docker rm -f $(docker ps -a -q) ||:
+          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
 ############################################################################################
 ##################################### Docker images  #######################################
 ############################################################################################
@ -1011,6 +1058,7 @@ jobs:
      - BuilderBinFreeBSD
      # - BuilderBinGCC
      - BuilderBinPPC64
+      - BuilderBinAmd64SSE2
      - BuilderBinClangTidy
      - BuilderDebShared
    runs-on: [self-hosted, style-checker]
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@ -935,6 +935,51 @@ jobs:
          # shellcheck disable=SC2046
          docker rm -f $(docker ps -a -q) ||:
          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
+  BuilderBinAmd64SSE2:
+    needs: [DockerHubPush, FastTest, StyleCheck]
+    runs-on: [self-hosted, builder]
+    steps:
+      - name: Set envs
+        run: |
+          cat >> "$GITHUB_ENV" << 'EOF'
+          TEMP_PATH=${{runner.temp}}/build_check
+          IMAGES_PATH=${{runner.temp}}/images_path
+          REPO_COPY=${{runner.temp}}/build_check/ClickHouse
+          CACHES_PATH=${{runner.temp}}/../ccaches
+          BUILD_NAME=binary_amd64sse2
+          EOF
+      - name: Download changed images
+        uses: actions/download-artifact@v2
+        with:
+          name: changed_images
+          path: ${{ env.IMAGES_PATH }}
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+      - name: Build
+        run: |
+          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
+          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          sudo rm -fr "$TEMP_PATH"
+          mkdir -p "$TEMP_PATH"
+          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
+          cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME"
+      - name: Upload build URLs to artifacts
+        if: ${{ success() || failure() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: ${{ env.BUILD_URLS }}
+          path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json
+      - name: Cleanup
+        if: always()
+        run: |
+          # shellcheck disable=SC2046
+          docker kill $(docker ps -q) ||:
+          # shellcheck disable=SC2046
+          docker rm -f $(docker ps -a -q) ||:
+          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
 ############################################################################################
 ##################################### Docker images  #######################################
 ############################################################################################
@ -1023,6 +1068,7 @@ jobs:
      - BuilderBinFreeBSD
      # - BuilderBinGCC
      - BuilderBinPPC64
+      - BuilderBinAmd64SSE2
      - BuilderBinClangTidy
      - BuilderDebShared
    runs-on: [self-hosted, style-checker]
@ -1254,6 +1300,228 @@ jobs:
          # shellcheck disable=SC2046
          docker rm -f $(docker ps -a -q) ||:
          sudo rm -fr "$TEMP_PATH"
+  FunctionalStatelessTestS3Debug0:
+    needs: [BuilderDebDebug]
+    runs-on: [self-hosted, func-tester]
+    steps:
+      - name: Set envs
+        run: |
+          cat >> "$GITHUB_ENV" << 'EOF'
+          TEMP_PATH=${{runner.temp}}/stateless_s3_storage_debug
+          REPORTS_PATH=${{runner.temp}}/reports_dir
+          CHECK_NAME=Stateless tests (debug, s3 storage)
+          REPO_COPY=${{runner.temp}}/stateless_s3_storage_debug/ClickHouse
+          KILL_TIMEOUT=10800
+          RUN_BY_HASH_NUM=0
+          RUN_BY_HASH_TOTAL=3
+          EOF
+      - name: Download json reports
+        uses: actions/download-artifact@v2
+        with:
+          path: ${{ env.REPORTS_PATH }}
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+      - name: Functional test
+        run: |
+          sudo rm -fr "$TEMP_PATH"
+          mkdir -p "$TEMP_PATH"
+          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
+          cd "$REPO_COPY/tests/ci"
+          python3 functional_test_check.py "$CHECK_NAME" "$KILL_TIMEOUT"
+      - name: Cleanup
+        if: always()
+        run: |
+          docker kill "$(docker ps -q)" ||:
+          docker rm -f "$(docker ps -a -q)" ||:
+          sudo rm -fr "$TEMP_PATH"
+  FunctionalStatelessTestS3Debug1:
+    needs: [BuilderDebDebug]
+    runs-on: [self-hosted, func-tester]
+    steps:
+      - name: Set envs
+        run: |
+          cat >> "$GITHUB_ENV" << 'EOF'
+          TEMP_PATH=${{runner.temp}}/stateless_s3_storage_debug
+          REPORTS_PATH=${{runner.temp}}/reports_dir
+          CHECK_NAME=Stateless tests (debug, s3 storage)
+          REPO_COPY=${{runner.temp}}/stateless_s3_storage_debug/ClickHouse
+          KILL_TIMEOUT=10800
+          RUN_BY_HASH_NUM=1
+          RUN_BY_HASH_TOTAL=3
+          EOF
+      - name: Download json reports
+        uses: actions/download-artifact@v2
+        with:
+          path: ${{ env.REPORTS_PATH }}
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+      - name: Functional test
+        run: |
+          sudo rm -fr "$TEMP_PATH"
+          mkdir -p "$TEMP_PATH"
+          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
+          cd "$REPO_COPY/tests/ci"
+          python3 functional_test_check.py "$CHECK_NAME" "$KILL_TIMEOUT"
+      - name: Cleanup
+        if: always()
+        run: |
+          docker kill "$(docker ps -q)" ||:
+          docker rm -f "$(docker ps -a -q)" ||:
+          sudo rm -fr "$TEMP_PATH"
+  FunctionalStatelessTestS3Debug2:
+    needs: [BuilderDebDebug]
+    runs-on: [self-hosted, func-tester]
+    steps:
+      - name: Set envs
+        run: |
+          cat >> "$GITHUB_ENV" << 'EOF'
+          TEMP_PATH=${{runner.temp}}/stateless_s3_storage_debug
+          REPORTS_PATH=${{runner.temp}}/reports_dir
+          CHECK_NAME=Stateless tests (debug, s3 storage)
+          REPO_COPY=${{runner.temp}}/stateless_s3_storage_debug/ClickHouse
+          KILL_TIMEOUT=10800
+          RUN_BY_HASH_NUM=2
+          RUN_BY_HASH_TOTAL=3
+          EOF
+      - name: Download json reports
+        uses: actions/download-artifact@v2
+        with:
+          path: ${{ env.REPORTS_PATH }}
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+      - name: Functional test
+        run: |
+          sudo rm -fr "$TEMP_PATH"
+          mkdir -p "$TEMP_PATH"
+          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
+          cd "$REPO_COPY/tests/ci"
+          python3 functional_test_check.py "$CHECK_NAME" "$KILL_TIMEOUT"
+      - name: Cleanup
+        if: always()
+        run: |
+          docker kill "$(docker ps -q)" ||:
+          docker rm -f "$(docker ps -a -q)" ||:
+          sudo rm -fr "$TEMP_PATH"
+  FunctionalStatelessTestS3Tsan0:
+    needs: [BuilderDebTsan]
+    runs-on: [self-hosted, func-tester]
+    steps:
+      - name: Set envs
+        run: |
+          cat >> "$GITHUB_ENV" << 'EOF'
+          TEMP_PATH=${{runner.temp}}/stateless_s3_storage_tsan
+          REPORTS_PATH=${{runner.temp}}/reports_dir
+          CHECK_NAME=Stateless tests (tsan, s3 storage)
+          REPO_COPY=${{runner.temp}}/stateless_s3_storage_tsan/ClickHouse
+          KILL_TIMEOUT=10800
+          RUN_BY_HASH_NUM=0
+          RUN_BY_HASH_TOTAL=3
+          EOF
+      - name: Download json reports
+        uses: actions/download-artifact@v2
+        with:
+          path: ${{ env.REPORTS_PATH }}
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+      - name: Functional test
+        run: |
+          sudo rm -fr "$TEMP_PATH"
+          mkdir -p "$TEMP_PATH"
+          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
+          cd "$REPO_COPY/tests/ci"
+          python3 functional_test_check.py "$CHECK_NAME" "$KILL_TIMEOUT"
+      - name: Cleanup
+        if: always()
+        run: |
+          docker kill "$(docker ps -q)" ||:
+          docker rm -f "$(docker ps -a -q)" ||:
+          sudo rm -fr "$TEMP_PATH"
+  FunctionalStatelessTestS3Tsan1:
+    needs: [BuilderDebTsan]
+    runs-on: [self-hosted, func-tester]
+    steps:
+      - name: Set envs
+        run: |
+          cat >> "$GITHUB_ENV" << 'EOF'
+          TEMP_PATH=${{runner.temp}}/stateless_s3_storage_tsan
+          REPORTS_PATH=${{runner.temp}}/reports_dir
+          CHECK_NAME=Stateless tests (tsan, s3 storage)
+          REPO_COPY=${{runner.temp}}/stateless_s3_storage_tsan/ClickHouse
+          KILL_TIMEOUT=10800
+          RUN_BY_HASH_NUM=1
+          RUN_BY_HASH_TOTAL=3
+          EOF
+      - name: Download json reports
+        uses: actions/download-artifact@v2
+        with:
+          path: ${{ env.REPORTS_PATH }}
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+      - name: Functional test
+        run: |
+          sudo rm -fr "$TEMP_PATH"
+          mkdir -p "$TEMP_PATH"
+          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
+          cd "$REPO_COPY/tests/ci"
+          python3 functional_test_check.py "$CHECK_NAME" "$KILL_TIMEOUT"
+      - name: Cleanup
+        if: always()
+        run: |
+          docker kill "$(docker ps -q)" ||:
+          docker rm -f "$(docker ps -a -q)" ||:
+          sudo rm -fr "$TEMP_PATH"
+  FunctionalStatelessTestS3Tsan2:
+    needs: [BuilderDebTsan]
+    runs-on: [self-hosted, func-tester]
+    steps:
+      - name: Set envs
+        run: |
+          cat >> "$GITHUB_ENV" << 'EOF'
+          TEMP_PATH=${{runner.temp}}/stateless_s3_storage_tsan
+          REPORTS_PATH=${{runner.temp}}/reports_dir
+          CHECK_NAME=Stateless tests (tsan, s3 storage)
+          REPO_COPY=${{runner.temp}}/stateless_s3_storage_tsan/ClickHouse
+          KILL_TIMEOUT=10800
+          RUN_BY_HASH_NUM=2
+          RUN_BY_HASH_TOTAL=3
+          EOF
+      - name: Download json reports
+        uses: actions/download-artifact@v2
+        with:
+          path: ${{ env.REPORTS_PATH }}
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+      - name: Functional test
+        run: |
+          sudo rm -fr "$TEMP_PATH"
+          mkdir -p "$TEMP_PATH"
+          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
+          cd "$REPO_COPY/tests/ci"
+          python3 functional_test_check.py "$CHECK_NAME" "$KILL_TIMEOUT"
+      - name: Cleanup
+        if: always()
+        run: |
+          docker kill "$(docker ps -q)" ||:
+          docker rm -f "$(docker ps -a -q)" ||:
+          sudo rm -fr "$TEMP_PATH"
  FunctionalStatelessTestAarch64:
    needs: [BuilderDebAarch64]
    runs-on: [self-hosted, func-tester-aarch64]
@ -3388,6 +3656,12 @@ jobs:
      - FunctionalStatefulTestMsan
      - FunctionalStatefulTestUBsan
      - FunctionalStatelessTestReleaseS3
+      - FunctionalStatelessTestS3Debug0
+      - FunctionalStatelessTestS3Debug1
+      - FunctionalStatelessTestS3Debug2
+      - FunctionalStatelessTestS3Tsan0
+      - FunctionalStatelessTestS3Tsan1
+      - FunctionalStatelessTestS3Tsan2
      - StressTestDebug
      - StressTestAsan
      - StressTestTsan
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -143,6 +143,8 @@ include (cmake/add_warning.cmake)
 if (COMPILER_CLANG)
    # generate ranges for fast "addr2line" search
    if (NOT CMAKE_BUILD_TYPE_UC STREQUAL "RELEASE")
+        # NOTE: that clang has a bug because of it does not emit .debug_aranges
+        # with ThinLTO, so custom ld.lld wrapper is shipped in docker images.
        set(COMPILER_FLAGS "${COMPILER_FLAGS} -gdwarf-aranges")
    endif ()

--- a/README.md
+++ b/README.md
@ -15,4 +15,5 @@ ClickHouse® is an open-source column-oriented database management system that a
 * [Contacts](https://clickhouse.com/company/contact) can help to get your questions answered if there are any.

 ## Upcoming events
-* [**v22.8 Release Webinar**](https://clickhouse.com/company/events/v22-8-release-webinar) Original creator, co-founder, and CTO of ClickHouse Alexey Milovidov will walk us through the highlights of the release, provide live demos, and share vision into what is coming in the roadmap.
+* [**v22.9 Release Webinar**](https://clickhouse.com/company/events/v22-9-release-webinar) Original creator, co-founder, and CTO of ClickHouse Alexey Milovidov will walk us through the highlights of the release, provide live demos, and share vision into what is coming in the roadmap.
+* [**ClickHouse for Analytics @ Barracuda Networks**](https://www.meetup.com/clickhouse-silicon-valley-meetup-group/events/288140358/) Join us for this in person meetup hosted by our friends at Barracuda in Bay Area.
--- a/cmake/cpu_features.cmake
+++ b/cmake/cpu_features.cmake
@ -24,6 +24,23 @@ option (ENABLE_BMI "Use BMI instructions on x86_64" 0)
 option (ENABLE_AVX2_FOR_SPEC_OP "Use avx2 instructions for specific operations on x86_64" 0)
 option (ENABLE_AVX512_FOR_SPEC_OP "Use avx512 instructions for specific operations on x86_64" 0)

+# X86: Allow compilation for a SSE2-only target machine. Done by a special build in CI for embedded or very old hardware.
+option (NO_SSE3_OR_HIGHER "Disable SSE3 or higher on x86_64" 0)
+if (NO_SSE3_OR_HIGHER)
+    SET(ENABLE_SSSE3 0)
+    SET(ENABLE_SSE41 0)
+    SET(ENABLE_SSE42 0)
+    SET(ENABLE_PCLMULQDQ 0)
+    SET(ENABLE_POPCNT 0)
+    SET(ENABLE_AVX 0)
+    SET(ENABLE_AVX2 0)
+    SET(ENABLE_AVX512 0)
+    SET(ENABLE_AVX512_VBMI 0)
+    SET(ENABLE_BMI 0)
+    SET(ENABLE_AVX2_FOR_SPEC_OP 0)
+    SET(ENABLE_AVX512_FOR_SPEC_OP 0)
+endif()
+
 option (ARCH_NATIVE "Add -march=native compiler flag. This makes your binaries non-portable but more performant code may be generated. This option overrides ENABLE_* options for specific instruction set. Highly not recommended to use." 0)

 if (ARCH_NATIVE)
--- a/cmake/ld.lld.in
+++ b/cmake/ld.lld.in
@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+# This is a workaround for bug in llvm/clang,
+# that does not produce .debug_aranges with LTO
+#
+# NOTE: this is a temporary solution, that should be removed once [1] will be
+# resolved.
+#
+#   [1]: https://discourse.llvm.org/t/clang-does-not-produce-full-debug-aranges-section-with-thinlto/64898/8
+
+# NOTE: only -flto=thin is supported.
+# NOTE: it is not possible to check was there -gdwarf-aranges initially or not.
+if [[ "$*" =~ -plugin-opt=thinlto ]]; then
+    exec "@LLD_PATH@" -mllvm -generate-arange-section "$@"
+else
+    exec "@LLD_PATH@" "$@"
+fi
--- a/cmake/split_debug_symbols.cmake
+++ b/cmake/split_debug_symbols.cmake
@ -20,7 +20,7 @@ macro(clickhouse_split_debug_symbols)
       COMMAND mkdir -p "${STRIP_DESTINATION_DIR}/bin"
       COMMAND cp "${STRIP_BINARY_PATH}" "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}"
       # Splits debug symbols into separate file, leaves the binary untouched:
-       COMMAND "${OBJCOPY_PATH}" --only-keep-debug --compress-debug-sections "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}" "${STRIP_DESTINATION_DIR}/lib/debug/bin/${STRIP_TARGET}.debug"
+       COMMAND "${OBJCOPY_PATH}" --only-keep-debug "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}" "${STRIP_DESTINATION_DIR}/lib/debug/bin/${STRIP_TARGET}.debug"
       COMMAND chmod 0644 "${STRIP_DESTINATION_DIR}/lib/debug/bin/${STRIP_TARGET}.debug"
       # Strips binary, sections '.note' & '.comment' are removed in line with Debian's stripping policy: www.debian.org/doc/debian-policy/ch-files.html, section '.clickhouse.hash' is needed for integrity check:
       COMMAND "${STRIP_PATH}" --remove-section=.comment --remove-section=.note --keep-section=.clickhouse.hash "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}"
--- a/cmake/tools.cmake
+++ b/cmake/tools.cmake
@ -94,8 +94,13 @@ if (LINKER_NAME)
        if (NOT LLD_PATH)
            message (FATAL_ERROR "Using linker ${LINKER_NAME} but can't find its path.")
        endif ()
-        set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} --ld-path=${LLD_PATH}")
-        set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} --ld-path=${LLD_PATH}")
+
+        # This a temporary quirk to emit .debug_aranges with ThinLTO
+        set (LLD_WRAPPER "${CMAKE_CURRENT_BINARY_DIR}/ld.lld")
+        configure_file ("${CMAKE_CURRENT_SOURCE_DIR}/cmake/ld.lld.in" "${LLD_WRAPPER}" @ONLY)
+
+        set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} --ld-path=${LLD_WRAPPER}")
+        set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} --ld-path=${LLD_WRAPPER}")
    else ()
        set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=${LINKER_NAME}")
        set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fuse-ld=${LINKER_NAME}")
--- a/contrib/vectorscan-cmake/CMakeLists.txt
+++ b/contrib/vectorscan-cmake/CMakeLists.txt
@ -1,6 +1,6 @@
 # We use vectorscan, a portable and API/ABI-compatible drop-in replacement for hyperscan.

-if (ARCH_AMD64)
+if (ARCH_AMD64 AND NOT NO_SSE3_OR_HIGHER)
    option (ENABLE_VECTORSCAN "Enable vectorscan library" ${ENABLE_LIBRARIES})
 endif()

--- a/docker/packager/packager
+++ b/docker/packager/packager
@ -130,6 +130,7 @@ def parse_env_variables(
    ARM_SUFFIX = "-aarch64"
    FREEBSD_SUFFIX = "-freebsd"
    PPC_SUFFIX = "-ppc64le"
+    AMD64_SSE2_SUFFIX = "-amd64sse2"

    result = []
    result.append("OUTPUT_DIR=/output")
@ -141,6 +142,7 @@ def parse_env_variables(
    is_cross_arm = compiler.endswith(ARM_SUFFIX)
    is_cross_ppc = compiler.endswith(PPC_SUFFIX)
    is_cross_freebsd = compiler.endswith(FREEBSD_SUFFIX)
+    is_amd64_sse2 = compiler.endswith(AMD64_SSE2_SUFFIX)

    if is_cross_darwin:
        cc = compiler[: -len(DARWIN_SUFFIX)]
@ -186,6 +188,10 @@ def parse_env_variables(
        cmake_flags.append(
            "-DCMAKE_TOOLCHAIN_FILE=/build/cmake/linux/toolchain-ppc64le.cmake"
        )
+    elif is_amd64_sse2:
+        cc = compiler[: -len(AMD64_SSE2_SUFFIX)]
+        result.append("DEB_ARCH=amd64")
+        cmake_flags.append("-DNO_SSE3_OR_HIGHER=1")
    else:
        cc = compiler
        result.append("DEB_ARCH=amd64")
@ -339,6 +345,7 @@ if __name__ == "__main__":
            "clang-14-darwin-aarch64",
            "clang-14-aarch64",
            "clang-14-ppc64le",
+            "clang-14-amd64sse2",
            "clang-14-freebsd",
            "gcc-11",
        ),
--- a/docker/test/stress/run.sh
+++ b/docker/test/stress/run.sh
@ -3,6 +3,9 @@
 # shellcheck disable=SC2086
 # shellcheck disable=SC2024

+# Avoid overlaps with previous runs
+dmesg --clear
+
 set -x

 # Thread Fuzzer allows to check more permutations of possible thread scheduling
@ -38,6 +41,7 @@ function install_packages()

 function configure()
 {
+    export ZOOKEEPER_FAULT_INJECTION=1
    # install test configs
    export USE_DATABASE_ORDINARY=1
    export EXPORT_S3_STORAGE_POLICIES=1
--- a/docker/test/stress/stress
+++ b/docker/test/stress/stress
@ -168,7 +168,7 @@ def prepare_for_hung_check(drop_databases):
                for db in databases:
                    if db == "system":
                        continue
-                    command = make_query_command(f"DROP DATABASE {db}")
+                    command = make_query_command(f'DETACH DATABASE {db}')
                    # we don't wait for drop
                    Popen(command, shell=True)
                break
--- a/docs/en/development/build.md
+++ b/docs/en/development/build.md
@ -140,6 +140,6 @@ hash cmake

 ClickHouse is available in pre-built binaries and packages. Binaries are portable and can be run on any Linux flavour.

-They are built for stable, prestable and testing releases as long as for every commit to master and for every pull request.
+Binaries are built for stable and LTS releases and also every commit to `master` for each pull request.

 To find the freshest build from `master`, go to [commits page](https://github.com/ClickHouse/ClickHouse/commits/master), click on the first green check mark or red cross near commit, and click to the “Details” link right after “ClickHouse Build Check”.
--- a/docs/en/getting-started/example-datasets/nypd_complaint_data.md
+++ b/docs/en/getting-started/example-datasets/nypd_complaint_data.md
@ -0,0 +1,654 @@
+---
+slug: /en/getting-started/example-datasets/nypd_complaint_data
+sidebar_label: NYPD Complaint Data
+description: "Ingest and query Tab Separated Value data in 5 steps"
+title: NYPD Complaint Data
+---
+
+Tab separated value, or TSV, files are common and may include field headings as the first line of the file. ClickHouse can ingest TSVs, and also can query TSVs without ingesting the files.  This guide covers both of these cases. If you need to query or ingest CSV files, the same techniques work, simply substitute `TSV` with `CSV` in your format arguments.
+
+While working through this guide you will:
+- **Investigate**: Query the structure and content of the TSV file.
+- **Determine the target ClickHouse schema**: Choose proper data types and map the existing data to those types.
+- **Create a ClickHouse table**.
+- **Preprocess and stream** the data to ClickHouse.
+- **Run some queries** against ClickHouse.
+
+The dataset used in this guide comes from the NYC Open Data team, and contains data about "all valid felony, misdemeanor, and violation crimes reported to the New York City Police Department (NYPD)". At the time of writing, the data file is 166MB, but it is updated regularly.
+
+**Source**: [data.cityofnewyork.us](https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Current-Year-To-Date-/5uac-w243)  
+**Terms of use**: https://www1.nyc.gov/home/terms-of-use.page
+
+## Prerequisites
+- Download the dataset by visiting the [NYPD Complaint Data Current (Year To Date)](https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Current-Year-To-Date-/5uac-w243) page, clicking the Export button, and choosing **TSV for Excel**.
+- Install [ClickHouse server and client](../../getting-started/install.md).
+- [Launch](../../getting-started/install.md#launch) ClickHouse server, and connect with `clickhouse-client`
+
+### A note about the commands described in this guide
+There are two types of commands in this guide:
+- Some of the commands are querying the TSV files, these are run at the command prompt.
+- The rest of the commands are querying ClickHouse, and these are run in the `clickhouse-client` or Play UI.
+
+:::note
+The examples in this guide assume that you have saved the TSV file to `${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv`, please adjust the commands if needed.
+:::
+
+## Familiarize yourself with the TSV file
+
+Before starting to work with the ClickHouse database familiarize yourself with the data. 
+
+### Look at the fields in the source TSV file
+
+This is an example of a command to query a TSV file, but don't run it yet.
+```sh
+clickhouse-local --query \
+"describe file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')"
+```
+
+Sample response
+```response
+CMPLNT_NUM                  Nullable(Float64)					
+ADDR_PCT_CD                 Nullable(Float64)					
+BORO_NM                     Nullable(String)					
+CMPLNT_FR_DT                Nullable(String)					
+CMPLNT_FR_TM                Nullable(String)					
+```
+
+:::tip
+Most of the time the above command will let you know which fields in the input data are numeric, and which are strings, and which are tuples.  This is not always the case.  Because ClickHouse is routineley used with datasets containing billions of records there is a default number (100) of rows examined to [infer the schema](../../guides/developer/working-with-json/json-semi-structured.md/#relying-on-schema-inference) in order to avoid parsing billions of rows to infer the schema. The response below may not match what you see, as the dataset is updated several times each year. Looking at the Data Dictionary you can see that CMPLNT_NUM is specified as text, and not numeric.  By overriding the default of 100 rows for inference with the setting `SETTINGS input_format_max_rows_to_read_for_schema_inference=2000`
+you can get a better idea of the content.
+
+Note: as of version 22.5 the default is now 25,000 rows for inferring the schema, so only change the setting if you are on an older version or if you need more than 25,000 rows to be sampled.
+:::
+
+Run this command at your command prompt.  You will be using `clickhouse-local` to query the data in the TSV file you downloaded.
+```sh
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"describe file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')" 
+```
+
+Result:
+```response
+CMPLNT_NUM        Nullable(String)					
+ADDR_PCT_CD       Nullable(Float64)					
+BORO_NM           Nullable(String)					
+CMPLNT_FR_DT      Nullable(String)					
+CMPLNT_FR_TM      Nullable(String)					
+CMPLNT_TO_DT      Nullable(String)					
+CMPLNT_TO_TM      Nullable(String)					
+CRM_ATPT_CPTD_CD  Nullable(String)					
+HADEVELOPT        Nullable(String)					
+HOUSING_PSA       Nullable(Float64)					
+JURISDICTION_CODE Nullable(Float64)					
+JURIS_DESC        Nullable(String)					
+KY_CD             Nullable(Float64)					
+LAW_CAT_CD        Nullable(String)					
+LOC_OF_OCCUR_DESC Nullable(String)					
+OFNS_DESC         Nullable(String)					
+PARKS_NM          Nullable(String)					
+PATROL_BORO       Nullable(String)					
+PD_CD             Nullable(Float64)					
+PD_DESC           Nullable(String)					
+PREM_TYP_DESC     Nullable(String)					
+RPT_DT            Nullable(String)					
+STATION_NAME      Nullable(String)					
+SUSP_AGE_GROUP    Nullable(String)					
+SUSP_RACE         Nullable(String)					
+SUSP_SEX          Nullable(String)					
+TRANSIT_DISTRICT  Nullable(Float64)					
+VIC_AGE_GROUP     Nullable(String)					
+VIC_RACE          Nullable(String)					
+VIC_SEX           Nullable(String)					
+X_COORD_CD        Nullable(Float64)					
+Y_COORD_CD        Nullable(Float64)					
+Latitude          Nullable(Float64)					
+Longitude         Nullable(Float64)					
+Lat_Lon           Tuple(Nullable(Float64), Nullable(Float64))					
+New Georeferenced Column Nullable(String)
+```
+
+At this point you should check that the columns in the TSV file match the names and types specified in the **Columns in this Dataset** section of the [dataset web page](https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Current-Year-To-Date-/5uac-w243).  The data types are not very specific, all numeric fields are set to `Nullable(Float64)`, and all other fields are `Nullable(String)`.  When you create a ClickHouse table to store the data you can specify more appropriate and performant types.
+
+### Determine the proper schema
+
+In order to figure out what types should be used for the fields it is necessary to know what the data looks like. For example, the field `JURISDICTION_CODE` is a numeric: should it be a `UInt8`, or an `Enum`, or is `Float64` appropriate?
+
+```sql
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"select JURISDICTION_CODE, count() FROM
+ file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
+ GROUP BY JURISDICTION_CODE
+ ORDER BY JURISDICTION_CODE
+ FORMAT PrettyCompact"
+```
+
+Result:
+```response
+┌─JURISDICTION_CODE─┬─count()─┐
+│                 0 │  188875 │
+│                 1 │    4799 │
+│                 2 │   13833 │
+│                 3 │     656 │
+│                 4 │      51 │
+│                 6 │       5 │
+│                 7 │       2 │
+│                 9 │      13 │
+│                11 │      14 │
+│                12 │       5 │
+│                13 │       2 │
+│                14 │      70 │
+│                15 │      20 │
+│                72 │     159 │
+│                87 │       9 │
+│                88 │      75 │
+│                97 │     405 │
+└───────────────────┴─────────┘
+```
+
+The query response shows that the `JURISDICTION_CODE` fits well in a `UInt8`.
+
+Similarly, look at some of the `String` fields and see if they are well suited to being `DateTime` or [`LowCardinality(String)`](../../sql-reference/data-types/lowcardinality.md) fields.
+
+For example, the field `PARKS_NM` is described as "Name of NYC park, playground or greenspace of occurrence, if applicable (state parks are not included)".  The names of parks in New York City may be a good candidate for a `LowCardinality(String)`:
+
+```sh
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"select count(distinct PARKS_NM) FROM
+ file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
+ FORMAT PrettyCompact"
+```
+
+Result:
+```response
+┌─uniqExact(PARKS_NM)─┐
+│                 319 │
+└─────────────────────┘
+```
+
+Have a look at some of the park names:
+```sql
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"select distinct PARKS_NM FROM
+ file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
+ LIMIT 10
+ FORMAT PrettyCompact"
+```
+
+Result:
+```response
+┌─PARKS_NM───────────────────┐
+│ (null)                     │
+│ ASSER LEVY PARK            │
+│ JAMES J WALKER PARK        │
+│ BELT PARKWAY/SHORE PARKWAY │
+│ PROSPECT PARK              │
+│ MONTEFIORE SQUARE          │
+│ SUTTON PLACE PARK          │
+│ JOYCE KILMER PARK          │
+│ ALLEY ATHLETIC PLAYGROUND  │
+│ ASTORIA PARK               │
+└────────────────────────────┘
+```
+
+The dataset in use at the time of writing has only a few hundred distinct parks and playgrounds in the `PARK_NM` column.  This is a small number based on the [LowCardinality](../../sql-reference/data-types/lowcardinality.md#lowcardinality-dscr) recommendation to stay below 10,000 distinct strings in a `LowCardinality(String)` field.
+
+### DateTime fields
+Based on the **Columns in this Dataset** section of the [dataset web page](https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Current-Year-To-Date-/5uac-w243) there are date and time fields for the start and end of the reported event.  Looking at the min and max of the `CMPLNT_FR_DT` and `CMPLT_TO_DT` gives an idea of whether or not the fields are always populated:
+
+```sh title="CMPLNT_FR_DT"
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"select min(CMPLNT_FR_DT), max(CMPLNT_FR_DT) FROM
+file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
+FORMAT PrettyCompact"
+```
+
+Result:
+```response
+┌─min(CMPLNT_FR_DT)─┬─max(CMPLNT_FR_DT)─┐
+│ 01/01/1973        │ 12/31/2021        │
+└───────────────────┴───────────────────┘
+```
+
+```sh title="CMPLNT_TO_DT"
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"select min(CMPLNT_TO_DT), max(CMPLNT_TO_DT) FROM
+file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
+FORMAT PrettyCompact"
+```
+
+Result:
+```response
+┌─min(CMPLNT_TO_DT)─┬─max(CMPLNT_TO_DT)─┐
+│                   │ 12/31/2021        │
+└───────────────────┴───────────────────┘
+```
+
+```sh title="CMPLNT_FR_TM"
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"select min(CMPLNT_FR_TM), max(CMPLNT_FR_TM) FROM
+file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
+FORMAT PrettyCompact"
+```
+
+Result:
+```response
+┌─min(CMPLNT_FR_TM)─┬─max(CMPLNT_FR_TM)─┐
+│ 00:00:00          │ 23:59:00          │
+└───────────────────┴───────────────────┘
+```
+
+```sh title="CMPLNT_TO_TM"
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"select min(CMPLNT_TO_TM), max(CMPLNT_TO_TM) FROM
+file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
+FORMAT PrettyCompact"
+```
+
+Result:
+```response
+┌─min(CMPLNT_TO_TM)─┬─max(CMPLNT_TO_TM)─┐
+│ (null)            │ 23:59:00          │
+└───────────────────┴───────────────────┘
+```
+
+## Make a plan
+
+Based on the above investigation:
+- `JURISDICTION_CODE` should be cast as `UInt8`.
+- `PARKS_NM` should be cast to `LowCardinality(String)`
+- `CMPLNT_FR_DT` and `CMPLNT_FR_TM` are always populated (possibly with a default time of `00:00:00`)
+- `CMPLNT_TO_DT` and `CMPLNT_TO_TM` may be empty
+- Dates and times are stored in separate fields in the source
+- Dates are `mm/dd/yyyy` format
+- Times are `hh:mm:ss` format
+- Dates and times can be concatenated into DateTime types
+- There are some dates before January 1st 1970, which means we need a 64 bit DateTime
+
+:::note
+There are many more changes to be made to the types, they all can be determined by following the same investigation steps.  Look at the number of distinct strings in a field, the min and max of the numerics, and make your decisions.  The table schema that is given later in the guide has many low cardinality strings and unsigned integer fields and very few floating point numerics.
+:::
+
+## Concatenate the date and time fields
+
+To concatenate the date and time fields `CMPLNT_FR_DT` and `CMPLNT_FR_TM` into a single `String` that can be cast to a `DateTime`, select the two fields joined by the concatenation operator: `CMPLNT_FR_DT || ' ' || CMPLNT_FR_TM`.  The `CMPLNT_TO_DT` and `CMPLNT_TO_TM` fields are handled similarly.
+
+```sh
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"select CMPLNT_FR_DT || ' ' || CMPLNT_FR_TM AS complaint_begin FROM
+file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
+LIMIT 10
+FORMAT PrettyCompact"
+```
+
+Result:
+```response
+┌─complaint_begin─────┐
+│ 07/29/2010 00:01:00 │
+│ 12/01/2011 12:00:00 │
+│ 04/01/2017 15:00:00 │
+│ 03/26/2018 17:20:00 │
+│ 01/01/2019 00:00:00 │
+│ 06/14/2019 00:00:00 │
+│ 11/29/2021 20:00:00 │
+│ 12/04/2021 00:35:00 │
+│ 12/05/2021 12:50:00 │
+│ 12/07/2021 20:30:00 │
+└─────────────────────┘
+```
+
+## Convert the date and time String to a DateTime64 type
+
+Earlier in the guide we discovered that there are dates in the TSV file before January 1st 1970, which means that we need a 64 bit DateTime type for the dates.  The dates also need to be converted from `MM/DD/YYYY` to `YYYY/MM/DD` format.  Both of these can be done with [`parseDateTime64BestEffort()`](../../sql-reference/functions/type-conversion-functions.md#parsedatetime64besteffort).
+
+```sh
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"WITH (CMPLNT_FR_DT || ' ' || CMPLNT_FR_TM) AS CMPLNT_START,
+      (CMPLNT_TO_DT || ' ' || CMPLNT_TO_TM) AS CMPLNT_END
+select parseDateTime64BestEffort(CMPLNT_START) AS complaint_begin,
+       parseDateTime64BestEffortOrNull(CMPLNT_END) AS complaint_end
+FROM file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
+ORDER BY complaint_begin ASC
+LIMIT 25
+FORMAT PrettyCompact"
+```
+
+Lines 2 and 3 above contain the concatenation from the previous step, and lines 4 and 5 above parse the strings into `DateTime64`.  As the complaint end time is not guaranteed to exist `parseDateTime64BestEffortOrNull` is used.
+
+Result:
+```response
+┌─────────complaint_begin─┬───────────complaint_end─┐
+│ 1925-01-01 10:00:00.000 │ 2021-02-12 09:30:00.000 │
+│ 1925-01-01 11:37:00.000 │ 2022-01-16 11:49:00.000 │
+│ 1925-01-01 15:00:00.000 │ 2021-12-31 00:00:00.000 │
+│ 1925-01-01 15:00:00.000 │ 2022-02-02 22:00:00.000 │
+│ 1925-01-01 19:00:00.000 │ 2022-04-14 05:00:00.000 │
+│ 1955-09-01 19:55:00.000 │ 2022-08-01 00:45:00.000 │
+│ 1972-03-17 11:40:00.000 │ 2022-03-17 11:43:00.000 │
+│ 1972-05-23 22:00:00.000 │ 2022-05-24 09:00:00.000 │
+│ 1972-05-30 23:37:00.000 │ 2022-05-30 23:50:00.000 │
+│ 1972-07-04 02:17:00.000 │                    ᴺᵁᴸᴸ │
+│ 1973-01-01 00:00:00.000 │                    ᴺᵁᴸᴸ │
+│ 1975-01-01 00:00:00.000 │                    ᴺᵁᴸᴸ │
+│ 1976-11-05 00:01:00.000 │ 1988-10-05 23:59:00.000 │
+│ 1977-01-01 00:00:00.000 │ 1977-01-01 23:59:00.000 │
+│ 1977-12-20 00:01:00.000 │                    ᴺᵁᴸᴸ │
+│ 1981-01-01 00:01:00.000 │                    ᴺᵁᴸᴸ │
+│ 1981-08-14 00:00:00.000 │ 1987-08-13 23:59:00.000 │
+│ 1983-01-07 00:00:00.000 │ 1990-01-06 00:00:00.000 │
+│ 1984-01-01 00:01:00.000 │ 1984-12-31 23:59:00.000 │
+│ 1985-01-01 12:00:00.000 │ 1987-12-31 15:00:00.000 │
+│ 1985-01-11 09:00:00.000 │ 1985-12-31 12:00:00.000 │
+│ 1986-03-16 00:05:00.000 │ 2022-03-16 00:45:00.000 │
+│ 1987-01-07 00:00:00.000 │ 1987-01-09 00:00:00.000 │
+│ 1988-04-03 18:30:00.000 │ 2022-08-03 09:45:00.000 │
+│ 1988-07-29 12:00:00.000 │ 1990-07-27 22:00:00.000 │
+└─────────────────────────┴─────────────────────────┘
+```
+:::note
+The dates shown as `1925` above are from errors in the data.  There are several records in the original data with dates in the years `1019` - `1022` that should be `2019` - `2022`.  They are being stored as Jan 1st 1925 as that is the earliest date with a 64 bit DateTime.
+:::
+
+## Create a table
+
+The decisions made above on the data types used for the columns are reflected in the table schema
+below. We also need to decide on the `ORDER BY` and `PRIMARY KEY` used for the table.  At least one
+of `ORDER BY` or `PRIMARY KEY` must be specified.  Here are some guidelines on deciding on the 
+columns to includes in `ORDER BY`, and more information is in the *Next Steps* section at the end
+of this document.
+
+### Order By and Primary Key clauses
+
+- The `ORDER BY` tuple should include fields that are used in query filters
+- To maximize compression on disk the `ORDER BY` tuple should be ordered by ascending cardinality
+- If it exists, the `PRIMARY KEY` tuple must be a subset of the `ORDER BY` tuple
+- If only `ORDER BY` is specified, then the same tuple will be used as `PRIMARY KEY`
+- The primary key index is created using the `PRIMARY KEY` tuple if specified, otherwise the `ORDER BY` tuple
+- The `PRIMARY KEY` index is kept in main memory
+
+Looking at the dataset and the questions that might be answered by querying it we might
+decide that we would look at the types of crimes reported over time in the five boroughs of
+New York City.  These fields might be then included in the `ORDER BY`:
+
+| Column      | Description (from the data dictionary)                 |
+| ----------- | ---------------------------------------------------    |
+| OFNS_DESC   | Description of offense corresponding with key code     |
+| RPT_DT      | Date event was reported to police                      |
+| BORO_NM     | The name of the borough in which the incident occurred |
+
+
+Querying the TSV file for the cardinality of the three candidate columns:
+
+```bash
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"select formatReadableQuantity(uniq(OFNS_DESC)) as cardinality_OFNS_DESC,
+        formatReadableQuantity(uniq(RPT_DT)) as cardinality_RPT_DT,
+        formatReadableQuantity(uniq(BORO_NM)) as cardinality_BORO_NM
+  FROM
+  file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
+  FORMAT PrettyCompact"
+```
+
+Result:
+```response
+┌─cardinality_OFNS_DESC─┬─cardinality_RPT_DT─┬─cardinality_BORO_NM─┐
+│ 60.00                 │ 306.00             │ 6.00                │
+└───────────────────────┴────────────────────┴─────────────────────┘
+```
+Ordering by cardinality, the `ORDER BY` becomes:
+
+```
+ORDER BY ( BORO_NM, OFNS_DESC, RPT_DT )
+```
+:::note
+The table below will use more easily read column names, the above names will be mapped to
+```
+ORDER BY ( borough, offense_description, date_reported )
+```
+:::
+
+Putting together the changes to data types and the `ORDER BY` tuple gives this table structure:
+
+```sql
+CREATE TABLE NYPD_Complaint ( 
+    complaint_number     String,
+    precinct             UInt8,
+    borough              LowCardinality(String),
+    complaint_begin      DateTime64(0,'America/New_York'),
+    complaint_end        DateTime64(0,'America/New_York'),
+    was_crime_completed  String,
+    housing_authority    String,
+    housing_level_code   UInt32,
+    jurisdiction_code    UInt8, 
+    jurisdiction         LowCardinality(String),
+    offense_code         UInt8,
+    offense_level        LowCardinality(String),
+    location_descriptor  LowCardinality(String),
+    offense_description  LowCardinality(String),
+    park_name            LowCardinality(String),
+    patrol_borough       LowCardinality(String),
+    PD_CD                UInt16,
+    PD_DESC              String,
+    location_type        LowCardinality(String),
+    date_reported        Date,
+    transit_station      LowCardinality(String),
+    suspect_age_group    LowCardinality(String),
+    suspect_race         LowCardinality(String),
+    suspect_sex          LowCardinality(String),
+    transit_district     UInt8,
+    victim_age_group     LowCardinality(String),
+    victim_race          LowCardinality(String),
+    victim_sex           LowCardinality(String),
+    NY_x_coordinate      UInt32,
+    NY_y_coordinate      UInt32,
+    Latitude             Float64,
+    Longitude            Float64
+) ENGINE = MergeTree
+  ORDER BY ( borough, offense_description, date_reported )
+```
+
+### Finding the primary key of a table
+
+The ClickHouse `system` database, specifically `system.table` has all of the information about the table you
+just created.  This query shows the `ORDER BY` (sorting key), and the `PRIMARY KEY`:
+```sql
+SELECT
+    partition_key,
+    sorting_key,
+    primary_key,
+    table
+FROM system.tables
+WHERE table = 'NYPD_Complaint'
+FORMAT Vertical
+```
+
+Response
+```response
+Query id: 6a5b10bf-9333-4090-b36e-c7f08b1d9e01
+
+Row 1:
+──────
+partition_key: 
+sorting_key:   borough, offense_description, date_reported
+primary_key:   borough, offense_description, date_reported
+table:         NYPD_Complaint
+
+1 row in set. Elapsed: 0.001 sec.
+```
+
+## Preprocess and Import Data {#preprocess-import-data}
+
+We will use `clickhouse-local` tool for data preprocessing and `clickhouse-client` to upload it.
+
+### `clickhouse-local` arguments used
+
+:::tip
+`table='input'` appears in the arguments to clickhouse-local below.  clickhouse-local takes the provided input (`cat ${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv`) and inserts the input into a table.  By default the table is named `table`.  In this guide the name of the table is set to `input` to make the data flow clearer. The final argument to clickhouse-local is a query that selects from the table (`FROM input`) which is then piped to `clickhouse-client` to populate the table `NYPD_Complaint`.
+:::
+  
+```sql
+cat ${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv \
+  | clickhouse-local --table='input' --input-format='TSVWithNames' \
+  --input_format_max_rows_to_read_for_schema_inference=2000 \
+  --query "
+    WITH (CMPLNT_FR_DT || ' ' || CMPLNT_FR_TM) AS CMPLNT_START,
+     (CMPLNT_TO_DT || ' ' || CMPLNT_TO_TM) AS CMPLNT_END
+    SELECT
+      CMPLNT_NUM                                  AS complaint_number,
+      ADDR_PCT_CD                                 AS precinct,
+      BORO_NM                                     AS borough,
+      parseDateTime64BestEffort(CMPLNT_START)     AS complaint_begin,
+      parseDateTime64BestEffortOrNull(CMPLNT_END) AS complaint_end,
+      CRM_ATPT_CPTD_CD                            AS was_crime_completed,
+      HADEVELOPT                                  AS housing_authority_development,
+      HOUSING_PSA                                 AS housing_level_code,
+      JURISDICTION_CODE                           AS jurisdiction_code, 
+      JURIS_DESC                                  AS jurisdiction,
+      KY_CD                                       AS offense_code,
+      LAW_CAT_CD                                  AS offense_level,
+      LOC_OF_OCCUR_DESC                           AS location_descriptor,
+      OFNS_DESC                                   AS offense_description, 
+      PARKS_NM                                    AS park_name,
+      PATROL_BORO                                 AS patrol_borough,
+      PD_CD,
+      PD_DESC,
+      PREM_TYP_DESC                               AS location_type,
+      toDate(parseDateTimeBestEffort(RPT_DT))     AS date_reported,
+      STATION_NAME                                AS transit_station,
+      SUSP_AGE_GROUP                              AS suspect_age_group,
+      SUSP_RACE                                   AS suspect_race,
+      SUSP_SEX                                    AS suspect_sex,
+      TRANSIT_DISTRICT                            AS transit_district,
+      VIC_AGE_GROUP                               AS victim_age_group,   
+      VIC_RACE                                    AS victim_race,
+      VIC_SEX                                     AS victim_sex,
+      X_COORD_CD                                  AS NY_x_coordinate,
+      Y_COORD_CD                                  AS NY_y_coordinate,
+      Latitude,
+      Longitude
+    FROM input" \
+  | clickhouse-client --query='INSERT INTO NYPD_Complaint FORMAT TSV'
+```  
+
+## Validate the Data {#validate-data}
+
+:::note
+The dataset changes once or more per year, your counts may not match what is in this document.
+:::
+
+Query:
+
+```sql
+SELECT count()
+FROM NYPD_Complaint
+```
+
+Result:
+
+```text
+┌─count()─┐
+│  208993 │
+└─────────┘
+
+1 row in set. Elapsed: 0.001 sec. 
+```
+
+The size of the dataset in ClickHouse is just 12% of the original TSV file, compare the size of the original TSV file with the size of the table:
+
+Query:
+
+```sql
+SELECT formatReadableSize(total_bytes)
+FROM system.tables
+WHERE name = 'NYPD_Complaint'
+```
+
+Result:
+```text
+┌─formatReadableSize(total_bytes)─┐
+│ 8.63 MiB                        │
+└─────────────────────────────────┘
+```
+
+
+## Run Some Queries {#run-queries}
+
+### Query 1. Compare the number of complaints by month
+
+Query:
+
+```sql
+SELECT
+    dateName('month', date_reported) AS month,
+    count() AS complaints,
+    bar(complaints, 0, 50000, 80)
+FROM NYPD_Complaint
+GROUP BY month
+ORDER BY complaints DESC
+```
+
+Result:
+```response
+Query id: 7fbd4244-b32a-4acf-b1f3-c3aa198e74d9
+
+┌─month─────┬─complaints─┬─bar(count(), 0, 50000, 80)───────────────────────────────┐
+│ March     │      34536 │ ███████████████████████████████████████████████████████▎ │
+│ May       │      34250 │ ██████████████████████████████████████████████████████▋  │
+│ April     │      32541 │ ████████████████████████████████████████████████████     │
+│ January   │      30806 │ █████████████████████████████████████████████████▎       │
+│ February  │      28118 │ ████████████████████████████████████████████▊            │
+│ November  │       7474 │ ███████████▊                                             │
+│ December  │       7223 │ ███████████▌                                             │
+│ October   │       7070 │ ███████████▎                                             │
+│ September │       6910 │ ███████████                                              │
+│ August    │       6801 │ ██████████▊                                              │
+│ June      │       6779 │ ██████████▋                                              │
+│ July      │       6485 │ ██████████▍                                              │
+└───────────┴────────────┴──────────────────────────────────────────────────────────┘
+
+12 rows in set. Elapsed: 0.006 sec. Processed 208.99 thousand rows, 417.99 KB (37.48 million rows/s., 74.96 MB/s.)
+```
+
+### Query 2. Compare total number of complaints by Borough
+
+Query:
+
+```sql
+SELECT
+    borough,
+    count() AS complaints,
+    bar(complaints, 0, 125000, 60)
+FROM NYPD_Complaint
+GROUP BY borough
+ORDER BY complaints DESC
+```
+
+Result:
+```response
+Query id: 8cdcdfd4-908f-4be0-99e3-265722a2ab8d
+
+┌─borough───────┬─complaints─┬─bar(count(), 0, 125000, 60)──┐
+│ BROOKLYN      │      57947 │ ███████████████████████████▋ │
+│ MANHATTAN     │      53025 │ █████████████████████████▍   │
+│ QUEENS        │      44875 │ █████████████████████▌       │
+│ BRONX         │      44260 │ █████████████████████▏       │
+│ STATEN ISLAND │       8503 │ ████                         │
+│ (null)        │        383 │ ▏                            │
+└───────────────┴────────────┴──────────────────────────────┘
+
+6 rows in set. Elapsed: 0.008 sec. Processed 208.99 thousand rows, 209.43 KB (27.14 million rows/s., 27.20 MB/s.)
+```
+
+## Next Steps
+
+[A Practical Introduction to Sparse Primary Indexes in ClickHouse](../../guides/improving-query-performance/sparse-primary-indexes/sparse-primary-indexes-intro.md) discusses the differences in ClickHouse indexing compared to traditional relational databases, how ClickHouse builds and uses a sparse primary index, and indexing best practices.
--- a/docs/en/getting-started/install.md
+++ b/docs/en/getting-started/install.md
@ -59,7 +59,7 @@ clickhouse-client # or "clickhouse-client --password" if you set up a password.

 </details>

-You can replace `stable` with `lts` or `testing` to use different [release trains](../faq/operations/production.md) based on your needs.
+You can replace `stable` with `lts` to use different [release kinds](../faq/operations/production.md) based on your needs.

 You can also download and install packages manually from [here](https://packages.clickhouse.com/deb/pool/stable).

@ -106,7 +106,7 @@ clickhouse-client # or "clickhouse-client --password" if you set up a password.

 </details>

-If you want to use the most recent version, replace `stable` with `testing` (this is recommended for your testing environments). `prestable` is sometimes also available.
+You can replace `stable` with `lts` to use different [release kinds](../faq/operations/production.md) based on your needs.

 Then run these commands to install packages:

@ -221,7 +221,7 @@ For non-Linux operating systems and for AArch64 CPU architecture, ClickHouse bui
    curl -O 'https://builds.clickhouse.com/master/aarch64/clickhouse' && chmod a+x ./clickhouse
    ```

-Run `sudo ./clickhouse install` to install ClickHouse system-wide (also with needed configuration files, configuring users etc.). Then run `clickhouse start` commands to start the clickhouse-server and `clickhouse-client` to connect to it.
+Run `sudo ./clickhouse install` to install ClickHouse system-wide (also with needed configuration files, configuring users etc.). Then run `sudo clickhouse start` commands to start the clickhouse-server and `clickhouse-client` to connect to it.

 Use the `clickhouse client` to connect to the server, or `clickhouse local` to process local data.

--- a/docs/en/operations/backup.md
+++ b/docs/en/operations/backup.md
@ -2,10 +2,9 @@
 slug: /en/operations/backup
 sidebar_position: 49
 sidebar_label: Data backup and restore
+title: Data backup and restore
 ---

-# Data backup and restore
-
 While [replication](../engines/table-engines/mergetree-family/replication.md) provides protection from hardware failures, it does not protect against human errors: accidental deletion of data, deletion of the wrong table or a table on the wrong cluster, and software bugs that result in incorrect data processing or data corruption. In many cases mistakes like these will affect all replicas. ClickHouse has built-in safeguards to prevent some types of mistakes — for example, by default [you can’t just drop tables with a MergeTree-like engine containing more than 50 Gb of data](server-configuration-parameters/settings.md#max-table-size-to-drop). However, these safeguards do not cover all possible cases and can be circumvented.

 In order to effectively mitigate possible human errors, you should carefully prepare a strategy for backing up and restoring your data **in advance**.
--- a/docs/en/operations/caches.md
+++ b/docs/en/operations/caches.md
@ -20,6 +20,7 @@ Additional cache types:
 - [Avro format](../interfaces/formats.md#data-format-avro) schemas cache.
 - [Dictionaries](../sql-reference/dictionaries/index.md) data cache.
 - Schema inference cache.
+- [Filesystem cache](storing-data.md) over S3, Azure, Local and other disks.

 Indirectly used:

--- a/docs/en/operations/server-configuration-parameters/settings.md
+++ b/docs/en/operations/server-configuration-parameters/settings.md
@ -1452,7 +1452,7 @@ Port for communicating with clients over MySQL protocol.

 **Possible values**

-Positive integer.
+Positive integer to specify the port number to listen to or empty value to disable.

 Example

@ -1466,7 +1466,7 @@ Port for communicating with clients over PostgreSQL protocol.

 **Possible values**

-Positive integer.
+Positive integer to specify the port number to listen to or empty value to disable.

 Example

--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@ -1176,8 +1176,9 @@ Enables the quorum writes.

 -   If `insert_quorum < 2`, the quorum writes are disabled.
 -   If `insert_quorum >= 2`, the quorum writes are enabled.
+-   If `insert_quorum = 'auto'`, use majority number (`number_of_replicas / 2 + 1`) as quorum number.

-Default value: 0.
+Default value: 0 - disabled.

 Quorum writes

@ -1259,7 +1260,7 @@ Possible values:

 Default value: 1.

-By default, blocks inserted into replicated tables by the `INSERT` statement are deduplicated (see [Data Replication](../../engines/table-engines/mergetree-family/replication.md)). 
+By default, blocks inserted into replicated tables by the `INSERT` statement are deduplicated (see [Data Replication](../../engines/table-engines/mergetree-family/replication.md)).
 For the replicated tables by default the only 100 of the most recent blocks for each partition are deduplicated (see [replicated_deduplication_window](merge-tree-settings.md#replicated-deduplication-window), [replicated_deduplication_window_seconds](merge-tree-settings.md/#replicated-deduplication-window-seconds)).
 For not replicated tables see [non_replicated_deduplication_window](merge-tree-settings.md/#non-replicated-deduplication-window).

--- a/docs/en/operations/storing-data.md
+++ b/docs/en/operations/storing-data.md
@ -112,6 +112,119 @@ Example of disk configuration:
 </clickhouse>
 ```

+## Using local cache {#using-local-cache}
+
+It is possible to configure local cache over disks in storage configuration starting from version 22.3. For versions 22.3 - 22.7 cache is supported only for `s3` disk type. For versions >= 22.8 cache is supported for any disk type: S3, Azure, Local, Encrypted, etc. Cache uses `LRU` cache policy.
+
+Example of configuration for versions later or equal to 22.8:
+
+``` xml
+<clickhouse>
+    <storage_configuration>
+        <disks>
+            <s3>
+                <type>s3</type>
+                <endpoint>...</endpoint>
+                ... s3 configuration ...
+            </s3>
+            <cache>
+                <type>cache</type>
+                <disk>s3</disk>
+                <path>/s3_cache/</path>
+                <max_size>10000000</max_size>
+            </cache>
+        </disks>
+    </storage_configuration>
+```
+
+Example of configuration for versions earlier than 22.8:
+
+``` xml
+<clickhouse>
+    <storage_configuration>
+        <disks>
+            <s3>
+                <type>s3</type>
+                <endpoint>...</endpoint>
+                ... s3 configuration ...
+                <data_cache_enabled>1</data_cache_enabled>
+                <data_cache_size>10000000</data_cache_size>
+            </s3>
+        </disks>
+    </storage_configuration>
+```
+
+Cache **configuration settings**:
+
+- `path` - path to the directory with cache. Default: None, this setting is obligatory.
+
+- `max_size` - maximum size of the cache in bytes. When the limit is reached, cache files are evicted according to the cache eviction policy. Default: None, this setting is obligatory.
+
+- `cache_on_write_operations` - allow to turn on `write-through` cache (caching data on any write operations: `INSERT` queries, background merges). Default: `false`. The `write-through` cache can be disabled per query using setting `enable_filesystem_cache_on_write_operations` (data is cached only if both cache config settings and corresponding query setting are enabled).
+
+- `enable_filesystem_query_cache_limit` - allow to limit the size of cache which is downloaded within each query (depends on user setting `max_query_cache_size`). Default: `false`.
+
+- `enable_cache_hits_threshold` - a number, which defines how many times some data needs to be read before it will be cached. Default: `0`, e.g. the data is cached at the first attempt to read it.
+
+- `do_not_evict_index_and_mark_files` - do not evict small frequently used files according to cache policy. Default: `true`.
+
+- `max_file_segment_size` - a maximum size of a single cache file. Default: `104857600` (100 Mb).
+
+- `max_elements` - a limit for a number of cache files. Default: `1048576`.
+
+Cache **query settings**:
+
+- `enable_filesystem_cache` - allows to disable cache per query even if storage policy was configured with `cache` disk type. Default: `true`.
+
+- `read_from_filesystem_cache_if_exists_otherwise_bypass_cache` - allows to use cache in query only if it already exists, otherwise query data will not be written to local cache storage. Default: `false`.
+
+- `enable_filesystem_cache_on_write_operations` - turn on `write-through` cache. This setting works only if setting `cache_on_write_operations` in cache configuration is turned on.
+
+- `enable_filesystem_cache_log` - turn on logging to `system.filesystem_cache_log` table. Gives a detailed view of cache usage per query. Default: `false`.
+
+- `max_query_cache_size` - a limit for the cache size, which can be written to local cache storage. Requires enabled `enable_filesystem_query_cache_limit` in cache configuration. Default: `false`.
+
+- `skip_download_if_exceeds_query_cache` - allows to change the behaviour of setting `max_query_cache_size`. Default: `true`. If this setting is turned on and cache download limit during query was reached, no more cache will be downloaded to cache storage. If this setting is turned off and cache download limit during query was reached, cache will still be written by cost of evicting previously downloaded (within current query) data, e.g. second behaviour allows to preserve `last recentltly used` behaviour while keeping query cache limit.
+
+** Warning **
+Cache configuration settings and cache query settings correspond to the latest ClickHouse version, for earlier versions something might not be supported.
+
+Cache **system tables**:
+
+- `system.filesystem_cache` - system tables which shows current state of cache.
+
+- `system.filesystem_cache_log` - system table which shows detailed cache usage per query. Requires `enable_filesystem_cache_log` setting to be `true`.
+
+Cache **commands**:
+
+- `SYSTEM DROP FILESYSTEM CACHE (<path>) (ON CLUSTER)`
+
+- `SHOW CACHES` -- show list of caches which were configured on the server.
+
+- `DESCRIBE CACHE '<cache_name>'` - show cache configuration and some general statistics for a specific cache. Cache name can be taken from `SHOW CACHES` command.
+
+Cache current metrics:
+
+- `FilesystemCacheSize`
+
+- `FilesystemCacheElements`
+
+Cache asynchronous metrics:
+
+- `FilesystemCacheBytes`
+
+- `FilesystemCacheFiles`
+
+Cache profile events:
+
+- `CachedReadBufferReadFromSourceBytes`, `CachedReadBufferReadFromCacheBytes,`
+
+- `CachedReadBufferReadFromSourceMicroseconds`, `CachedReadBufferReadFromCacheMicroseconds`
+
+- `CachedReadBufferCacheWriteBytes`, `CachedReadBufferCacheWriteMicroseconds`
+
+- `CachedWriteBufferCacheWriteBytes`, `CachedWriteBufferCacheWriteMicroseconds`
+
 ## Storing Data on Web Server {#storing-data-on-webserver}

 There is a tool `clickhouse-static-files-uploader`, which prepares a data directory for a given table (`SELECT data_paths FROM system.tables WHERE name = 'table_name'`). For each table you need, you get a directory of files. These files can be uploaded to, for example, a web server with static files. After this preparation, you can load this table into any ClickHouse server via `DiskWeb`.
--- a/docs/en/operations/tips.md
+++ b/docs/en/operations/tips.md
@ -74,13 +74,16 @@ Make sure that [`fstrim`](https://en.wikipedia.org/wiki/Trim_(computing)) is ena

 ## File System {#file-system}

-Ext4 is the most reliable option. Set the mount options `noatime`.
-XFS should be avoided. It works mostly fine but there are some reports about lower performance.
+Ext4 is the most reliable option. Set the mount options `noatime`. XFS works well too.
 Most other file systems should also work fine.

+FAT-32 and exFAT are not supported due to lack of hard links.
+
 Do not use compressed filesystems, because ClickHouse does compression on its own and better.
 It's not recommended to use encrypted filesystems, because you can use builtin encryption in ClickHouse, which is better.

+While ClickHouse can work over NFS, it is not the best idea.
+
 ## Linux Kernel {#linux-kernel}

 Don’t use an outdated Linux kernel.
--- a/docs/en/sql-reference/functions/date-time-functions.md
+++ b/docs/en/sql-reference/functions/date-time-functions.md
@ -640,7 +640,8 @@ Result:

 ## date\_diff

-Returns the difference between two dates or dates with time values.
+Returns the difference between two dates or dates with time values. 
+The difference is calculated using relative units, e.g. the difference between `2022-01-01` and `2021-12-29` is 3 days for day unit (see [toRelativeDayNum](#torelativedaynum)), 1 month for month unit (see [toRelativeMonthNum](#torelativemonthnum)), 1 year for year unit (see [toRelativeYearNum](#torelativeyearnum)).

 **Syntax**

@ -692,6 +693,25 @@ Result:
 └────────────────────────────────────────────────────────────────────────────────────────┘
 ```

+Query:
+
+``` sql
+SELECT
+    toDate('2022-01-01') AS e,
+    toDate('2021-12-29') AS s,
+    dateDiff('day', s, e) AS day_diff,
+    dateDiff('month', s, e) AS month__diff,
+    dateDiff('year', s, e) AS year_diff;
+```
+
+Result:
+
+``` text
+┌──────────e─┬──────────s─┬─day_diff─┬─month__diff─┬─year_diff─┐
+│ 2022-01-01 │ 2021-12-29 │        3 │           1 │         1 │
+└────────────┴────────────┴──────────┴─────────────┴───────────┘
+```
+
 ## date\_sub

 Subtracts the time interval or date interval from the provided date or date with time.
--- a/programs/obfuscator/Obfuscator.cpp
+++ b/programs/obfuscator/Obfuscator.cpp
@ -24,6 +24,7 @@
 #include <Common/typeid_cast.h>
 #include <Common/assert_cast.h>
 #include <Formats/registerFormats.h>
+#include <Formats/ReadSchemaUtils.h>
 #include <Processors/Formats/IInputFormat.h>
 #include <QueryPipeline/QueryPipelineBuilder.h>
 #include <Processors/Executors/PullingPipelineExecutor.h>
@ -38,6 +39,7 @@
 #include <IO/WriteBufferFromFile.h>
 #include <Compression/CompressedReadBuffer.h>
 #include <Compression/CompressedWriteBuffer.h>
+#include <Interpreters/parseColumnsListForTableFunction.h>
 #include <memory>
 #include <cmath>
 #include <unistd.h>
@ -1239,7 +1241,6 @@ try

    if (options.count("help")
        || !options.count("seed")
-        || !options.count("structure")
        || !options.count("input-format")
        || !options.count("output-format"))
    {
@ -1259,7 +1260,11 @@ try

    UInt64 seed = sipHash64(options["seed"].as<std::string>());

-    std::string structure = options["structure"].as<std::string>();
+    std::string structure;
+
+    if (options.count("structure"))
+        structure = options["structure"].as<std::string>();
+
    std::string input_format = options["input-format"].as<std::string>();
    std::string output_format = options["output-format"].as<std::string>();

@ -1287,32 +1292,51 @@ try
    markov_model_params.determinator_sliding_window_size = options["determinator-sliding-window-size"].as<UInt64>();

    /// Create the header block
-    std::vector<std::string> structure_vals;
-    boost::split(structure_vals, structure, boost::algorithm::is_any_of(" ,"), boost::algorithm::token_compress_on);
-
-    if (structure_vals.size() % 2 != 0)
-        throw Exception("Odd number of elements in section structure: must be a list of name type pairs", ErrorCodes::LOGICAL_ERROR);
+    SharedContextHolder shared_context = Context::createShared();
+    auto context = Context::createGlobal(shared_context.get());
+    auto context_const = WithContext(context).getContext();
+    context->makeGlobalContext();

    Block header;
-    const DataTypeFactory & data_type_factory = DataTypeFactory::instance();

-    for (size_t i = 0, size = structure_vals.size(); i < size; i += 2)
+    ColumnsDescription schema_columns;
+
+    if (structure.empty())
+    {
+        ReadBufferIterator read_buffer_iterator = [&](ColumnsDescription &)
+        {
+            auto file = std::make_unique<ReadBufferFromFileDescriptor>(STDIN_FILENO);
+
+            /// stdin must be seekable
+            auto res = lseek(file->getFD(), 0, SEEK_SET);
+            if (-1 == res)
+                throwFromErrno("Input must be seekable file (it will be read twice).", ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
+
+            return file;
+        };
+
+        schema_columns = readSchemaFromFormat(input_format, {}, read_buffer_iterator, false, context_const);
+    }
+    else
+    {
+        schema_columns = parseColumnsListFromString(structure, context_const);
+    }
+
+    auto schema_columns_info = schema_columns.getOrdinary();
+
+    for (auto & info : schema_columns_info)
    {
        ColumnWithTypeAndName column;
-        column.name = structure_vals[i];
-        column.type = data_type_factory.get(structure_vals[i + 1]);
+        column.name = info.name;
+        column.type = info.type;
        column.column = column.type->createColumn();
        header.insert(std::move(column));
    }

-    SharedContextHolder shared_context = Context::createShared();
-    auto context = Context::createGlobal(shared_context.get());
-    context->makeGlobalContext();
-
    ReadBufferFromFileDescriptor file_in(STDIN_FILENO);
    WriteBufferFromFileDescriptor file_out(STDOUT_FILENO);

-    if (load_from_file.empty())
+    if (load_from_file.empty() || structure.empty())
    {
        /// stdin must be seekable
        auto res = lseek(file_in.getFD(), 0, SEEK_SET);
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@ -1036,7 +1036,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
        try
        {
            LOG_DEBUG(
-                log, "Initiailizing merge tree metadata cache lru_cache_size:{} continue_if_corrupted:{}", size, continue_if_corrupted);
+                log, "Initializing merge tree metadata cache lru_cache_size:{} continue_if_corrupted:{}", size, continue_if_corrupted);
            global_context->initializeMergeTreeMetadataCache(path_str + "/" + "rocksdb", size);
        }
        catch (...)
@ -1089,7 +1089,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
        }
    }

-    LOG_DEBUG(log, "Initiailizing interserver credentials.");
+    LOG_DEBUG(log, "Initializing interserver credentials.");
    global_context->updateInterserverCredentials(config());

    if (config().has("macros"))
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -42,6 +42,14 @@ endif ()
 # See `src/Common/TargetSpecific.h`
 option(ENABLE_MULTITARGET_CODE "Enable platform-dependent code" ON)

+if (NO_SSE3_OR_HIGHER)
+    # Optimized x86 code in DECLARE_*_SPECIFIC_CODE blocks (see `src/Common/TargetSpecific.h`) is sometimes marked FORCE_INLINE. As a
+    # result, its instruction set requirements (e.g. SSE4.2) leak into generic code. This is normally not a problem for standard x86 builds
+    # because generic code is compiled with SSE 4.2 anyways. But it breaks SSE2-only builds. Therefore disabling the multitarget code
+    # machinery and always use generic code. (The cleaner alternative is removing FORCE_INLINE but that impacts performance too much.)
+    set(ENABLE_MULTITARGET_CODE OFF)
+endif()
+
 if (ENABLE_MULTITARGET_CODE)
    add_definitions(-DENABLE_MULTITARGET_CODE=1)
 else()
--- a/src/Client/ConnectionEstablisher.cpp
+++ b/src/Client/ConnectionEstablisher.cpp
@ -16,6 +16,7 @@ namespace ErrorCodes
    extern const int ATTEMPT_TO_READ_AFTER_EOF;
    extern const int NETWORK_ERROR;
    extern const int SOCKET_TIMEOUT;
+    extern const int DNS_ERROR;
 }

 ConnectionEstablisher::ConnectionEstablisher(
@ -90,6 +91,7 @@ void ConnectionEstablisher::run(ConnectionEstablisher::TryResult & result, std::
    catch (const Exception & e)
    {
        if (e.code() != ErrorCodes::NETWORK_ERROR && e.code() != ErrorCodes::SOCKET_TIMEOUT
+            && e.code() != ErrorCodes::DNS_ERROR
            && e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF)
            throw;

--- a/src/Common/ConcurrencyControl.h
+++ b/src/Common/ConcurrencyControl.h
@ -250,7 +250,7 @@ private:
        }
    }

-    SlotCount available(std::unique_lock<std::mutex> &)
+    SlotCount available(std::unique_lock<std::mutex> &) const
    {
        if (cur_concurrency < max_concurrency)
            return max_concurrency - cur_concurrency;
--- a/src/Common/Config/ConfigProcessor.cpp
+++ b/src/Common/Config/ConfigProcessor.cpp
@ -13,6 +13,7 @@
 #include <Poco/DOM/Text.h>
 #include <Poco/DOM/Attr.h>
 #include <Poco/DOM/Comment.h>
+#include <Poco/XML/XMLWriter.h>
 #include <Poco/Util/XMLConfiguration.h>
 #include <Common/ZooKeeper/ZooKeeperNodeCache.h>
 #include <Common/ZooKeeper/KeeperException.h>
@ -729,7 +730,11 @@ void ConfigProcessor::savePreprocessedConfig(const LoadedConfig & loaded_config,
            if (!preprocessed_path_parent.empty())
                fs::create_directories(preprocessed_path_parent);
        }
-        DOMWriter().writeNode(preprocessed_path, loaded_config.preprocessed_xml);
+        DOMWriter writer;
+        writer.setNewLine("\n");
+        writer.setIndent("    ");
+        writer.setOptions(Poco::XML::XMLWriter::PRETTY_PRINT);
+        writer.writeNode(preprocessed_path, loaded_config.preprocessed_xml);
        LOG_DEBUG(log, "Saved preprocessed configuration to '{}'.", preprocessed_path);
    }
    catch (Poco::Exception & e)
--- a/src/Common/Config/YAMLParser.cpp
+++ b/src/Common/Config/YAMLParser.cpp
@ -26,114 +26,107 @@ namespace ErrorCodes
    extern const int CANNOT_PARSE_YAML;
 }

-/// A prefix symbol in yaml key
-/// We add attributes to nodes by using a prefix symbol in the key part.
-/// Currently we use @ as a prefix symbol. Note, that @ is reserved
-/// by YAML standard, so we need to write a key-value pair like this: "@attribute": attr_value
-const char YAML_ATTRIBUTE_PREFIX = '@';
-
 namespace
 {
+    /// A prefix symbol in yaml key
+    /// We add attributes to nodes by using a prefix symbol in the key part.
+    /// Currently we use @ as a prefix symbol. Note, that @ is reserved
+    /// by YAML standard, so we need to write a key-value pair like this: "@attribute": attr_value
+    const char YAML_ATTRIBUTE_PREFIX = '@';

-Poco::AutoPtr<Poco::XML::Element> createCloneNode(Poco::XML::Element & original_node)
-{
-    Poco::AutoPtr<Poco::XML::Element> clone_node = original_node.ownerDocument()->createElement(original_node.nodeName());
-    original_node.parentNode()->appendChild(clone_node);
-    return clone_node;
-}
-
-void processNode(const YAML::Node & node, Poco::XML::Element & parent_xml_element)
-{
-    auto * xml_document = parent_xml_element.ownerDocument();
-    switch (node.Type())
+    Poco::AutoPtr<Poco::XML::Element> cloneXMLNode(const Poco::XML::Element & original_node)
    {
-        case YAML::NodeType::Scalar:
-        {
-            std::string value = node.as<std::string>();
-            Poco::AutoPtr<Poco::XML::Text> xml_value = xml_document->createTextNode(value);
-            parent_xml_element.appendChild(xml_value);
-            break;
-        }
+        Poco::AutoPtr<Poco::XML::Element> clone_node = original_node.ownerDocument()->createElement(original_node.nodeName());
+        original_node.parentNode()->appendChild(clone_node);
+        return clone_node;
+    }

-        /// We process YAML Sequences as a
-        /// list of <key>value</key> tags with same key and different values.
-        /// For example, we translate this sequence
-        /// seq:
-        ///     - val1
-        ///     - val2
-        ///
-        /// into this:
-        /// <seq>val1</seq>
-        /// <seq>val2</seq>
-        case YAML::NodeType::Sequence:
+    void processNode(const YAML::Node & node, Poco::XML::Element & parent_xml_node)
+    {
+        auto * xml_document = parent_xml_node.ownerDocument();
+        switch (node.Type())
        {
-            for (const auto & child_node : node)
-                /// For sequences it depends how we want to process them.
-                /// Sequences of key-value pairs such as:
-                /// seq:
-                ///     - k1: val1
-                ///     - k2: val2
-                /// into xml like this:
-                /// <seq>
-                ///     <k1>val1</k1>
-                ///     <k2>val2</k2>
-                /// </seq>
-                ///
-                /// But, if the sequence is just a list, the root-node needs to be repeated, such as:
-                /// seq:
-                ///     - val1
-                ///     - val2
-                /// into xml like this:
-                /// <seq>val1</seq>
-                /// <seq>val2</seq>
-                ///
-                /// Therefore check what type the child is, for further processing.
-                /// Mixing types (values list or map) will lead to strange results but should not happen.
-                if (parent_xml_element.hasChildNodes() && !child_node.IsMap())
-                {
-                    /// Create a new parent node with same tag for each child node
-                    processNode(child_node, *createCloneNode(parent_xml_element));
-                }
-                else
-                {
-                    /// Map, so don't recreate the parent node but add directly
-                    processNode(child_node, parent_xml_element);
-                }
-            break;
-        }
-        case YAML::NodeType::Map:
-        {
-            for (const auto & key_value_pair : node)
+            case YAML::NodeType::Scalar:
            {
-                const auto & key_node = key_value_pair.first;
-                const auto & value_node = key_value_pair.second;
-                std::string key = key_node.as<std::string>();
-                bool is_attribute = (key.starts_with(YAML_ATTRIBUTE_PREFIX) && value_node.IsScalar());
-                if (is_attribute)
-                {
-                    /// we use substr(1) here to remove YAML_ATTRIBUTE_PREFIX from key
-                    auto attribute_name = key.substr(1);
-                    std::string value = value_node.as<std::string>();
-                    parent_xml_element.setAttribute(attribute_name, value);
-                }
-                else
-                {
-                    Poco::AutoPtr<Poco::XML::Element> xml_key = xml_document->createElement(key);
-                    parent_xml_element.appendChild(xml_key);
-                    processNode(value_node, *xml_key);
-                }
+                std::string value = node.as<std::string>();
+                Poco::AutoPtr<Poco::XML::Text> xml_value = xml_document->createTextNode(value);
+                parent_xml_node.appendChild(xml_value);
+                break;
+            }
+
+            /// For sequences we repeat the parent xml node. For example,
+            /// seq:
+            ///     - val1
+            ///     - val2
+            /// is converted into the following xml:
+            /// <seq>val1</seq>
+            /// <seq>val2</seq>
+            ///
+            /// A sequence of mappings is converted in the same way:
+            /// seq:
+            ///     - k1: val1
+            ///       k2: val2
+            ///     - k3: val3
+            /// is converted into the following xml:
+            /// <seq><k1>val1</k1><k2>val2</k2></seq>
+            /// <seq><k3>val3</k3></seq>
+            case YAML::NodeType::Sequence:
+            {
+                size_t i = 0;
+                for (auto it = node.begin(); it != node.end(); ++it, ++i)
+                {
+                    const auto & child_node = *it;
+
+                    bool need_clone_parent_xml_node = (i > 0);
+
+                    if (need_clone_parent_xml_node)
+                    {
+                        /// Create a new parent node with same tag for each child node
+                        processNode(child_node, *cloneXMLNode(parent_xml_node));
+                    }
+                    else
+                    {
+                        /// Map, so don't recreate the parent node but add directly
+                        processNode(child_node, parent_xml_node);
+                    }
+                }
+                break;
+            }
+
+            case YAML::NodeType::Map:
+            {
+                for (const auto & key_value_pair : node)
+                {
+                    const auto & key_node = key_value_pair.first;
+                    const auto & value_node = key_value_pair.second;
+                    std::string key = key_node.as<std::string>();
+                    bool is_attribute = (key.starts_with(YAML_ATTRIBUTE_PREFIX) && value_node.IsScalar());
+                    if (is_attribute)
+                    {
+                        /// we use substr(1) here to remove YAML_ATTRIBUTE_PREFIX from key
+                        auto attribute_name = key.substr(1);
+                        std::string value = value_node.as<std::string>();
+                        parent_xml_node.setAttribute(attribute_name, value);
+                    }
+                    else
+                    {
+                        Poco::AutoPtr<Poco::XML::Element> xml_key = xml_document->createElement(key);
+                        parent_xml_node.appendChild(xml_key);
+                        processNode(value_node, *xml_key);
+                    }
+                }
+                break;
+            }
+
+            case YAML::NodeType::Null: break;
+            case YAML::NodeType::Undefined:
+            {
+                throw Exception(ErrorCodes::CANNOT_PARSE_YAML, "YAMLParser has encountered node with undefined type and cannot continue parsing of the file");
            }
-            break;
-        }
-        case YAML::NodeType::Null: break;
-        case YAML::NodeType::Undefined:
-        {
-            throw Exception(ErrorCodes::CANNOT_PARSE_YAML, "YAMLParser has encountered node with undefined type and cannot continue parsing of the file");
        }
    }
 }

-}

 Poco::AutoPtr<Poco::XML::Document> YAMLParser::parse(const String& path)
 {
--- a/src/Common/Elf.cpp
+++ b/src/Common/Elf.cpp
@ -22,13 +22,13 @@ Elf::Elf(const std::string & path)
    /// Check if it's an elf.
    elf_size = in.buffer().size();
    if (elf_size < sizeof(ElfEhdr))
-        throw Exception("The size of supposedly ELF file is too small", ErrorCodes::CANNOT_PARSE_ELF);
+        throw Exception(ErrorCodes::CANNOT_PARSE_ELF, "The size of supposedly ELF file '{}' is too small", path);

    mapped = in.buffer().begin();
    header = reinterpret_cast<const ElfEhdr *>(mapped);

    if (memcmp(header->e_ident, "\x7F""ELF", 4) != 0)
-        throw Exception("The file is not ELF according to magic", ErrorCodes::CANNOT_PARSE_ELF);
+        throw Exception(ErrorCodes::CANNOT_PARSE_ELF, "The file '{}' is not ELF according to magic", path);

    /// Get section header.
    ElfOff section_header_offset = header->e_shoff;
@ -37,7 +37,7 @@ Elf::Elf(const std::string & path)
    if (!section_header_offset
        || !section_header_num_entries
        || section_header_offset + section_header_num_entries * sizeof(ElfShdr) > elf_size)
-        throw Exception("The ELF is truncated (section header points after end of file)", ErrorCodes::CANNOT_PARSE_ELF);
+        throw Exception(ErrorCodes::CANNOT_PARSE_ELF, "The ELF '{}' is truncated (section header points after end of file)", path);

    section_headers = reinterpret_cast<const ElfShdr *>(mapped + section_header_offset);

@ -48,11 +48,11 @@ Elf::Elf(const std::string & path)
    });

    if (!section_names_strtab)
-        throw Exception("The ELF doesn't have string table with section names", ErrorCodes::CANNOT_PARSE_ELF);
+        throw Exception(ErrorCodes::CANNOT_PARSE_ELF, "The ELF '{}' doesn't have string table with section names", path);

    ElfOff section_names_offset = section_names_strtab->header.sh_offset;
    if (section_names_offset >= elf_size)
-        throw Exception("The ELF is truncated (section names string table points after end of file)", ErrorCodes::CANNOT_PARSE_ELF);
+        throw Exception(ErrorCodes::CANNOT_PARSE_ELF, "The ELF '{}' is truncated (section names string table points after end of file)", path);

    section_names = reinterpret_cast<const char *>(mapped + section_names_offset);

@ -64,7 +64,7 @@ Elf::Elf(const std::string & path)
    if (!program_header_offset
        || !program_header_num_entries
        || program_header_offset + program_header_num_entries * sizeof(ElfPhdr) > elf_size)
-        throw Exception("The ELF is truncated (program header points after end of file)", ErrorCodes::CANNOT_PARSE_ELF);
+        throw Exception(ErrorCodes::CANNOT_PARSE_ELF, "The ELF '{}' is truncated (program header points after end of file)", path);

    program_headers = reinterpret_cast<const ElfPhdr *>(mapped + program_header_offset);
 }
--- a/src/Common/FieldVisitorToString.cpp
+++ b/src/Common/FieldVisitorToString.cpp
@ -145,5 +145,11 @@ String FieldVisitorToString::operator() (const Object & x) const

 }

+String convertFieldToString(const Field & field)
+{
+    if (field.getType() == Field::Types::Which::String)
+        return field.get<String>();
+    return applyVisitor(FieldVisitorToString(), field);
 }

+}
--- a/src/Common/FieldVisitorToString.h
+++ b/src/Common/FieldVisitorToString.h
@ -31,5 +31,8 @@ public:
    String operator() (const bool & x) const;
 };

-}
+/// Get value from field and convert it to string.
+/// Also remove quotes from strings.
+String convertFieldToString(const Field & field);

+}
--- a/src/Common/OpenTelemetryTraceContext.cpp
+++ b/src/Common/OpenTelemetryTraceContext.cpp
@ -88,7 +88,13 @@ void Span::addAttribute(std::exception_ptr e) noexcept

 SpanHolder::SpanHolder(std::string_view _operation_name)
 {
-    if (current_thread_trace_context.isTraceEnabled())
+    if (!current_thread_trace_context.isTraceEnabled())
+    {
+        return;
+    }
+
+    /// Use try-catch to make sure the ctor is exception safe.
+    try
    {
        this->trace_id = current_thread_trace_context.trace_id;
        this->parent_span_id = current_thread_trace_context.span_id;
@ -97,9 +103,19 @@ SpanHolder::SpanHolder(std::string_view _operation_name)
        this->start_time_us
            = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();

-        // set current span id to this
-        current_thread_trace_context.span_id = this->span_id;
+        /// Add new initialization here
    }
+    catch (...)
+    {
+        tryLogCurrentException(__FUNCTION__);
+
+        /// Clear related fields to make sure the span won't be recorded.
+        this->trace_id = UUID();
+        return;
+    }
+
+    /// Set current span as parent of other spans created later on this thread.
+    current_thread_trace_context.span_id = this->span_id;
 }

 void SpanHolder::finish() noexcept
@ -216,7 +232,7 @@ const TracingContextOnThread & CurrentContext()
    return current_thread_trace_context;
 }

-void TracingContextOnThread::reset()
+void TracingContextOnThread::reset() noexcept
 {
    this->trace_id = UUID();
    this->span_id = 0;
@ -231,63 +247,75 @@ TracingContextHolder::TracingContextHolder(
    const Settings * settings_ptr,
    const std::weak_ptr<OpenTelemetrySpanLog> & _span_log)
 {
-    if (current_thread_trace_context.isTraceEnabled())
+    /// Use try-catch to make sure the ctor is exception safe.
+    /// If any exception is raised during the construction, the tracing is not enabled on current thread.
+    try
    {
-        ///
-        /// This is not the normal case,
-        /// it means that construction of current object is not at the start of current thread.
-        /// Usually this is due to:
-        ///    1. bad design
-        ///    2. right design but code changes so that original point where this object is constructing is not the new start execution of current thread
-        ///
-        /// In such case, we should use current context as parent of this new constructing object,
-        /// So this branch ensures this class can be instantiated multiple times on one same thread safely.
-        ///
-        this->is_context_owner = false;
-        this->root_span.trace_id = current_thread_trace_context.trace_id;
-        this->root_span.parent_span_id = current_thread_trace_context.span_id;
+        if (current_thread_trace_context.isTraceEnabled())
+        {
+            ///
+            /// This is not the normal case,
+            /// it means that construction of current object is not at the start of current thread.
+            /// Usually this is due to:
+            ///    1. bad design
+            ///    2. right design but code changes so that original point where this object is constructing is not the new start execution of current thread
+            ///
+            /// In such case, we should use current context as parent of this new constructing object,
+            /// So this branch ensures this class can be instantiated multiple times on one same thread safely.
+            ///
+            this->is_context_owner = false;
+            this->root_span.trace_id = current_thread_trace_context.trace_id;
+            this->root_span.parent_span_id = current_thread_trace_context.span_id;
+            this->root_span.span_id = thread_local_rng();
+            this->root_span.operation_name = _operation_name;
+            this->root_span.start_time_us
+                = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
+
+            /// Set the root span as parent of other spans created on current thread
+            current_thread_trace_context.span_id = this->root_span.span_id;
+            return;
+        }
+
+        if (!_parent_trace_context.isTraceEnabled())
+        {
+            if (settings_ptr == nullptr)
+                /// Skip tracing context initialization on current thread
+                return;
+
+            // Start the trace with some configurable probability.
+            std::bernoulli_distribution should_start_trace{settings_ptr->opentelemetry_start_trace_probability};
+            if (!should_start_trace(thread_local_rng))
+                /// skip tracing context initialization on current thread
+                return;
+
+            while (_parent_trace_context.trace_id == UUID())
+            {
+                // Make sure the random generated trace_id is not 0 which is an invalid id.
+                _parent_trace_context.trace_id.toUnderType().items[0] = thread_local_rng(); //-V656
+                _parent_trace_context.trace_id.toUnderType().items[1] = thread_local_rng(); //-V656
+            }
+            _parent_trace_context.span_id = 0;
+        }
+
+        this->root_span.trace_id = _parent_trace_context.trace_id;
+        this->root_span.parent_span_id = _parent_trace_context.span_id;
        this->root_span.span_id = thread_local_rng();
        this->root_span.operation_name = _operation_name;
        this->root_span.start_time_us
            = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();

-        current_thread_trace_context.span_id = this->root_span.span_id;
+        /// Add new initialization here
+    }
+    catch (...)
+    {
+        tryLogCurrentException(__FUNCTION__);
+
+        /// Clear related fields to make sure the tracing is not enabled.
+        this->root_span.trace_id = UUID();
        return;
    }

-    if (!_parent_trace_context.isTraceEnabled())
-    {
-        if (settings_ptr == nullptr)
-            /// skip tracing context initialization on current thread
-            return;
-
-        // start the trace ourselves, with some configurable probability.
-        std::bernoulli_distribution should_start_trace{settings_ptr->opentelemetry_start_trace_probability};
-        if (!should_start_trace(thread_local_rng))
-            /// skip tracing context initialization on current thread
-            return;
-
-        while (_parent_trace_context.trace_id == UUID())
-        {
-            // make sure the random generated trace_id is not 0 which is an invalid id
-            _parent_trace_context.trace_id.toUnderType().items[0] = thread_local_rng(); //-V656
-            _parent_trace_context.trace_id.toUnderType().items[1] = thread_local_rng(); //-V656
-        }
-        _parent_trace_context.span_id = 0;
-    }
-
-    this->root_span.trace_id = _parent_trace_context.trace_id;
-    this->root_span.parent_span_id = _parent_trace_context.span_id;
-    this->root_span.span_id = thread_local_rng();
-    this->root_span.operation_name = _operation_name;
-    this->root_span.start_time_us
-        = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
-
-    /// This object is created to initialize tracing context on a new thread,
-    /// it's helpful to record the thread_id so that we know the thread switching from the span log
-    this->root_span.addAttribute("clickhouse.thread_id", getThreadId());
-
-    /// set up trace context on current thread
+    /// Set up trace context on current thread only when the root span is successfully initialized.
    current_thread_trace_context = _parent_trace_context;
    current_thread_trace_context.span_id = this->root_span.span_id;
    current_thread_trace_context.trace_flags = TRACE_FLAG_SAMPLED;
@ -306,6 +334,18 @@ TracingContextHolder::~TracingContextHolder()
        auto shared_span_log = current_thread_trace_context.span_log.lock();
        if (shared_span_log)
        {
+            try
+            {
+                /// This object is created to initialize tracing context on a new thread,
+                /// it's helpful to record the thread_id so that we know the thread switching from the span log
+                this->root_span.addAttribute("clickhouse.thread_id", getThreadId());
+            }
+            catch (...)
+            {
+                /// It's acceptable that the attribute is not recorded in case of any exception,
+                /// so the exception is ignored to try to log the span.
+            }
+
            this->root_span.finish_time_us
                = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();

--- a/src/Common/OpenTelemetryTraceContext.h
+++ b/src/Common/OpenTelemetryTraceContext.h
@ -74,7 +74,7 @@ struct TracingContextOnThread : TracingContext
        return *this;
    }

-    void reset();
+    void reset() noexcept;

    /// Use weak_ptr instead of shared_ptr to hold a reference to the underlying system.opentelemetry_span_log table
    /// Since this object is kept on threads and passed across threads, a weak_ptr is more safe to prevent potential leak
--- a/src/Common/OvercommitTracker.h
+++ b/src/Common/OvercommitTracker.h
@ -61,7 +61,7 @@ enum class QueryCancellationState

 // Usually it's hard to set some reasonable hard memory limit
 // (especially, the default value). This class introduces new
-// mechanisim for the limiting of memory usage.
+// mechanism for the limiting of memory usage.
 // Soft limit represents guaranteed amount of memory query/user
 // may use. It's allowed to exceed this limit. But if hard limit
 // is reached, query with the biggest overcommit ratio
@ -82,7 +82,7 @@ protected:
    virtual void pickQueryToExcludeImpl() = 0;

    // This mutex is used to disallow concurrent access
-    // to picked_tracker and cancelation_state variables.
+    // to picked_tracker and cancellation_state variables.
    std::mutex overcommit_m;
    std::condition_variable cv;

--- a/src/Common/ProfileEvents.cpp
+++ b/src/Common/ProfileEvents.cpp
@ -286,6 +286,18 @@ The server successfully detected this situation and will download merged part fr
    M(S3WriteRequestsThrottling, "Number of 429 and 503 errors in POST, DELETE, PUT and PATCH requests to S3 storage.") \
    M(S3WriteRequestsRedirects, "Number of redirects in POST, DELETE, PUT and PATCH requests to S3 storage.") \
    \
+    M(DiskS3ReadMicroseconds, "Time of GET and HEAD requests to DiskS3 storage.") \
+    M(DiskS3ReadRequestsCount, "Number of GET and HEAD requests to DiskS3 storage.") \
+    M(DiskS3ReadRequestsErrors, "Number of non-throttling errors in GET and HEAD requests to DiskS3 storage.") \
+    M(DiskS3ReadRequestsThrottling, "Number of 429 and 503 errors in GET and HEAD requests to DiskS3 storage.") \
+    M(DiskS3ReadRequestsRedirects, "Number of redirects in GET and HEAD requests to DiskS3 storage.") \
+    \
+    M(DiskS3WriteMicroseconds, "Time of POST, DELETE, PUT and PATCH requests to DiskS3 storage.") \
+    M(DiskS3WriteRequestsCount, "Number of POST, DELETE, PUT and PATCH requests to DiskS3 storage.") \
+    M(DiskS3WriteRequestsErrors, "Number of non-throttling errors in POST, DELETE, PUT and PATCH requests to DiskS3 storage.") \
+    M(DiskS3WriteRequestsThrottling, "Number of 429 and 503 errors in POST, DELETE, PUT and PATCH requests to DiskS3 storage.") \
+    M(DiskS3WriteRequestsRedirects, "Number of redirects in POST, DELETE, PUT and PATCH requests to DiskS3 storage.") \
+    \
    M(ReadBufferFromS3Microseconds, "Time spend in reading from S3.") \
    M(ReadBufferFromS3Bytes, "Bytes read from S3.") \
    M(ReadBufferFromS3RequestsErrors, "Number of exceptions while reading from S3.") \
--- a/src/Common/SymbolIndex.cpp
+++ b/src/Common/SymbolIndex.cpp
@ -37,7 +37,7 @@ But because ClickHouse is linked with most of the symbols exported (-rdynamic fl
 It allows to get source file names and line numbers from addresses. Only available if you use -g option for compiler.
 It is also used by default for ClickHouse builds, but because of its weight (about two gigabytes)
 it is split to separate binary and provided in clickhouse-common-static-dbg package.
-This separate binary is placed in /usr/lib/debug/usr/bin/clickhouse and is loaded automatically by tools like gdb, addr2line.
+This separate binary is placed in /usr/lib/debug/usr/bin/clickhouse.debug and is loaded automatically by tools like gdb, addr2line.
 When you build ClickHouse by yourself, debug info is not split and present in a single huge binary.

 What ClickHouse is using to provide good stack traces?
@ -391,10 +391,22 @@ void collectSymbolsFromELF(
    std::filesystem::path local_debug_info_path = canonical_path.parent_path() / canonical_path.stem();
    local_debug_info_path += ".debug";
    std::filesystem::path debug_info_path = std::filesystem::path("/usr/lib/debug") / canonical_path.relative_path();
+    debug_info_path += ".debug";

-    if (std::filesystem::exists(local_debug_info_path))
+    /// NOTE: This is a workaround for current package system.
+    ///
+    /// Since nfpm cannot copy file only if it exists,
+    /// and so in cmake empty .debug file is created instead,
+    /// but if we will try to load empty Elf file, then the CANNOT_PARSE_ELF
+    /// exception will be thrown from the Elf::Elf.
+    auto exists_not_empty = [](const std::filesystem::path & path)
+    {
+        return std::filesystem::exists(path) && !std::filesystem::is_empty(path);
+    };
+
+    if (exists_not_empty(local_debug_info_path))
        object_name = local_debug_info_path;
-    else if (std::filesystem::exists(debug_info_path))
+    else if (exists_not_empty(debug_info_path))
        object_name = debug_info_path;
    else if (build_id.size() >= 2)
    {
@ -412,7 +424,7 @@ void collectSymbolsFromELF(

        std::filesystem::path build_id_debug_info_path(
            fmt::format("/usr/lib/debug/.build-id/{}/{}.debug", build_id_hex.substr(0, 2), build_id_hex.substr(2)));
-        if (std::filesystem::exists(build_id_debug_info_path))
+        if (exists_not_empty(build_id_debug_info_path))
            object_name = build_id_debug_info_path;
        else
            object_name = canonical_path;
--- a/src/Common/ZooKeeper/IKeeper.h
+++ b/src/Common/ZooKeeper/IKeeper.h
@ -80,7 +80,7 @@ enum class Error : int32_t
    ZUNIMPLEMENTED = -6,        /// Operation is unimplemented
    ZOPERATIONTIMEOUT = -7,     /// Operation timeout
    ZBADARGUMENTS = -8,         /// Invalid arguments
-    ZINVALIDSTATE = -9,         /// Invliad zhandle state
+    ZINVALIDSTATE = -9,         /// Invalid zhandle state

    /** API errors.
        * This is never thrown by the server, it shouldn't be used other than
@ -428,6 +428,12 @@ public:
    Exception(const Error code_, const std::string & path); /// NOLINT
    Exception(const Exception & exc);

+    template <typename... Args>
+    Exception(const Error code_, fmt::format_string<Args...> fmt, Args &&... args)
+        : Exception(fmt::format(fmt, std::forward<Args>(args)...), code_)
+    {
+    }
+
    const char * name() const noexcept override { return "Coordination::Exception"; }
    const char * className() const noexcept override { return "Coordination::Exception"; }
    Exception * clone() const override { return new Exception(*this); }
@ -439,7 +445,7 @@ public:
 /** Usage scenario:
  * - create an object and issue commands;
  * - you provide callbacks for your commands; callbacks are invoked in internal thread and must be cheap:
-  *   for example, just signal a condvar / fulfull a promise.
+  *   for example, just signal a condvar / fulfill a promise.
  * - you also may provide callbacks for watches; they are also invoked in internal thread and must be cheap.
  * - whenever you receive exception with ZSESSIONEXPIRED code or method isExpired returns true,
  *   the ZooKeeper instance is no longer usable - you may only destroy it and probably create another.
--- a/src/Common/ZooKeeper/TestKeeper.cpp
+++ b/src/Common/ZooKeeper/TestKeeper.cpp
@ -507,15 +507,15 @@ ResponsePtr TestKeeperSyncRequest::createResponse() const { return std::make_sha
 ResponsePtr TestKeeperMultiRequest::createResponse() const { return std::make_shared<MultiResponse>(); }


-TestKeeper::TestKeeper(const String & root_path_, Poco::Timespan operation_timeout_)
-    : root_path(root_path_), operation_timeout(operation_timeout_)
+TestKeeper::TestKeeper(const zkutil::ZooKeeperArgs & args_)
+    : args(args_)
 {
    container.emplace("/", Node());

-    if (!root_path.empty())
+    if (!args.chroot.empty())
    {
-        if (root_path.back() == '/')
-            root_path.pop_back();
+        if (args.chroot.back() == '/')
+            args.chroot.pop_back();
    }

    processing_thread = ThreadFromGlobalPool([this] { processingThread(); });
@ -547,7 +547,7 @@ void TestKeeper::processingThread()
        {
            RequestInfo info;

-            UInt64 max_wait = static_cast<UInt64>(operation_timeout.totalMilliseconds());
+            UInt64 max_wait = static_cast<UInt64>(args.operation_timeout_ms);
            if (requests_queue.tryPop(info, max_wait))
            {
                if (expired)
@ -556,7 +556,7 @@ void TestKeeper::processingThread()

                ++zxid;

-                info.request->addRootPath(root_path);
+                info.request->addRootPath(args.chroot);
                auto [response, _] = info.request->process(container, zxid);

                if (info.watch)
@ -580,7 +580,7 @@ void TestKeeper::processingThread()
                if (response->error == Error::ZOK)
                    info.request->processWatches(watches, list_watches);

-                response->removeRootPath(root_path);
+                response->removeRootPath(args.chroot);
                if (info.callback)
                    info.callback(*response);
            }
@ -689,7 +689,7 @@ void TestKeeper::pushRequest(RequestInfo && request)
        if (expired)
            throw Exception("Session expired", Error::ZSESSIONEXPIRED);

-        if (!requests_queue.tryPush(std::move(request), operation_timeout.totalMilliseconds()))
+        if (!requests_queue.tryPush(std::move(request), args.operation_timeout_ms))
            throw Exception("Cannot push request to queue within operation timeout", Error::ZOPERATIONTIMEOUT);
    }
    catch (...)
--- a/src/Common/ZooKeeper/TestKeeper.h
+++ b/src/Common/ZooKeeper/TestKeeper.h
@ -8,6 +8,7 @@

 #include <Poco/Timespan.h>
 #include <Common/ZooKeeper/IKeeper.h>
+#include <Common/ZooKeeper/ZooKeeperArgs.h>
 #include <Common/ThreadPool.h>
 #include <Common/ConcurrentBoundedQueue.h>

@ -33,7 +34,7 @@ using TestKeeperRequestPtr = std::shared_ptr<TestKeeperRequest>;
 class TestKeeper final : public IKeeper
 {
 public:
-    TestKeeper(const String & root_path_, Poco::Timespan operation_timeout_);
+    TestKeeper(const zkutil::ZooKeeperArgs & args_);
    ~TestKeeper() override;

    bool isExpired() const override { return expired; }
@ -123,10 +124,7 @@ private:

    Container container;

-    String root_path;
-    ACLs default_acls;
-
-    Poco::Timespan operation_timeout;
+    zkutil::ZooKeeperArgs args;

    std::mutex push_request_mutex;
    std::atomic<bool> expired{false};
--- a/src/Common/ZooKeeper/ZooKeeper.cpp
+++ b/src/Common/ZooKeeper/ZooKeeper.cpp
@ -6,20 +6,18 @@
 #include <functional>
 #include <filesystem>

+#include <Common/randomSeed.h>
 #include <base/find_symbols.h>
 #include <base/sort.h>
 #include <base/getFQDNOrHostName.h>
 #include "Common/ZooKeeper/IKeeper.h"
 #include <Common/StringUtils/StringUtils.h>
 #include <Common/Exception.h>
-#include <Common/isLocalAddress.h>

 #include <Poco/Net/NetException.h>
 #include <Poco/Net/DNS.h>


-#define ZOOKEEPER_CONNECTION_TIMEOUT_MS 1000
-
 namespace fs = std::filesystem;

 namespace DB
@ -49,25 +47,19 @@ static void check(Coordination::Error code, const std::string & path)
 }


-void ZooKeeper::init(const std::string & implementation_, const Strings & hosts_, const std::string & identity_,
-                     int32_t session_timeout_ms_, int32_t operation_timeout_ms_, const std::string & chroot_, const GetPriorityForLoadBalancing & get_priority_load_balancing_)
-{
-    log = &Poco::Logger::get("ZooKeeper");
-    hosts = hosts_;
-    identity = identity_;
-    session_timeout_ms = session_timeout_ms_;
-    operation_timeout_ms = operation_timeout_ms_;
-    chroot = chroot_;
-    implementation = implementation_;
-    get_priority_load_balancing = get_priority_load_balancing_;
+void ZooKeeper::init(ZooKeeperArgs args_)

-    if (implementation == "zookeeper")
+{
+    args = std::move(args_);
+    log = &Poco::Logger::get("ZooKeeper");
+
+    if (args.implementation == "zookeeper")
    {
-        if (hosts.empty())
+        if (args.hosts.empty())
            throw KeeperException("No hosts passed to ZooKeeper constructor.", Coordination::Error::ZBADARGUMENTS);

        Coordination::ZooKeeper::Nodes nodes;
-        nodes.reserve(hosts.size());
+        nodes.reserve(args.hosts.size());

        /// Shuffle the hosts to distribute the load among ZooKeeper nodes.
        std::vector<ShuffleHost> shuffled_hosts = shuffleHosts();
@ -108,33 +100,23 @@ void ZooKeeper::init(const std::string & implementation_, const Strings & hosts_
                throw KeeperException("Cannot use any of provided ZooKeeper nodes", Coordination::Error::ZBADARGUMENTS);
        }

-        impl = std::make_unique<Coordination::ZooKeeper>(
-                nodes,
-                chroot,
-                identity_.empty() ? "" : "digest",
-                identity_,
-                Poco::Timespan(0, session_timeout_ms_ * 1000),
-                Poco::Timespan(0, ZOOKEEPER_CONNECTION_TIMEOUT_MS * 1000),
-                Poco::Timespan(0, operation_timeout_ms_ * 1000),
-                zk_log);
+        impl = std::make_unique<Coordination::ZooKeeper>(nodes, args, zk_log);

-        if (chroot.empty())
-            LOG_TRACE(log, "Initialized, hosts: {}", fmt::join(hosts, ","));
+        if (args.chroot.empty())
+            LOG_TRACE(log, "Initialized, hosts: {}", fmt::join(args.hosts, ","));
        else
-            LOG_TRACE(log, "Initialized, hosts: {}, chroot: {}", fmt::join(hosts, ","), chroot);
+            LOG_TRACE(log, "Initialized, hosts: {}, chroot: {}", fmt::join(args.hosts, ","), args.chroot);
    }
-    else if (implementation == "testkeeper")
+    else if (args.implementation == "testkeeper")
    {
-        impl = std::make_unique<Coordination::TestKeeper>(
-                chroot,
-                Poco::Timespan(0, operation_timeout_ms_ * 1000));
+        impl = std::make_unique<Coordination::TestKeeper>(args);
    }
    else
    {
-        throw DB::Exception("Unknown implementation of coordination service: " + implementation, DB::ErrorCodes::NOT_IMPLEMENTED);
+        throw DB::Exception("Unknown implementation of coordination service: " + args.implementation, DB::ErrorCodes::NOT_IMPLEMENTED);
    }

-    if (!chroot.empty())
+    if (!args.chroot.empty())
    {
        /// Here we check that zk root exists.
        /// This check is clumsy. The reason is we do this request under common mutex, and never want to hung here.
@ -144,7 +126,7 @@ void ZooKeeper::init(const std::string & implementation_, const Strings & hosts_
        /// This should not happen now, when memory tracker is disabled.
        /// But let's keep it just in case (it is also easy to backport).
        auto future = asyncExists("/");
-        auto res = future.wait_for(std::chrono::milliseconds(operation_timeout_ms));
+        auto res = future.wait_for(std::chrono::milliseconds(args.operation_timeout_ms));
        if (res != std::future_status::ready)
            throw KeeperException("Cannot check if zookeeper root exists.", Coordination::Error::ZOPERATIONTIMEOUT);

@ -153,18 +135,30 @@ void ZooKeeper::init(const std::string & implementation_, const Strings & hosts_
            throw KeeperException(code, "/");

        if (code == Coordination::Error::ZNONODE)
-            throw KeeperException("ZooKeeper root doesn't exist. You should create root node " + chroot + " before start.", Coordination::Error::ZNONODE);
+            throw KeeperException("ZooKeeper root doesn't exist. You should create root node " + args.chroot + " before start.", Coordination::Error::ZNONODE);
    }
 }

+ZooKeeper::ZooKeeper(const ZooKeeperArgs & args_, std::shared_ptr<DB::ZooKeeperLog> zk_log_)
+{
+    zk_log = std::move(zk_log_);
+    init(args_);
+}
+
+ZooKeeper::ZooKeeper(const Poco::Util::AbstractConfiguration & config, const std::string & config_name, std::shared_ptr<DB::ZooKeeperLog> zk_log_)
+    : zk_log(std::move(zk_log_))
+{
+    init(ZooKeeperArgs(config, config_name));
+}
+
 std::vector<ShuffleHost> ZooKeeper::shuffleHosts() const
 {
-    std::function<size_t(size_t index)> get_priority = get_priority_load_balancing.getPriorityFunc(get_priority_load_balancing.load_balancing, 0, hosts.size());
+    std::function<size_t(size_t index)> get_priority = args.get_priority_load_balancing.getPriorityFunc(args.get_priority_load_balancing.load_balancing, 0, args.hosts.size());
    std::vector<ShuffleHost> shuffle_hosts;
-    for (size_t i = 0; i < hosts.size(); ++i)
+    for (size_t i = 0; i < args.hosts.size(); ++i)
    {
        ShuffleHost shuffle_host;
-        shuffle_host.host = hosts[i];
+        shuffle_host.host = args.hosts[i];
        if (get_priority)
            shuffle_host.priority = get_priority(i);
        shuffle_host.randomize();
@ -181,125 +175,16 @@ std::vector<ShuffleHost> ZooKeeper::shuffleHosts() const
    return shuffle_hosts;
 }

-ZooKeeper::ZooKeeper(const std::string & hosts_string, const std::string & identity_, int32_t session_timeout_ms_,
-                     int32_t operation_timeout_ms_, const std::string & chroot_, const std::string & implementation_,
-                     std::shared_ptr<DB::ZooKeeperLog> zk_log_, const GetPriorityForLoadBalancing & get_priority_load_balancing_)
-{
-    zk_log = std::move(zk_log_);
-    Strings hosts_strings;
-    splitInto<','>(hosts_strings, hosts_string);
-
-    init(implementation_, hosts_strings, identity_, session_timeout_ms_, operation_timeout_ms_, chroot_, get_priority_load_balancing_);
-}
-
-ZooKeeper::ZooKeeper(const Strings & hosts_, const std::string & identity_, int32_t session_timeout_ms_,
-                     int32_t operation_timeout_ms_, const std::string & chroot_, const std::string & implementation_,
-                     std::shared_ptr<DB::ZooKeeperLog> zk_log_, const GetPriorityForLoadBalancing & get_priority_load_balancing_)
-{
-    zk_log = std::move(zk_log_);
-    init(implementation_, hosts_, identity_, session_timeout_ms_, operation_timeout_ms_, chroot_, get_priority_load_balancing_);
-}
-
-struct ZooKeeperArgs
-{
-    ZooKeeperArgs(const Poco::Util::AbstractConfiguration & config, const std::string & config_name)
-    {
-        Poco::Util::AbstractConfiguration::Keys keys;
-        config.keys(config_name, keys);
-
-        session_timeout_ms = Coordination::DEFAULT_SESSION_TIMEOUT_MS;
-        operation_timeout_ms = Coordination::DEFAULT_OPERATION_TIMEOUT_MS;
-        implementation = "zookeeper";
-        for (const auto & key : keys)
-        {
-            if (startsWith(key, "node"))
-            {
-                hosts.push_back(
-                        (config.getBool(config_name + "." + key + ".secure", false) ? "secure://" : "") +
-                        config.getString(config_name + "." + key + ".host") + ":"
-                        + config.getString(config_name + "." + key + ".port", "2181")
-                );
-            }
-            else if (key == "session_timeout_ms")
-            {
-                session_timeout_ms = config.getInt(config_name + "." + key);
-            }
-            else if (key == "operation_timeout_ms")
-            {
-                operation_timeout_ms = config.getInt(config_name + "." + key);
-            }
-            else if (key == "identity")
-            {
-                identity = config.getString(config_name + "." + key);
-            }
-            else if (key == "root")
-            {
-                chroot = config.getString(config_name + "." + key);
-            }
-            else if (key == "implementation")
-            {
-                implementation = config.getString(config_name + "." + key);
-            }
-            else if (key == "zookeeper_load_balancing")
-            {
-                String load_balancing_str = config.getString(config_name + "." + key);
-                /// Use magic_enum to avoid dependency from dbms (`SettingFieldLoadBalancingTraits::fromString(...)`)
-                auto load_balancing = magic_enum::enum_cast<DB::LoadBalancing>(Poco::toUpper(load_balancing_str));
-                if (!load_balancing)
-                    throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Unknown load balancing: {}", load_balancing_str);
-                get_priority_load_balancing.load_balancing = *load_balancing;
-            }
-            else
-                throw KeeperException(std::string("Unknown key ") + key + " in config file", Coordination::Error::ZBADARGUMENTS);
-        }
-
-        if (!chroot.empty())
-        {
-            if (chroot.front() != '/')
-                throw KeeperException(std::string("Root path in config file should start with '/', but got ") + chroot, Coordination::Error::ZBADARGUMENTS);
-            if (chroot.back() == '/')
-                chroot.pop_back();
-        }
-
-        /// init get_priority_load_balancing
-        get_priority_load_balancing.hostname_differences.resize(hosts.size());
-        const String & local_hostname = getFQDNOrHostName();
-        for (size_t i = 0; i < hosts.size(); ++i)
-        {
-            const String & node_host = hosts[i].substr(0, hosts[i].find_last_of(':'));
-            get_priority_load_balancing.hostname_differences[i] = DB::getHostNameDifference(local_hostname, node_host);
-        }
-    }
-
-    Strings hosts;
-    std::string identity;
-    int session_timeout_ms;
-    int operation_timeout_ms;
-    std::string chroot;
-    std::string implementation;
-    GetPriorityForLoadBalancing get_priority_load_balancing;
-};
-
-ZooKeeper::ZooKeeper(const Poco::Util::AbstractConfiguration & config, const std::string & config_name, std::shared_ptr<DB::ZooKeeperLog> zk_log_)
-    : zk_log(std::move(zk_log_))
-{
-    ZooKeeperArgs args(config, config_name);
-    init(args.implementation, args.hosts, args.identity, args.session_timeout_ms, args.operation_timeout_ms, args.chroot, args.get_priority_load_balancing);
-}

 bool ZooKeeper::configChanged(const Poco::Util::AbstractConfiguration & config, const std::string & config_name) const
 {
-    ZooKeeperArgs args(config, config_name);
+    ZooKeeperArgs new_args(config, config_name);

    // skip reload testkeeper cause it's for test and data in memory
-    if (args.implementation == implementation && implementation == "testkeeper")
+    if (new_args.implementation == args.implementation && args.implementation == "testkeeper")
        return false;

-    if (args.get_priority_load_balancing != get_priority_load_balancing)
-        return true;
-
-    return std::tie(args.implementation, args.hosts, args.identity, args.session_timeout_ms, args.operation_timeout_ms, args.chroot, args.get_priority_load_balancing)
-        != std::tie(implementation, hosts, identity, session_timeout_ms, operation_timeout_ms, chroot, args.get_priority_load_balancing);
+    return args != new_args;
 }


@ -318,7 +203,7 @@ Coordination::Error ZooKeeper::getChildrenImpl(const std::string & path, Strings
 {
    auto future_result = asyncTryGetChildrenNoThrow(path, watch_callback, list_request_type);

-    if (future_result.wait_for(std::chrono::milliseconds(operation_timeout_ms)) != std::future_status::ready)
+    if (future_result.wait_for(std::chrono::milliseconds(args.operation_timeout_ms)) != std::future_status::ready)
    {
        impl->finalize(fmt::format("Operation timeout on {} {}", toString(Coordination::OpNum::List), path));
        return Coordination::Error::ZOPERATIONTIMEOUT;
@ -385,7 +270,7 @@ Coordination::Error ZooKeeper::createImpl(const std::string & path, const std::s
 {
    auto future_result = asyncTryCreateNoThrow(path, data, mode);

-    if (future_result.wait_for(std::chrono::milliseconds(operation_timeout_ms)) != std::future_status::ready)
+    if (future_result.wait_for(std::chrono::milliseconds(args.operation_timeout_ms)) != std::future_status::ready)
    {
        impl->finalize(fmt::format("Operation timeout on {} {}", toString(Coordination::OpNum::Create), path));
        return Coordination::Error::ZOPERATIONTIMEOUT;
@ -455,7 +340,7 @@ Coordination::Error ZooKeeper::removeImpl(const std::string & path, int32_t vers
    auto future_result = asyncTryRemoveNoThrow(path, version);


-    if (future_result.wait_for(std::chrono::milliseconds(operation_timeout_ms)) != std::future_status::ready)
+    if (future_result.wait_for(std::chrono::milliseconds(args.operation_timeout_ms)) != std::future_status::ready)
    {
        impl->finalize(fmt::format("Operation timeout on {} {}", toString(Coordination::OpNum::Remove), path));
        return Coordination::Error::ZOPERATIONTIMEOUT;
@ -487,7 +372,7 @@ Coordination::Error ZooKeeper::existsImpl(const std::string & path, Coordination
 {
    auto future_result = asyncTryExistsNoThrow(path, watch_callback);

-    if (future_result.wait_for(std::chrono::milliseconds(operation_timeout_ms)) != std::future_status::ready)
+    if (future_result.wait_for(std::chrono::milliseconds(args.operation_timeout_ms)) != std::future_status::ready)
    {
        impl->finalize(fmt::format("Operation timeout on {} {}", toString(Coordination::OpNum::Exists), path));
        return Coordination::Error::ZOPERATIONTIMEOUT;
@ -521,7 +406,7 @@ Coordination::Error ZooKeeper::getImpl(const std::string & path, std::string & r
 {
    auto future_result = asyncTryGetNoThrow(path, watch_callback);

-    if (future_result.wait_for(std::chrono::milliseconds(operation_timeout_ms)) != std::future_status::ready)
+    if (future_result.wait_for(std::chrono::milliseconds(args.operation_timeout_ms)) != std::future_status::ready)
    {
        impl->finalize(fmt::format("Operation timeout on {} {}", toString(Coordination::OpNum::Get), path));
        return Coordination::Error::ZOPERATIONTIMEOUT;
@ -593,7 +478,7 @@ Coordination::Error ZooKeeper::setImpl(const std::string & path, const std::stri
 {
    auto future_result = asyncTrySetNoThrow(path, data, version);

-    if (future_result.wait_for(std::chrono::milliseconds(operation_timeout_ms)) != std::future_status::ready)
+    if (future_result.wait_for(std::chrono::milliseconds(args.operation_timeout_ms)) != std::future_status::ready)
    {
        impl->finalize(fmt::format("Operation timeout on {} {}", toString(Coordination::OpNum::Set), path));
        return Coordination::Error::ZOPERATIONTIMEOUT;
@ -645,7 +530,7 @@ Coordination::Error ZooKeeper::multiImpl(const Coordination::Requests & requests

    auto future_result = asyncTryMultiNoThrow(requests);

-    if (future_result.wait_for(std::chrono::milliseconds(operation_timeout_ms)) != std::future_status::ready)
+    if (future_result.wait_for(std::chrono::milliseconds(args.operation_timeout_ms)) != std::future_status::ready)
    {
        impl->finalize(fmt::format("Operation timeout on {} {}", toString(Coordination::OpNum::Multi), requests[0]->getPath()));
        return Coordination::Error::ZOPERATIONTIMEOUT;
@ -679,7 +564,7 @@ Coordination::Error ZooKeeper::syncImpl(const std::string & path, std::string &
 {
    auto future_result = asyncTrySyncNoThrow(path);

-    if (future_result.wait_for(std::chrono::milliseconds(operation_timeout_ms)) != std::future_status::ready)
+    if (future_result.wait_for(std::chrono::milliseconds(args.operation_timeout_ms)) != std::future_status::ready)
    {
        impl->finalize(fmt::format("Operation timeout on {} {}", toString(Coordination::OpNum::Sync), path));
        return Coordination::Error::ZOPERATIONTIMEOUT;
@ -884,7 +769,7 @@ void ZooKeeper::waitForEphemeralToDisappearIfAny(const std::string & path)
    if (!tryGet(path, content, nullptr, eph_node_disappeared))
        return;

-    int32_t timeout_ms = 3 * session_timeout_ms;
+    int32_t timeout_ms = 3 * args.session_timeout_ms;
    if (!eph_node_disappeared->tryWait(timeout_ms))
        throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR,
                            "Ephemeral node {} still exists after {}s, probably it's owned by someone else. "
@ -894,7 +779,7 @@ void ZooKeeper::waitForEphemeralToDisappearIfAny(const std::string & path)

 ZooKeeperPtr ZooKeeper::startNewSession() const
 {
-    return std::make_shared<ZooKeeper>(hosts, identity, session_timeout_ms, operation_timeout_ms, chroot, implementation, zk_log, get_priority_load_balancing);
+    return std::make_shared<ZooKeeper>(args, zk_log);
 }


--- a/src/Common/ZooKeeper/ZooKeeper.h
+++ b/src/Common/ZooKeeper/ZooKeeper.h
@ -13,7 +13,7 @@
 #include <Common/Stopwatch.h>
 #include <Common/ZooKeeper/IKeeper.h>
 #include <Common/ZooKeeper/ZooKeeperConstants.h>
-#include <Common/GetPriorityForLoadBalancing.h>
+#include <Common/ZooKeeper/ZooKeeperArgs.h>
 #include <Common/thread_local_rng.h>
 #include <unistd.h>
 #include <random>
@ -72,24 +72,11 @@ using GetPriorityForLoadBalancing = DB::GetPriorityForLoadBalancing;
 class ZooKeeper
 {
 public:
+
    using Ptr = std::shared_ptr<ZooKeeper>;

-    /// hosts_string -- comma separated [secure://]host:port list
-    explicit ZooKeeper(const std::string & hosts_string, const std::string & identity_ = "",
-              int32_t session_timeout_ms_ = Coordination::DEFAULT_SESSION_TIMEOUT_MS,
-              int32_t operation_timeout_ms_ = Coordination::DEFAULT_OPERATION_TIMEOUT_MS,
-              const std::string & chroot_ = "",
-              const std::string & implementation_ = "zookeeper",
-              std::shared_ptr<DB::ZooKeeperLog> zk_log_ = nullptr,
-              const GetPriorityForLoadBalancing & get_priority_load_balancing_ = {});
+    ZooKeeper(const ZooKeeperArgs & args_, std::shared_ptr<DB::ZooKeeperLog> zk_log_ = nullptr);

-    explicit ZooKeeper(const Strings & hosts_, const std::string & identity_ = "",
-              int32_t session_timeout_ms_ = Coordination::DEFAULT_SESSION_TIMEOUT_MS,
-              int32_t operation_timeout_ms_ = Coordination::DEFAULT_OPERATION_TIMEOUT_MS,
-              const std::string & chroot_ = "",
-              const std::string & implementation_ = "zookeeper",
-              std::shared_ptr<DB::ZooKeeperLog> zk_log_ = nullptr,
-              const GetPriorityForLoadBalancing & get_priority_load_balancing_ = {});

    /** Config of the form:
        <zookeeper>
@ -337,8 +324,7 @@ public:
 private:
    friend class EphemeralNodeHolder;

-    void init(const std::string & implementation_, const Strings & hosts_, const std::string & identity_,
-              int32_t session_timeout_ms_, int32_t operation_timeout_ms_, const std::string & chroot_, const GetPriorityForLoadBalancing & get_priority_load_balancing_);
+    void init(ZooKeeperArgs args_);

    /// The following methods don't any throw exceptions but return error codes.
    Coordination::Error createImpl(const std::string & path, const std::string & data, int32_t mode, std::string & path_created);
@ -358,20 +344,13 @@ private:

    std::unique_ptr<Coordination::IKeeper> impl;

-    Strings hosts;
-    std::string identity;
-    int32_t session_timeout_ms;
-    int32_t operation_timeout_ms;
-    std::string chroot;
-    std::string implementation;
+    ZooKeeperArgs args;

    std::mutex mutex;

    Poco::Logger * log = nullptr;
    std::shared_ptr<DB::ZooKeeperLog> zk_log;

-    GetPriorityForLoadBalancing get_priority_load_balancing;
-
    AtomicStopwatch session_uptime;
 };

--- a/src/Common/ZooKeeper/ZooKeeperArgs.cpp
+++ b/src/Common/ZooKeeper/ZooKeeperArgs.cpp
@ -0,0 +1,108 @@
+#include <Common/ZooKeeper/ZooKeeperArgs.h>
+#include <Common/ZooKeeper/KeeperException.h>
+#include <base/find_symbols.h>
+#include <base/getFQDNOrHostName.h>
+#include <Poco/Util/AbstractConfiguration.h>
+#include <Common/isLocalAddress.h>
+#include <Poco/String.h>
+
+namespace DB
+{
+namespace ErrorCodes
+{
+    extern const int BAD_ARGUMENTS;
+}
+}
+
+namespace zkutil
+{
+
+ZooKeeperArgs::ZooKeeperArgs(const Poco::Util::AbstractConfiguration & config, const String & config_name)
+{
+    Poco::Util::AbstractConfiguration::Keys keys;
+    config.keys(config_name, keys);
+
+    for (const auto & key : keys)
+    {
+        if (key.starts_with("node"))
+        {
+            hosts.push_back(
+                (config.getBool(config_name + "." + key + ".secure", false) ? "secure://" : "")
+                + config.getString(config_name + "." + key + ".host") + ":" + config.getString(config_name + "." + key + ".port", "2181"));
+        }
+        else if (key == "session_timeout_ms")
+        {
+            session_timeout_ms = config.getInt(config_name + "." + key);
+        }
+        else if (key == "operation_timeout_ms")
+        {
+            operation_timeout_ms = config.getInt(config_name + "." + key);
+        }
+        else if (key == "connection_timeout_ms")
+        {
+            connection_timeout_ms = config.getInt(config_name + "." + key);
+        }
+        else if (key == "send_fault_probability")
+        {
+            send_fault_probability = config.getDouble(config_name + "." + key);
+        }
+        else if (key == "recv_fault_probability")
+        {
+            recv_fault_probability = config.getDouble(config_name + "." + key);
+        }
+        else if (key == "identity")
+        {
+            identity = config.getString(config_name + "." + key);
+            if (!identity.empty())
+                auth_scheme = "digest";
+        }
+        else if (key == "root")
+        {
+            chroot = config.getString(config_name + "." + key);
+        }
+        else if (key == "implementation")
+        {
+            implementation = config.getString(config_name + "." + key);
+        }
+        else if (key == "zookeeper_load_balancing")
+        {
+            String load_balancing_str = config.getString(config_name + "." + key);
+            /// Use magic_enum to avoid dependency from dbms (`SettingFieldLoadBalancingTraits::fromString(...)`)
+            auto load_balancing = magic_enum::enum_cast<DB::LoadBalancing>(Poco::toUpper(load_balancing_str));
+            if (!load_balancing)
+                throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Unknown load balancing: {}", load_balancing_str);
+            get_priority_load_balancing.load_balancing = *load_balancing;
+        }
+        else
+            throw KeeperException(std::string("Unknown key ") + key + " in config file", Coordination::Error::ZBADARGUMENTS);
+    }
+
+    if (!chroot.empty())
+    {
+        if (chroot.front() != '/')
+            throw KeeperException(
+                Coordination::Error::ZBADARGUMENTS,
+                "Root path in config file should start with '/', but got {}", chroot);
+        if (chroot.back() == '/')
+            chroot.pop_back();
+    }
+
+    if (session_timeout_ms < 0 || operation_timeout_ms < 0 || connection_timeout_ms < 0)
+        throw KeeperException("Timeout cannot be negative", Coordination::Error::ZBADARGUMENTS);
+
+    /// init get_priority_load_balancing
+    get_priority_load_balancing.hostname_differences.resize(hosts.size());
+    const String & local_hostname = getFQDNOrHostName();
+    for (size_t i = 0; i < hosts.size(); ++i)
+    {
+        const String & node_host = hosts[i].substr(0, hosts[i].find_last_of(':'));
+        get_priority_load_balancing.hostname_differences[i] = DB::getHostNameDifference(local_hostname, node_host);
+    }
+}
+
+ZooKeeperArgs::ZooKeeperArgs(const String & hosts_string)
+{
+    splitInto<','>(hosts, hosts_string);
+}
+
+}
--- a/src/Common/ZooKeeper/ZooKeeperArgs.h
+++ b/src/Common/ZooKeeper/ZooKeeperArgs.h
@ -0,0 +1,37 @@
+#pragma once
+#include <Common/ZooKeeper/Types.h>
+#include <Common/ZooKeeper/ZooKeeperConstants.h>
+#include <Common/GetPriorityForLoadBalancing.h>
+
+namespace Poco::Util
+{
+    class AbstractConfiguration;
+}
+
+namespace zkutil
+{
+
+struct ZooKeeperArgs
+{
+    ZooKeeperArgs(const Poco::Util::AbstractConfiguration & config, const String & config_name);
+
+    /// hosts_string -- comma separated [secure://]host:port list
+    ZooKeeperArgs(const String & hosts_string);
+    ZooKeeperArgs() = default;
+    bool operator == (const ZooKeeperArgs &) const = default;
+
+    String implementation = "zookeeper";
+    Strings hosts;
+    String auth_scheme;
+    String identity;
+    String chroot;
+    int32_t connection_timeout_ms = Coordination::DEFAULT_CONNECTION_TIMEOUT_MS;
+    int32_t session_timeout_ms = Coordination::DEFAULT_SESSION_TIMEOUT_MS;
+    int32_t operation_timeout_ms = Coordination::DEFAULT_OPERATION_TIMEOUT_MS;
+    float send_fault_probability = 0;
+    float recv_fault_probability = 0;
+
+    DB::GetPriorityForLoadBalancing get_priority_load_balancing;
+};
+
+}
--- a/src/Common/ZooKeeper/ZooKeeperConstants.h
+++ b/src/Common/ZooKeeper/ZooKeeperConstants.h
@ -56,5 +56,6 @@ static constexpr int32_t DEFAULT_SESSION_TIMEOUT_MS = 30000;
 static constexpr int32_t DEFAULT_MIN_SESSION_TIMEOUT_MS = 10000;
 static constexpr int32_t DEFAULT_MAX_SESSION_TIMEOUT_MS = 100000;
 static constexpr int32_t DEFAULT_OPERATION_TIMEOUT_MS = 10000;
+static constexpr int32_t DEFAULT_CONNECTION_TIMEOUT_MS = 1000;

 }
--- a/src/Common/ZooKeeper/ZooKeeperImpl.cpp
+++ b/src/Common/ZooKeeper/ZooKeeperImpl.cpp
@ -276,15 +276,15 @@ void ZooKeeper::read(T & x)
    Coordination::read(x, *in);
 }

-static void removeRootPath(String & path, const String & root_path)
+static void removeRootPath(String & path, const String & chroot)
 {
-    if (root_path.empty())
+    if (chroot.empty())
        return;

-    if (path.size() <= root_path.size())
-        throw Exception("Received path is not longer than root_path", Error::ZDATAINCONSISTENCY);
+    if (path.size() <= chroot.size())
+        throw Exception(Error::ZDATAINCONSISTENCY, "Received path is not longer than chroot");

-    path = path.substr(root_path.size());
+    path = path.substr(chroot.size());
 }

 ZooKeeper::~ZooKeeper()
@ -308,27 +308,20 @@ ZooKeeper::~ZooKeeper()

 ZooKeeper::ZooKeeper(
    const Nodes & nodes,
-    const String & root_path_,
-    const String & auth_scheme,
-    const String & auth_data,
-    Poco::Timespan session_timeout_,
-    Poco::Timespan connection_timeout,
-    Poco::Timespan operation_timeout_,
+    const zkutil::ZooKeeperArgs & args_,
    std::shared_ptr<ZooKeeperLog> zk_log_)
-    : root_path(root_path_),
-    session_timeout(session_timeout_),
-    operation_timeout(std::min(operation_timeout_, session_timeout_))
+    : args(args_)
 {
    log = &Poco::Logger::get("ZooKeeperClient");
    std::atomic_store(&zk_log, std::move(zk_log_));

-    if (!root_path.empty())
+    if (!args.chroot.empty())
    {
-        if (root_path.back() == '/')
-            root_path.pop_back();
+        if (args.chroot.back() == '/')
+            args.chroot.pop_back();
    }

-    if (auth_scheme.empty())
+    if (args.auth_scheme.empty())
    {
        ACL acl;
        acl.permissions = ACL::All;
@ -345,10 +338,22 @@ ZooKeeper::ZooKeeper(
        default_acls.emplace_back(std::move(acl));
    }

-    connect(nodes, connection_timeout);

-    if (!auth_scheme.empty())
-        sendAuth(auth_scheme, auth_data);
+    /// It makes sense (especially, for async requests) to inject a fault in two places:
+    /// pushRequest (before request is sent) and receiveEvent (after request was executed).
+    if (0 < args.send_fault_probability && args.send_fault_probability <= 1)
+    {
+        send_inject_fault.emplace(args.send_fault_probability);
+    }
+    if (0 < args.recv_fault_probability && args.recv_fault_probability <= 1)
+    {
+        recv_inject_fault.emplace(args.recv_fault_probability);
+    }
+
+    connect(nodes, args.connection_timeout_ms * 1000);
+
+    if (!args.auth_scheme.empty())
+        sendAuth(args.auth_scheme, args.identity);

    send_thread = ThreadFromGlobalPool([this] { sendThread(); });
    receive_thread = ThreadFromGlobalPool([this] { receiveThread(); });
@ -364,7 +369,7 @@ void ZooKeeper::connect(
    Poco::Timespan connection_timeout)
 {
    if (nodes.empty())
-        throw Exception("No nodes passed to ZooKeeper constructor", Error::ZBADARGUMENTS);
+        throw Exception(Error::ZBADARGUMENTS, "No nodes passed to ZooKeeper constructor");

    static constexpr size_t num_tries = 3;
    bool connected = false;
@ -394,8 +399,8 @@ void ZooKeeper::connect(
                socket.connect(node.address, connection_timeout);
                socket_address = socket.peerAddress();

-                socket.setReceiveTimeout(operation_timeout);
-                socket.setSendTimeout(operation_timeout);
+                socket.setReceiveTimeout(args.operation_timeout_ms * 1000);
+                socket.setSendTimeout(args.operation_timeout_ms * 1000);
                socket.setNoDelay(true);

                in.emplace(socket);
@ -453,7 +458,7 @@ void ZooKeeper::connect(
        }

        message << fail_reasons.str() << "\n";
-        throw Exception(message.str(), Error::ZCONNECTIONLOSS);
+        throw Exception(Error::ZCONNECTIONLOSS, message.str());
    }
    else
    {
@ -466,7 +471,7 @@ void ZooKeeper::sendHandshake()
 {
    int32_t handshake_length = 44;
    int64_t last_zxid_seen = 0;
-    int32_t timeout = session_timeout.totalMilliseconds();
+    int32_t timeout = args.session_timeout_ms;
    int64_t previous_session_id = 0;    /// We don't support session restore. So previous session_id is always zero.
    constexpr int32_t passwd_len = 16;
    std::array<char, passwd_len> passwd {};
@ -491,7 +496,7 @@ void ZooKeeper::receiveHandshake()

    read(handshake_length);
    if (handshake_length != SERVER_HANDSHAKE_LENGTH)
-        throw Exception("Unexpected handshake length received: " + DB::toString(handshake_length), Error::ZMARSHALLINGERROR);
+        throw Exception(Error::ZMARSHALLINGERROR, "Unexpected handshake length received: {}", handshake_length);

    read(protocol_version_read);
    if (protocol_version_read != ZOOKEEPER_PROTOCOL_VERSION)
@ -500,15 +505,15 @@ void ZooKeeper::receiveHandshake()
        /// It's better for faster failover than just connection drop.
        /// Implemented in clickhouse-keeper.
        if (protocol_version_read == KEEPER_PROTOCOL_VERSION_CONNECTION_REJECT)
-            throw Exception("Keeper server rejected the connection during the handshake. Possibly it's overloaded, doesn't see leader or stale", Error::ZCONNECTIONLOSS);
+            throw Exception(Error::ZCONNECTIONLOSS, "Keeper server rejected the connection during the handshake. Possibly it's overloaded, doesn't see leader or stale");
        else
-            throw Exception("Unexpected protocol version: " + DB::toString(protocol_version_read), Error::ZMARSHALLINGERROR);
+            throw Exception(Error::ZMARSHALLINGERROR, "Unexpected protocol version: {}", protocol_version_read);
    }

    read(timeout);
-    if (timeout != session_timeout.totalMilliseconds())
+    if (timeout != args.session_timeout_ms)
        /// Use timeout from server.
-        session_timeout = timeout * Poco::Timespan::MILLISECONDS;
+        args.session_timeout_ms = timeout;

    read(session_id);
    read(passwd);
@ -535,17 +540,15 @@ void ZooKeeper::sendAuth(const String & scheme, const String & data)
    read(err);

    if (read_xid != AUTH_XID)
-        throw Exception("Unexpected event received in reply to auth request: " + DB::toString(read_xid),
-            Error::ZMARSHALLINGERROR);
+        throw Exception(Error::ZMARSHALLINGERROR, "Unexpected event received in reply to auth request: {}", read_xid);

    int32_t actual_length = in->count() - count_before_event;
    if (length != actual_length)
-        throw Exception("Response length doesn't match. Expected: " + DB::toString(length) + ", actual: " + DB::toString(actual_length),
-            Error::ZMARSHALLINGERROR);
+        throw Exception(Error::ZMARSHALLINGERROR, "Response length doesn't match. Expected: {}, actual: {}", length, actual_length);

    if (err != Error::ZOK)
-        throw Exception("Error received in reply to auth request. Code: " + DB::toString(static_cast<int32_t>(err)) + ". Message: " + String(errorMessage(err)),
-            Error::ZMARSHALLINGERROR);
+        throw Exception(Error::ZMARSHALLINGERROR, "Error received in reply to auth request. Code: {}. Message: {}",
+                        static_cast<int32_t>(err), errorMessage(err));
 }


@ -562,14 +565,14 @@ void ZooKeeper::sendThread()
            auto prev_bytes_sent = out->count();

            auto now = clock::now();
-            auto next_heartbeat_time = prev_heartbeat_time + std::chrono::milliseconds(session_timeout.totalMilliseconds() / 3);
+            auto next_heartbeat_time = prev_heartbeat_time + std::chrono::milliseconds(args.session_timeout_ms / 3);

            if (next_heartbeat_time > now)
            {
                /// Wait for the next request in queue. No more than operation timeout. No more than until next heartbeat time.
                UInt64 max_wait = std::min(
                    static_cast<UInt64>(std::chrono::duration_cast<std::chrono::milliseconds>(next_heartbeat_time - now).count()),
-                    static_cast<UInt64>(operation_timeout.totalMilliseconds()));
+                    static_cast<UInt64>(args.operation_timeout_ms));

                RequestInfo info;
                if (requests_queue.tryPop(info, max_wait))
@ -594,7 +597,7 @@ void ZooKeeper::sendThread()
                        break;
                    }

-                    info.request->addRootPath(root_path);
+                    info.request->addRootPath(args.chroot);

                    info.request->probably_sent = true;
                    info.request->write(*out);
@ -633,13 +636,13 @@ void ZooKeeper::receiveThread()

    try
    {
-        Int64 waited = 0;
+        Int64 waited_us = 0;
        while (!requests_queue.isFinished())
        {
            auto prev_bytes_received = in->count();

            clock::time_point now = clock::now();
-            UInt64 max_wait = operation_timeout.totalMicroseconds();
+            UInt64 max_wait_us = args.operation_timeout_ms * 1000;
            std::optional<RequestInfo> earliest_operation;

            {
@ -648,30 +651,32 @@ void ZooKeeper::receiveThread()
                {
                    /// Operations are ordered by xid (and consequently, by time).
                    earliest_operation = operations.begin()->second;
-                    auto earliest_operation_deadline = earliest_operation->time + std::chrono::microseconds(operation_timeout.totalMicroseconds());
+                    auto earliest_operation_deadline = earliest_operation->time + std::chrono::microseconds(args.operation_timeout_ms * 1000);
                    if (now > earliest_operation_deadline)
-                        throw Exception("Operation timeout (deadline already expired) for path: " + earliest_operation->request->getPath(), Error::ZOPERATIONTIMEOUT);
-                    max_wait = std::chrono::duration_cast<std::chrono::microseconds>(earliest_operation_deadline - now).count();
+                        throw Exception(Error::ZOPERATIONTIMEOUT, "Operation timeout (deadline already expired) for path: {}",
+                                        earliest_operation->request->getPath());
+                    max_wait_us = std::chrono::duration_cast<std::chrono::microseconds>(earliest_operation_deadline - now).count();
                }
            }

-            if (in->poll(max_wait))
+            if (in->poll(max_wait_us))
            {
                if (requests_queue.isFinished())
                    break;

                receiveEvent();
-                waited = 0;
+                waited_us = 0;
            }
            else
            {
                if (earliest_operation)
                {
-                    throw Exception("Operation timeout (no response) for request " + toString(earliest_operation->request->getOpNum()) + " for path: " + earliest_operation->request->getPath(), Error::ZOPERATIONTIMEOUT);
+                    throw Exception(Error::ZOPERATIONTIMEOUT, "Operation timeout (no response) for request {} for path: {}",
+                                    earliest_operation->request->getOpNum(), earliest_operation->request->getPath());
                }
-                waited += max_wait;
-                if (waited >= session_timeout.totalMicroseconds())
-                    throw Exception("Nothing is received in session timeout", Error::ZOPERATIONTIMEOUT);
+                waited_us += max_wait_us;
+                if (waited_us >= args.session_timeout_ms * 1000)
+                    throw Exception(Error::ZOPERATIONTIMEOUT, "Nothing is received in session timeout");

            }

@ -703,10 +708,13 @@ void ZooKeeper::receiveEvent()
    ZooKeeperResponsePtr response;
    UInt64 elapsed_ms = 0;

+    if (unlikely(recv_inject_fault) && recv_inject_fault.value()(thread_local_rng))
+        throw Exception(Error::ZSESSIONEXPIRED, "Session expired (fault injected on recv)");
+
    if (xid == PING_XID)
    {
        if (err != Error::ZOK)
-            throw Exception("Received error in heartbeat response: " + String(errorMessage(err)), Error::ZRUNTIMEINCONSISTENCY);
+            throw Exception(Error::ZRUNTIMEINCONSISTENCY, "Received error in heartbeat response: {}", errorMessage(err));

        response = std::make_shared<ZooKeeperHeartbeatResponse>();
    }
@ -781,7 +789,7 @@ void ZooKeeper::receiveEvent()
        else
        {
            response->readImpl(*in);
-            response->removeRootPath(root_path);
+            response->removeRootPath(args.chroot);
        }
        /// Instead of setting the watch in sendEvent, set it in receiveEvent because need to check the response.
        /// The watch shouldn't be set if the node does not exist and it will never exist like sequential ephemeral nodes.
@ -801,9 +809,9 @@ void ZooKeeper::receiveEvent()
            {
                CurrentMetrics::add(CurrentMetrics::ZooKeeperWatch);

-                /// The key of wathces should exclude the root_path
+                /// The key of wathces should exclude the args.chroot
                String req_path = request_info.request->getPath();
-                removeRootPath(req_path, root_path);
+                removeRootPath(req_path, args.chroot);
                std::lock_guard lock(watches_mutex);
                watches[req_path].emplace_back(std::move(request_info.watch));
            }
@ -811,7 +819,7 @@ void ZooKeeper::receiveEvent()

        int32_t actual_length = in->count() - count_before_event;
        if (length != actual_length)
-            throw Exception("Response length doesn't match. Expected: " + DB::toString(length) + ", actual: " + DB::toString(actual_length), Error::ZMARSHALLINGERROR);
+            throw Exception(Error::ZMARSHALLINGERROR, "Response length doesn't match. Expected: {}, actual: {}", length, actual_length);

        logOperationIfNeeded(request_info.request, response, /* finalize= */ false, elapsed_ms);   //-V614
    }
@ -1035,9 +1043,9 @@ void ZooKeeper::pushRequest(RequestInfo && info)
        {
            info.request->xid = next_xid.fetch_add(1);
            if (info.request->xid == CLOSE_XID)
-                throw Exception("xid equal to close_xid", Error::ZSESSIONEXPIRED);
+                throw Exception(Error::ZSESSIONEXPIRED, "xid equal to close_xid");
            if (info.request->xid < 0)
-                throw Exception("XID overflow", Error::ZSESSIONEXPIRED);
+                throw Exception(Error::ZSESSIONEXPIRED, "XID overflow");

            if (auto * multi_request = dynamic_cast<ZooKeeperMultiRequest *>(info.request.get()))
            {
@ -1046,12 +1054,15 @@ void ZooKeeper::pushRequest(RequestInfo && info)
            }
        }

-        if (!requests_queue.tryPush(std::move(info), operation_timeout.totalMilliseconds()))
+        if (unlikely(send_inject_fault) && send_inject_fault.value()(thread_local_rng))
+            throw Exception(Error::ZSESSIONEXPIRED, "Session expired (fault injected on send)");
+
+        if (!requests_queue.tryPush(std::move(info), args.operation_timeout_ms))
        {
            if (requests_queue.isFinished())
-                throw Exception("Session expired", Error::ZSESSIONEXPIRED);
+                throw Exception(Error::ZSESSIONEXPIRED, "Session expired");

-            throw Exception("Cannot push request to queue within operation timeout", Error::ZOPERATIONTIMEOUT);
+            throw Exception(Error::ZOPERATIONTIMEOUT, "Cannot push request to queue within operation timeout");
        }
    }
    catch (...)
@ -1079,7 +1090,7 @@ void ZooKeeper::initApiVersion()
    };

    get(keeper_api_version_path, std::move(callback), {});
-    if (future.wait_for(std::chrono::milliseconds(operation_timeout.totalMilliseconds())) != std::future_status::ready)
+    if (future.wait_for(std::chrono::milliseconds(args.operation_timeout_ms)) != std::future_status::ready)
    {
        LOG_TRACE(log, "Failed to get API version: timeout");
        return;
@ -1220,7 +1231,7 @@ void ZooKeeper::list(
    if (keeper_api_version < Coordination::KeeperApiVersion::WITH_FILTERED_LIST)
    {
        if (list_request_type != ListRequestType::ALL)
-            throw Exception("Filtered list request type cannot be used because it's not supported by the server", Error::ZBADARGUMENTS);
+            throw Exception(Error::ZBADARGUMENTS, "Filtered list request type cannot be used because it's not supported by the server");

        request = std::make_shared<ZooKeeperListRequest>();
    }
@ -1299,8 +1310,8 @@ void ZooKeeper::close()
    RequestInfo request_info;
    request_info.request = std::make_shared<ZooKeeperCloseRequest>(std::move(request));

-    if (!requests_queue.tryPush(std::move(request_info), operation_timeout.totalMilliseconds()))
-        throw Exception("Cannot push close request to queue within operation timeout", Error::ZOPERATIONTIMEOUT);
+    if (!requests_queue.tryPush(std::move(request_info), args.operation_timeout_ms))
+        throw Exception(Error::ZOPERATIONTIMEOUT, "Cannot push close request to queue within operation timeout");

    ProfileEvents::increment(ProfileEvents::ZooKeeperClose);
 }
--- a/src/Common/ZooKeeper/ZooKeeperImpl.h
+++ b/src/Common/ZooKeeper/ZooKeeperImpl.h
@ -7,6 +7,7 @@
 #include <Common/ThreadPool.h>
 #include <Common/ZooKeeper/IKeeper.h>
 #include <Common/ZooKeeper/ZooKeeperCommon.h>
+#include <Common/ZooKeeper/ZooKeeperArgs.h>
 #include <Coordination/KeeperConstants.h>

 #include <IO/ReadBuffer.h>
@ -27,6 +28,7 @@
 #include <cstdint>
 #include <optional>
 #include <functional>
+#include <random>


 /** ZooKeeper C++ library, a replacement for libzookeeper.
@ -111,12 +113,7 @@ public:
      */
    ZooKeeper(
        const Nodes & nodes,
-        const String & root_path,
-        const String & auth_scheme,
-        const String & auth_data,
-        Poco::Timespan session_timeout_,
-        Poco::Timespan connection_timeout,
-        Poco::Timespan operation_timeout_,
+        const zkutil::ZooKeeperArgs & args_,
        std::shared_ptr<ZooKeeperLog> zk_log_);

    ~ZooKeeper() override;
@ -201,11 +198,12 @@ public:
    void setZooKeeperLog(std::shared_ptr<DB::ZooKeeperLog> zk_log_);

 private:
-    String root_path;
    ACLs default_acls;

-    Poco::Timespan session_timeout;
-    Poco::Timespan operation_timeout;
+    zkutil::ZooKeeperArgs args;
+
+    std::optional<std::bernoulli_distribution> send_inject_fault;
+    std::optional<std::bernoulli_distribution> recv_inject_fault;

    Poco::Net::StreamSocket socket;
    /// To avoid excessive getpeername(2) calls.
--- a/src/Common/ZooKeeper/examples/zkutil_test_async.cpp
+++ b/src/Common/ZooKeeper/examples/zkutil_test_async.cpp
@ -5,7 +5,7 @@
 int main(int argc, char ** argv)
 try
 {
-    zkutil::ZooKeeper zookeeper{"localhost:2181"};
+    zkutil::ZooKeeper zookeeper{zkutil::ZooKeeperArgs("localhost:2181")};

    auto nodes = zookeeper.getChildren("/tmp");

--- a/src/Common/ZooKeeper/examples/zkutil_test_commands.cpp
+++ b/src/Common/ZooKeeper/examples/zkutil_test_commands.cpp
@ -16,7 +16,7 @@ try
        return 1;
    }

-    ZooKeeper zk(argv[1], "", 5000);
+    ZooKeeper zk{zkutil::ZooKeeperArgs(argv[1])};

    std::cout << "create path" << std::endl;
    zk.create("/test", "old", zkutil::CreateMode::Persistent);
--- a/src/Common/ZooKeeper/examples/zkutil_test_commands_new_lib.cpp
+++ b/src/Common/ZooKeeper/examples/zkutil_test_commands_new_lib.cpp
@ -40,7 +40,8 @@ try
    }


-    ZooKeeper zk(nodes, {}, {}, {}, {5, 0}, {0, 50000}, {0, 50000}, nullptr);
+    zkutil::ZooKeeperArgs args;
+    ZooKeeper zk(nodes, args, nullptr);

    Poco::Event event(true);

--- a/src/Common/tests/gtest_merge_configs.cpp
+++ b/src/Common/tests/gtest_merge_configs.cpp
@ -43,11 +43,8 @@ clickhouse:
    text_log:
        database: system
        table: text_log
-        partition_by:
-            "@remove": "1"
-        engine:
-            - "@replace" : "1"
-            - "ENGINE MergeTree"
+        partition_by: {"@remove": "1"}
+        engine: "ENGINE MergeTree"
        flush_interval_milliseconds: 7500
        level: debug
 )YAML";
@ -112,11 +109,8 @@ clickhouse:
    text_log :
        database: system
        table: text_log
-        partition_by:
-            "@remove": "1"
-        engine:
-            - "@replace" : "1"
-            - "ENGINE MergeTree"
+        partition_by: {"@remove": "1"}
+        engine: "ENGINE MergeTree"
        flush_interval_milliseconds: 7500
        level: debug
 )YAML";
--- a/src/Common/tests/gtest_yaml_parser.cpp
+++ b/src/Common/tests/gtest_yaml_parser.cpp
@ -13,40 +13,12 @@

 using namespace DB;

-TEST(Common, YamlParserInvalidFile)
+TEST(YamlParser, InvalidFile)
 {
    ASSERT_THROW(YAMLParser::parse("some-non-existing-file.yaml"), Exception);
 }

-TEST(Common, YamlParserProcessKeysList)
-{
-    auto yaml_file = getFileWithContents("keys-list.yaml", R"YAML(
-operator:
-    access_management: "1"
-    networks:
-      - ip: "10.1.6.168"
-      - ip: "::1"
-      - ip: "127.0.0.1"
-)YAML");
-    SCOPE_EXIT({ yaml_file->remove(); });
-
-    Poco::AutoPtr<Poco::XML::Document> xml = YAMLParser::parse(yaml_file->path());
-    auto *p_node = xml->getNodeByPath("/clickhouse");
-    EXPECT_EQ(xmlNodeAsString(p_node), R"CONFIG(<clickhouse>
-<operator>
-<access_management>1</access_management>
-<networks>
-<ip>10.1.6.168</ip>
-<ip>::1</ip>
-<ip>127.0.0.1</ip>
-</networks>
-</operator>
-</clickhouse>
-)CONFIG");
-
-}
-
-TEST(Common, YamlParserProcessValuesList)
+TEST(YamlParser, ProcessValuesList)
 {
    auto yaml_file = getFileWithContents("values-list.yaml", R"YAML(
 rules:
@ -75,4 +47,141 @@ rules:
 )CONFIG");

 }
+
+TEST(YamlParser, ProcessKeysList)
+{
+    auto yaml_file = getFileWithContents("keys-list.yaml", R"YAML(
+operator:
+    access_management: 1
+    networks:
+        ip:
+          - 10.1.6.168
+          - ::1
+          - 127.0.0.1
+)YAML");
+    SCOPE_EXIT({ yaml_file->remove(); });
+
+    Poco::AutoPtr<Poco::XML::Document> xml = YAMLParser::parse(yaml_file->path());
+    auto *p_node = xml->getNodeByPath("/clickhouse");
+    EXPECT_EQ(xmlNodeAsString(p_node), R"CONFIG(<clickhouse>
+<operator>
+<access_management>1</access_management>
+<networks>
+<ip>10.1.6.168</ip>
+<ip>::1</ip>
+<ip>127.0.0.1</ip>
+</networks>
+</operator>
+</clickhouse>
+)CONFIG");
+
+}
+
+TEST(YamlParser, ProcessListAttributes)
+{
+    auto yaml_file = getFileWithContents("list_attributes.yaml", R"YAML(
+seq:
+  - "@attr1": x
+  - k1: val1
+    k2: val2
+    "@attr2": y
+  - k3: val3
+    "@attr3": z
+)YAML");
+    SCOPE_EXIT({ yaml_file->remove(); });
+
+    Poco::AutoPtr<Poco::XML::Document> xml = YAMLParser::parse(yaml_file->path());
+    auto *p_node = xml->getNodeByPath("/clickhouse");
+    EXPECT_EQ(xmlNodeAsString(p_node), R"CONFIG(<clickhouse>
+<seq attr1="x"></seq>
+<seq attr2="y">
+<k1>val1</k1>
+<k2>val2</k2>
+</seq>
+<seq attr3="z">
+<k3>val3</k3>
+</seq>
+</clickhouse>
+)CONFIG");
+
+}
+
+TEST(YamlParser, ProcessMapAttributes)
+{
+    auto yaml_file = getFileWithContents("map_attributes.yaml", R"YAML(
+map:
+    "@attr1": x
+    k1: val1
+    k2: val2
+    "@attr2": y
+    k3: val3
+    "@attr3": z
+)YAML");
+    SCOPE_EXIT({ yaml_file->remove(); });
+
+    Poco::AutoPtr<Poco::XML::Document> xml = YAMLParser::parse(yaml_file->path());
+    auto *p_node = xml->getNodeByPath("/clickhouse");
+    EXPECT_EQ(xmlNodeAsString(p_node), R"CONFIG(<clickhouse>
+<map attr1="x" attr2="y" attr3="z">
+<k1>val1</k1>
+<k2>val2</k2>
+<k3>val3</k3>
+</map>
+</clickhouse>
+)CONFIG");
+
+}
+
+TEST(YamlParser, ClusterDef)
+{
+    auto yaml_file = getFileWithContents("cluster_def.yaml", R"YAML(
+test_cluster:
+    shard:
+        - internal_replication: false
+          replica:
+              - host: 127.0.0.1
+                port: 9000
+              - host: 127.0.0.2
+                port: 9000
+        - internal_replication: true
+          replica:
+              - host: 127.0.0.3
+                port: 9000
+              - host: 127.0.0.4
+                port: 9000
+)YAML");
+    SCOPE_EXIT({ yaml_file->remove(); });
+
+    Poco::AutoPtr<Poco::XML::Document> xml = YAMLParser::parse(yaml_file->path());
+    auto *p_node = xml->getNodeByPath("/clickhouse");
+    EXPECT_EQ(xmlNodeAsString(p_node), R"CONFIG(<clickhouse>
+<test_cluster>
+<shard>
+<internal_replication>false</internal_replication>
+<replica>
+<host>127.0.0.1</host>
+<port>9000</port>
+</replica>
+<replica>
+<host>127.0.0.2</host>
+<port>9000</port>
+</replica>
+</shard>
+<shard>
+<internal_replication>true</internal_replication>
+<replica>
+<host>127.0.0.3</host>
+<port>9000</port>
+</replica>
+<replica>
+<host>127.0.0.4</host>
+<port>9000</port>
+</replica>
+</shard>
+</test_cluster>
+</clickhouse>
+)CONFIG");
+
+}
+
 #endif
--- a/src/Compression/LZ4_decompress_faster.cpp
+++ b/src/Compression/LZ4_decompress_faster.cpp
@ -478,11 +478,7 @@ template <> void inline copyOverlap<32, true>(UInt8 * op, const UInt8 *& match,
 /// See also https://stackoverflow.com/a/30669632

 template <size_t copy_amount, bool use_shuffle>
-bool NO_INLINE decompressImpl(
-     const char * const source,
-     char * const dest,
-     size_t source_size,
-     size_t dest_size)
+bool NO_INLINE decompressImpl(const char * const source, char * const dest, size_t source_size, size_t dest_size)
 {
    const UInt8 * ip = reinterpret_cast<const UInt8 *>(source);
    UInt8 * op = reinterpret_cast<UInt8 *>(dest);
@ -515,6 +511,18 @@ bool NO_INLINE decompressImpl(

        const unsigned token = *ip++;
        length = token >> 4;
+
+        UInt8 * copy_end;
+        size_t real_length;
+
+        /// It might be true fairly often for well-compressed columns.
+        /// ATST it may hurt performance in other cases because this condition is hard to predict (especially if the number of zeros is ~50%).
+        /// In such cases this `if` will significantly increase number of mispredicted instructions. But seems like it results in a
+        /// noticeable slowdown only for implementations with `copy_amount` > 8. Probably because they use havier instructions.
+        if constexpr (copy_amount == 8)
+            if (length == 0)
+                goto decompress_match;
+
        if (length == 0x0F)
        {
            if (unlikely(ip + 1 >= input_end))
@ -524,7 +532,7 @@ bool NO_INLINE decompressImpl(

        /// Copy literals.

-        UInt8 * copy_end = op + length;
+        copy_end = op + length;

        /// input: Hello, world
        ///        ^-ip
@ -541,7 +549,7 @@ bool NO_INLINE decompressImpl(
            return false;

        // Due to implementation specifics the copy length is always a multiple of copy_amount
-        size_t real_length = 0;
+        real_length = 0;

        static_assert(copy_amount == 8 || copy_amount == 16 || copy_amount == 32);
        if constexpr (copy_amount == 8)
@ -552,9 +560,9 @@ bool NO_INLINE decompressImpl(
            real_length = (((length >> 5) + 1) * 32);

        if (unlikely(ip + real_length >= input_end + ADDITIONAL_BYTES_AT_END_OF_BUFFER))
-             return false;
+            return false;

-        wildCopy<copy_amount>(op, ip, copy_end);    /// Here we can write up to copy_amount - 1 bytes after buffer.
+        wildCopy<copy_amount>(op, ip, copy_end); /// Here we can write up to copy_amount - 1 bytes after buffer.

        if (copy_end == output_end)
            return true;
@ -562,6 +570,8 @@ bool NO_INLINE decompressImpl(
        ip += length;
        op = copy_end;

+    decompress_match:
+
        if (unlikely(ip + 1 >= input_end))
            return false;

--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -213,7 +213,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
    \
    M(Bool, insert_deduplicate, true, "For INSERT queries in the replicated table, specifies that deduplication of insertings blocks should be performed", 0) \
    \
-    M(UInt64Auto, insert_quorum, 0, "For INSERT queries in the replicated table, wait writing for the specified number of replicas and linearize the addition of the data. 0 - disabled.", 0) \
+    M(UInt64Auto, insert_quorum, 0, "For INSERT queries in the replicated table, wait writing for the specified number of replicas and linearize the addition of the data. 0 - disabled, 'auto' - use majority", 0) \
    M(Milliseconds, insert_quorum_timeout, 600000, "If the quorum of replicas did not meet in specified time (in milliseconds), exception will be thrown and insertion is aborted.", 0) \
    M(Bool, insert_quorum_parallel, true, "For quorum INSERT queries - enable to make parallel inserts without linearizability", 0) \
    M(UInt64, select_sequential_consistency, 0, "For SELECT queries from the replicated table, throw an exception if the replica does not have a chunk written with the quorum; do not read the parts that have not yet been written with the quorum.", 0) \
--- a/src/Dictionaries/getDictionaryConfigurationFromAST.cpp
+++ b/src/Dictionaries/getDictionaryConfigurationFromAST.cpp
@ -44,15 +44,6 @@ struct AttributeConfiguration

 using AttributeNameToConfiguration = std::unordered_map<std::string, AttributeConfiguration>;

-/// Get value from field and convert it to string.
-/// Also remove quotes from strings.
-String getFieldAsString(const Field & field)
-{
-    if (field.getType() == Field::Types::Which::String)
-        return field.get<String>();
-    return applyVisitor(FieldVisitorToString(), field);
-}
-
 String getAttributeExpression(const ASTDictionaryAttributeDeclaration * dict_attr)
 {
    if (!dict_attr->expression)
@ -61,7 +52,7 @@ String getAttributeExpression(const ASTDictionaryAttributeDeclaration * dict_att
    /// EXPRESSION PROPERTY should be expression or string
    String expression_str;
    if (const auto * literal = dict_attr->expression->as<ASTLiteral>(); literal && literal->value.getType() == Field::Types::String)
-        expression_str = getFieldAsString(literal->value);
+        expression_str = convertFieldToString(literal->value);
    else
        expression_str = queryToString(dict_attr->expression);

@ -275,7 +266,7 @@ void buildSingleAttribute(
    AutoPtr<Element> null_value_element(doc->createElement("null_value"));
    String null_value_str;
    if (dict_attr->default_value)
-        null_value_str = getFieldAsString(dict_attr->default_value->as<ASTLiteral>()->value);
+        null_value_str = convertFieldToString(dict_attr->default_value->as<ASTLiteral>()->value);
    AutoPtr<Text> null_value(doc->createTextNode(null_value_str));
    null_value_element->appendChild(null_value);
    attribute_element->appendChild(null_value_element);
@ -452,7 +443,7 @@ void buildConfigurationFromFunctionWithKeyValueArguments(
        }
        else if (const auto * literal = pair->second->as<const ASTLiteral>())
        {
-            AutoPtr<Text> value(doc->createTextNode(getFieldAsString(literal->value)));
+            AutoPtr<Text> value(doc->createTextNode(convertFieldToString(literal->value)));
            current_xml_element->appendChild(value);
        }
        else if (const auto * list = pair->second->as<const ASTExpressionList>())
@ -473,7 +464,7 @@ void buildConfigurationFromFunctionWithKeyValueArguments(
            Field value;
            result->get(0, value);

-            AutoPtr<Text> text_value(doc->createTextNode(getFieldAsString(value)));
+            AutoPtr<Text> text_value(doc->createTextNode(convertFieldToString(value)));
            current_xml_element->appendChild(text_value);
        }
        else
@ -519,7 +510,7 @@ void buildSourceConfiguration(
        {
            AutoPtr<Element> setting_change_element(doc->createElement(name));
            settings_element->appendChild(setting_change_element);
-            AutoPtr<Text> setting_value(doc->createTextNode(getFieldAsString(value)));
+            AutoPtr<Text> setting_value(doc->createTextNode(convertFieldToString(value)));
            setting_change_element->appendChild(setting_value);
        }
    }
--- a/src/Disks/IDisk.h
+++ b/src/Disks/IDisk.h
@ -239,7 +239,16 @@ public:
    }

    /// For one local path there might be multiple remote paths in case of Log family engines.
-    using LocalPathWithObjectStoragePaths = std::pair<String, StoredObjects>;
+    struct LocalPathWithObjectStoragePaths
+     {
+         std::string local_path;
+         std::string common_prefix_for_objects;
+         StoredObjects objects;
+
+         LocalPathWithObjectStoragePaths(
+             const std::string & local_path_, const std::string & common_prefix_for_objects_, StoredObjects && objects_)
+             : local_path(local_path_), common_prefix_for_objects(common_prefix_for_objects_), objects(std::move(objects_)) {}
+     };

    virtual void getRemotePathsRecursive(const String &, std::vector<LocalPathWithObjectStoragePaths> &)
    {
--- a/src/Disks/IO/ReadBufferFromAzureBlobStorage.cpp
+++ b/src/Disks/IO/ReadBufferFromAzureBlobStorage.cpp
@ -29,7 +29,7 @@ ReadBufferFromAzureBlobStorage::ReadBufferFromAzureBlobStorage(
    size_t max_single_download_retries_,
    bool use_external_buffer_,
    size_t read_until_position_)
-    : ReadBufferFromFileBase(read_settings_.remote_fs_buffer_size, nullptr, 0)
+    : ReadBufferFromFileBase(use_external_buffer_ ? 0 : read_settings_.remote_fs_buffer_size, nullptr, 0)
    , blob_container_client(blob_container_client_)
    , path(path_)
    , max_single_read_retries(max_single_read_retries_)
--- a/src/Disks/IO/ReadIndirectBufferFromRemoteFS.cpp
+++ b/src/Disks/IO/ReadIndirectBufferFromRemoteFS.cpp
@ -1,6 +1,7 @@
 #include "ReadIndirectBufferFromRemoteFS.h"

 #include <Disks/IO/ReadBufferFromRemoteFSGather.h>
+#include <IO/ReadSettings.h>


 namespace DB
@ -13,8 +14,8 @@ namespace ErrorCodes


 ReadIndirectBufferFromRemoteFS::ReadIndirectBufferFromRemoteFS(
-    std::shared_ptr<ReadBufferFromRemoteFSGather> impl_)
-    : ReadBufferFromFileBase(DBMS_DEFAULT_BUFFER_SIZE, nullptr, 0)
+    std::shared_ptr<ReadBufferFromRemoteFSGather> impl_, const ReadSettings & settings)
+    : ReadBufferFromFileBase(settings.remote_fs_buffer_size, nullptr, 0)
    , impl(impl_)
 {
 }
--- a/src/Disks/IO/ReadIndirectBufferFromRemoteFS.h
+++ b/src/Disks/IO/ReadIndirectBufferFromRemoteFS.h
@ -9,6 +9,7 @@ namespace DB
 {

 class ReadBufferFromRemoteFSGather;
+struct ReadSettings;

 /**
 * Reads data from S3/HDFS/Web using stored paths in metadata.
@ -18,7 +19,7 @@ class ReadIndirectBufferFromRemoteFS : public ReadBufferFromFileBase
 {

 public:
-    explicit ReadIndirectBufferFromRemoteFS(std::shared_ptr<ReadBufferFromRemoteFSGather> impl_);
+    explicit ReadIndirectBufferFromRemoteFS(std::shared_ptr<ReadBufferFromRemoteFSGather> impl_, const ReadSettings & settings);

    off_t seek(off_t offset_, int whence) override;

--- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp
@ -112,7 +112,7 @@ std::unique_ptr<ReadBufferFromFileBase> AzureObjectStorage::readObjects( /// NOL
    }
    else
    {
-        auto buf = std::make_unique<ReadIndirectBufferFromRemoteFS>(std::move(reader_impl));
+        auto buf = std::make_unique<ReadIndirectBufferFromRemoteFS>(std::move(reader_impl), disk_read_settings);
        return std::make_unique<SeekAvoidingReadBuffer>(std::move(buf), settings_ptr->min_bytes_for_seek);
    }
 }
--- a/src/Disks/ObjectStorages/DiskObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/DiskObjectStorage.cpp
@ -127,7 +127,7 @@ void DiskObjectStorage::getRemotePathsRecursive(const String & local_path, std::
    {
        try
        {
-            paths_map.emplace_back(local_path, getStorageObjects(local_path));
+            paths_map.emplace_back(local_path, metadata_storage->getObjectStorageRootPath(), getStorageObjects(local_path));
        }
        catch (const Exception & e)
        {
@ -282,7 +282,10 @@ String DiskObjectStorage::getUniqueId(const String & path) const
 bool DiskObjectStorage::checkUniqueId(const String & id) const
 {
    if (!id.starts_with(object_storage_root_path))
+    {
+        LOG_DEBUG(log, "Blob with id {} doesn't start with blob storage prefix {}", id, object_storage_root_path);
        return false;
+    }

    auto object = StoredObject::create(*object_storage, id, {}, {}, true);
    return object_storage->exists(object);
--- a/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp
+++ b/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp
@ -68,6 +68,14 @@ void DiskObjectStorageMetadata::deserialize(ReadBuffer & buf)
    }
 }

+void DiskObjectStorageMetadata::createFromSingleObject(const std::string & relative_path, size_t bytes_size, size_t ref_count_, bool read_only_)
+{
+    storage_objects.emplace_back(relative_path, bytes_size);
+    total_size = bytes_size;
+    ref_count = ref_count_;
+    read_only = read_only_;
+}
+
 void DiskObjectStorageMetadata::deserializeFromString(const std::string & data)
 {
    ReadBufferFromString buf(data);
--- a/src/Disks/ObjectStorages/DiskObjectStorageMetadata.h
+++ b/src/Disks/ObjectStorages/DiskObjectStorageMetadata.h
@ -50,6 +50,7 @@ public:

    void deserialize(ReadBuffer & buf);
    void deserializeFromString(const std::string & data);
+    void createFromSingleObject(const std::string & relative_path, size_t bytes_size, size_t ref_count_, bool is_read_only_);

    void serialize(WriteBuffer & buf, bool sync) const;
    std::string serializeToString() const;
--- a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp
@ -70,11 +70,12 @@ std::unique_ptr<ReadBufferFromFileBase> HDFSObjectStorage::readObjects( /// NOLI
        auto hdfs_path = path.substr(begin_of_path);
        auto hdfs_uri = path.substr(0, begin_of_path);

-        return std::make_unique<ReadBufferFromHDFS>(hdfs_uri, hdfs_path, config, disk_read_settings);
+        return std::make_unique<ReadBufferFromHDFS>(
+            hdfs_uri, hdfs_path, config, disk_read_settings, /* read_until_position */0, /* use_external_buffer */true);
    };

    auto hdfs_impl = std::make_unique<ReadBufferFromRemoteFSGather>(std::move(read_buffer_creator), objects, disk_read_settings);
-    auto buf = std::make_unique<ReadIndirectBufferFromRemoteFS>(std::move(hdfs_impl));
+    auto buf = std::make_unique<ReadIndirectBufferFromRemoteFS>(std::move(hdfs_impl), read_settings);
    return std::make_unique<SeekAvoidingReadBuffer>(std::move(buf), settings->min_bytes_for_seek);
 }

--- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp
@ -31,6 +31,7 @@
 #include <Common/logger_useful.h>
 #include <Common/MultiVersion.h>

+
 namespace DB
 {

@ -55,7 +56,7 @@ void throwIfError(const Aws::Utils::Outcome<Result, Error> & response)
    if (!response.IsSuccess())
    {
        const auto & err = response.GetError();
-        throw Exception(ErrorCodes::S3_ERROR, "{} (Code: {})", err.GetMessage(), static_cast<size_t>(err.GetErrorType()));
+        throw S3Exception(fmt::format("{} (Code: {})", err.GetMessage(), static_cast<size_t>(err.GetErrorType())), err.GetErrorType());
    }
 }

@ -69,7 +70,7 @@ void throwIfUnexpectedError(const Aws::Utils::Outcome<Result, Error> & response,
    if (!response.IsSuccess() && (!if_exists || !isNotFoundError(response.GetError().GetErrorType())))
    {
        const auto & err = response.GetError();
-        throw Exception(ErrorCodes::S3_ERROR, "{} (Code: {})", err.GetMessage(), static_cast<size_t>(err.GetErrorType()));
+        throw S3Exception(err.GetErrorType(), "{} (Code: {})", err.GetMessage(), static_cast<size_t>(err.GetErrorType()));
    }
 }

@ -90,7 +91,19 @@ void logIfError(const Aws::Utils::Outcome<Result, Error> & response, std::functi

 std::string S3ObjectStorage::generateBlobNameForPath(const std::string & /* path */)
 {
-    return getRandomASCIIString(32);
+    /// Path to store the new S3 object.
+
+    /// Total length is 32 a-z characters for enough randomness.
+    /// First 3 characters are used as a prefix for
+    /// https://aws.amazon.com/premiumsupport/knowledge-center/s3-object-key-naming-pattern/
+
+    constexpr size_t key_name_total_size = 32;
+    constexpr size_t key_name_prefix_size = 3;
+
+    /// Path to store new S3 object.
+    return fmt::format("{}/{}",
+        getRandomASCIIString(key_name_prefix_size),
+        getRandomASCIIString(key_name_total_size - key_name_prefix_size));
 }

 Aws::S3::Model::HeadObjectOutcome S3ObjectStorage::requestObjectHeadData(const std::string & bucket_from, const std::string & key) const
@ -157,7 +170,7 @@ std::unique_ptr<ReadBufferFromFileBase> S3ObjectStorage::readObjects( /// NOLINT
    }
    else
    {
-        auto buf = std::make_unique<ReadIndirectBufferFromRemoteFS>(std::move(s3_impl));
+        auto buf = std::make_unique<ReadIndirectBufferFromRemoteFS>(std::move(s3_impl), disk_read_settings);
        return std::make_unique<SeekAvoidingReadBuffer>(std::move(buf), settings_ptr->min_bytes_for_seek);
    }
 }
--- a/src/Disks/ObjectStorages/S3/diskSettings.cpp
+++ b/src/Disks/ObjectStorages/S3/diskSettings.cpp
@ -116,7 +116,8 @@ std::unique_ptr<Aws::S3::S3Client> getClient(const Poco::Util::AbstractConfigura
    S3::PocoHTTPClientConfiguration client_configuration = S3::ClientFactory::instance().createClientConfiguration(
        config.getString(config_prefix + ".region", ""),
        context->getRemoteHostFilter(), context->getGlobalContext()->getSettingsRef().s3_max_redirects,
-        context->getGlobalContext()->getSettingsRef().enable_s3_requests_logging);
+        context->getGlobalContext()->getSettingsRef().enable_s3_requests_logging,
+        /* for_disk_s3 = */ true);

    S3::URI uri(Poco::URI(config.getString(config_prefix + ".endpoint")));
    if (uri.key.back() != '/')
--- a/src/Disks/ObjectStorages/StoredObject.h
+++ b/src/Disks/ObjectStorages/StoredObject.h
@ -3,6 +3,7 @@
 #include <string>
 #include <Disks/ObjectStorages/IObjectStorage_fwd.h>

+
 namespace DB
 {

--- a/src/Disks/ObjectStorages/Web/WebObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/Web/WebObjectStorage.cpp
@ -188,7 +188,7 @@ std::unique_ptr<ReadBufferFromFileBase> WebObjectStorage::readObject( /// NOLINT
    }
    else
    {
-        auto buf = std::make_unique<ReadIndirectBufferFromRemoteFS>(std::move(web_impl));
+        auto buf = std::make_unique<ReadIndirectBufferFromRemoteFS>(std::move(web_impl), read_settings);
        return std::make_unique<SeekAvoidingReadBuffer>(std::move(buf), min_bytes_for_seek);
    }
 }
--- a/src/Functions/IFunction.h
+++ b/src/Functions/IFunction.h
@ -171,7 +171,7 @@ public:
      */
    virtual bool isSuitableForConstantFolding() const { return true; }

-    /** If function isSuitableForConstantFolding then, this method will be called during query analyzis
+    /** If function isSuitableForConstantFolding then, this method will be called during query analysis
      * if some arguments are constants. For example logical functions (AndFunction, OrFunction) can
      * return they result based on some constant arguments.
      * Arguments are passed without modifications, useDefaultImplementationForNulls, useDefaultImplementationForNothing,
@ -394,7 +394,7 @@ private:
 using FunctionOverloadResolverPtr = std::shared_ptr<IFunctionOverloadResolver>;

 /// Old function interface. Check documentation in IFunction.h.
-/// If client do not need statefull properties it can implement this interface.
+/// If client do not need stateful properties it can implement this interface.
 class IFunction
 {
 public:
--- a/src/IO/ReadBufferFromS3.cpp
+++ b/src/IO/ReadBufferFromS3.cpp
@ -34,6 +34,7 @@ namespace ErrorCodes
    extern const int CANNOT_SEEK_THROUGH_FILE;
    extern const int SEEK_POSITION_OUT_OF_BOUND;
    extern const int LOGICAL_ERROR;
+    extern const int CANNOT_ALLOCATE_MEMORY;
 }


@ -48,7 +49,7 @@ ReadBufferFromS3::ReadBufferFromS3(
    size_t offset_,
    size_t read_until_position_,
    bool restricted_seek_)
-    : ReadBufferFromFileBase(settings_.remote_fs_buffer_size, nullptr, 0)
+    : ReadBufferFromFileBase(use_external_buffer_ ? 0 : settings_.remote_fs_buffer_size, nullptr, 0)
    , client_ptr(std::move(client_ptr_))
    , bucket(bucket_)
    , key(key_)
@ -136,6 +137,23 @@ bool ReadBufferFromS3::nextImpl()
            ProfileEvents::increment(ProfileEvents::ReadBufferFromS3Microseconds, watch.elapsedMicroseconds());
            ProfileEvents::increment(ProfileEvents::ReadBufferFromS3RequestsErrors, 1);

+            if (const auto * s3_exception = dynamic_cast<const S3Exception *>(&e))
+            {
+                /// It doesn't make sense to retry Access Denied or No Such Key
+                if (!s3_exception->isRetryableError())
+                {
+                    tryLogCurrentException(log, fmt::format("while reading key: {}, from bucket: {}", key, bucket));
+                    throw;
+                }
+            }
+
+            /// It doesn't make sense to retry allocator errors
+            if (e.code() == ErrorCodes::CANNOT_ALLOCATE_MEMORY)
+            {
+                tryLogCurrentException(log);
+                throw;
+            }
+
            LOG_DEBUG(
                log,
                "Caught exception while reading S3 object. Bucket: {}, Key: {}, Version: {}, Offset: {}, Attempt: {}, Message: {}",
@ -306,7 +324,10 @@ std::unique_ptr<ReadBuffer> ReadBufferFromS3::initialize()
        return std::make_unique<ReadBufferFromIStream>(read_result.GetBody(), buffer_size);
    }
    else
-        throw Exception(outcome.GetError().GetMessage(), ErrorCodes::S3_ERROR);
+    {
+        const auto & error = outcome.GetError();
+        throw S3Exception(error.GetMessage(), error.GetErrorType());
+    }
 }

 SeekableReadBufferPtr ReadBufferS3Factory::getReader()
--- a/src/IO/S3/PocoHTTPClient.cpp
+++ b/src/IO/S3/PocoHTTPClient.cpp
@ -42,6 +42,18 @@ namespace ProfileEvents
    extern const Event S3WriteRequestsErrors;
    extern const Event S3WriteRequestsThrottling;
    extern const Event S3WriteRequestsRedirects;
+
+    extern const Event DiskS3ReadMicroseconds;
+    extern const Event DiskS3ReadRequestsCount;
+    extern const Event DiskS3ReadRequestsErrors;
+    extern const Event DiskS3ReadRequestsThrottling;
+    extern const Event DiskS3ReadRequestsRedirects;
+
+    extern const Event DiskS3WriteMicroseconds;
+    extern const Event DiskS3WriteRequestsCount;
+    extern const Event DiskS3WriteRequestsErrors;
+    extern const Event DiskS3WriteRequestsThrottling;
+    extern const Event DiskS3WriteRequestsRedirects;
 }

 namespace CurrentMetrics
@ -62,11 +74,13 @@ PocoHTTPClientConfiguration::PocoHTTPClientConfiguration(
        const String & force_region_,
        const RemoteHostFilter & remote_host_filter_,
        unsigned int s3_max_redirects_,
-        bool enable_s3_requests_logging_)
+        bool enable_s3_requests_logging_,
+        bool for_disk_s3_)
    : force_region(force_region_)
    , remote_host_filter(remote_host_filter_)
    , s3_max_redirects(s3_max_redirects_)
    , enable_s3_requests_logging(enable_s3_requests_logging_)
+    , for_disk_s3(for_disk_s3_)
 {
 }

@ -112,6 +126,7 @@ PocoHTTPClient::PocoHTTPClient(const PocoHTTPClientConfiguration & client_config
    , remote_host_filter(client_configuration.remote_host_filter)
    , s3_max_redirects(client_configuration.s3_max_redirects)
    , enable_s3_requests_logging(client_configuration.enable_s3_requests_logging)
+    , for_disk_s3(client_configuration.for_disk_s3)
    , extra_headers(client_configuration.extra_headers)
 {
 }
@ -176,6 +191,46 @@ namespace
    }
 }

+PocoHTTPClient::S3MetricKind PocoHTTPClient::getMetricKind(const Aws::Http::HttpRequest & request)
+{
+    switch (request.GetMethod())
+    {
+        case Aws::Http::HttpMethod::HTTP_GET:
+        case Aws::Http::HttpMethod::HTTP_HEAD:
+            return S3MetricKind::Read;
+        case Aws::Http::HttpMethod::HTTP_POST:
+        case Aws::Http::HttpMethod::HTTP_DELETE:
+        case Aws::Http::HttpMethod::HTTP_PUT:
+        case Aws::Http::HttpMethod::HTTP_PATCH:
+            return S3MetricKind::Write;
+    }
+    throw Exception("Unsupported request method", ErrorCodes::NOT_IMPLEMENTED);
+}
+
+void PocoHTTPClient::addMetric(const Aws::Http::HttpRequest & request, S3MetricType type, ProfileEvents::Count amount) const
+{
+    const ProfileEvents::Event events_map[static_cast<size_t>(S3MetricType::EnumSize)][static_cast<size_t>(S3MetricKind::EnumSize)] = {
+        {ProfileEvents::S3ReadMicroseconds, ProfileEvents::S3WriteMicroseconds},
+        {ProfileEvents::S3ReadRequestsCount, ProfileEvents::S3WriteRequestsCount},
+        {ProfileEvents::S3ReadRequestsErrors, ProfileEvents::S3WriteRequestsErrors},
+        {ProfileEvents::S3ReadRequestsThrottling, ProfileEvents::S3WriteRequestsThrottling},
+        {ProfileEvents::S3ReadRequestsRedirects, ProfileEvents::S3WriteRequestsRedirects},
+    };
+
+    const ProfileEvents::Event disk_s3_events_map[static_cast<size_t>(S3MetricType::EnumSize)][static_cast<size_t>(S3MetricKind::EnumSize)] = {
+        {ProfileEvents::DiskS3ReadMicroseconds, ProfileEvents::DiskS3WriteMicroseconds},
+        {ProfileEvents::DiskS3ReadRequestsCount, ProfileEvents::DiskS3WriteRequestsCount},
+        {ProfileEvents::DiskS3ReadRequestsErrors, ProfileEvents::DiskS3WriteRequestsErrors},
+        {ProfileEvents::DiskS3ReadRequestsThrottling, ProfileEvents::DiskS3WriteRequestsThrottling},
+        {ProfileEvents::DiskS3ReadRequestsRedirects, ProfileEvents::DiskS3WriteRequestsRedirects},
+    };
+
+    S3MetricKind kind = getMetricKind(request);
+
+    ProfileEvents::increment(events_map[static_cast<unsigned int>(type)][static_cast<unsigned int>(kind)], amount);
+    if (for_disk_s3)
+        ProfileEvents::increment(disk_s3_events_map[static_cast<unsigned int>(type)][static_cast<unsigned int>(kind)], amount);
+}

 void PocoHTTPClient::makeRequestInternal(
    Aws::Http::HttpRequest & request,
@ -189,45 +244,7 @@ void PocoHTTPClient::makeRequestInternal(
    if (enable_s3_requests_logging)
        LOG_TEST(log, "Make request to: {}", uri);

-    enum class S3MetricType
-    {
-        Microseconds,
-        Count,
-        Errors,
-        Throttling,
-        Redirects,
-
-        EnumSize,
-    };
-
-    auto select_metric = [&request](S3MetricType type)
-    {
-        const ProfileEvents::Event events_map[][2] = {
-            {ProfileEvents::S3ReadMicroseconds, ProfileEvents::S3WriteMicroseconds},
-            {ProfileEvents::S3ReadRequestsCount, ProfileEvents::S3WriteRequestsCount},
-            {ProfileEvents::S3ReadRequestsErrors, ProfileEvents::S3WriteRequestsErrors},
-            {ProfileEvents::S3ReadRequestsThrottling, ProfileEvents::S3WriteRequestsThrottling},
-            {ProfileEvents::S3ReadRequestsRedirects, ProfileEvents::S3WriteRequestsRedirects},
-        };
-
-        static_assert((sizeof(events_map) / sizeof(events_map[0])) == static_cast<unsigned int>(S3MetricType::EnumSize));
-
-        switch (request.GetMethod())
-        {
-            case Aws::Http::HttpMethod::HTTP_GET:
-            case Aws::Http::HttpMethod::HTTP_HEAD:
-                return events_map[static_cast<unsigned int>(type)][0]; // Read
-            case Aws::Http::HttpMethod::HTTP_POST:
-            case Aws::Http::HttpMethod::HTTP_DELETE:
-            case Aws::Http::HttpMethod::HTTP_PUT:
-            case Aws::Http::HttpMethod::HTTP_PATCH:
-                return events_map[static_cast<unsigned int>(type)][1]; // Write
-        }
-
-        throw Exception("Unsupported request method", ErrorCodes::NOT_IMPLEMENTED);
-    };
-
-    ProfileEvents::increment(select_metric(S3MetricType::Count));
+    addMetric(request, S3MetricType::Count);
    CurrentMetrics::Increment metric_increment{CurrentMetrics::S3Requests};

    try
@ -334,7 +351,7 @@ void PocoHTTPClient::makeRequestInternal(
            auto & response_body_stream = session->receiveResponse(poco_response);

            watch.stop();
-            ProfileEvents::increment(select_metric(S3MetricType::Microseconds), watch.elapsedMicroseconds());
+            addMetric(request, S3MetricType::Microseconds, watch.elapsedMicroseconds());

            int status_code = static_cast<int>(poco_response.getStatus());

@ -349,7 +366,7 @@ void PocoHTTPClient::makeRequestInternal(
                if (enable_s3_requests_logging)
                    LOG_TEST(log, "Redirecting request to new location: {}", location);

-                ProfileEvents::increment(select_metric(S3MetricType::Redirects));
+                addMetric(request, S3MetricType::Redirects);

                continue;
            }
@ -387,7 +404,7 @@ void PocoHTTPClient::makeRequestInternal(
                    LOG_WARNING(log, "Response for request contain <Error> tag in body, settings internal server error (500 code)");
                    response->SetResponseCode(Aws::Http::HttpResponseCode::INTERNAL_SERVER_ERROR);

-                    ProfileEvents::increment(select_metric(S3MetricType::Errors));
+                    addMetric(request, S3MetricType::Errors);
                    if (error_report)
                        error_report(request_configuration);

@ -401,11 +418,11 @@ void PocoHTTPClient::makeRequestInternal(

                if (status_code == 429 || status_code == 503)
                { // API throttling
-                    ProfileEvents::increment(select_metric(S3MetricType::Throttling));
+                    addMetric(request, S3MetricType::Throttling);
                }
                else if (status_code >= 300)
                {
-                    ProfileEvents::increment(select_metric(S3MetricType::Errors));
+                    addMetric(request, S3MetricType::Errors);
                    if (status_code >= 500 && error_report)
                        error_report(request_configuration);
                }
@ -423,7 +440,7 @@ void PocoHTTPClient::makeRequestInternal(
        response->SetClientErrorType(Aws::Client::CoreErrors::NETWORK_CONNECTION);
        response->SetClientErrorMessage(getCurrentExceptionMessage(false));

-        ProfileEvents::increment(select_metric(S3MetricType::Errors));
+        addMetric(request, S3MetricType::Errors);
    }
 }

--- a/src/IO/S3/PocoHTTPClient.h
+++ b/src/IO/S3/PocoHTTPClient.h
@ -44,6 +44,7 @@ struct PocoHTTPClientConfiguration : public Aws::Client::ClientConfiguration
    const RemoteHostFilter & remote_host_filter;
    unsigned int s3_max_redirects;
    bool enable_s3_requests_logging;
+    bool for_disk_s3;
    HeaderCollection extra_headers;

    void updateSchemeAndRegion();
@ -55,7 +56,8 @@ private:
        const String & force_region_,
        const RemoteHostFilter & remote_host_filter_,
        unsigned int s3_max_redirects_,
-        bool enable_s3_requests_logging_
+        bool enable_s3_requests_logging_,
+        bool for_disk_s3_
    );

    /// Constructor of Aws::Client::ClientConfiguration must be called after AWS SDK initialization.
@ -113,18 +115,42 @@ public:
        Aws::Utils::RateLimits::RateLimiterInterface * writeLimiter) const override;

 private:
+
    void makeRequestInternal(
        Aws::Http::HttpRequest & request,
        std::shared_ptr<PocoHTTPResponse> & response,
        Aws::Utils::RateLimits::RateLimiterInterface * readLimiter,
        Aws::Utils::RateLimits::RateLimiterInterface * writeLimiter) const;

+    enum class S3MetricType
+    {
+        Microseconds,
+        Count,
+        Errors,
+        Throttling,
+        Redirects,
+
+        EnumSize,
+    };
+
+    enum class S3MetricKind
+    {
+        Read,
+        Write,
+
+        EnumSize,
+    };
+
+    static S3MetricKind getMetricKind(const Aws::Http::HttpRequest & request);
+    void addMetric(const Aws::Http::HttpRequest & request, S3MetricType type, ProfileEvents::Count amount = 1) const;
+
    std::function<ClientConfigurationPerRequest(const Aws::Http::HttpRequest &)> per_request_configuration;
    std::function<void(const ClientConfigurationPerRequest &)> error_report;
    ConnectionTimeouts timeouts;
    const RemoteHostFilter & remote_host_filter;
    unsigned int s3_max_redirects;
    bool enable_s3_requests_logging;
+    bool for_disk_s3;
    const HeaderCollection extra_headers;
 };

--- a/src/IO/S3/tests/gtest_aws_s3_client.cpp
+++ b/src/IO/S3/tests/gtest_aws_s3_client.cpp
@ -87,7 +87,8 @@ TEST(IOTestAwsS3Client, AppendExtraSSECHeaders)
        region,
        remote_host_filter,
        s3_max_redirects,
-        enable_s3_requests_logging
+        enable_s3_requests_logging,
+        /* for_disk_s3 = */ false
    );

    client_configuration.endpointOverride = uri.endpoint;
--- a/src/IO/S3Common.cpp
+++ b/src/IO/S3Common.cpp
@ -35,6 +35,26 @@

 #    include <fstream>

+namespace DB
+{
+
+bool S3Exception::isRetryableError() const
+{
+    /// Looks like these list is quite conservative, add more codes if you wish
+    static const std::unordered_set<Aws::S3::S3Errors> unretryable_errors = {
+        Aws::S3::S3Errors::NO_SUCH_KEY,
+        Aws::S3::S3Errors::ACCESS_DENIED,
+        Aws::S3::S3Errors::INVALID_ACCESS_KEY_ID,
+        Aws::S3::S3Errors::INVALID_SIGNATURE,
+        Aws::S3::S3Errors::NO_SUCH_UPLOAD,
+        Aws::S3::S3Errors::NO_SUCH_BUCKET,
+    };
+
+    return !unretryable_errors.contains(code);
+}
+
+}
+
 namespace
 {

@ -543,7 +563,7 @@ public:
            /// AWS API tries credentials providers one by one. Some of providers (like ProfileConfigFileAWSCredentialsProvider) can be
            /// quite verbose even if nobody configured them. So we use our provider first and only after it use default providers.
            {
-                DB::S3::PocoHTTPClientConfiguration aws_client_configuration = DB::S3::ClientFactory::instance().createClientConfiguration(configuration.region, configuration.remote_host_filter, configuration.s3_max_redirects, configuration.enable_s3_requests_logging);
+                DB::S3::PocoHTTPClientConfiguration aws_client_configuration = DB::S3::ClientFactory::instance().createClientConfiguration(configuration.region, configuration.remote_host_filter, configuration.s3_max_redirects, configuration.enable_s3_requests_logging, configuration.for_disk_s3);
                AddProvider(std::make_shared<AwsAuthSTSAssumeRoleWebIdentityCredentialsProvider>(aws_client_configuration));
            }

@ -580,7 +600,7 @@ public:
            }
            else if (Aws::Utils::StringUtils::ToLower(ec2_metadata_disabled.c_str()) != "true")
            {
-                DB::S3::PocoHTTPClientConfiguration aws_client_configuration = DB::S3::ClientFactory::instance().createClientConfiguration(configuration.region, configuration.remote_host_filter, configuration.s3_max_redirects, configuration.enable_s3_requests_logging);
+                DB::S3::PocoHTTPClientConfiguration aws_client_configuration = DB::S3::ClientFactory::instance().createClientConfiguration(configuration.region, configuration.remote_host_filter, configuration.s3_max_redirects, configuration.enable_s3_requests_logging, configuration.for_disk_s3);

                /// See MakeDefaultHttpResourceClientConfiguration().
                /// This is part of EC2 metadata client, but unfortunately it can't be accessed from outside
@ -700,9 +720,10 @@ namespace S3
        const String & force_region,
        const RemoteHostFilter & remote_host_filter,
        unsigned int s3_max_redirects,
-        bool enable_s3_requests_logging)
+        bool enable_s3_requests_logging,
+        bool for_disk_s3)
    {
-        return PocoHTTPClientConfiguration(force_region, remote_host_filter, s3_max_redirects, enable_s3_requests_logging);
+        return PocoHTTPClientConfiguration(force_region, remote_host_filter, s3_max_redirects, enable_s3_requests_logging, for_disk_s3);
    }

    URI::URI(const Poco::URI & uri_)
--- a/src/IO/S3Common.h
+++ b/src/IO/S3Common.h
@ -7,23 +7,62 @@
 #include <base/types.h>
 #include <aws/core/Aws.h>
 #include <aws/core/client/ClientConfiguration.h>
+#include <aws/s3/S3Errors.h>
 #include <IO/S3/PocoHTTPClient.h>
 #include <Poco/URI.h>

+#include <Common/Exception.h>
+
 namespace Aws::S3
 {
    class S3Client;
 }

+
 namespace DB
 {
-    class RemoteHostFilter;
-    struct HttpHeader;
-    using HeaderCollection = std::vector<HttpHeader>;
+namespace ErrorCodes
+{
+    extern const int S3_ERROR;
 }

+class RemoteHostFilter;
+struct HttpHeader;
+using HeaderCollection = std::vector<HttpHeader>;
+
+class S3Exception : public Exception
+{
+public:
+
+    // Format message with fmt::format, like the logging functions.
+    template <typename... Args>
+    S3Exception(Aws::S3::S3Errors code_, fmt::format_string<Args...> fmt, Args &&... args)
+        : Exception(fmt::format(fmt, std::forward<Args>(args)...), ErrorCodes::S3_ERROR)
+        , code(code_)
+    {
+    }
+
+    S3Exception(const std::string & msg, Aws::S3::S3Errors code_)
+        : Exception(msg, ErrorCodes::S3_ERROR)
+        , code(code_)
+    {}
+
+    Aws::S3::S3Errors getS3ErrorCode() const
+    {
+        return code;
+    }
+
+    bool isRetryableError() const;
+
+private:
+    const Aws::S3::S3Errors code;
+};
+}
+
+
 namespace DB::S3
 {
+
 class ClientFactory
 {
 public:
@ -45,7 +84,8 @@ public:
        const String & force_region,
        const RemoteHostFilter & remote_host_filter,
        unsigned int s3_max_redirects,
-        bool enable_s3_requests_logging);
+        bool enable_s3_requests_logging,
+        bool for_disk_s3);

 private:
    ClientFactory();
--- a/src/IO/WriteBufferFromS3.cpp
+++ b/src/IO/WriteBufferFromS3.cpp
@ -8,6 +8,7 @@

 #include <IO/WriteBufferFromS3.h>
 #include <IO/WriteHelpers.h>
+#include <IO/S3Common.h>
 #include <Interpreters/Context.h>

 #include <aws/s3/S3Client.h>
@ -173,7 +174,9 @@ void WriteBufferFromS3::finalizeImpl()
        auto response = client_ptr->HeadObject(request);

        if (!response.IsSuccess())
-            throw Exception(ErrorCodes::S3_ERROR, "Object {} from bucket {} disappeared immediately after upload, it's a bug in S3 or S3 API.", key, bucket);
+            throw S3Exception(fmt::format("Object {} from bucket {} disappeared immediately after upload, it's a bug in S3 or S3 API.", key, bucket), response.GetError().GetErrorType());
+        else
+            LOG_TRACE(log, "Object {} exists after upload", key);
    }
 }

@ -197,7 +200,7 @@ void WriteBufferFromS3::createMultipartUpload()
        LOG_TRACE(log, "Multipart upload has created. Bucket: {}, Key: {}, Upload id: {}", bucket, key, multipart_upload_id);
    }
    else
-        throw Exception(outcome.GetError().GetMessage(), ErrorCodes::S3_ERROR);
+        throw S3Exception(outcome.GetError().GetMessage(), outcome.GetError().GetErrorType());
 }

 void WriteBufferFromS3::writePart()
@ -309,7 +312,7 @@ void WriteBufferFromS3::processUploadRequest(UploadPartTask & task)
        LOG_TRACE(log, "Writing part finished. Bucket: {}, Key: {}, Upload_id: {}, Etag: {}, Parts: {}", bucket, key, multipart_upload_id, task.tag, part_tags.size());
    }
    else
-        throw Exception(outcome.GetError().GetMessage(), ErrorCodes::S3_ERROR);
+        throw S3Exception(outcome.GetError().GetMessage(), outcome.GetError().GetErrorType());

    total_parts_uploaded++;
 }
@ -343,9 +346,10 @@ void WriteBufferFromS3::completeMultipartUpload()
        LOG_TRACE(log, "Multipart upload has completed. Bucket: {}, Key: {}, Upload_id: {}, Parts: {}", bucket, key, multipart_upload_id, tags.size());
    else
    {
-        throw Exception(ErrorCodes::S3_ERROR, "{} Tags:{}",
-            outcome.GetError().GetMessage(),
-            fmt::join(tags.begin(), tags.end(), " "));
+        throw S3Exception(
+            outcome.GetError().GetErrorType(),
+            "Message: {}, Key: {}, Bucket: {}, Tags: {}",
+            outcome.GetError().GetMessage(), key, bucket, fmt::join(tags.begin(), tags.end(), " "));
    }
 }

@ -430,7 +434,10 @@ void WriteBufferFromS3::processPutRequest(const PutObjectTask & task)
    if (outcome.IsSuccess())
        LOG_TRACE(log, "Single part upload has completed. Bucket: {}, Key: {}, Object size: {}, WithPool: {}", bucket, key, task.req.GetContentLength(), with_pool);
    else
-        throw Exception(outcome.GetError().GetMessage(), ErrorCodes::S3_ERROR);
+        throw S3Exception(
+            outcome.GetError().GetErrorType(),
+            "Message: {}, Key: {}, Bucket: {}, Object size: {}, WithPool: {}",
+            outcome.GetError().GetMessage(), key, bucket, task.req.GetContentLength(), with_pool);
 }

 void WriteBufferFromS3::waitForReadyBackGroundTasks()
--- a/src/Interpreters/InterpreterDeleteQuery.cpp
+++ b/src/Interpreters/InterpreterDeleteQuery.cpp
@ -21,7 +21,6 @@ namespace DB

 namespace ErrorCodes
 {
-    extern const int BAD_ARGUMENTS;
    extern const int TABLE_IS_READ_ONLY;
    extern const int SUPPORT_IS_DISABLED;
 }
@ -34,11 +33,6 @@ InterpreterDeleteQuery::InterpreterDeleteQuery(const ASTPtr & query_ptr_, Contex

 BlockIO InterpreterDeleteQuery::execute()
 {
-    if (!getContext()->getSettingsRef().allow_experimental_lightweight_delete)
-    {
-        throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Lightweight delete mutate is experimental. Set `allow_experimental_lightweight_delete` setting to enable it");
-    }
-
    FunctionNameNormalizer().visit(query_ptr.get());
    const ASTDeleteQuery & delete_query = query_ptr->as<ASTDeleteQuery &>();
    auto table_id = getContext()->resolveStorageID(delete_query, Context::ResolveOrdinary);
@ -49,10 +43,6 @@ BlockIO InterpreterDeleteQuery::execute()

    /// First check table storage for validations.
    StoragePtr table = DatabaseCatalog::instance().getTable(table_id, getContext());
-    auto merge_tree = std::dynamic_pointer_cast<MergeTreeData>(table);
-    if (!merge_tree)
-        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Only MergeTree family tables are supported");
-
    checkStorageSupportsTransactionsIfNeeded(table, getContext());
    if (table->isStaticStorage())
        throw Exception(ErrorCodes::TABLE_IS_READ_ONLY, "Table is read-only");
@ -69,6 +59,27 @@ BlockIO InterpreterDeleteQuery::execute()
    auto table_lock = table->lockForShare(getContext()->getCurrentQueryId(), getContext()->getSettingsRef().lock_acquire_timeout);
    auto metadata_snapshot = table->getInMemoryMetadataPtr();

+    auto merge_tree = std::dynamic_pointer_cast<MergeTreeData>(table);
+    if (!merge_tree)
+    {
+        /// Convert to MutationCommand
+        MutationCommands mutation_commands;
+        MutationCommand mut_command;
+
+        mut_command.type = MutationCommand::Type::DELETE;
+        mut_command.predicate = delete_query.predicate;
+
+        mutation_commands.emplace_back(mut_command);
+
+        table->checkMutationIsPossible(mutation_commands, getContext()->getSettingsRef());
+        MutationsInterpreter(table, metadata_snapshot, mutation_commands, getContext(), false).validate();
+        table->mutate(mutation_commands, getContext());
+        return {};
+    }
+
+    if (!getContext()->getSettingsRef().allow_experimental_lightweight_delete)
+        throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Lightweight delete mutate is experimental. Set `allow_experimental_lightweight_delete` setting to enable it");
+
    /// Convert to MutationCommand
    MutationCommands mutation_commands;
    MutationCommand mut_command;
--- a/src/Interpreters/InterpreterSystemQuery.cpp
+++ b/src/Interpreters/InterpreterSystemQuery.cpp
@ -753,7 +753,7 @@ bool InterpreterSystemQuery::dropReplicaImpl(ASTSystemQuery & query, const Stora
                        "if you want to clean the data and drop this replica", ErrorCodes::TABLE_WAS_NOT_DROPPED);

    /// NOTE it's not atomic: replica may become active after this check, but before dropReplica(...)
-    /// However, the main usecase is to drop dead replica, which cannot become active.
+    /// However, the main use case is to drop dead replica, which cannot become active.
    /// This check prevents only from accidental drop of some other replica.
    if (zookeeper->exists(status.zookeeper_path + "/replicas/" + query.replica + "/is_active"))
        throw Exception("Can't drop replica: " + query.replica + ", because it's active",
--- a/src/Interpreters/MutationsInterpreter.cpp
+++ b/src/Interpreters/MutationsInterpreter.cpp
@ -226,7 +226,7 @@ bool isStorageTouchedByMutations(
    ASTPtr select_query = prepareQueryAffectedAST(commands, storage, context_copy);

    /// Interpreter must be alive, when we use result of execute() method.
-    /// For some reason it may copy context and and give it into ExpressionTransform
+    /// For some reason it may copy context and give it into ExpressionTransform
    /// after that we will use context from destroyed stack frame in our stream.
    InterpreterSelectQuery interpreter(
        select_query, context_copy, storage, metadata_snapshot, SelectQueryOptions().ignoreLimits().ignoreProjections());
@ -288,13 +288,17 @@ MutationsInterpreter::MutationsInterpreter(
    const StorageMetadataPtr & metadata_snapshot_,
    MutationCommands commands_,
    ContextPtr context_,
-    bool can_execute_)
+    bool can_execute_,
+    bool return_all_columns_,
+    bool return_deleted_rows_)
    : storage(std::move(storage_))
    , metadata_snapshot(metadata_snapshot_)
    , commands(std::move(commands_))
    , context(Context::createCopy(context_))
    , can_execute(can_execute_)
    , select_limits(SelectQueryOptions().analyze(!can_execute).ignoreLimits().ignoreProjections())
+    , return_all_columns(return_all_columns_)
+    , return_deleted_rows(return_deleted_rows_)
 {
    mutation_ast = prepare(!can_execute);
 }
@ -472,14 +476,21 @@ ASTPtr MutationsInterpreter::prepare(bool dry_run)
    /// First, break a sequence of commands into stages.
    for (auto & command : commands)
    {
+        // we can return deleted rows only if it's the only present command
+        assert(command.type == MutationCommand::DELETE || !return_deleted_rows);
+
        if (command.type == MutationCommand::DELETE)
        {
            mutation_kind.set(MutationKind::MUTATE_OTHER);
            if (stages.empty() || !stages.back().column_to_updated.empty())
                stages.emplace_back(context);

-            auto negated_predicate = makeASTFunction("isZeroOrNull", getPartitionAndPredicateExpressionForMutationCommand(command));
-            stages.back().filters.push_back(negated_predicate);
+            auto predicate  = getPartitionAndPredicateExpressionForMutationCommand(command);
+
+            if (!return_deleted_rows)
+                predicate = makeASTFunction("isZeroOrNull", predicate);
+
+            stages.back().filters.push_back(predicate);
        }
        else if (command.type == MutationCommand::UPDATE)
        {
@ -789,7 +800,7 @@ ASTPtr MutationsInterpreter::prepareInterpreterSelectQuery(std::vector<Stage> &
    /// Next, for each stage calculate columns changed by this and previous stages.
    for (size_t i = 0; i < prepared_stages.size(); ++i)
    {
-        if (!prepared_stages[i].filters.empty())
+        if (return_all_columns || !prepared_stages[i].filters.empty())
        {
            for (const auto & column : all_columns)
                prepared_stages[i].output_columns.insert(column.name);
--- a/src/Interpreters/MutationsInterpreter.h
+++ b/src/Interpreters/MutationsInterpreter.h
@ -43,7 +43,9 @@ public:
        const StorageMetadataPtr & metadata_snapshot_,
        MutationCommands commands_,
        ContextPtr context_,
-        bool can_execute_);
+        bool can_execute_,
+        bool return_all_columns_ = false,
+        bool return_deleted_rows_ = false);

    void validate();

@ -156,6 +158,12 @@ private:

    /// Columns, that we need to read for calculation of skip indices, projections or TTL expressions.
    ColumnDependencies dependencies;
+
+    // whether all columns should be returned, not just updated
+    bool return_all_columns;
+
+    // whether we should return deleted or nondeleted rows on DELETE mutation
+    bool return_deleted_rows;
 };

 }
--- a/src/Interpreters/TreeCNFConverter.cpp
+++ b/src/Interpreters/TreeCNFConverter.cpp
@ -349,7 +349,7 @@ CNFQuery & CNFQuery::pullNotOutFunctions()
    return *this;
 }

-CNFQuery & CNFQuery::pushNotInFuntions()
+CNFQuery & CNFQuery::pushNotInFunctions()
 {
    transformAtoms([](const AtomicFormula & atom) -> AtomicFormula
                   {
--- a/src/Interpreters/TreeCNFConverter.h
+++ b/src/Interpreters/TreeCNFConverter.h
@ -133,7 +133,7 @@ public:
    /// Converts != -> NOT =; <,>= -> (NOT) <; >,<= -> (NOT) <= for simpler matching
    CNFQuery & pullNotOutFunctions();
    /// Revert pullNotOutFunctions actions
-    CNFQuery & pushNotInFuntions();
+    CNFQuery & pushNotInFunctions();

    /// (a OR b OR ...) AND (NOT a OR b OR ...) -> (b OR ...)
    CNFQuery & reduce();
--- a/src/Interpreters/TreeOptimizer.cpp
+++ b/src/Interpreters/TreeOptimizer.cpp
@ -154,7 +154,7 @@ void optimizeGroupBy(ASTSelectQuery * select_query, ContextPtr context)
                    continue;
                }
            }
-            /// don't optimise functions that shadow any of it's arguments, e.g.:
+            /// don't optimize functions that shadow any of it's arguments, e.g.:
            /// SELECT toString(dummy) as dummy FROM system.one GROUP BY dummy;
            if (!function->alias.empty())
            {
@ -632,7 +632,7 @@ bool convertQueryToCNF(ASTSelectQuery * select_query)
        if (!cnf_form)
            return false;

-        cnf_form->pushNotInFuntions();
+        cnf_form->pushNotInFunctions();
        select_query->refWhere() = TreeCNFConverter::fromCNF(*cnf_form);
        return true;
    }
--- a/src/Interpreters/TreeRewriter.h
+++ b/src/Interpreters/TreeRewriter.h
@ -99,7 +99,7 @@ using TreeRewriterResultPtr = std::shared_ptr<const TreeRewriterResult>;

 /// Tree Rewriter in terms of CMU slides @sa https://15721.courses.cs.cmu.edu/spring2020/slides/19-optimizer1.pdf
 ///
-/// Optimises AST tree and collect information for further expression analysis in ExpressionAnalyzer.
+/// Optimizes AST tree and collect information for further expression analysis in ExpressionAnalyzer.
 /// Result AST has the following invariants:
 ///  * all aliases are substituted
 ///  * qualified names are translated
--- a/src/Interpreters/WhereConstraintsOptimizer.cpp
+++ b/src/Interpreters/WhereConstraintsOptimizer.cpp
@ -170,7 +170,7 @@ void WhereConstraintsOptimizer::perform()
                return replaceTermsToConstants(atom, compare_graph);
            })
            .reduce()
-            .pushNotInFuntions();
+            .pushNotInFunctions();

        if (optimize_append_index)
            AddIndexConstraintsOptimizer(metadata_snapshot).perform(cnf);
--- a/src/Loggers/OwnSplitChannel.cpp
+++ b/src/Loggers/OwnSplitChannel.cpp
@ -46,6 +46,8 @@ void OwnSplitChannel::log(const Poco::Message & msg)

 void OwnSplitChannel::tryLogSplit(const Poco::Message & msg)
 {
+    LockMemoryExceptionInThread lock_memory_tracker(VariableContext::Global);
+
    try
    {
        logSplit(msg);
@ -62,8 +64,6 @@ void OwnSplitChannel::tryLogSplit(const Poco::Message & msg)
    /// but let's log it into the stderr at least.
    catch (...)
    {
-        LockMemoryExceptionInThread lock_memory_tracker(VariableContext::Global);
-
        const std::string & exception_message = getCurrentExceptionMessage(true);
        const std::string & message = msg.getText();

--- a/src/Parsers/ParserSelectQuery.cpp
+++ b/src/Parsers/ParserSelectQuery.cpp
@ -224,8 +224,6 @@ bool ParserSelectQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
            select_query->group_by_with_rollup = true;
        else if (s_cube.ignore(pos, expected))
            select_query->group_by_with_cube = true;
-        else if (s_grouping_sets.ignore(pos, expected))
-            select_query->group_by_with_grouping_sets = true;
        else if (s_totals.ignore(pos, expected))
            select_query->group_by_with_totals = true;
        else
--- a/src/Processors/QueryPlan/AggregatingStep.cpp
+++ b/src/Processors/QueryPlan/AggregatingStep.cpp
@ -251,14 +251,17 @@ void AggregatingStep::transformPipeline(QueryPipelineBuilder & pipeline, const B
                outputs.push_back(grouping_node);

                const auto & missing_columns = grouping_sets_params[set_counter].missing_keys;
+                const auto & used_keys = grouping_sets_params[set_counter].used_keys;

                auto to_nullable_function = FunctionFactory::instance().get("toNullable", nullptr);
                for (size_t i = 0; i < output_header.columns(); ++i)
                {
                    auto & col = output_header.getByPosition(i);
-                    const auto it = std::find_if(
+                    const auto missing_it = std::find_if(
                        missing_columns.begin(), missing_columns.end(), [&](const auto & missing_col) { return missing_col == col.name; });
-                    if (it != missing_columns.end())
+                    const auto used_it = std::find_if(
+                        used_keys.begin(), used_keys.end(), [&](const auto & used_col) { return used_col == col.name; });
+                    if (missing_it != missing_columns.end())
                    {
                        auto column_with_default = col.column->cloneEmpty();
                        col.type->insertDefaultInto(*column_with_default);
@ -270,7 +273,7 @@ void AggregatingStep::transformPipeline(QueryPipelineBuilder & pipeline, const B
                    else
                    {
                        const auto * column_node = dag->getOutputs()[header.getPositionByName(col.name)];
-                        if (group_by_use_nulls && column_node->result_type->canBeInsideNullable())
+                        if (used_it != used_keys.end() && group_by_use_nulls && column_node->result_type->canBeInsideNullable())
                            outputs.push_back(&dag->addFunction(to_nullable_function, { column_node }, col.name));
                        else
                            outputs.push_back(column_node);
--- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp
+++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp
@ -179,7 +179,6 @@ Pipe ReadFromMergeTree::readFromPool(
        sum_marks,
        min_marks_for_concurrent_read,
        std::move(parts_with_range),
-        data,
        storage_snapshot,
        prewhere_info,
        required_columns,
--- a/src/Storages/HDFS/ReadBufferFromHDFS.cpp
+++ b/src/Storages/HDFS/ReadBufferFromHDFS.cpp
@ -41,8 +41,9 @@ struct ReadBufferFromHDFS::ReadBufferFromHDFSImpl : public BufferWithOwnMemory<S
        const std::string & hdfs_file_path_,
        const Poco::Util::AbstractConfiguration & config_,
        const ReadSettings & read_settings_,
-        size_t read_until_position_)
-        : BufferWithOwnMemory<SeekableReadBuffer>(read_settings_.remote_fs_buffer_size)
+        size_t read_until_position_,
+        bool use_external_buffer_)
+        : BufferWithOwnMemory<SeekableReadBuffer>(use_external_buffer_ ? 0 : read_settings_.remote_fs_buffer_size)
        , hdfs_uri(hdfs_uri_)
        , hdfs_file_path(hdfs_file_path_)
        , builder(createHDFSBuilder(hdfs_uri_, config_))
@ -132,10 +133,12 @@ ReadBufferFromHDFS::ReadBufferFromHDFS(
        const String & hdfs_file_path_,
        const Poco::Util::AbstractConfiguration & config_,
        const ReadSettings & read_settings_,
-        size_t read_until_position_)
+        size_t read_until_position_,
+        bool use_external_buffer_)
    : ReadBufferFromFileBase(read_settings_.remote_fs_buffer_size, nullptr, 0)
    , impl(std::make_unique<ReadBufferFromHDFSImpl>(
-               hdfs_uri_, hdfs_file_path_, config_, read_settings_, read_until_position_))
+               hdfs_uri_, hdfs_file_path_, config_, read_settings_, read_until_position_, use_external_buffer_))
+    , use_external_buffer(use_external_buffer_)
 {
 }

@ -146,7 +149,18 @@ size_t ReadBufferFromHDFS::getFileSize()

 bool ReadBufferFromHDFS::nextImpl()
 {
-    impl->position() = impl->buffer().begin() + offset();
+    if (use_external_buffer)
+    {
+        impl->set(internal_buffer.begin(), internal_buffer.size());
+        assert(working_buffer.begin() != nullptr);
+        assert(!internal_buffer.empty());
+    }
+    else
+    {
+        impl->position() = impl->buffer().begin() + offset();
+        assert(!impl->hasPendingData());
+    }
+
    auto result = impl->next();

    if (result)
--- a/src/Storages/HDFS/ReadBufferFromHDFS.h
+++ b/src/Storages/HDFS/ReadBufferFromHDFS.h
@ -29,7 +29,8 @@ public:
        const String & hdfs_file_path_,
        const Poco::Util::AbstractConfiguration & config_,
        const ReadSettings & read_settings_,
-        size_t read_until_position_ = 0);
+        size_t read_until_position_ = 0,
+        bool use_external_buffer = false);

    ~ReadBufferFromHDFS() override;

@ -49,6 +50,7 @@ public:

 private:
    std::unique_ptr<ReadBufferFromHDFSImpl> impl;
+    bool use_external_buffer;
 };
 }

--- a/src/Storages/MergeTree/AlterConversions.h
+++ b/src/Storages/MergeTree/AlterConversions.h
@ -0,0 +1,24 @@
+#pragma once
+
+#include <string>
+#include <unordered_map>
+
+
+namespace DB
+{
+
+/// Alter conversions which should be applied on-fly for part. Build from of
+/// the most recent mutation commands for part. Now we have only rename_map
+/// here (from ALTER_RENAME) command, because for all other type of alters
+/// we can deduce conversions for part from difference between
+/// part->getColumns() and storage->getColumns().
+struct AlterConversions
+{
+    /// Rename map new_name -> old_name
+    std::unordered_map<std::string, std::string> rename_map;
+
+    bool isColumnRenamed(const std::string & new_name) const { return rename_map.count(new_name) > 0; }
+    std::string getColumnOldName(const std::string & new_name) const { return rename_map.at(new_name); }
+};
+
+}
--- a/src/Storages/MergeTree/DataPartsExchange.cpp
+++ b/src/Storages/MergeTree/DataPartsExchange.cpp
@ -399,7 +399,7 @@ MergeTreeData::DataPartPtr Service::findPart(const String & name)
    throw Exception(ErrorCodes::NO_SUCH_DATA_PART, "No part {} in table", name);
 }

-MergeTreeData::MutableDataPartPtr Fetcher::fetchPart(
+MergeTreeData::MutableDataPartPtr Fetcher::fetchSelectedPart(
    const StorageMetadataPtr & metadata_snapshot,
    ContextPtr context,
    const String & part_name,
@ -420,6 +420,11 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchPart(
    if (blocker.isCancelled())
        throw Exception("Fetching of part was cancelled", ErrorCodes::ABORTED);

+    const auto data_settings = data.getSettings();
+
+    if (data_settings->allow_remote_fs_zero_copy_replication && !try_zero_copy)
+        LOG_WARNING(log, "Zero copy replication enabled, but trying to fetch part {} without zero copy", part_name);
+
    /// It should be "tmp-fetch_" and not "tmp_fetch_", because we can fetch part to detached/,
    /// but detached part name prefix should not contain underscore.
    static const String TMP_PREFIX = "tmp-fetch_";
@ -429,7 +434,6 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchPart(

    /// Validation of the input that may come from malicious replica.
    auto part_info = MergeTreePartInfo::fromPartName(part_name, data.format_version);
-    const auto data_settings = data.getSettings();

    Poco::URI uri;
    uri.setScheme(interserver_scheme);
@ -465,6 +469,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchPart(
            capability.push_back(toString(disk->getDataSourceDescription().type));
        }
    }
+
    if (!capability.empty())
    {
        ::sort(capability.begin(), capability.end());
@ -474,6 +479,9 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchPart(
    }
    else
    {
+        if (data_settings->allow_remote_fs_zero_copy_replication)
+            LOG_WARNING(log, "Cannot select any zero-copy disk for {}", part_name);
+
        try_zero_copy = false;
    }

@ -585,7 +593,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchPart(
            temporary_directory_lock = {};

            /// Try again but without zero-copy
-            return fetchPart(metadata_snapshot, context, part_name, replica_path, host, port, timeouts,
+            return fetchSelectedPart(metadata_snapshot, context, part_name, replica_path, host, port, timeouts,
                user, password, interserver_scheme, throttler, to_detached, tmp_prefix, nullptr, false, disk);
        }
    }
--- a/src/Storages/MergeTree/DataPartsExchange.h
+++ b/src/Storages/MergeTree/DataPartsExchange.h
@ -66,7 +66,7 @@ public:
    explicit Fetcher(StorageReplicatedMergeTree & data_) : data(data_), log(&Poco::Logger::get("Fetcher")) {}

    /// Downloads a part to tmp_directory. If to_detached - downloads to the `detached` directory.
-    MergeTreeData::MutableDataPartPtr fetchPart(
+    MergeTreeData::MutableDataPartPtr fetchSelectedPart(
        const StorageMetadataPtr & metadata_snapshot,
        ContextPtr context,
        const String & part_name,
--- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp
+++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp
@ -532,25 +532,6 @@ void IMergeTreeDataPart::removeIfNeeded()
            LOG_TRACE(storage.log, "Removed part from old location {}", path);
        }
    }
-    catch (const Exception & ex)
-    {
-        tryLogCurrentException(__PRETTY_FUNCTION__, fmt::format("while removing part {} with path {}", name, path));
-
-        /// In this case we want to avoid assertions, because such errors are unavoidable in setup
-        /// with zero-copy replication.
-        if (const auto * keeper_exception = dynamic_cast<const Coordination::Exception *>(&ex))
-        {
-            if (Coordination::isHardwareError(keeper_exception->code))
-                return;
-        }
-
-        /// FIXME If part it temporary, then directory will not be removed for 1 day (temporary_directories_lifetime).
-        /// If it's tmp_merge_<part_name> or tmp_fetch_<part_name>,
-        /// then all future attempts to execute part producing operation will fail with "directory already exists".
-        assert(!is_temp);
-        assert(state != MergeTreeDataPartState::DeleteOnDestroy);
-        assert(state != MergeTreeDataPartState::Temporary);
-    }
    catch (...)
    {
        tryLogCurrentException(__PRETTY_FUNCTION__, fmt::format("while removing part {} with path {}", name, path));
@ -558,11 +539,6 @@ void IMergeTreeDataPart::removeIfNeeded()
        /// FIXME If part it temporary, then directory will not be removed for 1 day (temporary_directories_lifetime).
        /// If it's tmp_merge_<part_name> or tmp_fetch_<part_name>,
        /// then all future attempts to execute part producing operation will fail with "directory already exists".
-        ///
-        /// For remote disks this issue is really frequent, so we don't about server here
-        assert(!is_temp);
-        assert(state != MergeTreeDataPartState::DeleteOnDestroy);
-        assert(state != MergeTreeDataPartState::Temporary);
    }
 }

@ -1433,7 +1409,10 @@ std::pair<bool, NameSet> IMergeTreeDataPart::canRemovePart() const
 {
    /// NOTE: It's needed for zero-copy replication
    if (force_keep_shared_data)
+    {
+        LOG_DEBUG(storage.log, "Blobs for part {} cannot be removed because it's forced to be keeped", name);
        return std::make_pair(false, NameSet{});
+    }

    return storage.unlockSharedData(*this);
 }
@ -1457,6 +1436,12 @@ void IMergeTreeDataPart::remove() const

    auto [can_remove, files_not_to_remove] = canRemovePart();

+    if (!can_remove)
+        LOG_TRACE(storage.log, "Blobs of part {} cannot be removed", name);
+
+    if (!files_not_to_remove.empty())
+        LOG_TRACE(storage.log, "Some blobs ({}) of part {} cannot be removed", fmt::join(files_not_to_remove, ", "), name);
+
    if (!isStoredOnDisk())
        return;

--- a/src/Storages/MergeTree/IMergeTreeDataPartInfoForReader.h
+++ b/src/Storages/MergeTree/IMergeTreeDataPartInfoForReader.h
@ -0,0 +1,68 @@
+#pragma once
+#include <Interpreters/Context.h>
+#include <Storages/MergeTree/AlterConversions.h>
+#include <Core/NamesAndTypes.h>
+
+namespace DB
+{
+
+class IDataPartStorage;
+using DataPartStoragePtr = std::shared_ptr<IDataPartStorage>;
+class MergeTreeIndexGranularity;
+struct MergeTreeDataPartChecksums;
+struct MergeTreeIndexGranularityInfo;
+class ISerialization;
+using SerializationPtr = std::shared_ptr<const ISerialization>;
+
+/**
+ * A class which contains all information about a data part that is required
+ * in order to use MergeTreeDataPartReader's.
+ * It is a separate interface and not a simple struct because
+ * otherwise it will need to copy all the information which might not
+ * be even used (for example, an IndexGranulary class object is quite heavy).
+ */
+class IMergeTreeDataPartInfoForReader : public WithContext
+{
+public:
+    explicit IMergeTreeDataPartInfoForReader(ContextPtr context_) : WithContext(context_) {}
+
+    virtual ~IMergeTreeDataPartInfoForReader() = default;
+
+    virtual bool isCompactPart() const = 0;
+
+    virtual bool isWidePart() const = 0;
+
+    virtual bool isInMemoryPart() const = 0;
+
+    virtual bool isProjectionPart() const = 0;
+
+    virtual const DataPartStoragePtr & getDataPartStorage() const = 0;
+
+    virtual const NamesAndTypesList & getColumns() const = 0;
+
+    virtual std::optional<size_t> getColumnPosition(const String & column_name) const = 0;
+
+    virtual String getColumnNameWithMinimumCompressedSize(bool with_subcolumns) const = 0;
+
+    virtual const MergeTreeDataPartChecksums & getChecksums() const = 0;
+
+    virtual AlterConversions getAlterConversions() const = 0;
+
+    virtual size_t getMarksCount() const = 0;
+
+    virtual size_t getFileSizeOrZero(const std::string & file_name) const = 0;
+
+    virtual const MergeTreeIndexGranularityInfo & getIndexGranularityInfo() const = 0;
+
+    virtual const MergeTreeIndexGranularity & getIndexGranularity() const = 0;
+
+    virtual SerializationPtr getSerialization(const NameAndTypePair & column) const = 0;
+
+    virtual const SerializationInfoByName & getSerializationInfos() const = 0;
+
+    virtual void reportBroken() = 0;
+};
+
+using MergeTreeDataPartInfoForReaderPtr = std::shared_ptr<IMergeTreeDataPartInfoForReader>;
+
+}
--- a/src/Storages/MergeTree/IMergeTreeReader.cpp
+++ b/src/Storages/MergeTree/IMergeTreeReader.cpp
@ -23,7 +23,7 @@ namespace ErrorCodes


 IMergeTreeReader::IMergeTreeReader(
-    const MergeTreeData::DataPartPtr & data_part_,
+    MergeTreeDataPartInfoForReaderPtr data_part_info_for_read_,
    const NamesAndTypesList & columns_,
    const StorageMetadataPtr & metadata_snapshot_,
    UncompressedCache * uncompressed_cache_,
@ -31,19 +31,18 @@ IMergeTreeReader::IMergeTreeReader(
    const MarkRanges & all_mark_ranges_,
    const MergeTreeReaderSettings & settings_,
    const ValueSizeMap & avg_value_size_hints_)
-    : data_part(data_part_)
+    : data_part_info_for_read(data_part_info_for_read_)
    , avg_value_size_hints(avg_value_size_hints_)
    , uncompressed_cache(uncompressed_cache_)
    , mark_cache(mark_cache_)
    , settings(settings_)
-    , storage(data_part_->storage)
    , metadata_snapshot(metadata_snapshot_)
    , all_mark_ranges(all_mark_ranges_)
-    , alter_conversions(storage.getAlterConversionsForPart(data_part))
+    , alter_conversions(data_part_info_for_read->getAlterConversions())
    /// For wide parts convert plain arrays of Nested to subcolumns
    /// to allow to use shared offset column from cache.
-    , requested_columns(isWidePart(data_part) ? Nested::convertToSubcolumns(columns_) : columns_)
-    , part_columns(isWidePart(data_part) ? Nested::collect(data_part->getColumns()) : data_part->getColumns())
+    , requested_columns(data_part_info_for_read->isWidePart() ? Nested::convertToSubcolumns(columns_) : columns_)
+    , part_columns(data_part_info_for_read->isWidePart() ? Nested::collect(data_part_info_for_read->getColumns()) : data_part_info_for_read->getColumns())
 {
    columns_to_read.reserve(requested_columns.size());
    serializations.reserve(requested_columns.size());
@ -71,7 +70,7 @@ void IMergeTreeReader::fillMissingColumns(Columns & res_columns, bool & should_e
    catch (Exception & e)
    {
        /// Better diagnostics.
-        e.addMessage("(while reading from part " + data_part->data_part_storage->getFullPath() + ")");
+        e.addMessage("(while reading from part " + data_part_info_for_read->getDataPartStorage()->getFullPath() + ")");
        throw;
    }
 }
@ -99,13 +98,13 @@ void IMergeTreeReader::evaluateMissingDefaults(Block additional_columns, Columns
        }

        auto dag = DB::evaluateMissingDefaults(
-                additional_columns, requested_columns, metadata_snapshot->getColumns(), storage.getContext());
+                additional_columns, requested_columns, metadata_snapshot->getColumns(), data_part_info_for_read->getContext());
        if (dag)
        {
            dag->addMaterializingOutputActions();
            auto actions = std::make_shared<
                ExpressionActions>(std::move(dag),
-                ExpressionActionsSettings::fromSettings(storage.getContext()->getSettingsRef()));
+                ExpressionActionsSettings::fromSettings(data_part_info_for_read->getContext()->getSettingsRef()));
            actions->execute(additional_columns);
        }

@ -117,7 +116,7 @@ void IMergeTreeReader::evaluateMissingDefaults(Block additional_columns, Columns
    catch (Exception & e)
    {
        /// Better diagnostics.
-        e.addMessage("(while reading from part " + data_part->data_part_storage->getFullPath() + ")");
+        e.addMessage("(while reading from part " + data_part_info_for_read->getDataPartStorage()->getFullPath() + ")");
        throw;
    }
 }
@ -151,7 +150,7 @@ SerializationPtr IMergeTreeReader::getSerializationInPart(const NameAndTypePair
    if (!column_in_part)
        return IDataType::getSerialization(required_column);

-    const auto & infos = data_part->getSerializationInfos();
+    const auto & infos = data_part_info_for_read->getSerializationInfos();
    if (auto it = infos.find(column_in_part->getNameInStorage()); it != infos.end())
        return IDataType::getSerialization(*column_in_part, *it->second);

@ -187,7 +186,7 @@ void IMergeTreeReader::performRequiredConversions(Columns & res_columns) const
            copy_block.insert({res_columns[pos], getColumnInPart(*name_and_type).type, name_and_type->name});
        }

-        DB::performRequiredConversions(copy_block, requested_columns, storage.getContext());
+        DB::performRequiredConversions(copy_block, requested_columns, data_part_info_for_read->getContext());

        /// Move columns from block.
        name_and_type = requested_columns.begin();
@ -197,7 +196,7 @@ void IMergeTreeReader::performRequiredConversions(Columns & res_columns) const
    catch (Exception & e)
    {
        /// Better diagnostics.
-        e.addMessage("(while reading from part " + data_part->data_part_storage->getFullPath() + ")");
+        e.addMessage("(while reading from part " + data_part_info_for_read->getDataPartStorage()->getFullPath() + ")");
        throw;
    }
 }
@ -205,11 +204,11 @@ void IMergeTreeReader::performRequiredConversions(Columns & res_columns) const
 IMergeTreeReader::ColumnPosition IMergeTreeReader::findColumnForOffsets(const String & column_name) const
 {
    String table_name = Nested::extractTableName(column_name);
-    for (const auto & part_column : data_part->getColumns())
+    for (const auto & part_column : data_part_info_for_read->getColumns())
    {
        if (typeid_cast<const DataTypeArray *>(part_column.type.get()))
        {
-            auto position = data_part->getColumnPosition(part_column.getNameInStorage());
+            auto position = data_part_info_for_read->getColumnPosition(part_column.getNameInStorage());
            if (position && Nested::extractTableName(part_column.name) == table_name)
                return position;
        }
--- a/src/Storages/MergeTree/IMergeTreeReader.h
+++ b/src/Storages/MergeTree/IMergeTreeReader.h
@ -4,6 +4,8 @@
 #include <Common/HashTable/HashMap.h>
 #include <Storages/MergeTree/MergeTreeReaderStream.h>
 #include <Storages/MergeTree/MergeTreeBlockReadUtils.h>
+#include <Storages/MergeTree/IMergeTreeDataPart.h>
+#include <Storages/MergeTree/IMergeTreeDataPartInfoForReader.h>

 namespace DB
 {
@ -20,7 +22,7 @@ public:
    using DeserializeBinaryBulkStateMap = std::map<std::string, ISerialization::DeserializeBinaryBulkStatePtr>;

    IMergeTreeReader(
-        const MergeTreeData::DataPartPtr & data_part_,
+        MergeTreeDataPartInfoForReaderPtr data_part_info_for_read_,
        const NamesAndTypesList & columns_,
        const StorageMetadataPtr & metadata_snapshot_,
        UncompressedCache * uncompressed_cache_,
@ -57,7 +59,7 @@ public:

    size_t getFirstMarkToRead() const { return all_mark_ranges.front().begin; }

-    MergeTreeData::DataPartPtr data_part;
+    MergeTreeDataPartInfoForReaderPtr data_part_info_for_read;

 protected:
    /// Returns actual column name in part, which can differ from table metadata.
@ -86,7 +88,6 @@ protected:

    MergeTreeReaderSettings settings;

-    const MergeTreeData & storage;
    StorageMetadataPtr metadata_snapshot;
    MarkRanges all_mark_ranges;

@ -95,7 +96,7 @@ protected:

 private:
    /// Alter conversions, which must be applied on fly if required
-    MergeTreeData::AlterConversions alter_conversions;
+    AlterConversions alter_conversions;

    /// Columns that are requested to read.
    NamesAndTypesList requested_columns;
--- a/src/Storages/MergeTree/LoadedMergeTreeDataPartInfoForReader.h
+++ b/src/Storages/MergeTree/LoadedMergeTreeDataPartInfoForReader.h
@ -0,0 +1,55 @@
+#pragma once
+#include <Storages/MergeTree/IMergeTreeDataPartInfoForReader.h>
+#include <Storages/MergeTree/MergeTreeData.h>
+
+
+namespace DB
+{
+
+class LoadedMergeTreeDataPartInfoForReader final : public IMergeTreeDataPartInfoForReader
+{
+public:
+    explicit LoadedMergeTreeDataPartInfoForReader(MergeTreeData::DataPartPtr data_part_)
+        : IMergeTreeDataPartInfoForReader(data_part_->storage.getContext())
+        , data_part(data_part_)
+    {}
+
+    bool isCompactPart() const override { return DB::isCompactPart(data_part); }
+
+    bool isWidePart() const override { return DB::isWidePart(data_part); }
+
+    bool isInMemoryPart() const override { return DB::isInMemoryPart(data_part); }
+
+    bool isProjectionPart() const override { return data_part->isProjectionPart(); }
+
+    const DataPartStoragePtr & getDataPartStorage() const override { return data_part->data_part_storage; }
+
+    const NamesAndTypesList & getColumns() const override { return data_part->getColumns(); }
+
+    std::optional<size_t> getColumnPosition(const String & column_name) const override { return data_part->getColumnPosition(column_name); }
+
+    AlterConversions getAlterConversions() const override { return data_part->storage.getAlterConversionsForPart(data_part); }
+
+    String getColumnNameWithMinimumCompressedSize(bool with_subcolumns) const override { return data_part->getColumnNameWithMinimumCompressedSize(with_subcolumns); }
+
+    const MergeTreeDataPartChecksums & getChecksums() const override { return data_part->checksums; }
+
+    void reportBroken() override { data_part->storage.reportBrokenPart(data_part); }
+
+    size_t getMarksCount() const override { return data_part->getMarksCount(); }
+
+    size_t getFileSizeOrZero(const std::string & file_name) const override { return data_part->getFileSizeOrZero(file_name); }
+
+    const MergeTreeIndexGranularityInfo & getIndexGranularityInfo() const override { return data_part->index_granularity_info; }
+
+    const MergeTreeIndexGranularity & getIndexGranularity() const override { return data_part->index_granularity; }
+
+    const SerializationInfoByName & getSerializationInfos() const override { return data_part->getSerializationInfos(); }
+
+    SerializationPtr getSerialization(const NameAndTypePair & column) const override { return data_part->getSerialization(column.name); }
+
+private:
+    MergeTreeData::DataPartPtr data_part;
+};
+
+}
--- a/src/Storages/MergeTree/MarkRange.cpp
+++ b/src/Storages/MergeTree/MarkRange.cpp
@ -36,4 +36,16 @@ size_t getLastMark(const MarkRanges & ranges)
    return current_task_last_mark;
 }

+std::string toString(const MarkRanges & ranges)
+{
+    std::string result;
+    for (const auto & mark_range : ranges)
+    {
+        if (!result.empty())
+            result += ", ";
+        result += "(" + std::to_string(mark_range.begin) + ", " + std::to_string(mark_range.end) + ")";
+    }
+    return result;
+}
+
 }
--- a/Show More
+++ b/Show More