diff --git a/.clang-tidy b/.clang-tidy
index 2ca1402ddf1..860e7b3189f 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -1,6 +1,14 @@
+# To run clang-tidy from CMake, build ClickHouse with -DENABLE_CLANG_TIDY=1. To show all warnings, it is
+# recommended to pass "-k0" to Ninja.
+
 # Enable all checks + disale selected checks. Feel free to remove disabled checks from below list if
 # a) the new check is not controversial (this includes many checks in readability-* and google-*) or
 # b) too noisy (checks with > 100 new warnings are considered noisy, this includes e.g. cppcoreguidelines-*).
+
+# TODO Let clang-tidy check headers in further directories
+#      --> HeaderFilterRegex: '^.*/(src|base|programs|utils)/.*(h|hpp)$'
+HeaderFilterRegex: '^.*/(base)/.*(h|hpp)$'
+
 Checks: '*,
     -abseil-*,
 
diff --git a/.github/workflows/backport_branches.yml b/.github/workflows/backport_branches.yml
index e1b2b1fad01..1c51d06f395 100644
--- a/.github/workflows/backport_branches.yml
+++ b/.github/workflows/backport_branches.yml
@@ -349,6 +349,100 @@ jobs:
           # shellcheck disable=SC2046
           docker rm -f $(docker ps -a -q) ||:
           sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
+  BuilderBinDarwin:
+    needs: [DockerHubPush]
+    runs-on: [self-hosted, builder]
+    steps:
+      - name: Set envs
+        run: |
+          cat >> "$GITHUB_ENV" << 'EOF'
+          TEMP_PATH=${{runner.temp}}/build_check
+          IMAGES_PATH=${{runner.temp}}/images_path
+          REPO_COPY=${{runner.temp}}/build_check/ClickHouse
+          CACHES_PATH=${{runner.temp}}/../ccaches
+          BUILD_NAME=binary_darwin
+          EOF
+      - name: Download changed images
+        uses: actions/download-artifact@v2
+        with:
+          name: changed_images
+          path: ${{ env.IMAGES_PATH }}
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 0 # otherwise we will have no info about contributors
+      - name: Build
+        run: |
+          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
+          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          sudo rm -fr "$TEMP_PATH"
+          mkdir -p "$TEMP_PATH"
+          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
+          cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME"
+      - name: Upload build URLs to artifacts
+        if: ${{ success() || failure() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: ${{ env.BUILD_URLS }}
+          path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json
+      - name: Cleanup
+        if: always()
+        run: |
+          # shellcheck disable=SC2046
+          docker kill $(docker ps -q) ||:
+          # shellcheck disable=SC2046
+          docker rm -f $(docker ps -a -q) ||:
+          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
+  BuilderBinDarwinAarch64:
+    needs: [DockerHubPush]
+    runs-on: [self-hosted, builder]
+    steps:
+      - name: Set envs
+        run: |
+          cat >> "$GITHUB_ENV" << 'EOF'
+          TEMP_PATH=${{runner.temp}}/build_check
+          IMAGES_PATH=${{runner.temp}}/images_path
+          REPO_COPY=${{runner.temp}}/build_check/ClickHouse
+          CACHES_PATH=${{runner.temp}}/../ccaches
+          BUILD_NAME=binary_darwin_aarch64
+          EOF
+      - name: Download changed images
+        uses: actions/download-artifact@v2
+        with:
+          name: changed_images
+          path: ${{ env.IMAGES_PATH }}
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 0 # otherwise we will have no info about contributors
+      - name: Build
+        run: |
+          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
+          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          sudo rm -fr "$TEMP_PATH"
+          mkdir -p "$TEMP_PATH"
+          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
+          cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME"
+      - name: Upload build URLs to artifacts
+        if: ${{ success() || failure() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: ${{ env.BUILD_URLS }}
+          path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json
+      - name: Cleanup
+        if: always()
+        run: |
+          # shellcheck disable=SC2046
+          docker kill $(docker ps -q) ||:
+          # shellcheck disable=SC2046
+          docker rm -f $(docker ps -a -q) ||:
+          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
 ############################################################################################
 ##################################### Docker images  #######################################
 ############################################################################################
@@ -425,6 +519,46 @@ jobs:
           # shellcheck disable=SC2046
           docker rm -f $(docker ps -a -q) ||:
           sudo rm -fr "$TEMP_PATH"
+  BuilderSpecialReport:
+    needs:
+      - BuilderBinDarwin
+      - BuilderBinDarwinAarch64
+    runs-on: [self-hosted, style-checker]
+    steps:
+      - name: Set envs
+        run: |
+          cat >> "$GITHUB_ENV" << 'EOF'
+          TEMP_PATH=${{runner.temp}}/report_check
+          REPORTS_PATH=${{runner.temp}}/reports_dir
+          CHECK_NAME=ClickHouse special build check
+          NEEDS_DATA_PATH=${{runner.temp}}/needs.json
+          EOF
+      - name: Download json reports
+        uses: actions/download-artifact@v2
+        with:
+          path: ${{ env.REPORTS_PATH }}
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+      - name: Report Builder
+        run: |
+          sudo rm -fr "$TEMP_PATH"
+          mkdir -p "$TEMP_PATH"
+          cat > "$NEEDS_DATA_PATH" << 'EOF'
+          ${{ toJSON(needs) }}
+          EOF
+          cd "$GITHUB_WORKSPACE/tests/ci"
+          python3 build_report_check.py "$CHECK_NAME"
+      - name: Cleanup
+        if: always()
+        run: |
+          # shellcheck disable=SC2046
+          docker kill $(docker ps -q) ||:
+          # shellcheck disable=SC2046
+          docker rm -f $(docker ps -a -q) ||:
+          sudo rm -fr "$TEMP_PATH"
 ##############################################################################################
 ########################### FUNCTIONAl STATELESS TESTS #######################################
 ##############################################################################################
@@ -437,7 +571,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stateless_debug
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stateless tests (address)
+          CHECK_NAME=Stateless tests (asan)
           REPO_COPY=${{runner.temp}}/stateless_debug/ClickHouse
           KILL_TIMEOUT=10800
           EOF
@@ -521,7 +655,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stress_thread
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stress test (thread)
+          CHECK_NAME=Stress test (tsan)
           REPO_COPY=${{runner.temp}}/stress_thread/ClickHouse
           EOF
       - name: Download json reports
@@ -592,6 +726,7 @@ jobs:
       - DockerHubPush
       - DockerServerImages
       - BuilderReport
+      - BuilderSpecialReport
       - FunctionalStatelessTestAsan
       - FunctionalStatefulTestDebug
       - StressTestTsan
diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml
index 2acc1468328..d3a303eb7ab 100644
--- a/.github/workflows/master.yml
+++ b/.github/workflows/master.yml
@@ -923,6 +923,53 @@ jobs:
           # shellcheck disable=SC2046
           docker rm -f $(docker ps -a -q) ||:
           sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
+  BuilderBinAmd64SSE2:
+    needs: [DockerHubPush]
+    runs-on: [self-hosted, builder]
+    steps:
+      - name: Set envs
+        run: |
+          cat >> "$GITHUB_ENV" << 'EOF'
+          TEMP_PATH=${{runner.temp}}/build_check
+          IMAGES_PATH=${{runner.temp}}/images_path
+          REPO_COPY=${{runner.temp}}/build_check/ClickHouse
+          CACHES_PATH=${{runner.temp}}/../ccaches
+          BUILD_NAME=binary_amd64sse2
+          EOF
+      - name: Download changed images
+        uses: actions/download-artifact@v2
+        with:
+          name: changed_images
+          path: ${{ env.IMAGES_PATH }}
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 0 # otherwise we will have no info about contributors
+      - name: Build
+        run: |
+          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
+          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          sudo rm -fr "$TEMP_PATH"
+          mkdir -p "$TEMP_PATH"
+          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
+          cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME"
+      - name: Upload build URLs to artifacts
+        if: ${{ success() || failure() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: ${{ env.BUILD_URLS }}
+          path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json
+      - name: Cleanup
+        if: always()
+        run: |
+          # shellcheck disable=SC2046
+          docker kill $(docker ps -q) ||:
+          # shellcheck disable=SC2046
+          docker rm -f $(docker ps -a -q) ||:
+          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
 ############################################################################################
 ##################################### Docker images  #######################################
 ############################################################################################
@@ -1011,6 +1058,7 @@ jobs:
       - BuilderBinFreeBSD
       # - BuilderBinGCC
       - BuilderBinPPC64
+      - BuilderBinAmd64SSE2
       - BuilderBinClangTidy
       - BuilderDebShared
     runs-on: [self-hosted, style-checker]
@@ -1287,7 +1335,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stateless_debug
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stateless tests (address)
+          CHECK_NAME=Stateless tests (asan)
           REPO_COPY=${{runner.temp}}/stateless_debug/ClickHouse
           KILL_TIMEOUT=10800
           RUN_BY_HASH_NUM=0
@@ -1326,7 +1374,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stateless_debug
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stateless tests (address)
+          CHECK_NAME=Stateless tests (asan)
           REPO_COPY=${{runner.temp}}/stateless_debug/ClickHouse
           KILL_TIMEOUT=10800
           RUN_BY_HASH_NUM=1
@@ -1365,7 +1413,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stateless_tsan
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stateless tests (thread)
+          CHECK_NAME=Stateless tests (tsan)
           REPO_COPY=${{runner.temp}}/stateless_tsan/ClickHouse
           KILL_TIMEOUT=10800
           RUN_BY_HASH_NUM=0
@@ -1404,7 +1452,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stateless_tsan
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stateless tests (thread)
+          CHECK_NAME=Stateless tests (tsan)
           REPO_COPY=${{runner.temp}}/stateless_tsan/ClickHouse
           KILL_TIMEOUT=10800
           RUN_BY_HASH_NUM=1
@@ -1443,7 +1491,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stateless_tsan
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stateless tests (thread)
+          CHECK_NAME=Stateless tests (tsan)
           REPO_COPY=${{runner.temp}}/stateless_tsan/ClickHouse
           KILL_TIMEOUT=10800
           RUN_BY_HASH_NUM=2
@@ -1519,7 +1567,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stateless_memory
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stateless tests (memory)
+          CHECK_NAME=Stateless tests (msan)
           REPO_COPY=${{runner.temp}}/stateless_memory/ClickHouse
           KILL_TIMEOUT=10800
           RUN_BY_HASH_NUM=0
@@ -1558,7 +1606,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stateless_memory
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stateless tests (memory)
+          CHECK_NAME=Stateless tests (msan)
           REPO_COPY=${{runner.temp}}/stateless_memory/ClickHouse
           KILL_TIMEOUT=10800
           RUN_BY_HASH_NUM=1
@@ -1597,7 +1645,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stateless_memory
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stateless tests (memory)
+          CHECK_NAME=Stateless tests (msan)
           REPO_COPY=${{runner.temp}}/stateless_memory/ClickHouse
           KILL_TIMEOUT=10800
           RUN_BY_HASH_NUM=2
@@ -1830,7 +1878,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stateful_debug
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stateful tests (address)
+          CHECK_NAME=Stateful tests (asan)
           REPO_COPY=${{runner.temp}}/stateful_debug/ClickHouse
           KILL_TIMEOUT=3600
           EOF
@@ -1867,7 +1915,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stateful_tsan
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stateful tests (thread)
+          CHECK_NAME=Stateful tests (tsan)
           REPO_COPY=${{runner.temp}}/stateful_tsan/ClickHouse
           KILL_TIMEOUT=3600
           EOF
@@ -1904,7 +1952,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stateful_msan
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stateful tests (memory)
+          CHECK_NAME=Stateful tests (msan)
           REPO_COPY=${{runner.temp}}/stateful_msan/ClickHouse
           KILL_TIMEOUT=3600
           EOF
@@ -2018,7 +2066,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stress_thread
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stress test (address)
+          CHECK_NAME=Stress test (asan)
           REPO_COPY=${{runner.temp}}/stress_thread/ClickHouse
           EOF
       - name: Download json reports
@@ -2058,7 +2106,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stress_thread
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stress test (thread)
+          CHECK_NAME=Stress test (tsan)
           REPO_COPY=${{runner.temp}}/stress_thread/ClickHouse
           EOF
       - name: Download json reports
@@ -2094,7 +2142,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stress_memory
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stress test (memory)
+          CHECK_NAME=Stress test (msan)
           REPO_COPY=${{runner.temp}}/stress_memory/ClickHouse
           EOF
       - name: Download json reports
@@ -2130,7 +2178,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stress_undefined
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stress test (undefined)
+          CHECK_NAME=Stress test (ubsan)
           REPO_COPY=${{runner.temp}}/stress_undefined/ClickHouse
           EOF
       - name: Download json reports
@@ -2319,7 +2367,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/integration_tests_tsan
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Integration tests (thread)
+          CHECK_NAME=Integration tests (tsan)
           REPO_COPY=${{runner.temp}}/integration_tests_tsan/ClickHouse
           RUN_BY_HASH_NUM=0
           RUN_BY_HASH_TOTAL=4
@@ -2357,7 +2405,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/integration_tests_tsan
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Integration tests (thread)
+          CHECK_NAME=Integration tests (tsan)
           REPO_COPY=${{runner.temp}}/integration_tests_tsan/ClickHouse
           RUN_BY_HASH_NUM=1
           RUN_BY_HASH_TOTAL=4
@@ -2395,7 +2443,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/integration_tests_tsan
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Integration tests (thread)
+          CHECK_NAME=Integration tests (tsan)
           REPO_COPY=${{runner.temp}}/integration_tests_tsan/ClickHouse
           RUN_BY_HASH_NUM=2
           RUN_BY_HASH_TOTAL=4
@@ -2433,7 +2481,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/integration_tests_tsan
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Integration tests (thread)
+          CHECK_NAME=Integration tests (tsan)
           REPO_COPY=${{runner.temp}}/integration_tests_tsan/ClickHouse
           RUN_BY_HASH_NUM=3
           RUN_BY_HASH_TOTAL=4
@@ -2550,7 +2598,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/ast_fuzzer_asan
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=AST fuzzer (ASan)
+          CHECK_NAME=AST fuzzer (asan)
           REPO_COPY=${{runner.temp}}/ast_fuzzer_asan/ClickHouse
           EOF
       - name: Download json reports
@@ -2586,7 +2634,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/ast_fuzzer_tsan
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=AST fuzzer (TSan)
+          CHECK_NAME=AST fuzzer (tsan)
           REPO_COPY=${{runner.temp}}/ast_fuzzer_tsan/ClickHouse
           EOF
       - name: Download json reports
@@ -2622,7 +2670,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/ast_fuzzer_ubsan
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=AST fuzzer (UBSan)
+          CHECK_NAME=AST fuzzer (ubsan)
           REPO_COPY=${{runner.temp}}/ast_fuzzer_ubsan/ClickHouse
           EOF
       - name: Download json reports
@@ -2658,7 +2706,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/ast_fuzzer_msan
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=AST fuzzer (MSan)
+          CHECK_NAME=AST fuzzer (msan)
           REPO_COPY=${{runner.temp}}/ast_fuzzer_msan/ClickHouse
           EOF
       - name: Download json reports
diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml
index 513df8487c4..3f4e5d7bb00 100644
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@@ -935,6 +935,51 @@ jobs:
           # shellcheck disable=SC2046
           docker rm -f $(docker ps -a -q) ||:
           sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
+  BuilderBinAmd64SSE2:
+    needs: [DockerHubPush, FastTest, StyleCheck]
+    runs-on: [self-hosted, builder]
+    steps:
+      - name: Set envs
+        run: |
+          cat >> "$GITHUB_ENV" << 'EOF'
+          TEMP_PATH=${{runner.temp}}/build_check
+          IMAGES_PATH=${{runner.temp}}/images_path
+          REPO_COPY=${{runner.temp}}/build_check/ClickHouse
+          CACHES_PATH=${{runner.temp}}/../ccaches
+          BUILD_NAME=binary_amd64sse2
+          EOF
+      - name: Download changed images
+        uses: actions/download-artifact@v2
+        with:
+          name: changed_images
+          path: ${{ env.IMAGES_PATH }}
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+      - name: Build
+        run: |
+          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
+          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          sudo rm -fr "$TEMP_PATH"
+          mkdir -p "$TEMP_PATH"
+          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
+          cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME"
+      - name: Upload build URLs to artifacts
+        if: ${{ success() || failure() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: ${{ env.BUILD_URLS }}
+          path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json
+      - name: Cleanup
+        if: always()
+        run: |
+          # shellcheck disable=SC2046
+          docker kill $(docker ps -q) ||:
+          # shellcheck disable=SC2046
+          docker rm -f $(docker ps -a -q) ||:
+          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
 ############################################################################################
 ##################################### Docker images  #######################################
 ############################################################################################
@@ -1023,6 +1068,7 @@ jobs:
       - BuilderBinFreeBSD
       # - BuilderBinGCC
       - BuilderBinPPC64
+      - BuilderBinAmd64SSE2
       - BuilderBinClangTidy
       - BuilderDebShared
     runs-on: [self-hosted, style-checker]
@@ -1254,6 +1300,228 @@ jobs:
           # shellcheck disable=SC2046
           docker rm -f $(docker ps -a -q) ||:
           sudo rm -fr "$TEMP_PATH"
+  FunctionalStatelessTestS3Debug0:
+    needs: [BuilderDebDebug]
+    runs-on: [self-hosted, func-tester]
+    steps:
+      - name: Set envs
+        run: |
+          cat >> "$GITHUB_ENV" << 'EOF'
+          TEMP_PATH=${{runner.temp}}/stateless_s3_storage_debug
+          REPORTS_PATH=${{runner.temp}}/reports_dir
+          CHECK_NAME=Stateless tests (debug, s3 storage)
+          REPO_COPY=${{runner.temp}}/stateless_s3_storage_debug/ClickHouse
+          KILL_TIMEOUT=10800
+          RUN_BY_HASH_NUM=0
+          RUN_BY_HASH_TOTAL=3
+          EOF
+      - name: Download json reports
+        uses: actions/download-artifact@v2
+        with:
+          path: ${{ env.REPORTS_PATH }}
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+      - name: Functional test
+        run: |
+          sudo rm -fr "$TEMP_PATH"
+          mkdir -p "$TEMP_PATH"
+          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
+          cd "$REPO_COPY/tests/ci"
+          python3 functional_test_check.py "$CHECK_NAME" "$KILL_TIMEOUT"
+      - name: Cleanup
+        if: always()
+        run: |
+          docker kill "$(docker ps -q)" ||:
+          docker rm -f "$(docker ps -a -q)" ||:
+          sudo rm -fr "$TEMP_PATH"
+  FunctionalStatelessTestS3Debug1:
+    needs: [BuilderDebDebug]
+    runs-on: [self-hosted, func-tester]
+    steps:
+      - name: Set envs
+        run: |
+          cat >> "$GITHUB_ENV" << 'EOF'
+          TEMP_PATH=${{runner.temp}}/stateless_s3_storage_debug
+          REPORTS_PATH=${{runner.temp}}/reports_dir
+          CHECK_NAME=Stateless tests (debug, s3 storage)
+          REPO_COPY=${{runner.temp}}/stateless_s3_storage_debug/ClickHouse
+          KILL_TIMEOUT=10800
+          RUN_BY_HASH_NUM=1
+          RUN_BY_HASH_TOTAL=3
+          EOF
+      - name: Download json reports
+        uses: actions/download-artifact@v2
+        with:
+          path: ${{ env.REPORTS_PATH }}
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+      - name: Functional test
+        run: |
+          sudo rm -fr "$TEMP_PATH"
+          mkdir -p "$TEMP_PATH"
+          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
+          cd "$REPO_COPY/tests/ci"
+          python3 functional_test_check.py "$CHECK_NAME" "$KILL_TIMEOUT"
+      - name: Cleanup
+        if: always()
+        run: |
+          docker kill "$(docker ps -q)" ||:
+          docker rm -f "$(docker ps -a -q)" ||:
+          sudo rm -fr "$TEMP_PATH"
+  FunctionalStatelessTestS3Debug2:
+    needs: [BuilderDebDebug]
+    runs-on: [self-hosted, func-tester]
+    steps:
+      - name: Set envs
+        run: |
+          cat >> "$GITHUB_ENV" << 'EOF'
+          TEMP_PATH=${{runner.temp}}/stateless_s3_storage_debug
+          REPORTS_PATH=${{runner.temp}}/reports_dir
+          CHECK_NAME=Stateless tests (debug, s3 storage)
+          REPO_COPY=${{runner.temp}}/stateless_s3_storage_debug/ClickHouse
+          KILL_TIMEOUT=10800
+          RUN_BY_HASH_NUM=2
+          RUN_BY_HASH_TOTAL=3
+          EOF
+      - name: Download json reports
+        uses: actions/download-artifact@v2
+        with:
+          path: ${{ env.REPORTS_PATH }}
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+      - name: Functional test
+        run: |
+          sudo rm -fr "$TEMP_PATH"
+          mkdir -p "$TEMP_PATH"
+          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
+          cd "$REPO_COPY/tests/ci"
+          python3 functional_test_check.py "$CHECK_NAME" "$KILL_TIMEOUT"
+      - name: Cleanup
+        if: always()
+        run: |
+          docker kill "$(docker ps -q)" ||:
+          docker rm -f "$(docker ps -a -q)" ||:
+          sudo rm -fr "$TEMP_PATH"
+  FunctionalStatelessTestS3Tsan0:
+    needs: [BuilderDebTsan]
+    runs-on: [self-hosted, func-tester]
+    steps:
+      - name: Set envs
+        run: |
+          cat >> "$GITHUB_ENV" << 'EOF'
+          TEMP_PATH=${{runner.temp}}/stateless_s3_storage_tsan
+          REPORTS_PATH=${{runner.temp}}/reports_dir
+          CHECK_NAME=Stateless tests (tsan, s3 storage)
+          REPO_COPY=${{runner.temp}}/stateless_s3_storage_tsan/ClickHouse
+          KILL_TIMEOUT=10800
+          RUN_BY_HASH_NUM=0
+          RUN_BY_HASH_TOTAL=3
+          EOF
+      - name: Download json reports
+        uses: actions/download-artifact@v2
+        with:
+          path: ${{ env.REPORTS_PATH }}
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+      - name: Functional test
+        run: |
+          sudo rm -fr "$TEMP_PATH"
+          mkdir -p "$TEMP_PATH"
+          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
+          cd "$REPO_COPY/tests/ci"
+          python3 functional_test_check.py "$CHECK_NAME" "$KILL_TIMEOUT"
+      - name: Cleanup
+        if: always()
+        run: |
+          docker kill "$(docker ps -q)" ||:
+          docker rm -f "$(docker ps -a -q)" ||:
+          sudo rm -fr "$TEMP_PATH"
+  FunctionalStatelessTestS3Tsan1:
+    needs: [BuilderDebTsan]
+    runs-on: [self-hosted, func-tester]
+    steps:
+      - name: Set envs
+        run: |
+          cat >> "$GITHUB_ENV" << 'EOF'
+          TEMP_PATH=${{runner.temp}}/stateless_s3_storage_tsan
+          REPORTS_PATH=${{runner.temp}}/reports_dir
+          CHECK_NAME=Stateless tests (tsan, s3 storage)
+          REPO_COPY=${{runner.temp}}/stateless_s3_storage_tsan/ClickHouse
+          KILL_TIMEOUT=10800
+          RUN_BY_HASH_NUM=1
+          RUN_BY_HASH_TOTAL=3
+          EOF
+      - name: Download json reports
+        uses: actions/download-artifact@v2
+        with:
+          path: ${{ env.REPORTS_PATH }}
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+      - name: Functional test
+        run: |
+          sudo rm -fr "$TEMP_PATH"
+          mkdir -p "$TEMP_PATH"
+          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
+          cd "$REPO_COPY/tests/ci"
+          python3 functional_test_check.py "$CHECK_NAME" "$KILL_TIMEOUT"
+      - name: Cleanup
+        if: always()
+        run: |
+          docker kill "$(docker ps -q)" ||:
+          docker rm -f "$(docker ps -a -q)" ||:
+          sudo rm -fr "$TEMP_PATH"
+  FunctionalStatelessTestS3Tsan2:
+    needs: [BuilderDebTsan]
+    runs-on: [self-hosted, func-tester]
+    steps:
+      - name: Set envs
+        run: |
+          cat >> "$GITHUB_ENV" << 'EOF'
+          TEMP_PATH=${{runner.temp}}/stateless_s3_storage_tsan
+          REPORTS_PATH=${{runner.temp}}/reports_dir
+          CHECK_NAME=Stateless tests (tsan, s3 storage)
+          REPO_COPY=${{runner.temp}}/stateless_s3_storage_tsan/ClickHouse
+          KILL_TIMEOUT=10800
+          RUN_BY_HASH_NUM=2
+          RUN_BY_HASH_TOTAL=3
+          EOF
+      - name: Download json reports
+        uses: actions/download-artifact@v2
+        with:
+          path: ${{ env.REPORTS_PATH }}
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+      - name: Functional test
+        run: |
+          sudo rm -fr "$TEMP_PATH"
+          mkdir -p "$TEMP_PATH"
+          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
+          cd "$REPO_COPY/tests/ci"
+          python3 functional_test_check.py "$CHECK_NAME" "$KILL_TIMEOUT"
+      - name: Cleanup
+        if: always()
+        run: |
+          docker kill "$(docker ps -q)" ||:
+          docker rm -f "$(docker ps -a -q)" ||:
+          sudo rm -fr "$TEMP_PATH"
   FunctionalStatelessTestAarch64:
     needs: [BuilderDebAarch64]
     runs-on: [self-hosted, func-tester-aarch64]
@@ -1300,7 +1568,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stateless_debug
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stateless tests (address)
+          CHECK_NAME=Stateless tests (asan)
           REPO_COPY=${{runner.temp}}/stateless_debug/ClickHouse
           KILL_TIMEOUT=10800
           RUN_BY_HASH_NUM=0
@@ -1339,7 +1607,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stateless_debug
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stateless tests (address)
+          CHECK_NAME=Stateless tests (asan)
           REPO_COPY=${{runner.temp}}/stateless_debug/ClickHouse
           KILL_TIMEOUT=10800
           RUN_BY_HASH_NUM=1
@@ -1378,7 +1646,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stateless_tsan
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stateless tests (thread)
+          CHECK_NAME=Stateless tests (tsan)
           REPO_COPY=${{runner.temp}}/stateless_tsan/ClickHouse
           KILL_TIMEOUT=10800
           RUN_BY_HASH_NUM=0
@@ -1417,7 +1685,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stateless_tsan
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stateless tests (thread)
+          CHECK_NAME=Stateless tests (tsan)
           REPO_COPY=${{runner.temp}}/stateless_tsan/ClickHouse
           KILL_TIMEOUT=10800
           RUN_BY_HASH_NUM=1
@@ -1456,7 +1724,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stateless_tsan
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stateless tests (thread)
+          CHECK_NAME=Stateless tests (tsan)
           REPO_COPY=${{runner.temp}}/stateless_tsan/ClickHouse
           KILL_TIMEOUT=10800
           RUN_BY_HASH_NUM=2
@@ -1532,7 +1800,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stateless_memory
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stateless tests (memory)
+          CHECK_NAME=Stateless tests (msan)
           REPO_COPY=${{runner.temp}}/stateless_memory/ClickHouse
           KILL_TIMEOUT=10800
           RUN_BY_HASH_NUM=0
@@ -1571,7 +1839,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stateless_memory
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stateless tests (memory)
+          CHECK_NAME=Stateless tests (msan)
           REPO_COPY=${{runner.temp}}/stateless_memory/ClickHouse
           KILL_TIMEOUT=10800
           RUN_BY_HASH_NUM=1
@@ -1610,7 +1878,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stateless_memory
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stateless tests (memory)
+          CHECK_NAME=Stateless tests (msan)
           REPO_COPY=${{runner.temp}}/stateless_memory/ClickHouse
           KILL_TIMEOUT=10800
           RUN_BY_HASH_NUM=2
@@ -1766,7 +2034,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stateless_flaky_asan
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stateless tests flaky check (address)
+          CHECK_NAME=Stateless tests flaky check (asan)
           REPO_COPY=${{runner.temp}}/stateless_flaky_asan/ClickHouse
           KILL_TIMEOUT=3600
           EOF
@@ -1927,7 +2195,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stateful_debug
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stateful tests (address)
+          CHECK_NAME=Stateful tests (asan)
           REPO_COPY=${{runner.temp}}/stateful_debug/ClickHouse
           KILL_TIMEOUT=3600
           EOF
@@ -1964,7 +2232,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stateful_tsan
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stateful tests (thread)
+          CHECK_NAME=Stateful tests (tsan)
           REPO_COPY=${{runner.temp}}/stateful_tsan/ClickHouse
           KILL_TIMEOUT=3600
           EOF
@@ -2001,7 +2269,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stateful_msan
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stateful tests (memory)
+          CHECK_NAME=Stateful tests (msan)
           REPO_COPY=${{runner.temp}}/stateful_msan/ClickHouse
           KILL_TIMEOUT=3600
           EOF
@@ -2115,7 +2383,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stress_thread
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stress test (address)
+          CHECK_NAME=Stress test (asan)
           REPO_COPY=${{runner.temp}}/stress_thread/ClickHouse
           EOF
       - name: Download json reports
@@ -2155,7 +2423,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stress_thread
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stress test (thread)
+          CHECK_NAME=Stress test (tsan)
           REPO_COPY=${{runner.temp}}/stress_thread/ClickHouse
           EOF
       - name: Download json reports
@@ -2191,7 +2459,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stress_memory
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stress test (memory)
+          CHECK_NAME=Stress test (msan)
           REPO_COPY=${{runner.temp}}/stress_memory/ClickHouse
           EOF
       - name: Download json reports
@@ -2227,7 +2495,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stress_undefined
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stress test (undefined)
+          CHECK_NAME=Stress test (ubsan)
           REPO_COPY=${{runner.temp}}/stress_undefined/ClickHouse
           EOF
       - name: Download json reports
@@ -2302,7 +2570,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/ast_fuzzer_asan
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=AST fuzzer (ASan)
+          CHECK_NAME=AST fuzzer (asan)
           REPO_COPY=${{runner.temp}}/ast_fuzzer_asan/ClickHouse
           EOF
       - name: Download json reports
@@ -2338,7 +2606,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/ast_fuzzer_tsan
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=AST fuzzer (TSan)
+          CHECK_NAME=AST fuzzer (tsan)
           REPO_COPY=${{runner.temp}}/ast_fuzzer_tsan/ClickHouse
           EOF
       - name: Download json reports
@@ -2374,7 +2642,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/ast_fuzzer_ubsan
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=AST fuzzer (UBSan)
+          CHECK_NAME=AST fuzzer (ubsan)
           REPO_COPY=${{runner.temp}}/ast_fuzzer_ubsan/ClickHouse
           EOF
       - name: Download json reports
@@ -2410,7 +2678,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/ast_fuzzer_msan
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=AST fuzzer (MSan)
+          CHECK_NAME=AST fuzzer (msan)
           REPO_COPY=${{runner.temp}}/ast_fuzzer_msan/ClickHouse
           EOF
       - name: Download json reports
@@ -2599,7 +2867,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/integration_tests_tsan
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Integration tests (thread)
+          CHECK_NAME=Integration tests (tsan)
           REPO_COPY=${{runner.temp}}/integration_tests_tsan/ClickHouse
           RUN_BY_HASH_NUM=0
           RUN_BY_HASH_TOTAL=4
@@ -2637,7 +2905,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/integration_tests_tsan
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Integration tests (thread)
+          CHECK_NAME=Integration tests (tsan)
           REPO_COPY=${{runner.temp}}/integration_tests_tsan/ClickHouse
           RUN_BY_HASH_NUM=1
           RUN_BY_HASH_TOTAL=4
@@ -2675,7 +2943,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/integration_tests_tsan
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Integration tests (thread)
+          CHECK_NAME=Integration tests (tsan)
           REPO_COPY=${{runner.temp}}/integration_tests_tsan/ClickHouse
           RUN_BY_HASH_NUM=2
           RUN_BY_HASH_TOTAL=4
@@ -2713,7 +2981,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/integration_tests_tsan
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Integration tests (thread)
+          CHECK_NAME=Integration tests (tsan)
           REPO_COPY=${{runner.temp}}/integration_tests_tsan/ClickHouse
           RUN_BY_HASH_NUM=3
           RUN_BY_HASH_TOTAL=4
@@ -3388,6 +3656,12 @@ jobs:
       - FunctionalStatefulTestMsan
       - FunctionalStatefulTestUBsan
       - FunctionalStatelessTestReleaseS3
+      - FunctionalStatelessTestS3Debug0
+      - FunctionalStatelessTestS3Debug1
+      - FunctionalStatelessTestS3Debug2
+      - FunctionalStatelessTestS3Tsan0
+      - FunctionalStatelessTestS3Tsan1
+      - FunctionalStatelessTestS3Tsan2
       - StressTestDebug
       - StressTestAsan
       - StressTestTsan
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 7e12695990c..ae905aa62ba 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -29,8 +29,12 @@ jobs:
         rm -rf "$TEMP_PATH" && mkdir -p "$TEMP_PATH"
         cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
         cd "$REPO_COPY"
+        # Download and push packages to artifactory
         python3 ./tests/ci/push_to_artifactory.py --release "${{ github.ref }}" \
           --commit '${{ github.sha }}' --artifactory-url "${{ secrets.JFROG_ARTIFACTORY_URL }}" --all
+        # Download macos binaries to ${{runner.temp}}/download_binary
+        python3 ./tests/ci/download_binary.py binary_darwin binary_darwin_aarch64
+        mv '${{runner.temp}}/download_binary/'clickhouse-* '${{runner.temp}}/push_to_artifactory'
     - name: Upload packages to release assets
       uses: svenstaro/upload-release-action@v2
       with:
diff --git a/.github/workflows/release_branches.yml b/.github/workflows/release_branches.yml
index afeebf3c2b7..f579d1fee63 100644
--- a/.github/workflows/release_branches.yml
+++ b/.github/workflows/release_branches.yml
@@ -426,6 +426,100 @@ jobs:
           # shellcheck disable=SC2046
           docker rm -f $(docker ps -a -q) ||:
           sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
+  BuilderBinDarwin:
+    needs: [DockerHubPush]
+    runs-on: [self-hosted, builder]
+    steps:
+      - name: Set envs
+        run: |
+          cat >> "$GITHUB_ENV" << 'EOF'
+          TEMP_PATH=${{runner.temp}}/build_check
+          IMAGES_PATH=${{runner.temp}}/images_path
+          REPO_COPY=${{runner.temp}}/build_check/ClickHouse
+          CACHES_PATH=${{runner.temp}}/../ccaches
+          BUILD_NAME=binary_darwin
+          EOF
+      - name: Download changed images
+        uses: actions/download-artifact@v2
+        with:
+          name: changed_images
+          path: ${{ env.IMAGES_PATH }}
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 0 # otherwise we will have no info about contributors
+      - name: Build
+        run: |
+          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
+          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          sudo rm -fr "$TEMP_PATH"
+          mkdir -p "$TEMP_PATH"
+          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
+          cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME"
+      - name: Upload build URLs to artifacts
+        if: ${{ success() || failure() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: ${{ env.BUILD_URLS }}
+          path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json
+      - name: Cleanup
+        if: always()
+        run: |
+          # shellcheck disable=SC2046
+          docker kill $(docker ps -q) ||:
+          # shellcheck disable=SC2046
+          docker rm -f $(docker ps -a -q) ||:
+          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
+  BuilderBinDarwinAarch64:
+    needs: [DockerHubPush]
+    runs-on: [self-hosted, builder]
+    steps:
+      - name: Set envs
+        run: |
+          cat >> "$GITHUB_ENV" << 'EOF'
+          TEMP_PATH=${{runner.temp}}/build_check
+          IMAGES_PATH=${{runner.temp}}/images_path
+          REPO_COPY=${{runner.temp}}/build_check/ClickHouse
+          CACHES_PATH=${{runner.temp}}/../ccaches
+          BUILD_NAME=binary_darwin_aarch64
+          EOF
+      - name: Download changed images
+        uses: actions/download-artifact@v2
+        with:
+          name: changed_images
+          path: ${{ env.IMAGES_PATH }}
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 0 # otherwise we will have no info about contributors
+      - name: Build
+        run: |
+          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
+          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          sudo rm -fr "$TEMP_PATH"
+          mkdir -p "$TEMP_PATH"
+          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
+          cd "$REPO_COPY/tests/ci" && python3 build_check.py "$BUILD_NAME"
+      - name: Upload build URLs to artifacts
+        if: ${{ success() || failure() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: ${{ env.BUILD_URLS }}
+          path: ${{ env.TEMP_PATH }}/${{ env.BUILD_URLS }}.json
+      - name: Cleanup
+        if: always()
+        run: |
+          # shellcheck disable=SC2046
+          docker kill $(docker ps -q) ||:
+          # shellcheck disable=SC2046
+          docker rm -f $(docker ps -a -q) ||:
+          sudo rm -fr "$TEMP_PATH" "$CACHES_PATH"
 ############################################################################################
 ##################################### Docker images  #######################################
 ############################################################################################
@@ -505,6 +599,46 @@ jobs:
           # shellcheck disable=SC2046
           docker rm -f $(docker ps -a -q) ||:
           sudo rm -fr "$TEMP_PATH"
+  BuilderSpecialReport:
+    needs:
+      - BuilderBinDarwin
+      - BuilderBinDarwinAarch64
+    runs-on: [self-hosted, style-checker]
+    steps:
+      - name: Set envs
+        run: |
+          cat >> "$GITHUB_ENV" << 'EOF'
+          TEMP_PATH=${{runner.temp}}/report_check
+          REPORTS_PATH=${{runner.temp}}/reports_dir
+          CHECK_NAME=ClickHouse special build check
+          NEEDS_DATA_PATH=${{runner.temp}}/needs.json
+          EOF
+      - name: Download json reports
+        uses: actions/download-artifact@v2
+        with:
+          path: ${{ env.REPORTS_PATH }}
+      - name: Clear repository
+        run: |
+          sudo rm -fr "$GITHUB_WORKSPACE" && mkdir "$GITHUB_WORKSPACE"
+      - name: Check out repository code
+        uses: actions/checkout@v2
+      - name: Report Builder
+        run: |
+          sudo rm -fr "$TEMP_PATH"
+          mkdir -p "$TEMP_PATH"
+          cat > "$NEEDS_DATA_PATH" << 'EOF'
+          ${{ toJSON(needs) }}
+          EOF
+          cd "$GITHUB_WORKSPACE/tests/ci"
+          python3 build_report_check.py "$CHECK_NAME"
+      - name: Cleanup
+        if: always()
+        run: |
+          # shellcheck disable=SC2046
+          docker kill $(docker ps -q) ||:
+          # shellcheck disable=SC2046
+          docker rm -f $(docker ps -a -q) ||:
+          sudo rm -fr "$TEMP_PATH"
 ##############################################################################################
 ########################### FUNCTIONAl STATELESS TESTS #######################################
 ##############################################################################################
@@ -591,7 +725,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stateless_debug
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stateless tests (address)
+          CHECK_NAME=Stateless tests (asan)
           REPO_COPY=${{runner.temp}}/stateless_debug/ClickHouse
           KILL_TIMEOUT=10800
           RUN_BY_HASH_NUM=0
@@ -630,7 +764,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stateless_debug
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stateless tests (address)
+          CHECK_NAME=Stateless tests (asan)
           REPO_COPY=${{runner.temp}}/stateless_debug/ClickHouse
           KILL_TIMEOUT=10800
           RUN_BY_HASH_NUM=1
@@ -669,7 +803,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stateless_tsan
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stateless tests (thread)
+          CHECK_NAME=Stateless tests (tsan)
           REPO_COPY=${{runner.temp}}/stateless_tsan/ClickHouse
           KILL_TIMEOUT=10800
           RUN_BY_HASH_NUM=0
@@ -708,7 +842,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stateless_tsan
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stateless tests (thread)
+          CHECK_NAME=Stateless tests (tsan)
           REPO_COPY=${{runner.temp}}/stateless_tsan/ClickHouse
           KILL_TIMEOUT=10800
           RUN_BY_HASH_NUM=1
@@ -747,7 +881,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stateless_tsan
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stateless tests (thread)
+          CHECK_NAME=Stateless tests (tsan)
           REPO_COPY=${{runner.temp}}/stateless_tsan/ClickHouse
           KILL_TIMEOUT=10800
           RUN_BY_HASH_NUM=2
@@ -823,7 +957,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stateless_memory
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stateless tests (memory)
+          CHECK_NAME=Stateless tests (msan)
           REPO_COPY=${{runner.temp}}/stateless_memory/ClickHouse
           KILL_TIMEOUT=10800
           RUN_BY_HASH_NUM=0
@@ -862,7 +996,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stateless_memory
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stateless tests (memory)
+          CHECK_NAME=Stateless tests (msan)
           REPO_COPY=${{runner.temp}}/stateless_memory/ClickHouse
           KILL_TIMEOUT=10800
           RUN_BY_HASH_NUM=1
@@ -901,7 +1035,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stateless_memory
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stateless tests (memory)
+          CHECK_NAME=Stateless tests (msan)
           REPO_COPY=${{runner.temp}}/stateless_memory/ClickHouse
           KILL_TIMEOUT=10800
           RUN_BY_HASH_NUM=2
@@ -1134,7 +1268,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stateful_debug
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stateful tests (address)
+          CHECK_NAME=Stateful tests (asan)
           REPO_COPY=${{runner.temp}}/stateful_debug/ClickHouse
           KILL_TIMEOUT=3600
           EOF
@@ -1171,7 +1305,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stateful_tsan
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stateful tests (thread)
+          CHECK_NAME=Stateful tests (tsan)
           REPO_COPY=${{runner.temp}}/stateful_tsan/ClickHouse
           KILL_TIMEOUT=3600
           EOF
@@ -1208,7 +1342,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stateful_msan
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stateful tests (memory)
+          CHECK_NAME=Stateful tests (msan)
           REPO_COPY=${{runner.temp}}/stateful_msan/ClickHouse
           KILL_TIMEOUT=3600
           EOF
@@ -1322,7 +1456,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stress_thread
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stress test (address)
+          CHECK_NAME=Stress test (asan)
           REPO_COPY=${{runner.temp}}/stress_thread/ClickHouse
           EOF
       - name: Download json reports
@@ -1362,7 +1496,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stress_thread
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stress test (thread)
+          CHECK_NAME=Stress test (tsan)
           REPO_COPY=${{runner.temp}}/stress_thread/ClickHouse
           EOF
       - name: Download json reports
@@ -1398,7 +1532,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stress_memory
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stress test (memory)
+          CHECK_NAME=Stress test (msan)
           REPO_COPY=${{runner.temp}}/stress_memory/ClickHouse
           EOF
       - name: Download json reports
@@ -1434,7 +1568,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/stress_undefined
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Stress test (undefined)
+          CHECK_NAME=Stress test (ubsan)
           REPO_COPY=${{runner.temp}}/stress_undefined/ClickHouse
           EOF
       - name: Download json reports
@@ -1623,7 +1757,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/integration_tests_tsan
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Integration tests (thread)
+          CHECK_NAME=Integration tests (tsan)
           REPO_COPY=${{runner.temp}}/integration_tests_tsan/ClickHouse
           RUN_BY_HASH_NUM=0
           RUN_BY_HASH_TOTAL=4
@@ -1661,7 +1795,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/integration_tests_tsan
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Integration tests (thread)
+          CHECK_NAME=Integration tests (tsan)
           REPO_COPY=${{runner.temp}}/integration_tests_tsan/ClickHouse
           RUN_BY_HASH_NUM=1
           RUN_BY_HASH_TOTAL=4
@@ -1699,7 +1833,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/integration_tests_tsan
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Integration tests (thread)
+          CHECK_NAME=Integration tests (tsan)
           REPO_COPY=${{runner.temp}}/integration_tests_tsan/ClickHouse
           RUN_BY_HASH_NUM=2
           RUN_BY_HASH_TOTAL=4
@@ -1737,7 +1871,7 @@ jobs:
           cat >> "$GITHUB_ENV" << 'EOF'
           TEMP_PATH=${{runner.temp}}/integration_tests_tsan
           REPORTS_PATH=${{runner.temp}}/reports_dir
-          CHECK_NAME=Integration tests (thread)
+          CHECK_NAME=Integration tests (tsan)
           REPO_COPY=${{runner.temp}}/integration_tests_tsan/ClickHouse
           RUN_BY_HASH_NUM=3
           RUN_BY_HASH_TOTAL=4
@@ -1847,6 +1981,7 @@ jobs:
       - DockerHubPush
       - DockerServerImages
       - BuilderReport
+      - BuilderSpecialReport
       - FunctionalStatelessTestDebug0
       - FunctionalStatelessTestDebug1
       - FunctionalStatelessTestDebug2
diff --git a/CMakeLists.txt b/CMakeLists.txt
index dbbec2a600d..64fb870b61b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -143,6 +143,8 @@ include (cmake/add_warning.cmake)
 if (COMPILER_CLANG)
     # generate ranges for fast "addr2line" search
     if (NOT CMAKE_BUILD_TYPE_UC STREQUAL "RELEASE")
+        # NOTE: that clang has a bug because of it does not emit .debug_aranges
+        # with ThinLTO, so custom ld.lld wrapper is shipped in docker images.
         set(COMPILER_FLAGS "${COMPILER_FLAGS} -gdwarf-aranges")
     endif ()
 
diff --git a/README.md b/README.md
index b173add94e3..49aed14f719 100644
--- a/README.md
+++ b/README.md
@@ -15,4 +15,5 @@ ClickHouse® is an open-source column-oriented database management system that a
 * [Contacts](https://clickhouse.com/company/contact) can help to get your questions answered if there are any.
 
 ## Upcoming events
-* [**v22.8 Release Webinar**](https://clickhouse.com/company/events/v22-8-release-webinar) Original creator, co-founder, and CTO of ClickHouse Alexey Milovidov will walk us through the highlights of the release, provide live demos, and share vision into what is coming in the roadmap.
+* [**v22.9 Release Webinar**](https://clickhouse.com/company/events/v22-9-release-webinar) Original creator, co-founder, and CTO of ClickHouse Alexey Milovidov will walk us through the highlights of the release, provide live demos, and share vision into what is coming in the roadmap.
+* [**ClickHouse for Analytics @ Barracuda Networks**](https://www.meetup.com/clickhouse-silicon-valley-meetup-group/events/288140358/) Join us for this in person meetup hosted by our friends at Barracuda in Bay Area.
diff --git a/base/base/Decimal.h b/base/base/Decimal.h
index 1efb8ba8d92..22cb577b1b2 100644
--- a/base/base/Decimal.h
+++ b/base/base/Decimal.h
@@ -52,15 +52,15 @@ struct Decimal
     constexpr Decimal(Decimal<T> &&) noexcept = default;
     constexpr Decimal(const Decimal<T> &) = default;
 
-    constexpr Decimal(const T & value_): value(value_) {}
+    constexpr Decimal(const T & value_): value(value_) {} // NOLINT(google-explicit-constructor)
 
     template <typename U>
-    constexpr Decimal(const Decimal<U> & x): value(x.value) {}
+    constexpr Decimal(const Decimal<U> & x): value(x.value) {} // NOLINT(google-explicit-constructor)
 
     constexpr Decimal<T> & operator=(Decimal<T> &&) noexcept = default;
     constexpr Decimal<T> & operator = (const Decimal<T> &) = default;
 
-    constexpr operator T () const { return value; }
+    constexpr operator T () const { return value; } // NOLINT(google-explicit-constructor)
 
     template <typename U>
     constexpr U convertTo() const
@@ -111,7 +111,7 @@ public:
     using Base::Base;
     using NativeType = Base::NativeType;
 
-    constexpr DateTime64(const Base & v): Base(v) {}
+    constexpr DateTime64(const Base & v): Base(v) {} // NOLINT(google-explicit-constructor)
 };
 }
 
diff --git a/base/base/DecomposedFloat.h b/base/base/DecomposedFloat.h
index 652b28966b2..f152637b94e 100644
--- a/base/base/DecomposedFloat.h
+++ b/base/base/DecomposedFloat.h
@@ -36,14 +36,14 @@ struct DecomposedFloat
 {
     using Traits = FloatTraits<T>;
 
-    DecomposedFloat(T x)
+    explicit DecomposedFloat(T x)
     {
         memcpy(&x_uint, &x, sizeof(x));
     }
 
     typename Traits::UInt x_uint;
 
-    bool is_negative() const
+    bool isNegative() const
     {
         return x_uint >> (Traits::bits - 1);
     }
@@ -53,7 +53,7 @@ struct DecomposedFloat
     {
         return (exponent() == 0 && mantissa() == 0)
             ? 0
-            : (is_negative()
+            : (isNegative()
                 ? -1
                 : 1);
     }
@@ -63,7 +63,7 @@ struct DecomposedFloat
         return (x_uint >> (Traits::mantissa_bits)) & (((1ull << (Traits::exponent_bits + 1)) - 1) >> 1);
     }
 
-    int16_t normalized_exponent() const
+    int16_t normalizedExponent() const
     {
         return int16_t(exponent()) - ((1ull << (Traits::exponent_bits - 1)) - 1);
     }
@@ -73,20 +73,20 @@ struct DecomposedFloat
         return x_uint & ((1ull << Traits::mantissa_bits) - 1);
     }
 
-    int64_t mantissa_with_sign() const
+    int64_t mantissaWithSign() const
     {
-        return is_negative() ? -mantissa() : mantissa();
+        return isNegative() ? -mantissa() : mantissa();
     }
 
     /// NOTE Probably floating point instructions can be better.
-    bool is_integer_in_representable_range() const
+    bool isIntegerInRepresentableRange() const
     {
         return x_uint == 0
-            || (normalized_exponent() >= 0  /// The number is not less than one
+            || (normalizedExponent() >= 0  /// The number is not less than one
                 /// The number is inside the range where every integer has exact representation in float
-                && normalized_exponent() <= static_cast<int16_t>(Traits::mantissa_bits)
+                && normalizedExponent() <= static_cast<int16_t>(Traits::mantissa_bits)
                 /// After multiplying by 2^exp, the fractional part becomes zero, means the number is integer
-                && ((mantissa() & ((1ULL << (Traits::mantissa_bits - normalized_exponent())) - 1)) == 0));
+                && ((mantissa() & ((1ULL << (Traits::mantissa_bits - normalizedExponent())) - 1)) == 0));
     }
 
 
@@ -102,15 +102,15 @@ struct DecomposedFloat
             return sign();
 
         /// Different signs
-        if (is_negative() && rhs > 0)
+        if (isNegative() && rhs > 0)
             return -1;
-        if (!is_negative() && rhs < 0)
+        if (!isNegative() && rhs < 0)
             return 1;
 
         /// Fractional number with magnitude less than one
-        if (normalized_exponent() < 0)
+        if (normalizedExponent() < 0)
         {
-            if (!is_negative())
+            if (!isNegative())
                 return rhs > 0 ? -1 : 1;
             else
                 return rhs >= 0 ? -1 : 1;
@@ -121,11 +121,11 @@ struct DecomposedFloat
         {
             if (rhs == std::numeric_limits<Int>::lowest())
             {
-                assert(is_negative());
+                assert(isNegative());
 
-                if (normalized_exponent() < static_cast<int16_t>(8 * sizeof(Int) - is_signed_v<Int>))
+                if (normalizedExponent() < static_cast<int16_t>(8 * sizeof(Int) - is_signed_v<Int>))
                     return 1;
-                if (normalized_exponent() > static_cast<int16_t>(8 * sizeof(Int) - is_signed_v<Int>))
+                if (normalizedExponent() > static_cast<int16_t>(8 * sizeof(Int) - is_signed_v<Int>))
                     return -1;
 
                 if (mantissa() == 0)
@@ -136,44 +136,44 @@ struct DecomposedFloat
         }
 
         /// Too large number: abs(float) > abs(rhs). Also the case with infinities and NaN.
-        if (normalized_exponent() >= static_cast<int16_t>(8 * sizeof(Int) - is_signed_v<Int>))
-            return is_negative() ? -1 : 1;
+        if (normalizedExponent() >= static_cast<int16_t>(8 * sizeof(Int) - is_signed_v<Int>))
+            return isNegative() ? -1 : 1;
 
         using UInt = std::conditional_t<(sizeof(Int) > sizeof(typename Traits::UInt)), make_unsigned_t<Int>, typename Traits::UInt>;
         UInt uint_rhs = rhs < 0 ? -rhs : rhs;
 
         /// Smaller octave: abs(rhs) < abs(float)
         /// FYI, TIL: octave is also called "binade", https://en.wikipedia.org/wiki/Binade
-        if (uint_rhs < (static_cast<UInt>(1) << normalized_exponent()))
-            return is_negative() ? -1 : 1;
+        if (uint_rhs < (static_cast<UInt>(1) << normalizedExponent()))
+            return isNegative() ? -1 : 1;
 
         /// Larger octave: abs(rhs) > abs(float)
-        if (normalized_exponent() + 1 < static_cast<int16_t>(8 * sizeof(Int) - is_signed_v<Int>)
-            && uint_rhs >= (static_cast<UInt>(1) << (normalized_exponent() + 1)))
-            return is_negative() ? 1 : -1;
+        if (normalizedExponent() + 1 < static_cast<int16_t>(8 * sizeof(Int) - is_signed_v<Int>)
+            && uint_rhs >= (static_cast<UInt>(1) << (normalizedExponent() + 1)))
+            return isNegative() ? 1 : -1;
 
         /// The same octave
-        /// uint_rhs == 2 ^ normalized_exponent + mantissa * 2 ^ (normalized_exponent - mantissa_bits)
+        /// uint_rhs == 2 ^ normalizedExponent + mantissa * 2 ^ (normalizedExponent - mantissa_bits)
 
-        bool large_and_always_integer = normalized_exponent() >= static_cast<int16_t>(Traits::mantissa_bits);
+        bool large_and_always_integer = normalizedExponent() >= static_cast<int16_t>(Traits::mantissa_bits);
 
         UInt a = large_and_always_integer
-            ? static_cast<UInt>(mantissa()) << (normalized_exponent() - Traits::mantissa_bits)
-            : static_cast<UInt>(mantissa()) >> (Traits::mantissa_bits - normalized_exponent());
+            ? static_cast<UInt>(mantissa()) << (normalizedExponent() - Traits::mantissa_bits)
+            : static_cast<UInt>(mantissa()) >> (Traits::mantissa_bits - normalizedExponent());
 
-        UInt b = uint_rhs - (static_cast<UInt>(1) << normalized_exponent());
+        UInt b = uint_rhs - (static_cast<UInt>(1) << normalizedExponent());
 
         if (a < b)
-            return is_negative() ? 1 : -1;
+            return isNegative() ? 1 : -1;
         if (a > b)
-            return is_negative() ? -1 : 1;
+            return isNegative() ? -1 : 1;
 
         /// Float has no fractional part means that the numbers are equal.
-        if (large_and_always_integer || (mantissa() & ((1ULL << (Traits::mantissa_bits - normalized_exponent())) - 1)) == 0)
+        if (large_and_always_integer || (mantissa() & ((1ULL << (Traits::mantissa_bits - normalizedExponent())) - 1)) == 0)
             return 0;
         else
             /// Float has fractional part means its abs value is larger.
-            return is_negative() ? -1 : 1;
+            return isNegative() ? -1 : 1;
     }
 
 
diff --git a/base/base/JSON.h b/base/base/JSON.h
index 214e9f88e9b..850b74715c6 100644
--- a/base/base/JSON.h
+++ b/base/base/JSON.h
@@ -38,6 +38,7 @@
   */
 
 
+// NOLINTBEGIN(google-explicit-constructor)
 #ifdef __clang__
 #  pragma clang diagnostic push
 #  pragma clang diagnostic ignored "-Wdeprecated-dynamic-exception-spec"
@@ -46,6 +47,7 @@ POCO_DECLARE_EXCEPTION(Foundation_API, JSONException, Poco::Exception)
 #ifdef __clang__
 #  pragma clang diagnostic pop
 #endif
+// NOLINTEND(google-explicit-constructor)
 
 class JSON
 {
@@ -61,7 +63,7 @@ public:
         checkInit();
     }
 
-    JSON(const std::string & s) : ptr_begin(s.data()), ptr_end(s.data() + s.size()), level(0)
+    explicit JSON(std::string_view s) : ptr_begin(s.data()), ptr_end(s.data() + s.size()), level(0)
     {
         checkInit();
     }
@@ -71,13 +73,7 @@ public:
         *this = rhs;
     }
 
-    JSON & operator=(const JSON & rhs)
-    {
-        ptr_begin = rhs.ptr_begin;
-        ptr_end = rhs.ptr_end;
-        level = rhs.level;
-        return *this;
-    }
+    JSON & operator=(const JSON & rhs) = default;
 
     const char * data() const { return ptr_begin; }
     const char * dataEnd() const { return ptr_end; }
@@ -169,7 +165,7 @@ public:
 
     /// Перейти к следующему элементу массива или следующей name-value паре объекта.
     iterator & operator++();
-    iterator operator++(int);
+    iterator operator++(int); // NOLINT(cert-dcl21-cpp)
 
     /// Есть ли в строке escape-последовательности
     bool hasEscapes() const;
diff --git a/base/base/arithmeticOverflow.h b/base/base/arithmeticOverflow.h
index 9a0e27505e1..d7242058658 100644
--- a/base/base/arithmeticOverflow.h
+++ b/base/base/arithmeticOverflow.h
@@ -3,6 +3,7 @@
 #include <base/extended_types.h>
 #include <base/defines.h>
 
+// NOLINTBEGIN(google-runtime-int)
 
 namespace common
 {
@@ -206,3 +207,5 @@ namespace common
         return false;
     }
 }
+
+// NOLINTEND(google-runtime-int)
diff --git a/base/base/bit_cast.h b/base/base/bit_cast.h
index 5b4b0931b62..d1246b45590 100644
--- a/base/base/bit_cast.h
+++ b/base/base/bit_cast.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <string.h>
+#include <cstring>
 #include <algorithm>
 #include <type_traits>
 
diff --git a/base/base/defines.h b/base/base/defines.h
index c8c408b9c93..671253ed9e8 100644
--- a/base/base/defines.h
+++ b/base/base/defines.h
@@ -143,8 +143,8 @@
 
 /// Macros for suppressing TSA warnings for specific reads/writes (instead of suppressing it for the whole function)
 /// Consider adding a comment before using these macros.
-#   define TSA_SUPPRESS_WARNING_FOR_READ(x) [&]() TSA_NO_THREAD_SAFETY_ANALYSIS -> const auto & { return (x); }()
-#   define TSA_SUPPRESS_WARNING_FOR_WRITE(x) [&]() TSA_NO_THREAD_SAFETY_ANALYSIS -> auto & { return (x); }()
+#   define TSA_SUPPRESS_WARNING_FOR_READ(x) ([&]() TSA_NO_THREAD_SAFETY_ANALYSIS -> const auto & { return (x); }())
+#   define TSA_SUPPRESS_WARNING_FOR_WRITE(x) ([&]() TSA_NO_THREAD_SAFETY_ANALYSIS -> auto & { return (x); }())
 
 /// This macro is useful when only one thread writes to a member
 /// and you want to read this member from the same thread without locking a mutex.
diff --git a/base/base/extended_types.h b/base/base/extended_types.h
index 7deb8e17b8e..b58df45a97e 100644
--- a/base/base/extended_types.h
+++ b/base/base/extended_types.h
@@ -5,7 +5,6 @@
 #include <base/types.h>
 #include <base/wide_integer.h>
 
-
 using Int128 = wide::integer<128, signed>;
 using UInt128 = wide::integer<128, unsigned>;
 using Int256 = wide::integer<256, signed>;
@@ -18,7 +17,7 @@ static_assert(sizeof(UInt256) == 32);
 /// (std::common_type), are "set in stone". Attempting to specialize them causes undefined behavior.
 /// So instead of using the std type_traits, we use our own version which allows extension.
 template <typename T>
-struct is_signed
+struct is_signed // NOLINT(readability-identifier-naming)
 {
     static constexpr bool value = std::is_signed_v<T>;
 };
@@ -30,7 +29,7 @@ template <typename T>
 inline constexpr bool is_signed_v = is_signed<T>::value;
 
 template <typename T>
-struct is_unsigned
+struct is_unsigned // NOLINT(readability-identifier-naming)
 {
     static constexpr bool value = std::is_unsigned_v<T>;
 };
@@ -51,7 +50,7 @@ template <class T> concept is_integer =
 template <class T> concept is_floating_point = std::is_floating_point_v<T>;
 
 template <typename T>
-struct is_arithmetic
+struct is_arithmetic // NOLINT(readability-identifier-naming)
 {
     static constexpr bool value = std::is_arithmetic_v<T>;
 };
@@ -66,9 +65,9 @@ template <typename T>
 inline constexpr bool is_arithmetic_v = is_arithmetic<T>::value;
 
 template <typename T>
-struct make_unsigned
+struct make_unsigned // NOLINT(readability-identifier-naming)
 {
-    typedef std::make_unsigned_t<T> type;
+    using type = std::make_unsigned_t<T>;
 };
 
 template <> struct make_unsigned<Int128> { using type = UInt128; };
@@ -79,9 +78,9 @@ template <> struct make_unsigned<UInt256> { using type = UInt256; };
 template <typename T> using make_unsigned_t = typename make_unsigned<T>::type;
 
 template <typename T>
-struct make_signed
+struct make_signed // NOLINT(readability-identifier-naming)
 {
-    typedef std::make_signed_t<T> type;
+    using type = std::make_signed_t<T>;
 };
 
 template <> struct make_signed<Int128>  { using type = Int128; };
@@ -92,7 +91,7 @@ template <> struct make_signed<UInt256> { using type = Int256; };
 template <typename T> using make_signed_t = typename make_signed<T>::type;
 
 template <typename T>
-struct is_big_int
+struct is_big_int // NOLINT(readability-identifier-naming)
 {
     static constexpr bool value = false;
 };
@@ -104,4 +103,3 @@ template <> struct is_big_int<UInt256> { static constexpr bool value = true; };
 
 template <typename T>
 inline constexpr bool is_big_int_v = is_big_int<T>::value;
-
diff --git a/base/base/find_symbols.h b/base/base/find_symbols.h
index b28749afda6..83f53773ae7 100644
--- a/base/base/find_symbols.h
+++ b/base/base/find_symbols.h
@@ -15,7 +15,7 @@
   *
   * Allow to search for next character from the set of 'symbols...' in a string.
   * It is similar to 'strpbrk', 'strcspn' (and 'strchr', 'memchr' in the case of one symbol and '\0'),
-  * but with the following differencies:
+  * but with the following differences:
   * - works with any memory ranges, including containing zero bytes;
   * - doesn't require terminating zero byte: end of memory range is passed explicitly;
   * - if not found, returns pointer to end instead of nullptr;
diff --git a/base/base/iostream_debug_helpers.h b/base/base/iostream_debug_helpers.h
index 3a3f1a741ad..db974c911df 100644
--- a/base/base/iostream_debug_helpers.h
+++ b/base/base/iostream_debug_helpers.h
@@ -120,6 +120,7 @@ Out & dumpDispatchPriorities(Out & out, T && x, std::decay_t<decltype(dumpImpl<p
     return dumpImpl<priority>(out, x);
 }
 
+// NOLINTNEXTLINE(google-explicit-constructor)
 struct LowPriority { LowPriority(void *) {} };
 
 template <int priority, typename Out, typename T>
diff --git a/base/base/itoa.h b/base/base/itoa.h
index da7c2ffc73e..5e0b18d50c0 100644
--- a/base/base/itoa.h
+++ b/base/base/itoa.h
@@ -91,10 +91,10 @@ template <size_t N>
 using DivisionBy10PowN = typename SelectType
 <
     N,
-    Division<uint8_t, 0, 205U, 11>,                           /// divide by 10
-    Division<uint16_t, 1, 41943U, 22>,                        /// divide by 100
-    Division<uint32_t, 0, 3518437209U, 45>,                   /// divide by 10000
-    Division<uint64_t, 0, 12379400392853802749ULL, 90>        /// divide by 100000000
+    Division<uint8_t, false, 205U, 11>,                           /// divide by 10
+    Division<uint16_t, true, 41943U, 22>,                         /// divide by 100
+    Division<uint32_t, false, 3518437209U, 45>,                   /// divide by 10000
+    Division<uint64_t, false, 12379400392853802749ULL, 90>        /// divide by 100000000
 >::Result;
 
 template <size_t N>
@@ -352,7 +352,7 @@ static inline char * writeUIntText(T x, char * p)
     static_assert(is_unsigned_v<T>);
 
     int len = digits10(x);
-    auto pp = p + len;
+    auto * pp = p + len;
     while (x >= 100)
     {
         const auto i = x % 100;
diff --git a/base/base/scope_guard.h b/base/base/scope_guard.h
index 3d8f8ba0fe0..8524beac7ea 100644
--- a/base/base/scope_guard.h
+++ b/base/base/scope_guard.h
@@ -5,13 +5,13 @@
 #include <utility>
 
 template <class F>
-class [[nodiscard]] basic_scope_guard
+class [[nodiscard]] BasicScopeGuard
 {
 public:
-    constexpr basic_scope_guard() = default;
-    constexpr basic_scope_guard(basic_scope_guard && src) : function{src.release()} {}
+    constexpr BasicScopeGuard() = default;
+    constexpr BasicScopeGuard(BasicScopeGuard && src) : function{src.release()} {} // NOLINT(hicpp-noexcept-move, performance-noexcept-move-constructor)
 
-    constexpr basic_scope_guard & operator=(basic_scope_guard && src)
+    constexpr BasicScopeGuard & operator=(BasicScopeGuard && src) // NOLINT(hicpp-noexcept-move, performance-noexcept-move-constructor)
     {
         if (this != &src)
         {
@@ -23,11 +23,11 @@ public:
 
     template <typename G>
     requires std::is_convertible_v<G, F>
-    constexpr basic_scope_guard(basic_scope_guard<G> && src) : function{src.release()} {}
+    constexpr BasicScopeGuard(BasicScopeGuard<G> && src) : function{src.release()} {} // NOLINT(google-explicit-constructor)
 
     template <typename G>
     requires std::is_convertible_v<G, F>
-    constexpr basic_scope_guard & operator=(basic_scope_guard<G> && src)
+    constexpr BasicScopeGuard & operator=(BasicScopeGuard<G> && src)
     {
         if (this != &src)
         {
@@ -39,13 +39,13 @@ public:
 
     template <typename G>
     requires std::is_convertible_v<G, F>
-    constexpr basic_scope_guard(const G & function_) : function{function_} {}
+    constexpr BasicScopeGuard(const G & function_) : function{function_} {} // NOLINT(google-explicit-constructor)
 
     template <typename G>
     requires std::is_convertible_v<G, F>
-    constexpr basic_scope_guard(G && function_) : function{std::move(function_)} {}
+    constexpr BasicScopeGuard(G && function_) : function{std::move(function_)} {} // NOLINT(google-explicit-constructor, bugprone-forwarding-reference-overload, bugprone-move-forwarding-reference)
 
-    ~basic_scope_guard() { invoke(); }
+    ~BasicScopeGuard() { invoke(); }
 
     static constexpr bool is_nullable = std::is_constructible_v<bool, F>;
 
@@ -70,7 +70,7 @@ public:
 
     template <typename G>
     requires std::is_convertible_v<G, F>
-    basic_scope_guard<F> & join(basic_scope_guard<G> && other)
+    BasicScopeGuard<F> & join(BasicScopeGuard<G> && other)
     {
         if (other.function)
         {
@@ -102,14 +102,13 @@ private:
     F function = F{};
 };
 
-using scope_guard = basic_scope_guard<std::function<void(void)>>;
+using scope_guard = BasicScopeGuard<std::function<void(void)>>;
 
 
 template <class F>
-inline basic_scope_guard<F> make_scope_guard(F && function_) { return std::forward<F>(function_); }
+inline BasicScopeGuard<F> make_scope_guard(F && function_) { return std::forward<F>(function_); }
 
 #define SCOPE_EXIT_CONCAT(n, ...) \
 const auto scope_exit##n = make_scope_guard([&] { __VA_ARGS__; })
 #define SCOPE_EXIT_FWD(n, ...) SCOPE_EXIT_CONCAT(n, __VA_ARGS__)
 #define SCOPE_EXIT(...) SCOPE_EXIT_FWD(__LINE__, __VA_ARGS__)
-
diff --git a/base/base/sort.h b/base/base/sort.h
index 589469fffaa..912545979dc 100644
--- a/base/base/sort.h
+++ b/base/base/sort.h
@@ -14,7 +14,7 @@ template <typename Comparator>
 class DebugLessComparator
 {
 public:
-    constexpr DebugLessComparator(Comparator & cmp_)
+    constexpr DebugLessComparator(Comparator & cmp_) // NOLINT(google-explicit-constructor)
         : cmp(cmp_)
     {}
 
diff --git a/base/base/strong_typedef.h b/base/base/strong_typedef.h
index c9ea30b73fd..2ddea6412f5 100644
--- a/base/base/strong_typedef.h
+++ b/base/base/strong_typedef.h
@@ -34,8 +34,10 @@ public:
     template <class Enable = typename std::is_move_assignable<T>::type>
     Self & operator=(T && rhs) { t = std::move(rhs); return *this;}
 
+    // NOLINTBEGIN(google-explicit-constructor)
     operator const T & () const { return t; }
     operator T & () { return t; }
+    // NOLINTEND(google-explicit-constructor)
 
     bool operator==(const Self & rhs) const { return t == rhs.t; }
     bool operator<(const Self & rhs) const { return t < rhs.t; }
@@ -58,7 +60,10 @@ namespace std
     };
 }
 
+// NOLINTBEGIN(bugprone-macro-parentheses)
+
 #define STRONG_TYPEDEF(T, D) \
     struct D ## Tag {}; \
     using D = StrongTypedef<T, D ## Tag>; \
 
+// NOLINTEND(bugprone-macro-parentheses)
diff --git a/base/base/unit.h b/base/base/unit.h
index 682b43512fc..1fb530be1f0 100644
--- a/base/base/unit.h
+++ b/base/base/unit.h
@@ -10,9 +10,11 @@ constexpr size_t GiB = 1024 * MiB;
 #  pragma clang diagnostic ignored "-Wreserved-identifier"
 #endif
 
+// NOLINTBEGIN(google-runtime-int)
 constexpr size_t operator"" _KiB(unsigned long long val) { return val * KiB; }
 constexpr size_t operator"" _MiB(unsigned long long val) { return val * MiB; }
 constexpr size_t operator"" _GiB(unsigned long long val) { return val * GiB; }
+// NOLINTEND(google-runtime-int)
 
 #ifdef HAS_RESERVED_IDENTIFIER
 #  pragma clang diagnostic pop
diff --git a/base/base/wide_integer_to_string.h b/base/base/wide_integer_to_string.h
index 8b794fe9bcb..160bf599516 100644
--- a/base/base/wide_integer_to_string.h
+++ b/base/base/wide_integer_to_string.h
@@ -51,8 +51,8 @@ struct fmt::formatter<wide::integer<Bits, Signed>>
 {
     constexpr auto parse(format_parse_context & ctx)
     {
-        auto it = ctx.begin();
-        auto end = ctx.end();
+        const auto * it = ctx.begin();
+        const auto * end = ctx.end();
 
         /// Only support {}.
         if (it != end && *it != '}')
diff --git a/base/glibc-compatibility/memcpy/memcpy.h b/base/glibc-compatibility/memcpy/memcpy.h
index 9bee26a3722..0930dfb5c67 100644
--- a/base/glibc-compatibility/memcpy/memcpy.h
+++ b/base/glibc-compatibility/memcpy/memcpy.h
@@ -63,7 +63,7 @@
   * Very large size of memcpy typically indicates suboptimal (not cache friendly) algorithms in code or unrealistic scenarios,
   * so we don't pay attention to using non-temporary stores.
   *
-  * On recent Intel CPUs, the presence of "erms" makes "rep movsb" the most benefitial,
+  * On recent Intel CPUs, the presence of "erms" makes "rep movsb" the most beneficial,
   * even comparing to non-temporary aligned unrolled stores even with the most wide registers.
   *
   * memcpy can be written in asm, C or C++. The latter can also use inline asm.
@@ -214,4 +214,3 @@ tail:
 
     return ret;
 }
-
diff --git a/base/pcg-random/pcg_extras.hpp b/base/pcg-random/pcg_extras.hpp
index f5ba4d48849..78ce726d48b 100644
--- a/base/pcg-random/pcg_extras.hpp
+++ b/base/pcg-random/pcg_extras.hpp
@@ -49,6 +49,8 @@
     #include <cxxabi.h>
 #endif
 
+// NOLINTBEGIN(readability-identifier-naming, modernize-use-using, bugprone-macro-parentheses, google-explicit-constructor)
+
 /*
  * Abstractions for compiler-specific directives
  */
@@ -90,8 +92,6 @@
     #define PCG_EMULATED_128BIT_MATH 1
 #endif
 
-// NOLINTBEGIN(*)
-
 namespace pcg_extras {
 
 /*
@@ -553,6 +553,6 @@ std::ostream& operator<<(std::ostream& out, printable_typename<T>) {
 
 } // namespace pcg_extras
 
-// NOLINTEND(*)
+// NOLINTEND(readability-identifier-naming, modernize-use-using, bugprone-macro-parentheses, google-explicit-constructor)
 
 #endif // PCG_EXTRAS_HPP_INCLUDED
diff --git a/base/pcg-random/pcg_random.hpp b/base/pcg-random/pcg_random.hpp
index 94e43e1007b..db7c3d7f66c 100644
--- a/base/pcg-random/pcg_random.hpp
+++ b/base/pcg-random/pcg_random.hpp
@@ -101,7 +101,7 @@
 #endif
 
 /*
- * The pcg_extras namespace contains some support code that is likley to
+ * The pcg_extras namespace contains some support code that is likely to
  * be useful for a variety of RNGs, including:
  *      - 128-bit int support for platforms where it isn't available natively
  *      - bit twiddling operations
diff --git a/base/pcg-random/pcg_uint128.hpp b/base/pcg-random/pcg_uint128.hpp
index 1a1f61b9366..3452ba6f1c4 100644
--- a/base/pcg-random/pcg_uint128.hpp
+++ b/base/pcg-random/pcg_uint128.hpp
@@ -22,7 +22,7 @@
 /*
  * This code provides a a C++ class that can provide 128-bit (or higher)
  * integers.  To produce 2K-bit integers, it uses two K-bit integers,
- * placed in a union that allowes the code to also see them as four K/2 bit
+ * placed in a union that allows the code to also see them as four K/2 bit
  * integers (and access them either directly name, or by index).
  *
  * It may seem like we're reinventing the wheel here, because several
diff --git a/cmake/cpu_features.cmake b/cmake/cpu_features.cmake
index 1fc3c2db804..218b4deedce 100644
--- a/cmake/cpu_features.cmake
+++ b/cmake/cpu_features.cmake
@@ -24,6 +24,23 @@ option (ENABLE_BMI "Use BMI instructions on x86_64" 0)
 option (ENABLE_AVX2_FOR_SPEC_OP "Use avx2 instructions for specific operations on x86_64" 0)
 option (ENABLE_AVX512_FOR_SPEC_OP "Use avx512 instructions for specific operations on x86_64" 0)
 
+# X86: Allow compilation for a SSE2-only target machine. Done by a special build in CI for embedded or very old hardware.
+option (NO_SSE3_OR_HIGHER "Disable SSE3 or higher on x86_64" 0)
+if (NO_SSE3_OR_HIGHER)
+    SET(ENABLE_SSSE3 0)
+    SET(ENABLE_SSE41 0)
+    SET(ENABLE_SSE42 0)
+    SET(ENABLE_PCLMULQDQ 0)
+    SET(ENABLE_POPCNT 0)
+    SET(ENABLE_AVX 0)
+    SET(ENABLE_AVX2 0)
+    SET(ENABLE_AVX512 0)
+    SET(ENABLE_AVX512_VBMI 0)
+    SET(ENABLE_BMI 0)
+    SET(ENABLE_AVX2_FOR_SPEC_OP 0)
+    SET(ENABLE_AVX512_FOR_SPEC_OP 0)
+endif()
+
 option (ARCH_NATIVE "Add -march=native compiler flag. This makes your binaries non-portable but more performant code may be generated. This option overrides ENABLE_* options for specific instruction set. Highly not recommended to use." 0)
 
 if (ARCH_NATIVE)
diff --git a/cmake/ld.lld.in b/cmake/ld.lld.in
new file mode 100755
index 00000000000..9736dab1bc3
--- /dev/null
+++ b/cmake/ld.lld.in
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+# This is a workaround for bug in llvm/clang,
+# that does not produce .debug_aranges with LTO
+#
+# NOTE: this is a temporary solution, that should be removed once [1] will be
+# resolved.
+#
+#   [1]: https://discourse.llvm.org/t/clang-does-not-produce-full-debug-aranges-section-with-thinlto/64898/8
+
+# NOTE: only -flto=thin is supported.
+# NOTE: it is not possible to check was there -gdwarf-aranges initially or not.
+if [[ "$*" =~ -plugin-opt=thinlto ]]; then
+    exec "@LLD_PATH@" -mllvm -generate-arange-section "$@"
+else
+    exec "@LLD_PATH@" "$@"
+fi
diff --git a/cmake/split_debug_symbols.cmake b/cmake/split_debug_symbols.cmake
index 12182ed9c20..a9c2158359a 100644
--- a/cmake/split_debug_symbols.cmake
+++ b/cmake/split_debug_symbols.cmake
@@ -20,7 +20,7 @@ macro(clickhouse_split_debug_symbols)
        COMMAND mkdir -p "${STRIP_DESTINATION_DIR}/bin"
        COMMAND cp "${STRIP_BINARY_PATH}" "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}"
        # Splits debug symbols into separate file, leaves the binary untouched:
-       COMMAND "${OBJCOPY_PATH}" --only-keep-debug --compress-debug-sections "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}" "${STRIP_DESTINATION_DIR}/lib/debug/bin/${STRIP_TARGET}.debug"
+       COMMAND "${OBJCOPY_PATH}" --only-keep-debug "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}" "${STRIP_DESTINATION_DIR}/lib/debug/bin/${STRIP_TARGET}.debug"
        COMMAND chmod 0644 "${STRIP_DESTINATION_DIR}/lib/debug/bin/${STRIP_TARGET}.debug"
        # Strips binary, sections '.note' & '.comment' are removed in line with Debian's stripping policy: www.debian.org/doc/debian-policy/ch-files.html, section '.clickhouse.hash' is needed for integrity check:
        COMMAND "${STRIP_PATH}" --remove-section=.comment --remove-section=.note --keep-section=.clickhouse.hash "${STRIP_DESTINATION_DIR}/bin/${STRIP_TARGET}"
diff --git a/cmake/tools.cmake b/cmake/tools.cmake
index 5b005a6f1f9..57d39899a40 100644
--- a/cmake/tools.cmake
+++ b/cmake/tools.cmake
@@ -94,8 +94,13 @@ if (LINKER_NAME)
         if (NOT LLD_PATH)
             message (FATAL_ERROR "Using linker ${LINKER_NAME} but can't find its path.")
         endif ()
-        set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} --ld-path=${LLD_PATH}")
-        set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} --ld-path=${LLD_PATH}")
+
+        # This a temporary quirk to emit .debug_aranges with ThinLTO
+        set (LLD_WRAPPER "${CMAKE_CURRENT_BINARY_DIR}/ld.lld")
+        configure_file ("${CMAKE_CURRENT_SOURCE_DIR}/cmake/ld.lld.in" "${LLD_WRAPPER}" @ONLY)
+
+        set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} --ld-path=${LLD_WRAPPER}")
+        set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} --ld-path=${LLD_WRAPPER}")
     else ()
         set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=${LINKER_NAME}")
         set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fuse-ld=${LINKER_NAME}")
diff --git a/contrib/NuRaft b/contrib/NuRaft
index bdba298189e..1be805e7cb2 160000
--- a/contrib/NuRaft
+++ b/contrib/NuRaft
@@ -1 +1 @@
-Subproject commit bdba298189e29995892de78dcecf64d127444e81
+Subproject commit 1be805e7cb2494aa8170015493474379b0362dfc
diff --git a/contrib/datasketches-cpp b/contrib/datasketches-cpp
index 7d73d7610db..7abd49bb2e7 160000
--- a/contrib/datasketches-cpp
+++ b/contrib/datasketches-cpp
@@ -1 +1 @@
-Subproject commit 7d73d7610db31d4e1ecde0fb3a7ee90ef371207f
+Subproject commit 7abd49bb2e72bf9a5029993d31dcb1872da88292
diff --git a/contrib/libcxx-cmake/CMakeLists.txt b/contrib/libcxx-cmake/CMakeLists.txt
index a501c4df64f..6f42a479588 100644
--- a/contrib/libcxx-cmake/CMakeLists.txt
+++ b/contrib/libcxx-cmake/CMakeLists.txt
@@ -54,9 +54,8 @@ set(SRCS
 add_library(cxx ${SRCS})
 set_target_properties(cxx PROPERTIES FOLDER "contrib/libcxx-cmake")
 
-target_include_directories(cxx SYSTEM BEFORE PUBLIC
-        $<BUILD_INTERFACE:${LIBCXX_SOURCE_DIR}/include>
-        $<BUILD_INTERFACE:${LIBCXX_SOURCE_DIR}>/src)
+target_include_directories(cxx SYSTEM BEFORE PRIVATE $<BUILD_INTERFACE:${LIBCXX_SOURCE_DIR}/src>)
+target_include_directories(cxx SYSTEM BEFORE PUBLIC  $<BUILD_INTERFACE:${LIBCXX_SOURCE_DIR}/include>)
 target_compile_definitions(cxx PRIVATE -D_LIBCPP_BUILDING_LIBRARY -DLIBCXX_BUILDING_LIBCXXABI)
 
 # Enable capturing stack traces for all exceptions.
diff --git a/contrib/vectorscan-cmake/CMakeLists.txt b/contrib/vectorscan-cmake/CMakeLists.txt
index bc17105be99..f9f46d9a8cf 100644
--- a/contrib/vectorscan-cmake/CMakeLists.txt
+++ b/contrib/vectorscan-cmake/CMakeLists.txt
@@ -1,6 +1,6 @@
 # We use vectorscan, a portable and API/ABI-compatible drop-in replacement for hyperscan.
 
-if (ARCH_AMD64)
+if (ARCH_AMD64 AND NOT NO_SSE3_OR_HIGHER)
     option (ENABLE_VECTORSCAN "Enable vectorscan library" ${ENABLE_LIBRARIES})
 endif()
 
diff --git a/docker/packager/binary/Dockerfile b/docker/packager/binary/Dockerfile
index ba0a27c9801..b9b0c5c2c6c 100644
--- a/docker/packager/binary/Dockerfile
+++ b/docker/packager/binary/Dockerfile
@@ -83,5 +83,8 @@ RUN export CODENAME="$(lsb_release --codename --short | tr 'A-Z' 'a-z')" \
         --yes --no-install-recommends \
     && apt-get clean
 
+# for external_symbolizer_path
+RUN ln -s /usr/bin/llvm-symbolizer-15 /usr/bin/llvm-symbolizer
+
 COPY build.sh /
 CMD ["bash", "-c", "/build.sh 2>&1"]
diff --git a/docker/packager/packager b/docker/packager/packager
index 66eb568d460..591262959b4 100755
--- a/docker/packager/packager
+++ b/docker/packager/packager
@@ -130,6 +130,7 @@ def parse_env_variables(
     ARM_SUFFIX = "-aarch64"
     FREEBSD_SUFFIX = "-freebsd"
     PPC_SUFFIX = "-ppc64le"
+    AMD64_SSE2_SUFFIX = "-amd64sse2"
 
     result = []
     result.append("OUTPUT_DIR=/output")
@@ -141,6 +142,7 @@ def parse_env_variables(
     is_cross_arm = compiler.endswith(ARM_SUFFIX)
     is_cross_ppc = compiler.endswith(PPC_SUFFIX)
     is_cross_freebsd = compiler.endswith(FREEBSD_SUFFIX)
+    is_amd64_sse2 = compiler.endswith(AMD64_SSE2_SUFFIX)
 
     if is_cross_darwin:
         cc = compiler[: -len(DARWIN_SUFFIX)]
@@ -186,6 +188,10 @@ def parse_env_variables(
         cmake_flags.append(
             "-DCMAKE_TOOLCHAIN_FILE=/build/cmake/linux/toolchain-ppc64le.cmake"
         )
+    elif is_amd64_sse2:
+        cc = compiler[: -len(AMD64_SSE2_SUFFIX)]
+        result.append("DEB_ARCH=amd64")
+        cmake_flags.append("-DNO_SSE3_OR_HIGHER=1")
     else:
         cc = compiler
         result.append("DEB_ARCH=amd64")
@@ -339,6 +345,7 @@ if __name__ == "__main__":
             "clang-14-darwin-aarch64",
             "clang-14-aarch64",
             "clang-14-ppc64le",
+            "clang-14-amd64sse2",
             "clang-14-freebsd",
             "gcc-11",
         ),
diff --git a/docker/server/Dockerfile.ubuntu b/docker/server/Dockerfile.ubuntu
index b9e8b89cd92..f4102a6ccaf 100644
--- a/docker/server/Dockerfile.ubuntu
+++ b/docker/server/Dockerfile.ubuntu
@@ -31,9 +31,6 @@ ARG deb_location_url=""
 
 # set non-empty single_binary_location_url to create docker image
 # from a single binary url (useful for non-standard builds - with sanitizers, for arm64).
-# for example (run on aarch64 server):
-# docker build . --network host --build-arg single_binary_location_url="https://builds.clickhouse.com/master/aarch64/clickhouse" -t altinity/clickhouse-server:master-testing-arm
-# note: clickhouse-odbc-bridge is not supported there.
 ARG single_binary_location_url=""
 
 # user/group precreated explicitly with fixed uid/gid on purpose.
diff --git a/docker/server/entrypoint.sh b/docker/server/entrypoint.sh
index d4da5f0f38c..16372230d91 100755
--- a/docker/server/entrypoint.sh
+++ b/docker/server/entrypoint.sh
@@ -37,7 +37,6 @@ if [ -n "$ERROR_LOG_PATH" ]; then ERROR_LOG_DIR="$(dirname "$ERROR_LOG_PATH")";
 FORMAT_SCHEMA_PATH="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=format_schema_path || true)"
 
 # There could be many disks declared in config
-readarray -t FILESYSTEM_CACHE_PATHS < <(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key='storage_configuration.disks.*.data_cache_path' || true)
 readarray -t DISKS_PATHS < <(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key='storage_configuration.disks.*.path' || true)
 
 CLICKHOUSE_USER="${CLICKHOUSE_USER:-default}"
@@ -51,7 +50,6 @@ for dir in "$DATA_DIR" \
   "$TMP_DIR" \
   "$USER_PATH" \
   "$FORMAT_SCHEMA_PATH" \
-  "${FILESYSTEM_CACHE_PATHS[@]}" \
   "${DISKS_PATHS[@]}"
 do
     # check if variable not empty
diff --git a/docker/test/fuzzer/run-fuzzer.sh b/docker/test/fuzzer/run-fuzzer.sh
index 11ddb0bd2d3..93e38260395 100755
--- a/docker/test/fuzzer/run-fuzzer.sh
+++ b/docker/test/fuzzer/run-fuzzer.sh
@@ -1,8 +1,15 @@
 #!/bin/bash
 # shellcheck disable=SC2086,SC2001,SC2046,SC2030,SC2031
 
-set -eux
+set -x
+
+# core.COMM.PID-TID
+sysctl kernel.core_pattern='core.%e.%p-%P'
+
+set -e
+set -u
 set -o pipefail
+
 trap "exit" INT TERM
 # The watchdog is in the separate process group, so we have to kill it separately
 # if the script terminates earlier.
@@ -87,6 +94,19 @@ function configure
     # TODO figure out which ones are needed
     cp -av --dereference "$repo_dir"/tests/config/config.d/listen.xml db/config.d
     cp -av --dereference "$script_dir"/query-fuzzer-tweaks-users.xml db/users.d
+
+    cat > db/config.d/core.xml <<EOL
+<clickhouse>
+    <core_dump>
+        <!-- 100GiB -->
+        <size_limit>107374182400</size_limit>
+    </core_dump>
+    <!-- NOTE: no need to configure core_path,
+         since clickhouse is not started as daemon (via clickhouse start)
+    -->
+    <core_path>$PWD</core_path>
+</clickhouse>
+EOL
 }
 
 function watchdog
@@ -180,7 +200,6 @@ handle SIGUSR2 nostop noprint pass
 handle SIG$RTMIN nostop noprint pass
 info signals
 continue
-gcore
 backtrace full
 thread apply all backtrace full
 info registers
diff --git a/docker/test/stress/run.sh b/docker/test/stress/run.sh
index f8ecdf1aa21..ac7de9c07a2 100755
--- a/docker/test/stress/run.sh
+++ b/docker/test/stress/run.sh
@@ -3,8 +3,14 @@
 # shellcheck disable=SC2086
 # shellcheck disable=SC2024
 
+# Avoid overlaps with previous runs
+dmesg --clear
+
 set -x
 
+# core.COMM.PID-TID
+sysctl kernel.core_pattern='core.%e.%p-%P'
+
 # Thread Fuzzer allows to check more permutations of possible thread scheduling
 # and find more potential issues.
 
@@ -38,8 +44,10 @@ function install_packages()
 
 function configure()
 {
+    export ZOOKEEPER_FAULT_INJECTION=1
     # install test configs
     export USE_DATABASE_ORDINARY=1
+    export EXPORT_S3_STORAGE_POLICIES=1
     /usr/share/clickhouse-test/config/install.sh
 
     # we mount tests folder from repo to /usr/share
@@ -99,6 +107,19 @@ EOL
         </default>
     </profiles>
 </clickhouse>
+EOL
+
+    cat > /etc/clickhouse-server/config.d/core.xml <<EOL
+<clickhouse>
+    <core_dump>
+        <!-- 100GiB -->
+        <size_limit>107374182400</size_limit>
+    </core_dump>
+    <!-- NOTE: no need to configure core_path,
+         since clickhouse is not started as daemon (via clickhouse start)
+    -->
+    <core_path>$PWD</core_path>
+</clickhouse>
 EOL
 }
 
@@ -155,7 +176,6 @@ handle SIGUSR2 nostop noprint pass
 handle SIG$RTMIN nostop noprint pass
 info signals
 continue
-gcore
 backtrace full
 thread apply all backtrace full
 info registers
@@ -183,11 +203,11 @@ install_packages package_folder
 configure
 
 azurite-blob --blobHost 0.0.0.0 --blobPort 10000 --debug /azurite_log &
-./setup_minio.sh stateful  # to have a proper environment
+./setup_minio.sh stateless # to have a proper environment
 
 start
 
-# shellcheck disable=SC2086 # No quotes because I want to split it into words.
+shellcheck disable=SC2086 # No quotes because I want to split it into words.
 /s3downloader --url-prefix "$S3_URL" --dataset-names $DATASETS
 chmod 777 -R /var/lib/clickhouse
 clickhouse-client --query "ATTACH DATABASE IF NOT EXISTS datasets ENGINE = Ordinary"
@@ -200,12 +220,36 @@ start
 
 clickhouse-client --query "SHOW TABLES FROM datasets"
 clickhouse-client --query "SHOW TABLES FROM test"
-clickhouse-client --query "RENAME TABLE datasets.hits_v1 TO test.hits"
-clickhouse-client --query "RENAME TABLE datasets.visits_v1 TO test.visits"
-clickhouse-client --query "CREATE TABLE test.hits_s3  (WatchID UInt64, JavaEnable UInt8, Title String, GoodEvent Int16, EventTime DateTime, EventDate Date, CounterID UInt32, ClientIP UInt32, ClientIP6 FixedString(16), RegionID UInt32, UserID UInt64, CounterClass Int8, OS UInt8, UserAgent UInt8, URL String, Referer String, URLDomain String, RefererDomain String, Refresh UInt8, IsRobot UInt8, RefererCategories Array(UInt16), URLCategories Array(UInt16), URLRegions Array(UInt32), RefererRegions Array(UInt32), ResolutionWidth UInt16, ResolutionHeight UInt16, ResolutionDepth UInt8, FlashMajor UInt8, FlashMinor UInt8, FlashMinor2 String, NetMajor UInt8, NetMinor UInt8, UserAgentMajor UInt16, UserAgentMinor FixedString(2), CookieEnable UInt8, JavascriptEnable UInt8, IsMobile UInt8, MobilePhone UInt8, MobilePhoneModel String, Params String, IPNetworkID UInt32, TraficSourceID Int8, SearchEngineID UInt16, SearchPhrase String, AdvEngineID UInt8, IsArtifical UInt8, WindowClientWidth UInt16, WindowClientHeight UInt16, ClientTimeZone Int16, ClientEventTime DateTime, SilverlightVersion1 UInt8, SilverlightVersion2 UInt8, SilverlightVersion3 UInt32, SilverlightVersion4 UInt16, PageCharset String, CodeVersion UInt32, IsLink UInt8, IsDownload UInt8, IsNotBounce UInt8, FUniqID UInt64, HID UInt32, IsOldCounter UInt8, IsEvent UInt8, IsParameter UInt8, DontCountHits UInt8, WithHash UInt8, HitColor FixedString(1), UTCEventTime DateTime, Age UInt8, Sex UInt8, Income UInt8, Interests UInt16, Robotness UInt8, GeneralInterests Array(UInt16), RemoteIP UInt32, RemoteIP6 FixedString(16), WindowName Int32, OpenerName Int32, HistoryLength Int16, BrowserLanguage FixedString(2), BrowserCountry FixedString(2), SocialNetwork String, SocialAction String, HTTPError UInt16, SendTiming Int32, DNSTiming Int32, ConnectTiming Int32, ResponseStartTiming Int32, ResponseEndTiming Int32, FetchTiming Int32, RedirectTiming Int32, DOMInteractiveTiming Int32, DOMContentLoadedTiming Int32, DOMCompleteTiming Int32, LoadEventStartTiming Int32, LoadEventEndTiming Int32, NSToDOMContentLoadedTiming Int32, FirstPaintTiming Int32, RedirectCount Int8, SocialSourceNetworkID UInt8, SocialSourcePage String, ParamPrice Int64, ParamOrderID String, ParamCurrency FixedString(3), ParamCurrencyID UInt16, GoalsReached Array(UInt32), OpenstatServiceName String, OpenstatCampaignID String, OpenstatAdID String, OpenstatSourceID String, UTMSource String, UTMMedium String, UTMCampaign String, UTMContent String, UTMTerm String, FromTag String, HasGCLID UInt8, RefererHash UInt64, URLHash UInt64, CLID UInt32, YCLID UInt64, ShareService String, ShareURL String, ShareTitle String, ParsedParams Nested(Key1 String, Key2 String, Key3 String, Key4 String, Key5 String, ValueDouble Float64), IslandID FixedString(16), RequestNum UInt32, RequestTry UInt8) ENGINE = MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS index_granularity = 8192, storage_policy='s3_cache'"
-clickhouse-client --query "INSERT INTO test.hits_s3 SELECT * FROM test.hits"
+
+clickhouse-client --query "CREATE TABLE test.hits_s3 (WatchID UInt64,  JavaEnable UInt8,  Title String,  GoodEvent Int16,  EventTime DateTime,  EventDate Date,  CounterID UInt32,  ClientIP UInt32,  ClientIP6 FixedString(16),  RegionID UInt32,  UserID UInt64,  CounterClass Int8,  OS UInt8,  UserAgent UInt8,  URL String,  Referer String,  URLDomain String,  RefererDomain String,  Refresh UInt8,  IsRobot UInt8,  RefererCategories Array(UInt16),  URLCategories Array(UInt16), URLRegions Array(UInt32),  RefererRegions Array(UInt32),  ResolutionWidth UInt16,  ResolutionHeight UInt16,  ResolutionDepth UInt8,  FlashMajor UInt8, FlashMinor UInt8,  FlashMinor2 String,  NetMajor UInt8,  NetMinor UInt8, UserAgentMajor UInt16,  UserAgentMinor FixedString(2),  CookieEnable UInt8, JavascriptEnable UInt8,  IsMobile UInt8,  MobilePhone UInt8,  MobilePhoneModel String,  Params String,  IPNetworkID UInt32,  TraficSourceID Int8, SearchEngineID UInt16,  SearchPhrase String,  AdvEngineID UInt8,  IsArtifical UInt8,  WindowClientWidth UInt16,  WindowClientHeight UInt16,  ClientTimeZone Int16,  ClientEventTime DateTime,  SilverlightVersion1 UInt8, SilverlightVersion2 UInt8,  SilverlightVersion3 UInt32,  SilverlightVersion4 UInt16,  PageCharset String,  CodeVersion UInt32,  IsLink UInt8,  IsDownload UInt8,  IsNotBounce UInt8,  FUniqID UInt64,  HID UInt32,  IsOldCounter UInt8, IsEvent UInt8,  IsParameter UInt8,  DontCountHits UInt8,  WithHash UInt8, HitColor FixedString(1),  UTCEventTime DateTime,  Age UInt8,  Sex UInt8,  Income UInt8,  Interests UInt16,  Robotness UInt8,  GeneralInterests Array(UInt16), RemoteIP UInt32,  RemoteIP6 FixedString(16),  WindowName Int32,  OpenerName Int32,  HistoryLength Int16,  BrowserLanguage FixedString(2),  BrowserCountry FixedString(2),  SocialNetwork String,  SocialAction String,  HTTPError UInt16, SendTiming Int32,  DNSTiming Int32,  ConnectTiming Int32,  ResponseStartTiming Int32,  ResponseEndTiming Int32,  FetchTiming Int32,  RedirectTiming Int32, DOMInteractiveTiming Int32,  DOMContentLoadedTiming Int32,  DOMCompleteTiming Int32,  LoadEventStartTiming Int32,  LoadEventEndTiming Int32, NSToDOMContentLoadedTiming Int32,  FirstPaintTiming Int32,  RedirectCount Int8, SocialSourceNetworkID UInt8,  SocialSourcePage String,  ParamPrice Int64, ParamOrderID String,  ParamCurrency FixedString(3),  ParamCurrencyID UInt16, GoalsReached Array(UInt32),  OpenstatServiceName String,  OpenstatCampaignID String,  OpenstatAdID String,  OpenstatSourceID String,  UTMSource String, UTMMedium String,  UTMCampaign String,  UTMContent String,  UTMTerm String, FromTag String,  HasGCLID UInt8,  RefererHash UInt64,  URLHash UInt64,  CLID UInt32,  YCLID UInt64,  ShareService String,  ShareURL String,  ShareTitle String,  ParsedParams Nested(Key1 String,  Key2 String, Key3 String, Key4 String, Key5 String,  ValueDouble Float64),  IslandID FixedString(16),  RequestNum UInt32,  RequestTry UInt8) ENGINE = MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS index_granularity = 8192, storage_policy='s3_cache'"
+clickhouse-client --query "CREATE TABLE test.hits (WatchID UInt64,  JavaEnable UInt8,  Title String,  GoodEvent Int16,  EventTime DateTime,  EventDate Date,  CounterID UInt32,  ClientIP UInt32,  ClientIP6 FixedString(16),  RegionID UInt32,  UserID UInt64,  CounterClass Int8,  OS UInt8,  UserAgent UInt8,  URL String,  Referer String,  URLDomain String,  RefererDomain String,  Refresh UInt8,  IsRobot UInt8,  RefererCategories Array(UInt16),  URLCategories Array(UInt16), URLRegions Array(UInt32),  RefererRegions Array(UInt32),  ResolutionWidth UInt16,  ResolutionHeight UInt16,  ResolutionDepth UInt8,  FlashMajor UInt8, FlashMinor UInt8,  FlashMinor2 String,  NetMajor UInt8,  NetMinor UInt8, UserAgentMajor UInt16,  UserAgentMinor FixedString(2),  CookieEnable UInt8, JavascriptEnable UInt8,  IsMobile UInt8,  MobilePhone UInt8,  MobilePhoneModel String,  Params String,  IPNetworkID UInt32,  TraficSourceID Int8, SearchEngineID UInt16,  SearchPhrase String,  AdvEngineID UInt8,  IsArtifical UInt8,  WindowClientWidth UInt16,  WindowClientHeight UInt16,  ClientTimeZone Int16,  ClientEventTime DateTime,  SilverlightVersion1 UInt8, SilverlightVersion2 UInt8,  SilverlightVersion3 UInt32,  SilverlightVersion4 UInt16,  PageCharset String,  CodeVersion UInt32,  IsLink UInt8,  IsDownload UInt8,  IsNotBounce UInt8,  FUniqID UInt64,  HID UInt32,  IsOldCounter UInt8, IsEvent UInt8,  IsParameter UInt8,  DontCountHits UInt8,  WithHash UInt8, HitColor FixedString(1),  UTCEventTime DateTime,  Age UInt8,  Sex UInt8,  Income UInt8,  Interests UInt16,  Robotness UInt8,  GeneralInterests Array(UInt16), RemoteIP UInt32,  RemoteIP6 FixedString(16),  WindowName Int32,  OpenerName Int32,  HistoryLength Int16,  BrowserLanguage FixedString(2),  BrowserCountry FixedString(2),  SocialNetwork String,  SocialAction String,  HTTPError UInt16, SendTiming Int32,  DNSTiming Int32,  ConnectTiming Int32,  ResponseStartTiming Int32,  ResponseEndTiming Int32,  FetchTiming Int32,  RedirectTiming Int32, DOMInteractiveTiming Int32,  DOMContentLoadedTiming Int32,  DOMCompleteTiming Int32,  LoadEventStartTiming Int32,  LoadEventEndTiming Int32, NSToDOMContentLoadedTiming Int32,  FirstPaintTiming Int32,  RedirectCount Int8, SocialSourceNetworkID UInt8,  SocialSourcePage String,  ParamPrice Int64, ParamOrderID String,  ParamCurrency FixedString(3),  ParamCurrencyID UInt16, GoalsReached Array(UInt32),  OpenstatServiceName String,  OpenstatCampaignID String,  OpenstatAdID String,  OpenstatSourceID String,  UTMSource String, UTMMedium String,  UTMCampaign String,  UTMContent String,  UTMTerm String, FromTag String,  HasGCLID UInt8,  RefererHash UInt64,  URLHash UInt64,  CLID UInt32,  YCLID UInt64,  ShareService String,  ShareURL String,  ShareTitle String,  ParsedParams Nested(Key1 String,  Key2 String, Key3 String, Key4 String, Key5 String,  ValueDouble Float64),  IslandID FixedString(16),  RequestNum UInt32,  RequestTry UInt8) ENGINE = MergeTree() PARTITION BY toYYYYMM(EventDate) ORDER BY (CounterID, EventDate, intHash32(UserID)) SAMPLE BY intHash32(UserID) SETTINGS index_granularity = 8192, storage_policy='s3_cache'"
+clickhouse-client --query "CREATE TABLE test.visits (CounterID UInt32,  StartDate Date,  Sign Int8,  IsNew UInt8,  VisitID UInt64,  UserID UInt64,  StartTime DateTime,  Duration UInt32,  UTCStartTime DateTime,  PageViews Int32,  Hits Int32,  IsBounce UInt8,  Referer String,  StartURL String,  RefererDomain String,  StartURLDomain String,  EndURL String,  LinkURL String,  IsDownload UInt8,  TraficSourceID Int8,  SearchEngineID UInt16,  SearchPhrase String,  AdvEngineID UInt8,  PlaceID Int32,  RefererCategories Array(UInt16),  URLCategories Array(UInt16),  URLRegions Array(UInt32),  RefererRegions Array(UInt32),  IsYandex UInt8,  GoalReachesDepth Int32,  GoalReachesURL Int32,  GoalReachesAny Int32,  SocialSourceNetworkID UInt8,  SocialSourcePage String,  MobilePhoneModel String,  ClientEventTime DateTime,  RegionID UInt32,  ClientIP UInt32,  ClientIP6 FixedString(16),  RemoteIP UInt32,  RemoteIP6 FixedString(16),  IPNetworkID UInt32,  SilverlightVersion3 UInt32,  CodeVersion UInt32,  ResolutionWidth UInt16,  ResolutionHeight UInt16,  UserAgentMajor UInt16,  UserAgentMinor UInt16,  WindowClientWidth UInt16,  WindowClientHeight UInt16,  SilverlightVersion2 UInt8,  SilverlightVersion4 UInt16,  FlashVersion3 UInt16,  FlashVersion4 UInt16,  ClientTimeZone Int16,  OS UInt8,  UserAgent UInt8,  ResolutionDepth UInt8,  FlashMajor UInt8,  FlashMinor UInt8,  NetMajor UInt8,  NetMinor UInt8,  MobilePhone UInt8,  SilverlightVersion1 UInt8,  Age UInt8,  Sex UInt8,  Income UInt8,  JavaEnable UInt8,  CookieEnable UInt8,  JavascriptEnable UInt8,  IsMobile UInt8,  BrowserLanguage UInt16,  BrowserCountry UInt16,  Interests UInt16,  Robotness UInt8,  GeneralInterests Array(UInt16),  Params Array(String),  Goals Nested(ID UInt32, Serial UInt32, EventTime DateTime,  Price Int64,  OrderID String, CurrencyID UInt32),  WatchIDs Array(UInt64),  ParamSumPrice Int64,  ParamCurrency FixedString(3),  ParamCurrencyID UInt16,  ClickLogID UInt64,  ClickEventID Int32,  ClickGoodEvent Int32,  ClickEventTime DateTime,  ClickPriorityID Int32,  ClickPhraseID Int32,  ClickPageID Int32,  ClickPlaceID Int32,  ClickTypeID Int32,  ClickResourceID Int32,  ClickCost UInt32,  ClickClientIP UInt32,  ClickDomainID UInt32,  ClickURL String,  ClickAttempt UInt8,  ClickOrderID UInt32,  ClickBannerID UInt32,  ClickMarketCategoryID UInt32,  ClickMarketPP UInt32,  ClickMarketCategoryName String,  ClickMarketPPName String,  ClickAWAPSCampaignName String,  ClickPageName String,  ClickTargetType UInt16,  ClickTargetPhraseID UInt64,  ClickContextType UInt8,  ClickSelectType Int8,  ClickOptions String,  ClickGroupBannerID Int32,  OpenstatServiceName String,  OpenstatCampaignID String,  OpenstatAdID String,  OpenstatSourceID String,  UTMSource String,  UTMMedium String,  UTMCampaign String,  UTMContent String,  UTMTerm String,  FromTag String,  HasGCLID UInt8,  FirstVisit DateTime,  PredLastVisit Date,  LastVisit Date,  TotalVisits UInt32,  TraficSource    Nested(ID Int8,  SearchEngineID UInt16, AdvEngineID UInt8, PlaceID UInt16, SocialSourceNetworkID UInt8, Domain String, SearchPhrase String, SocialSourcePage String),  Attendance FixedString(16),  CLID UInt32,  YCLID UInt64,  NormalizedRefererHash UInt64,  SearchPhraseHash UInt64,  RefererDomainHash UInt64,  NormalizedStartURLHash UInt64,  StartURLDomainHash UInt64,  NormalizedEndURLHash UInt64,  TopLevelDomain UInt64,  URLScheme UInt64,  OpenstatServiceNameHash UInt64,  OpenstatCampaignIDHash UInt64,  OpenstatAdIDHash UInt64,  OpenstatSourceIDHash UInt64,  UTMSourceHash UInt64,  UTMMediumHash UInt64,  UTMCampaignHash UInt64,  UTMContentHash UInt64,  UTMTermHash UInt64,  FromHash UInt64,  WebVisorEnabled UInt8,  WebVisorActivity UInt32,  ParsedParams    Nested(Key1 String,  Key2 String,  Key3 String,  Key4 String, Key5 String, ValueDouble    Float64),  Market Nested(Type UInt8, GoalID UInt32, OrderID String,  OrderPrice Int64,  PP UInt32,  DirectPlaceID UInt32,  DirectOrderID  UInt32,  DirectBannerID UInt32,  GoodID String, GoodName String, GoodQuantity Int32,  GoodPrice Int64),  IslandID FixedString(16)) ENGINE = CollapsingMergeTree(Sign) PARTITION BY toYYYYMM(StartDate) ORDER BY (CounterID, StartDate, intHash32(UserID), VisitID) SAMPLE BY intHash32(UserID) SETTINGS index_granularity = 8192, storage_policy='s3_cache'"
+
+clickhouse-client --query "INSERT INTO test.hits_s3 SELECT * FROM datasets.hits_v1 SETTINGS enable_filesystem_cache_on_write_operations=0"
+clickhouse-client --query "INSERT INTO test.hits SELECT * FROM datasets.hits_v1 SETTINGS enable_filesystem_cache_on_write_operations=0"
+clickhouse-client --query "INSERT INTO test.visits SELECT * FROM datasets.visits_v1 SETTINGS enable_filesystem_cache_on_write_operations=0"
+
+clickhouse-client --query "DROP TABLE datasets.visits_v1 SYNC"
+clickhouse-client --query "DROP TABLE datasets.hits_v1 SYNC"
+
 clickhouse-client --query "SHOW TABLES FROM test"
 
+clickhouse-client --query "SYSTEM STOP THREAD FUZZER"
+
+stop
+
+# Let's enable S3 storage by default
+export USE_S3_STORAGE_FOR_MERGE_TREE=1
+configure
+
+# But we still need default disk because some tables loaded only into it
+sudo cat /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml | sed "s|<disk>s3</disk>|<disk>s3</disk><disk>default</disk>|" > /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml.tmp
+mv /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml.tmp /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml
+sudo chown clickhouse /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml
+sudo chgrp clickhouse /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml
+
+start
+
 ./stress --hung-check --drop-databases --output-folder test_output --skip-func-tests "$SKIP_TESTS_OPTION" \
     && echo -e 'Test script exit code\tOK' >> /test_output/test_results.tsv \
     || echo -e 'Test script failed\tFAIL' >> /test_output/test_results.tsv
@@ -255,6 +299,14 @@ zgrep -Fa "Code: 49, e.displayText() = DB::Exception:" /var/log/clickhouse-serve
 # Remove file logical_errors.txt if it's empty
 [ -s /test_output/logical_errors.txt ] || rm /test_output/logical_errors.txt
 
+# No such key errors
+zgrep -Ea "Code: 499.*The specified key does not exist" /var/log/clickhouse-server/clickhouse-server*.log > /test_output/no_such_key_errors.txt \
+    && echo -e 'S3_ERROR No such key thrown (see clickhouse-server.log or no_such_key_errors.txt)\tFAIL' >> /test_output/test_results.tsv \
+    || echo -e 'No lost s3 keys\tOK' >> /test_output/test_results.tsv
+
+# Remove file no_such_key_errors.txt if it's empty
+[ -s /test_output/no_such_key_errors.txt ] || rm /test_output/no_such_key_errors.txt
+
 # Crash
 zgrep -Fa "########################################" /var/log/clickhouse-server/clickhouse-server*.log > /dev/null \
     && echo -e 'Killed by signal (in clickhouse-server.log)\tFAIL' >> /test_output/test_results.tsv \
@@ -467,8 +519,7 @@ done
 clickhouse-local --structure "test String, res String" -q "SELECT 'failure', test FROM table WHERE res != 'OK' order by (lower(test) like '%hung%'), rowNumberInAllBlocks() LIMIT 1" < /test_output/test_results.tsv > /test_output/check_status.tsv
 [ -s /test_output/check_status.tsv ] || echo -e "success\tNo errors found" > /test_output/check_status.tsv
 
-# Core dumps (see gcore)
-# Default filename is 'core.PROCESS_ID'
+# Core dumps
 for core in core.*; do
     pigz $core
     mv $core.gz /test_output/
diff --git a/docker/test/stress/stress b/docker/test/stress/stress
index 64cca4beb3a..7f3f38bd8f5 100755
--- a/docker/test/stress/stress
+++ b/docker/test/stress/stress
@@ -168,7 +168,7 @@ def prepare_for_hung_check(drop_databases):
                 for db in databases:
                     if db == "system":
                         continue
-                    command = make_query_command(f"DROP DATABASE {db}")
+                    command = make_query_command(f'DETACH DATABASE {db}')
                     # we don't wait for drop
                     Popen(command, shell=True)
                 break
diff --git a/docker/test/style/Dockerfile b/docker/test/style/Dockerfile
index 0ec3f09ab7f..683124feaa0 100644
--- a/docker/test/style/Dockerfile
+++ b/docker/test/style/Dockerfile
@@ -17,7 +17,7 @@ RUN apt-get update && env DEBIAN_FRONTEND=noninteractive apt-get install --yes \
     python3-pip \
     shellcheck \
     yamllint \
-    && pip3 install black boto3 codespell dohq-artifactory PyGithub unidiff pylint==2.6.2 \
+    && pip3 install black==22.8.0 boto3 codespell==2.2.1 dohq-artifactory PyGithub unidiff pylint==2.6.2 \
     && apt-get clean \
     && rm -rf /root/.cache/pip
 
diff --git a/docs/en/development/build-osx.md b/docs/en/development/build-osx.md
index 97e4e4ddde1..12f74feb272 100644
--- a/docs/en/development/build-osx.md
+++ b/docs/en/development/build-osx.md
@@ -37,7 +37,7 @@ sudo xcode-select --install
 
 ``` bash
 brew update
-brew install cmake ninja libtool gettext llvm gcc binutils grep findutils
+brew install ccache cmake ninja libtool gettext llvm gcc binutils grep findutils
 ```
 
 ## Checkout ClickHouse Sources {#checkout-clickhouse-sources}
diff --git a/docs/en/development/build.md b/docs/en/development/build.md
index fa04fbf2680..8712aa3e2bc 100644
--- a/docs/en/development/build.md
+++ b/docs/en/development/build.md
@@ -140,6 +140,6 @@ hash cmake
 
 ClickHouse is available in pre-built binaries and packages. Binaries are portable and can be run on any Linux flavour.
 
-They are built for stable, prestable and testing releases as long as for every commit to master and for every pull request.
+Binaries are built for stable and LTS releases and also every commit to `master` for each pull request.
 
 To find the freshest build from `master`, go to [commits page](https://github.com/ClickHouse/ClickHouse/commits/master), click on the first green check mark or red cross near commit, and click to the “Details” link right after “ClickHouse Build Check”.
diff --git a/docs/en/engines/database-engines/replicated.md b/docs/en/engines/database-engines/replicated.md
index 554345a3c15..f0ef1e981fe 100644
--- a/docs/en/engines/database-engines/replicated.md
+++ b/docs/en/engines/database-engines/replicated.md
@@ -12,7 +12,7 @@ One ClickHouse server can have multiple replicated databases running and updatin
 
 ## Creating a Database {#creating-a-database}
 ``` sql
-    CREATE DATABASE testdb ENGINE = Replicated('zoo_path', 'shard_name', 'replica_name') [SETTINGS ...]
+CREATE DATABASE testdb ENGINE = Replicated('zoo_path', 'shard_name', 'replica_name') [SETTINGS ...]
 ```
 
 **Engine Parameters**
@@ -21,9 +21,7 @@ One ClickHouse server can have multiple replicated databases running and updatin
 -   `shard_name` — Shard name. Database replicas are grouped into shards by `shard_name`.
 -   `replica_name` — Replica name. Replica names must be different for all replicas of the same shard.
 
-:::warning
 For [ReplicatedMergeTree](../table-engines/mergetree-family/replication.md#table_engines-replication) tables if no arguments provided, then default arguments are used: `/clickhouse/tables/{uuid}/{shard}` and `{replica}`. These can be changed in the server settings [default_replica_path](../../operations/server-configuration-parameters/settings.md#default_replica_path) and [default_replica_name](../../operations/server-configuration-parameters/settings.md#default_replica_name). Macro `{uuid}` is unfolded to table's uuid, `{shard}` and `{replica}` are unfolded to values from server config, not from database engine arguments. But in the future, it will be possible to use `shard_name` and `replica_name` of Replicated database.
-:::
 
 ## Specifics and Recommendations {#specifics-and-recommendations}
 
diff --git a/docs/en/engines/table-engines/integrations/embedded-rocksdb.md b/docs/en/engines/table-engines/integrations/embedded-rocksdb.md
index 0eb3331f471..73dea4b0085 100644
--- a/docs/en/engines/table-engines/integrations/embedded-rocksdb.md
+++ b/docs/en/engines/table-engines/integrations/embedded-rocksdb.md
@@ -16,12 +16,14 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
     name1 [type1] [DEFAULT|MATERIALIZED|ALIAS expr1],
     name2 [type2] [DEFAULT|MATERIALIZED|ALIAS expr2],
     ...
-) ENGINE = EmbeddedRocksDB([ttl]) PRIMARY KEY(primary_key_name)
+) ENGINE = EmbeddedRocksDB([ttl, rocksdb_dir, read_only]) PRIMARY KEY(primary_key_name)
 ```
 
 Engine parameters:
 
 - `ttl` - time to live for values. TTL is accepted in seconds. If TTL is 0, regular RocksDB instance is used (without TTL).
+- `rocksdb_dir` - path to the directory of an existed RocksDB or the destination path of the created RocksDB. Open the table with the specified `rocksdb_dir`.
+- `read_only` - when `read_only` is set to true, read-only mode is used. For storage with TTL, compaction will not be triggered (neither manual nor automatic), so no expired entries are removed.
 - `primary_key_name` – any column name in the column list.
 - `primary key` must be specified, it supports only one column in the primary key. The primary key will be serialized in binary as a `rocksdb key`.
 - columns other than the primary key will be serialized in binary as `rocksdb` value in corresponding order.
diff --git a/docs/en/engines/table-engines/special/generate.md b/docs/en/engines/table-engines/special/generate.md
index d03d6dc9d13..e42429a1b10 100644
--- a/docs/en/engines/table-engines/special/generate.md
+++ b/docs/en/engines/table-engines/special/generate.md
@@ -15,7 +15,7 @@ Usage examples:
 ## Usage in ClickHouse Server {#usage-in-clickhouse-server}
 
 ``` sql
-ENGINE = GenerateRandom(random_seed, max_string_length, max_array_length)
+ENGINE = GenerateRandom([random_seed] [,max_string_length] [,max_array_length])
 ```
 
 The `max_array_length` and `max_string_length` parameters specify maximum length of all
diff --git a/docs/en/getting-started/example-datasets/cell-towers.md b/docs/en/getting-started/example-datasets/cell-towers.md
index e74849a76e0..e31ce3de5ce 100644
--- a/docs/en/getting-started/example-datasets/cell-towers.md
+++ b/docs/en/getting-started/example-datasets/cell-towers.md
@@ -13,7 +13,7 @@ OpenCelliD Project is licensed under a Creative Commons Attribution-ShareAlike 4
 
 ## Get the Dataset {#get-the-dataset}
 
-1. Download the snapshot of the dataset from February 2021: [https://datasets.clickhouse.com/cell_towers.csv.xz] (729 MB).
+1. Download the snapshot of the dataset from February 2021: [cell_towers.csv.xz](https://datasets.clickhouse.com/cell_towers.csv.xz) (729 MB).
 
 2. Validate the integrity (optional step):
 ```
diff --git a/docs/en/getting-started/example-datasets/nypd_complaint_data.md b/docs/en/getting-started/example-datasets/nypd_complaint_data.md
new file mode 100644
index 00000000000..8b02ac23cf9
--- /dev/null
+++ b/docs/en/getting-started/example-datasets/nypd_complaint_data.md
@@ -0,0 +1,654 @@
+---
+slug: /en/getting-started/example-datasets/nypd_complaint_data
+sidebar_label: NYPD Complaint Data
+description: "Ingest and query Tab Separated Value data in 5 steps"
+title: NYPD Complaint Data
+---
+
+Tab separated value, or TSV, files are common and may include field headings as the first line of the file. ClickHouse can ingest TSVs, and also can query TSVs without ingesting the files.  This guide covers both of these cases. If you need to query or ingest CSV files, the same techniques work, simply substitute `TSV` with `CSV` in your format arguments.
+
+While working through this guide you will:
+- **Investigate**: Query the structure and content of the TSV file.
+- **Determine the target ClickHouse schema**: Choose proper data types and map the existing data to those types.
+- **Create a ClickHouse table**.
+- **Preprocess and stream** the data to ClickHouse.
+- **Run some queries** against ClickHouse.
+
+The dataset used in this guide comes from the NYC Open Data team, and contains data about "all valid felony, misdemeanor, and violation crimes reported to the New York City Police Department (NYPD)". At the time of writing, the data file is 166MB, but it is updated regularly.
+
+**Source**: [data.cityofnewyork.us](https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Current-Year-To-Date-/5uac-w243)  
+**Terms of use**: https://www1.nyc.gov/home/terms-of-use.page
+
+## Prerequisites
+- Download the dataset by visiting the [NYPD Complaint Data Current (Year To Date)](https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Current-Year-To-Date-/5uac-w243) page, clicking the Export button, and choosing **TSV for Excel**.
+- Install [ClickHouse server and client](../../getting-started/install.md).
+- [Launch](../../getting-started/install.md#launch) ClickHouse server, and connect with `clickhouse-client`
+
+### A note about the commands described in this guide
+There are two types of commands in this guide:
+- Some of the commands are querying the TSV files, these are run at the command prompt.
+- The rest of the commands are querying ClickHouse, and these are run in the `clickhouse-client` or Play UI.
+
+:::note
+The examples in this guide assume that you have saved the TSV file to `${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv`, please adjust the commands if needed.
+:::
+
+## Familiarize yourself with the TSV file
+
+Before starting to work with the ClickHouse database familiarize yourself with the data. 
+
+### Look at the fields in the source TSV file
+
+This is an example of a command to query a TSV file, but don't run it yet.
+```sh
+clickhouse-local --query \
+"describe file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')"
+```
+
+Sample response
+```response
+CMPLNT_NUM                  Nullable(Float64)					
+ADDR_PCT_CD                 Nullable(Float64)					
+BORO_NM                     Nullable(String)					
+CMPLNT_FR_DT                Nullable(String)					
+CMPLNT_FR_TM                Nullable(String)					
+```
+
+:::tip
+Most of the time the above command will let you know which fields in the input data are numeric, and which are strings, and which are tuples.  This is not always the case.  Because ClickHouse is routineley used with datasets containing billions of records there is a default number (100) of rows examined to [infer the schema](../../guides/developer/working-with-json/json-semi-structured.md/#relying-on-schema-inference) in order to avoid parsing billions of rows to infer the schema. The response below may not match what you see, as the dataset is updated several times each year. Looking at the Data Dictionary you can see that CMPLNT_NUM is specified as text, and not numeric.  By overriding the default of 100 rows for inference with the setting `SETTINGS input_format_max_rows_to_read_for_schema_inference=2000`
+you can get a better idea of the content.
+
+Note: as of version 22.5 the default is now 25,000 rows for inferring the schema, so only change the setting if you are on an older version or if you need more than 25,000 rows to be sampled.
+:::
+
+Run this command at your command prompt.  You will be using `clickhouse-local` to query the data in the TSV file you downloaded.
+```sh
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"describe file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')" 
+```
+
+Result:
+```response
+CMPLNT_NUM        Nullable(String)					
+ADDR_PCT_CD       Nullable(Float64)					
+BORO_NM           Nullable(String)					
+CMPLNT_FR_DT      Nullable(String)					
+CMPLNT_FR_TM      Nullable(String)					
+CMPLNT_TO_DT      Nullable(String)					
+CMPLNT_TO_TM      Nullable(String)					
+CRM_ATPT_CPTD_CD  Nullable(String)					
+HADEVELOPT        Nullable(String)					
+HOUSING_PSA       Nullable(Float64)					
+JURISDICTION_CODE Nullable(Float64)					
+JURIS_DESC        Nullable(String)					
+KY_CD             Nullable(Float64)					
+LAW_CAT_CD        Nullable(String)					
+LOC_OF_OCCUR_DESC Nullable(String)					
+OFNS_DESC         Nullable(String)					
+PARKS_NM          Nullable(String)					
+PATROL_BORO       Nullable(String)					
+PD_CD             Nullable(Float64)					
+PD_DESC           Nullable(String)					
+PREM_TYP_DESC     Nullable(String)					
+RPT_DT            Nullable(String)					
+STATION_NAME      Nullable(String)					
+SUSP_AGE_GROUP    Nullable(String)					
+SUSP_RACE         Nullable(String)					
+SUSP_SEX          Nullable(String)					
+TRANSIT_DISTRICT  Nullable(Float64)					
+VIC_AGE_GROUP     Nullable(String)					
+VIC_RACE          Nullable(String)					
+VIC_SEX           Nullable(String)					
+X_COORD_CD        Nullable(Float64)					
+Y_COORD_CD        Nullable(Float64)					
+Latitude          Nullable(Float64)					
+Longitude         Nullable(Float64)					
+Lat_Lon           Tuple(Nullable(Float64), Nullable(Float64))					
+New Georeferenced Column Nullable(String)
+```
+
+At this point you should check that the columns in the TSV file match the names and types specified in the **Columns in this Dataset** section of the [dataset web page](https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Current-Year-To-Date-/5uac-w243).  The data types are not very specific, all numeric fields are set to `Nullable(Float64)`, and all other fields are `Nullable(String)`.  When you create a ClickHouse table to store the data you can specify more appropriate and performant types.
+
+### Determine the proper schema
+
+In order to figure out what types should be used for the fields it is necessary to know what the data looks like. For example, the field `JURISDICTION_CODE` is a numeric: should it be a `UInt8`, or an `Enum`, or is `Float64` appropriate?
+
+```sql
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"select JURISDICTION_CODE, count() FROM
+ file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
+ GROUP BY JURISDICTION_CODE
+ ORDER BY JURISDICTION_CODE
+ FORMAT PrettyCompact"
+```
+
+Result:
+```response
+┌─JURISDICTION_CODE─┬─count()─┐
+│                 0 │  188875 │
+│                 1 │    4799 │
+│                 2 │   13833 │
+│                 3 │     656 │
+│                 4 │      51 │
+│                 6 │       5 │
+│                 7 │       2 │
+│                 9 │      13 │
+│                11 │      14 │
+│                12 │       5 │
+│                13 │       2 │
+│                14 │      70 │
+│                15 │      20 │
+│                72 │     159 │
+│                87 │       9 │
+│                88 │      75 │
+│                97 │     405 │
+└───────────────────┴─────────┘
+```
+
+The query response shows that the `JURISDICTION_CODE` fits well in a `UInt8`.
+
+Similarly, look at some of the `String` fields and see if they are well suited to being `DateTime` or [`LowCardinality(String)`](../../sql-reference/data-types/lowcardinality.md) fields.
+
+For example, the field `PARKS_NM` is described as "Name of NYC park, playground or greenspace of occurrence, if applicable (state parks are not included)".  The names of parks in New York City may be a good candidate for a `LowCardinality(String)`:
+
+```sh
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"select count(distinct PARKS_NM) FROM
+ file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
+ FORMAT PrettyCompact"
+```
+
+Result:
+```response
+┌─uniqExact(PARKS_NM)─┐
+│                 319 │
+└─────────────────────┘
+```
+
+Have a look at some of the park names:
+```sql
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"select distinct PARKS_NM FROM
+ file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
+ LIMIT 10
+ FORMAT PrettyCompact"
+```
+
+Result:
+```response
+┌─PARKS_NM───────────────────┐
+│ (null)                     │
+│ ASSER LEVY PARK            │
+│ JAMES J WALKER PARK        │
+│ BELT PARKWAY/SHORE PARKWAY │
+│ PROSPECT PARK              │
+│ MONTEFIORE SQUARE          │
+│ SUTTON PLACE PARK          │
+│ JOYCE KILMER PARK          │
+│ ALLEY ATHLETIC PLAYGROUND  │
+│ ASTORIA PARK               │
+└────────────────────────────┘
+```
+
+The dataset in use at the time of writing has only a few hundred distinct parks and playgrounds in the `PARK_NM` column.  This is a small number based on the [LowCardinality](../../sql-reference/data-types/lowcardinality.md#lowcardinality-dscr) recommendation to stay below 10,000 distinct strings in a `LowCardinality(String)` field.
+
+### DateTime fields
+Based on the **Columns in this Dataset** section of the [dataset web page](https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Current-Year-To-Date-/5uac-w243) there are date and time fields for the start and end of the reported event.  Looking at the min and max of the `CMPLNT_FR_DT` and `CMPLT_TO_DT` gives an idea of whether or not the fields are always populated:
+
+```sh title="CMPLNT_FR_DT"
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"select min(CMPLNT_FR_DT), max(CMPLNT_FR_DT) FROM
+file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
+FORMAT PrettyCompact"
+```
+
+Result:
+```response
+┌─min(CMPLNT_FR_DT)─┬─max(CMPLNT_FR_DT)─┐
+│ 01/01/1973        │ 12/31/2021        │
+└───────────────────┴───────────────────┘
+```
+
+```sh title="CMPLNT_TO_DT"
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"select min(CMPLNT_TO_DT), max(CMPLNT_TO_DT) FROM
+file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
+FORMAT PrettyCompact"
+```
+
+Result:
+```response
+┌─min(CMPLNT_TO_DT)─┬─max(CMPLNT_TO_DT)─┐
+│                   │ 12/31/2021        │
+└───────────────────┴───────────────────┘
+```
+
+```sh title="CMPLNT_FR_TM"
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"select min(CMPLNT_FR_TM), max(CMPLNT_FR_TM) FROM
+file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
+FORMAT PrettyCompact"
+```
+
+Result:
+```response
+┌─min(CMPLNT_FR_TM)─┬─max(CMPLNT_FR_TM)─┐
+│ 00:00:00          │ 23:59:00          │
+└───────────────────┴───────────────────┘
+```
+
+```sh title="CMPLNT_TO_TM"
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"select min(CMPLNT_TO_TM), max(CMPLNT_TO_TM) FROM
+file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
+FORMAT PrettyCompact"
+```
+
+Result:
+```response
+┌─min(CMPLNT_TO_TM)─┬─max(CMPLNT_TO_TM)─┐
+│ (null)            │ 23:59:00          │
+└───────────────────┴───────────────────┘
+```
+
+## Make a plan
+
+Based on the above investigation:
+- `JURISDICTION_CODE` should be cast as `UInt8`.
+- `PARKS_NM` should be cast to `LowCardinality(String)`
+- `CMPLNT_FR_DT` and `CMPLNT_FR_TM` are always populated (possibly with a default time of `00:00:00`)
+- `CMPLNT_TO_DT` and `CMPLNT_TO_TM` may be empty
+- Dates and times are stored in separate fields in the source
+- Dates are `mm/dd/yyyy` format
+- Times are `hh:mm:ss` format
+- Dates and times can be concatenated into DateTime types
+- There are some dates before January 1st 1970, which means we need a 64 bit DateTime
+
+:::note
+There are many more changes to be made to the types, they all can be determined by following the same investigation steps.  Look at the number of distinct strings in a field, the min and max of the numerics, and make your decisions.  The table schema that is given later in the guide has many low cardinality strings and unsigned integer fields and very few floating point numerics.
+:::
+
+## Concatenate the date and time fields
+
+To concatenate the date and time fields `CMPLNT_FR_DT` and `CMPLNT_FR_TM` into a single `String` that can be cast to a `DateTime`, select the two fields joined by the concatenation operator: `CMPLNT_FR_DT || ' ' || CMPLNT_FR_TM`.  The `CMPLNT_TO_DT` and `CMPLNT_TO_TM` fields are handled similarly.
+
+```sh
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"select CMPLNT_FR_DT || ' ' || CMPLNT_FR_TM AS complaint_begin FROM
+file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
+LIMIT 10
+FORMAT PrettyCompact"
+```
+
+Result:
+```response
+┌─complaint_begin─────┐
+│ 07/29/2010 00:01:00 │
+│ 12/01/2011 12:00:00 │
+│ 04/01/2017 15:00:00 │
+│ 03/26/2018 17:20:00 │
+│ 01/01/2019 00:00:00 │
+│ 06/14/2019 00:00:00 │
+│ 11/29/2021 20:00:00 │
+│ 12/04/2021 00:35:00 │
+│ 12/05/2021 12:50:00 │
+│ 12/07/2021 20:30:00 │
+└─────────────────────┘
+```
+
+## Convert the date and time String to a DateTime64 type
+
+Earlier in the guide we discovered that there are dates in the TSV file before January 1st 1970, which means that we need a 64 bit DateTime type for the dates.  The dates also need to be converted from `MM/DD/YYYY` to `YYYY/MM/DD` format.  Both of these can be done with [`parseDateTime64BestEffort()`](../../sql-reference/functions/type-conversion-functions.md#parsedatetime64besteffort).
+
+```sh
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"WITH (CMPLNT_FR_DT || ' ' || CMPLNT_FR_TM) AS CMPLNT_START,
+      (CMPLNT_TO_DT || ' ' || CMPLNT_TO_TM) AS CMPLNT_END
+select parseDateTime64BestEffort(CMPLNT_START) AS complaint_begin,
+       parseDateTime64BestEffortOrNull(CMPLNT_END) AS complaint_end
+FROM file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
+ORDER BY complaint_begin ASC
+LIMIT 25
+FORMAT PrettyCompact"
+```
+
+Lines 2 and 3 above contain the concatenation from the previous step, and lines 4 and 5 above parse the strings into `DateTime64`.  As the complaint end time is not guaranteed to exist `parseDateTime64BestEffortOrNull` is used.
+
+Result:
+```response
+┌─────────complaint_begin─┬───────────complaint_end─┐
+│ 1925-01-01 10:00:00.000 │ 2021-02-12 09:30:00.000 │
+│ 1925-01-01 11:37:00.000 │ 2022-01-16 11:49:00.000 │
+│ 1925-01-01 15:00:00.000 │ 2021-12-31 00:00:00.000 │
+│ 1925-01-01 15:00:00.000 │ 2022-02-02 22:00:00.000 │
+│ 1925-01-01 19:00:00.000 │ 2022-04-14 05:00:00.000 │
+│ 1955-09-01 19:55:00.000 │ 2022-08-01 00:45:00.000 │
+│ 1972-03-17 11:40:00.000 │ 2022-03-17 11:43:00.000 │
+│ 1972-05-23 22:00:00.000 │ 2022-05-24 09:00:00.000 │
+│ 1972-05-30 23:37:00.000 │ 2022-05-30 23:50:00.000 │
+│ 1972-07-04 02:17:00.000 │                    ᴺᵁᴸᴸ │
+│ 1973-01-01 00:00:00.000 │                    ᴺᵁᴸᴸ │
+│ 1975-01-01 00:00:00.000 │                    ᴺᵁᴸᴸ │
+│ 1976-11-05 00:01:00.000 │ 1988-10-05 23:59:00.000 │
+│ 1977-01-01 00:00:00.000 │ 1977-01-01 23:59:00.000 │
+│ 1977-12-20 00:01:00.000 │                    ᴺᵁᴸᴸ │
+│ 1981-01-01 00:01:00.000 │                    ᴺᵁᴸᴸ │
+│ 1981-08-14 00:00:00.000 │ 1987-08-13 23:59:00.000 │
+│ 1983-01-07 00:00:00.000 │ 1990-01-06 00:00:00.000 │
+│ 1984-01-01 00:01:00.000 │ 1984-12-31 23:59:00.000 │
+│ 1985-01-01 12:00:00.000 │ 1987-12-31 15:00:00.000 │
+│ 1985-01-11 09:00:00.000 │ 1985-12-31 12:00:00.000 │
+│ 1986-03-16 00:05:00.000 │ 2022-03-16 00:45:00.000 │
+│ 1987-01-07 00:00:00.000 │ 1987-01-09 00:00:00.000 │
+│ 1988-04-03 18:30:00.000 │ 2022-08-03 09:45:00.000 │
+│ 1988-07-29 12:00:00.000 │ 1990-07-27 22:00:00.000 │
+└─────────────────────────┴─────────────────────────┘
+```
+:::note
+The dates shown as `1925` above are from errors in the data.  There are several records in the original data with dates in the years `1019` - `1022` that should be `2019` - `2022`.  They are being stored as Jan 1st 1925 as that is the earliest date with a 64 bit DateTime.
+:::
+
+## Create a table
+
+The decisions made above on the data types used for the columns are reflected in the table schema
+below. We also need to decide on the `ORDER BY` and `PRIMARY KEY` used for the table.  At least one
+of `ORDER BY` or `PRIMARY KEY` must be specified.  Here are some guidelines on deciding on the 
+columns to includes in `ORDER BY`, and more information is in the *Next Steps* section at the end
+of this document.
+
+### Order By and Primary Key clauses
+
+- The `ORDER BY` tuple should include fields that are used in query filters
+- To maximize compression on disk the `ORDER BY` tuple should be ordered by ascending cardinality
+- If it exists, the `PRIMARY KEY` tuple must be a subset of the `ORDER BY` tuple
+- If only `ORDER BY` is specified, then the same tuple will be used as `PRIMARY KEY`
+- The primary key index is created using the `PRIMARY KEY` tuple if specified, otherwise the `ORDER BY` tuple
+- The `PRIMARY KEY` index is kept in main memory
+
+Looking at the dataset and the questions that might be answered by querying it we might
+decide that we would look at the types of crimes reported over time in the five boroughs of
+New York City.  These fields might be then included in the `ORDER BY`:
+
+| Column      | Description (from the data dictionary)                 |
+| ----------- | ---------------------------------------------------    |
+| OFNS_DESC   | Description of offense corresponding with key code     |
+| RPT_DT      | Date event was reported to police                      |
+| BORO_NM     | The name of the borough in which the incident occurred |
+
+
+Querying the TSV file for the cardinality of the three candidate columns:
+
+```bash
+clickhouse-local --input_format_max_rows_to_read_for_schema_inference=2000 \
+--query \
+"select formatReadableQuantity(uniq(OFNS_DESC)) as cardinality_OFNS_DESC,
+        formatReadableQuantity(uniq(RPT_DT)) as cardinality_RPT_DT,
+        formatReadableQuantity(uniq(BORO_NM)) as cardinality_BORO_NM
+  FROM
+  file('${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv', 'TSVWithNames')
+  FORMAT PrettyCompact"
+```
+
+Result:
+```response
+┌─cardinality_OFNS_DESC─┬─cardinality_RPT_DT─┬─cardinality_BORO_NM─┐
+│ 60.00                 │ 306.00             │ 6.00                │
+└───────────────────────┴────────────────────┴─────────────────────┘
+```
+Ordering by cardinality, the `ORDER BY` becomes:
+
+```
+ORDER BY ( BORO_NM, OFNS_DESC, RPT_DT )
+```
+:::note
+The table below will use more easily read column names, the above names will be mapped to
+```
+ORDER BY ( borough, offense_description, date_reported )
+```
+:::
+
+Putting together the changes to data types and the `ORDER BY` tuple gives this table structure:
+
+```sql
+CREATE TABLE NYPD_Complaint ( 
+    complaint_number     String,
+    precinct             UInt8,
+    borough              LowCardinality(String),
+    complaint_begin      DateTime64(0,'America/New_York'),
+    complaint_end        DateTime64(0,'America/New_York'),
+    was_crime_completed  String,
+    housing_authority    String,
+    housing_level_code   UInt32,
+    jurisdiction_code    UInt8, 
+    jurisdiction         LowCardinality(String),
+    offense_code         UInt8,
+    offense_level        LowCardinality(String),
+    location_descriptor  LowCardinality(String),
+    offense_description  LowCardinality(String),
+    park_name            LowCardinality(String),
+    patrol_borough       LowCardinality(String),
+    PD_CD                UInt16,
+    PD_DESC              String,
+    location_type        LowCardinality(String),
+    date_reported        Date,
+    transit_station      LowCardinality(String),
+    suspect_age_group    LowCardinality(String),
+    suspect_race         LowCardinality(String),
+    suspect_sex          LowCardinality(String),
+    transit_district     UInt8,
+    victim_age_group     LowCardinality(String),
+    victim_race          LowCardinality(String),
+    victim_sex           LowCardinality(String),
+    NY_x_coordinate      UInt32,
+    NY_y_coordinate      UInt32,
+    Latitude             Float64,
+    Longitude            Float64
+) ENGINE = MergeTree
+  ORDER BY ( borough, offense_description, date_reported )
+```
+
+### Finding the primary key of a table
+
+The ClickHouse `system` database, specifically `system.table` has all of the information about the table you
+just created.  This query shows the `ORDER BY` (sorting key), and the `PRIMARY KEY`:
+```sql
+SELECT
+    partition_key,
+    sorting_key,
+    primary_key,
+    table
+FROM system.tables
+WHERE table = 'NYPD_Complaint'
+FORMAT Vertical
+```
+
+Response
+```response
+Query id: 6a5b10bf-9333-4090-b36e-c7f08b1d9e01
+
+Row 1:
+──────
+partition_key: 
+sorting_key:   borough, offense_description, date_reported
+primary_key:   borough, offense_description, date_reported
+table:         NYPD_Complaint
+
+1 row in set. Elapsed: 0.001 sec.
+```
+
+## Preprocess and Import Data {#preprocess-import-data}
+
+We will use `clickhouse-local` tool for data preprocessing and `clickhouse-client` to upload it.
+
+### `clickhouse-local` arguments used
+
+:::tip
+`table='input'` appears in the arguments to clickhouse-local below.  clickhouse-local takes the provided input (`cat ${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv`) and inserts the input into a table.  By default the table is named `table`.  In this guide the name of the table is set to `input` to make the data flow clearer. The final argument to clickhouse-local is a query that selects from the table (`FROM input`) which is then piped to `clickhouse-client` to populate the table `NYPD_Complaint`.
+:::
+  
+```sql
+cat ${HOME}/NYPD_Complaint_Data_Current__Year_To_Date_.tsv \
+  | clickhouse-local --table='input' --input-format='TSVWithNames' \
+  --input_format_max_rows_to_read_for_schema_inference=2000 \
+  --query "
+    WITH (CMPLNT_FR_DT || ' ' || CMPLNT_FR_TM) AS CMPLNT_START,
+     (CMPLNT_TO_DT || ' ' || CMPLNT_TO_TM) AS CMPLNT_END
+    SELECT
+      CMPLNT_NUM                                  AS complaint_number,
+      ADDR_PCT_CD                                 AS precinct,
+      BORO_NM                                     AS borough,
+      parseDateTime64BestEffort(CMPLNT_START)     AS complaint_begin,
+      parseDateTime64BestEffortOrNull(CMPLNT_END) AS complaint_end,
+      CRM_ATPT_CPTD_CD                            AS was_crime_completed,
+      HADEVELOPT                                  AS housing_authority_development,
+      HOUSING_PSA                                 AS housing_level_code,
+      JURISDICTION_CODE                           AS jurisdiction_code, 
+      JURIS_DESC                                  AS jurisdiction,
+      KY_CD                                       AS offense_code,
+      LAW_CAT_CD                                  AS offense_level,
+      LOC_OF_OCCUR_DESC                           AS location_descriptor,
+      OFNS_DESC                                   AS offense_description, 
+      PARKS_NM                                    AS park_name,
+      PATROL_BORO                                 AS patrol_borough,
+      PD_CD,
+      PD_DESC,
+      PREM_TYP_DESC                               AS location_type,
+      toDate(parseDateTimeBestEffort(RPT_DT))     AS date_reported,
+      STATION_NAME                                AS transit_station,
+      SUSP_AGE_GROUP                              AS suspect_age_group,
+      SUSP_RACE                                   AS suspect_race,
+      SUSP_SEX                                    AS suspect_sex,
+      TRANSIT_DISTRICT                            AS transit_district,
+      VIC_AGE_GROUP                               AS victim_age_group,   
+      VIC_RACE                                    AS victim_race,
+      VIC_SEX                                     AS victim_sex,
+      X_COORD_CD                                  AS NY_x_coordinate,
+      Y_COORD_CD                                  AS NY_y_coordinate,
+      Latitude,
+      Longitude
+    FROM input" \
+  | clickhouse-client --query='INSERT INTO NYPD_Complaint FORMAT TSV'
+```  
+
+## Validate the Data {#validate-data}
+
+:::note
+The dataset changes once or more per year, your counts may not match what is in this document.
+:::
+
+Query:
+
+```sql
+SELECT count()
+FROM NYPD_Complaint
+```
+
+Result:
+
+```text
+┌─count()─┐
+│  208993 │
+└─────────┘
+
+1 row in set. Elapsed: 0.001 sec. 
+```
+
+The size of the dataset in ClickHouse is just 12% of the original TSV file, compare the size of the original TSV file with the size of the table:
+
+Query:
+
+```sql
+SELECT formatReadableSize(total_bytes)
+FROM system.tables
+WHERE name = 'NYPD_Complaint'
+```
+
+Result:
+```text
+┌─formatReadableSize(total_bytes)─┐
+│ 8.63 MiB                        │
+└─────────────────────────────────┘
+```
+
+
+## Run Some Queries {#run-queries}
+
+### Query 1. Compare the number of complaints by month
+
+Query:
+
+```sql
+SELECT
+    dateName('month', date_reported) AS month,
+    count() AS complaints,
+    bar(complaints, 0, 50000, 80)
+FROM NYPD_Complaint
+GROUP BY month
+ORDER BY complaints DESC
+```
+
+Result:
+```response
+Query id: 7fbd4244-b32a-4acf-b1f3-c3aa198e74d9
+
+┌─month─────┬─complaints─┬─bar(count(), 0, 50000, 80)───────────────────────────────┐
+│ March     │      34536 │ ███████████████████████████████████████████████████████▎ │
+│ May       │      34250 │ ██████████████████████████████████████████████████████▋  │
+│ April     │      32541 │ ████████████████████████████████████████████████████     │
+│ January   │      30806 │ █████████████████████████████████████████████████▎       │
+│ February  │      28118 │ ████████████████████████████████████████████▊            │
+│ November  │       7474 │ ███████████▊                                             │
+│ December  │       7223 │ ███████████▌                                             │
+│ October   │       7070 │ ███████████▎                                             │
+│ September │       6910 │ ███████████                                              │
+│ August    │       6801 │ ██████████▊                                              │
+│ June      │       6779 │ ██████████▋                                              │
+│ July      │       6485 │ ██████████▍                                              │
+└───────────┴────────────┴──────────────────────────────────────────────────────────┘
+
+12 rows in set. Elapsed: 0.006 sec. Processed 208.99 thousand rows, 417.99 KB (37.48 million rows/s., 74.96 MB/s.)
+```
+
+### Query 2. Compare total number of complaints by Borough
+
+Query:
+
+```sql
+SELECT
+    borough,
+    count() AS complaints,
+    bar(complaints, 0, 125000, 60)
+FROM NYPD_Complaint
+GROUP BY borough
+ORDER BY complaints DESC
+```
+
+Result:
+```response
+Query id: 8cdcdfd4-908f-4be0-99e3-265722a2ab8d
+
+┌─borough───────┬─complaints─┬─bar(count(), 0, 125000, 60)──┐
+│ BROOKLYN      │      57947 │ ███████████████████████████▋ │
+│ MANHATTAN     │      53025 │ █████████████████████████▍   │
+│ QUEENS        │      44875 │ █████████████████████▌       │
+│ BRONX         │      44260 │ █████████████████████▏       │
+│ STATEN ISLAND │       8503 │ ████                         │
+│ (null)        │        383 │ ▏                            │
+└───────────────┴────────────┴──────────────────────────────┘
+
+6 rows in set. Elapsed: 0.008 sec. Processed 208.99 thousand rows, 209.43 KB (27.14 million rows/s., 27.20 MB/s.)
+```
+
+## Next Steps
+
+[A Practical Introduction to Sparse Primary Indexes in ClickHouse](../../guides/improving-query-performance/sparse-primary-indexes/sparse-primary-indexes-intro.md) discusses the differences in ClickHouse indexing compared to traditional relational databases, how ClickHouse builds and uses a sparse primary index, and indexing best practices.
diff --git a/docs/en/getting-started/install.md b/docs/en/getting-started/install.md
index 0bdb956f0cb..83561b07ade 100644
--- a/docs/en/getting-started/install.md
+++ b/docs/en/getting-started/install.md
@@ -4,10 +4,9 @@ sidebar_position: 1
 keywords: [clickhouse, install, installation, docs]
 description: ClickHouse can run on any Linux, FreeBSD, or Mac OS X with x86_64, AArch64, or PowerPC64LE CPU architecture.
 slug: /en/getting-started/install
+title: Installation
 ---
 
-# Installation
-
 ## System Requirements {#system-requirements}
 
 ClickHouse can run on any Linux, FreeBSD, or Mac OS X with x86_64, AArch64, or PowerPC64LE CPU architecture.
@@ -59,7 +58,7 @@ clickhouse-client # or "clickhouse-client --password" if you set up a password.
 
 </details>
 
-You can replace `stable` with `lts` or `testing` to use different [release trains](../faq/operations/production.md) based on your needs.
+You can replace `stable` with `lts` to use different [release kinds](../faq/operations/production.md) based on your needs.
 
 You can also download and install packages manually from [here](https://packages.clickhouse.com/deb/pool/stable).
 
@@ -106,7 +105,7 @@ clickhouse-client # or "clickhouse-client --password" if you set up a password.
 
 </details>
 
-If you want to use the most recent version, replace `stable` with `testing` (this is recommended for your testing environments). `prestable` is sometimes also available.
+You can replace `stable` with `lts` to use different [release kinds](../faq/operations/production.md) based on your needs.
 
 Then run these commands to install packages:
 
@@ -221,7 +220,7 @@ For non-Linux operating systems and for AArch64 CPU architecture, ClickHouse bui
     curl -O 'https://builds.clickhouse.com/master/aarch64/clickhouse' && chmod a+x ./clickhouse
     ```
 
-Run `sudo ./clickhouse install` to install ClickHouse system-wide (also with needed configuration files, configuring users etc.). Then run `clickhouse start` commands to start the clickhouse-server and `clickhouse-client` to connect to it.
+Run `sudo ./clickhouse install` to install ClickHouse system-wide (also with needed configuration files, configuring users etc.). Then run `sudo clickhouse start` commands to start the clickhouse-server and `clickhouse-client` to connect to it.
 
 Use the `clickhouse client` to connect to the server, or `clickhouse local` to process local data.
 
diff --git a/docs/en/interfaces/http.md b/docs/en/interfaces/http.md
index 036fcde6d7a..c980bc65152 100644
--- a/docs/en/interfaces/http.md
+++ b/docs/en/interfaces/http.md
@@ -175,6 +175,10 @@ You can also choose to use [HTTP compression](https://en.wikipedia.org/wiki/HTTP
 - `br`
 - `deflate`
 - `xz`
+- `zstd`
+- `lz4`
+- `bz2`
+- `snappy`
 
 To send a compressed `POST` request, append the request header `Content-Encoding: compression_method`.
 In order for ClickHouse to compress the response, enable compression with [enable_http_compression](../operations/settings/settings.md#settings-enable_http_compression) setting and append `Accept-Encoding: compression_method` header to the request. You can configure the data compression level in the [http_zlib_compression_level](../operations/settings/settings.md#settings-http_zlib_compression_level) setting for all compression methods.
diff --git a/docs/en/operations/access-rights.md b/docs/en/operations/access-rights.md
index 1919aa49ab9..fc39d8b6dc7 100644
--- a/docs/en/operations/access-rights.md
+++ b/docs/en/operations/access-rights.md
@@ -151,4 +151,3 @@ Management queries:
 
     By default, SQL-driven access control and account management is disabled for all users. You need to configure at least one user in the `users.xml` configuration file and set the value of the [access_management](../operations/settings/settings-users.md#access_management-user-setting) setting to 1.
 
-[Original article](https://clickhouse.com/docs/en/operations/access_rights/) <!--hide-->
diff --git a/docs/en/operations/backup.md b/docs/en/operations/backup.md
index 2faa23908e4..d26d8f27820 100644
--- a/docs/en/operations/backup.md
+++ b/docs/en/operations/backup.md
@@ -2,10 +2,9 @@
 slug: /en/operations/backup
 sidebar_position: 49
 sidebar_label: Data backup and restore
+title: Data backup and restore
 ---
 
-# Data backup and restore
-
 While [replication](../engines/table-engines/mergetree-family/replication.md) provides protection from hardware failures, it does not protect against human errors: accidental deletion of data, deletion of the wrong table or a table on the wrong cluster, and software bugs that result in incorrect data processing or data corruption. In many cases mistakes like these will affect all replicas. ClickHouse has built-in safeguards to prevent some types of mistakes — for example, by default [you can’t just drop tables with a MergeTree-like engine containing more than 50 Gb of data](server-configuration-parameters/settings.md#max-table-size-to-drop). However, these safeguards do not cover all possible cases and can be circumvented.
 
 In order to effectively mitigate possible human errors, you should carefully prepare a strategy for backing up and restoring your data **in advance**.
diff --git a/docs/en/operations/caches.md b/docs/en/operations/caches.md
index 910f57ec56b..3aeae7d1c9d 100644
--- a/docs/en/operations/caches.md
+++ b/docs/en/operations/caches.md
@@ -20,6 +20,7 @@ Additional cache types:
 - [Avro format](../interfaces/formats.md#data-format-avro) schemas cache.
 - [Dictionaries](../sql-reference/dictionaries/index.md) data cache.
 - Schema inference cache.
+- [Filesystem cache](storing-data.md) over S3, Azure, Local and other disks.
 
 Indirectly used:
 
diff --git a/docs/en/operations/quotas.md b/docs/en/operations/quotas.md
index f35bf44fcd0..05355e615fd 100644
--- a/docs/en/operations/quotas.md
+++ b/docs/en/operations/quotas.md
@@ -2,10 +2,9 @@
 slug: /en/operations/quotas
 sidebar_position: 51
 sidebar_label: Quotas
+title: Quotas
 ---
 
-# Quotas
-
 Quotas allow you to limit resource usage over a period of time or track the use of resources.
 Quotas are set up in the user config, which is usually ‘users.xml’.
 
@@ -118,4 +117,3 @@ For distributed query processing, the accumulated amounts are stored on the requ
 
 When the server is restarted, quotas are reset.
 
-[Original article](https://clickhouse.com/docs/en/operations/quotas/) <!--hide-->
diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md
index 24e08fe1fcd..b7fe7d49b7b 100644
--- a/docs/en/operations/server-configuration-parameters/settings.md
+++ b/docs/en/operations/server-configuration-parameters/settings.md
@@ -1452,7 +1452,7 @@ Port for communicating with clients over MySQL protocol.
 
 **Possible values**
 
-Positive integer.
+Positive integer to specify the port number to listen to or empty value to disable.
 
 Example
 
@@ -1466,7 +1466,7 @@ Port for communicating with clients over PostgreSQL protocol.
 
 **Possible values**
 
-Positive integer.
+Positive integer to specify the port number to listen to or empty value to disable.
 
 Example
 
diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md
index b4d44547328..3869168becd 100644
--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@@ -1176,8 +1176,9 @@ Enables the quorum writes.
 
 -   If `insert_quorum < 2`, the quorum writes are disabled.
 -   If `insert_quorum >= 2`, the quorum writes are enabled.
+-   If `insert_quorum = 'auto'`, use majority number (`number_of_replicas / 2 + 1`) as quorum number.
 
-Default value: 0.
+Default value: 0 - disabled.
 
 Quorum writes
 
@@ -1259,7 +1260,7 @@ Possible values:
 
 Default value: 1.
 
-By default, blocks inserted into replicated tables by the `INSERT` statement are deduplicated (see [Data Replication](../../engines/table-engines/mergetree-family/replication.md)). 
+By default, blocks inserted into replicated tables by the `INSERT` statement are deduplicated (see [Data Replication](../../engines/table-engines/mergetree-family/replication.md)).
 For the replicated tables by default the only 100 of the most recent blocks for each partition are deduplicated (see [replicated_deduplication_window](merge-tree-settings.md#replicated-deduplication-window), [replicated_deduplication_window_seconds](merge-tree-settings.md/#replicated-deduplication-window-seconds)).
 For not replicated tables see [non_replicated_deduplication_window](merge-tree-settings.md/#non-replicated-deduplication-window).
 
diff --git a/docs/en/operations/storing-data.md b/docs/en/operations/storing-data.md
index fab78366892..546e3d7b7a6 100644
--- a/docs/en/operations/storing-data.md
+++ b/docs/en/operations/storing-data.md
@@ -112,6 +112,119 @@ Example of disk configuration:
 </clickhouse>
 ```
 
+## Using local cache {#using-local-cache}
+
+It is possible to configure local cache over disks in storage configuration starting from version 22.3. For versions 22.3 - 22.7 cache is supported only for `s3` disk type. For versions >= 22.8 cache is supported for any disk type: S3, Azure, Local, Encrypted, etc. Cache uses `LRU` cache policy.
+
+Example of configuration for versions later or equal to 22.8:
+
+``` xml
+<clickhouse>
+    <storage_configuration>
+        <disks>
+            <s3>
+                <type>s3</type>
+                <endpoint>...</endpoint>
+                ... s3 configuration ...
+            </s3>
+            <cache>
+                <type>cache</type>
+                <disk>s3</disk>
+                <path>/s3_cache/</path>
+                <max_size>10000000</max_size>
+            </cache>
+        </disks>
+    </storage_configuration>
+```
+
+Example of configuration for versions earlier than 22.8:
+
+``` xml
+<clickhouse>
+    <storage_configuration>
+        <disks>
+            <s3>
+                <type>s3</type>
+                <endpoint>...</endpoint>
+                ... s3 configuration ...
+                <data_cache_enabled>1</data_cache_enabled>
+                <data_cache_size>10000000</data_cache_size>
+            </s3>
+        </disks>
+    </storage_configuration>
+```
+
+Cache **configuration settings**:
+
+- `path` - path to the directory with cache. Default: None, this setting is obligatory.
+
+- `max_size` - maximum size of the cache in bytes. When the limit is reached, cache files are evicted according to the cache eviction policy. Default: None, this setting is obligatory.
+
+- `cache_on_write_operations` - allow to turn on `write-through` cache (caching data on any write operations: `INSERT` queries, background merges). Default: `false`. The `write-through` cache can be disabled per query using setting `enable_filesystem_cache_on_write_operations` (data is cached only if both cache config settings and corresponding query setting are enabled).
+
+- `enable_filesystem_query_cache_limit` - allow to limit the size of cache which is downloaded within each query (depends on user setting `max_query_cache_size`). Default: `false`.
+
+- `enable_cache_hits_threshold` - a number, which defines how many times some data needs to be read before it will be cached. Default: `0`, e.g. the data is cached at the first attempt to read it.
+
+- `do_not_evict_index_and_mark_files` - do not evict small frequently used files according to cache policy. Default: `true`.
+
+- `max_file_segment_size` - a maximum size of a single cache file. Default: `104857600` (100 Mb).
+
+- `max_elements` - a limit for a number of cache files. Default: `1048576`.
+
+Cache **query settings**:
+
+- `enable_filesystem_cache` - allows to disable cache per query even if storage policy was configured with `cache` disk type. Default: `true`.
+
+- `read_from_filesystem_cache_if_exists_otherwise_bypass_cache` - allows to use cache in query only if it already exists, otherwise query data will not be written to local cache storage. Default: `false`.
+
+- `enable_filesystem_cache_on_write_operations` - turn on `write-through` cache. This setting works only if setting `cache_on_write_operations` in cache configuration is turned on.
+
+- `enable_filesystem_cache_log` - turn on logging to `system.filesystem_cache_log` table. Gives a detailed view of cache usage per query. Default: `false`.
+
+- `max_query_cache_size` - a limit for the cache size, which can be written to local cache storage. Requires enabled `enable_filesystem_query_cache_limit` in cache configuration. Default: `false`.
+
+- `skip_download_if_exceeds_query_cache` - allows to change the behaviour of setting `max_query_cache_size`. Default: `true`. If this setting is turned on and cache download limit during query was reached, no more cache will be downloaded to cache storage. If this setting is turned off and cache download limit during query was reached, cache will still be written by cost of evicting previously downloaded (within current query) data, e.g. second behaviour allows to preserve `last recentltly used` behaviour while keeping query cache limit.
+
+** Warning **
+Cache configuration settings and cache query settings correspond to the latest ClickHouse version, for earlier versions something might not be supported.
+
+Cache **system tables**:
+
+- `system.filesystem_cache` - system tables which shows current state of cache.
+
+- `system.filesystem_cache_log` - system table which shows detailed cache usage per query. Requires `enable_filesystem_cache_log` setting to be `true`.
+
+Cache **commands**:
+
+- `SYSTEM DROP FILESYSTEM CACHE (<path>) (ON CLUSTER)`
+
+- `SHOW CACHES` -- show list of caches which were configured on the server.
+
+- `DESCRIBE CACHE '<cache_name>'` - show cache configuration and some general statistics for a specific cache. Cache name can be taken from `SHOW CACHES` command.
+
+Cache current metrics:
+
+- `FilesystemCacheSize`
+
+- `FilesystemCacheElements`
+
+Cache asynchronous metrics:
+
+- `FilesystemCacheBytes`
+
+- `FilesystemCacheFiles`
+
+Cache profile events:
+
+- `CachedReadBufferReadFromSourceBytes`, `CachedReadBufferReadFromCacheBytes,`
+
+- `CachedReadBufferReadFromSourceMicroseconds`, `CachedReadBufferReadFromCacheMicroseconds`
+
+- `CachedReadBufferCacheWriteBytes`, `CachedReadBufferCacheWriteMicroseconds`
+
+- `CachedWriteBufferCacheWriteBytes`, `CachedWriteBufferCacheWriteMicroseconds`
+
 ## Storing Data on Web Server {#storing-data-on-webserver}
 
 There is a tool `clickhouse-static-files-uploader`, which prepares a data directory for a given table (`SELECT data_paths FROM system.tables WHERE name = 'table_name'`). For each table you need, you get a directory of files. These files can be uploaded to, for example, a web server with static files. After this preparation, you can load this table into any ClickHouse server via `DiskWeb`.
diff --git a/docs/en/operations/tips.md b/docs/en/operations/tips.md
index 85927cd0e05..facf78c85bf 100644
--- a/docs/en/operations/tips.md
+++ b/docs/en/operations/tips.md
@@ -74,13 +74,16 @@ Make sure that [`fstrim`](https://en.wikipedia.org/wiki/Trim_(computing)) is ena
 
 ## File System {#file-system}
 
-Ext4 is the most reliable option. Set the mount options `noatime`.
-XFS should be avoided. It works mostly fine but there are some reports about lower performance.
+Ext4 is the most reliable option. Set the mount options `noatime`. XFS works well too.
 Most other file systems should also work fine.
 
+FAT-32 and exFAT are not supported due to lack of hard links.
+
 Do not use compressed filesystems, because ClickHouse does compression on its own and better.
 It's not recommended to use encrypted filesystems, because you can use builtin encryption in ClickHouse, which is better.
 
+While ClickHouse can work over NFS, it is not the best idea.
+
 ## Linux Kernel {#linux-kernel}
 
 Don’t use an outdated Linux kernel.
diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md
index 433300eefa4..6e4c8c4b94e 100644
--- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md
+++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md
@@ -94,6 +94,21 @@ It is also possible for `Flat`, `Hashed`, `ComplexKeyHashed` dictionaries to onl
 -   If the source is HTTP then `update_field` will be added as a query parameter with the last update time as the parameter value.
 -   If the source is Executable then `update_field` will be added as an executable script argument with the last update time as the argument value.
 -   If the source is ClickHouse, MySQL, PostgreSQL, ODBC there will be an additional part of `WHERE`, where `update_field` is compared as greater or equal with the last update time.
+    - Per default, this `WHERE`-condition is checked at the highest level of the SQL-Query. Alternatively, the condition can be checked in any other `WHERE`-clause within the query using the `{condition}`-keyword. Example:
+    ```sql
+    ...
+    SOURCE(CLICKHOUSE(... 
+        update_field 'added_time' 
+        QUERY '
+            SELECT my_arr.1 AS x, my_arr.2 AS y, creation_time 
+            FROM (
+                SELECT arrayZip(x_arr, y_arr) AS my_arr, creation_time 
+                FROM dictionary_source
+                WHERE {condition}
+            )'
+    ))
+    ...
+    ```
 
 If `update_field` option is set, additional option `update_lag` can be set. Value of `update_lag` option is subtracted from previous update time before request updated data.
 
diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md
index 52f9a06df72..ced96078ce1 100644
--- a/docs/en/sql-reference/functions/date-time-functions.md
+++ b/docs/en/sql-reference/functions/date-time-functions.md
@@ -267,7 +267,7 @@ Result:
 └────────────────┘
 ```
 
-:::Attention
+:::note
 The return type of `toStartOf*`, `toLastDayOfMonth`, `toMonday` functions described below is `Date` or `DateTime`.
 Though these functions can take values of the extended types `Date32` and `DateTime64` as an argument, passing them a time outside the normal range (year 1970 to 2149 for `Date` / 2106 for `DateTime`) will produce wrong results.
 In case argument is out of normal range:
@@ -640,7 +640,8 @@ Result:
 
 ## date\_diff
 
-Returns the difference between two dates or dates with time values.
+Returns the difference between two dates or dates with time values. 
+The difference is calculated using relative units, e.g. the difference between `2022-01-01` and `2021-12-29` is 3 days for day unit (see [toRelativeDayNum](#torelativedaynum)), 1 month for month unit (see [toRelativeMonthNum](#torelativemonthnum)), 1 year for year unit (see [toRelativeYearNum](#torelativeyearnum)).
 
 **Syntax**
 
@@ -692,6 +693,25 @@ Result:
 └────────────────────────────────────────────────────────────────────────────────────────┘
 ```
 
+Query:
+
+``` sql
+SELECT
+    toDate('2022-01-01') AS e,
+    toDate('2021-12-29') AS s,
+    dateDiff('day', s, e) AS day_diff,
+    dateDiff('month', s, e) AS month__diff,
+    dateDiff('year', s, e) AS year_diff;
+```
+
+Result:
+
+``` text
+┌──────────e─┬──────────s─┬─day_diff─┬─month__diff─┬─year_diff─┐
+│ 2022-01-01 │ 2021-12-29 │        3 │           1 │         1 │
+└────────────┴────────────┴──────────┴─────────────┴───────────┘
+```
+
 ## date\_sub
 
 Subtracts the time interval or date interval from the provided date or date with time.
diff --git a/docs/en/sql-reference/functions/tuple-map-functions.md b/docs/en/sql-reference/functions/tuple-map-functions.md
index c555a838927..52023df4d72 100644
--- a/docs/en/sql-reference/functions/tuple-map-functions.md
+++ b/docs/en/sql-reference/functions/tuple-map-functions.md
@@ -430,5 +430,119 @@ Result:
 └────────────────────────────┘
 ```  
   
+## mapApply  
+  
+**Syntax**
+
+```sql
+mapApply(func, map)
+```  
+  
+**Parameters**
+  
+-   `func`  - [Lamda function](../../sql-reference/functions/index.md#higher-order-functions---operator-and-lambdaparams-expr-function).
+-   `map` — [Map](../../sql-reference/data-types/map.md).
+
+**Returned value**
+
+- Returns a map obtained from the original map by application of `func(map1[i], …, mapN[i])` for each element.
+  
+**Example**
+
+Query:
+
+```sql
+SELECT mapApply((k, v) -> (k, v * 10), _map) AS r
+FROM
+(
+    SELECT map('key1', number, 'key2', number * 2) AS _map
+    FROM numbers(3)
+)
+```  
+  
+Result:  
+  
+```text
+┌─r─────────────────────┐
+│ {'key1':0,'key2':0}   │
+│ {'key1':10,'key2':20} │
+│ {'key1':20,'key2':40} │
+└───────────────────────┘
+```  
+
+## mapFilter  
+  
+**Syntax**
+
+```sql
+mapFilter(func, map)
+```  
+  
+**Parameters**
+
+-   `func`  - [Lamda function](../../sql-reference/functions/index.md#higher-order-functions---operator-and-lambdaparams-expr-function).
+-   `map` — [Map](../../sql-reference/data-types/map.md).  
+
+**Returned value**
+
+- Returns a map containing only the elements in `map` for which `func(map1[i], …, mapN[i])` returns something other than 0.
+  
+  
+**Example**
+
+Query:
+
+```sql
+SELECT mapFilter((k, v) -> ((v % 2) = 0), _map) AS r
+FROM
+(
+    SELECT map('key1', number, 'key2', number * 2) AS _map
+    FROM numbers(3)
+)
+```  
+  
+Result:  
+  
+```text
+┌─r───────────────────┐
+│ {'key1':0,'key2':0} │
+│ {'key2':2}          │
+│ {'key1':2,'key2':4} │
+└─────────────────────┘
+```  
+
+
+## mapUpdate  
+  
+**Syntax**
+
+```sql
+mapUpdate(map1, map2)
+```  
+  
+**Parameters**
+
+-   `map1` [Map](../../sql-reference/data-types/map.md).
+-   `map2` [Map](../../sql-reference/data-types/map.md).
+
+**Returned value**
+
+- Returns a map1 with values updated of values for the corresponding keys in map2.
+  
+**Example**
+
+Query:
+
+```sql
+SELECT mapUpdate(map('key1', 0, 'key3', 0), map('key1', 10, 'key2', 10)) AS map;
+```  
+  
+Result:  
+  
+```text
+┌─map────────────────────────────┐
+│ {'key3':0,'key1':10,'key2':10} │
+└────────────────────────────────┘
+```  
 
 [Original article](https://clickhouse.com/docs/en/sql-reference/functions/tuple-map-functions/) <!--hide-->
diff --git a/docs/en/sql-reference/functions/uniqtheta-functions.md b/docs/en/sql-reference/functions/uniqtheta-functions.md
new file mode 100644
index 00000000000..b2d3712abfc
--- /dev/null
+++ b/docs/en/sql-reference/functions/uniqtheta-functions.md
@@ -0,0 +1,94 @@
+---
+slug: /en/sql-reference/functions/uniqtheta-functions
+---
+
+# uniqTheta Functions
+
+uniqTheta functions work for two uniqThetaSketch objects to do set operation calculations such as  ∪ / ∩ / × (union/intersect/not), it is to return a new uniqThetaSketch object contain the result.
+
+A uniqThetaSketch object is to be constructed by aggregation function uniqTheta with -State.
+
+UniqThetaSketch is a data structure storage of approximate values set.
+For more information on RoaringBitmap, see: [Theta Sketch Framework](https://datasketches.apache.org/docs/Theta/ThetaSketchFramework.html).
+
+## uniqThetaUnion
+
+Two uniqThetaSketch objects to do union calculation(set operation ∪), the result is a new uniqThetaSketch.
+
+``` sql
+uniqThetaUnion(uniqThetaSketch,uniqThetaSketch)
+```
+
+**Arguments**
+
+-   `uniqThetaSketch` – uniqThetaSketch object.
+
+**Example**
+
+``` sql
+select finalizeAggregation(uniqThetaUnion(a, b)) as a_union_b, finalizeAggregation(a) as a_cardinality, finalizeAggregation(b) as b_cardinality
+from
+(select arrayReduce('uniqThetaState',[1,2]) as a, arrayReduce('uniqThetaState',[2,3,4]) as b );
+```
+
+``` text
+┌─a_union_b─┬─a_cardinality─┬─b_cardinality─┐
+│         4 │             2 │             3 │
+└───────────┴───────────────┴───────────────┘
+```
+
+## uniqThetaIntersect
+
+Two uniqThetaSketch objects to do intersect calculation(set operation ∩), the result is a new uniqThetaSketch.
+
+``` sql
+uniqThetaIntersect(uniqThetaSketch,uniqThetaSketch)
+```
+
+**Arguments**
+
+-   `uniqThetaSketch` – uniqThetaSketch object.
+
+**Example**
+
+``` sql
+select finalizeAggregation(uniqThetaIntersect(a, b)) as a_intersect_b, finalizeAggregation(a) as a_cardinality, finalizeAggregation(b) as b_cardinality
+from
+(select arrayReduce('uniqThetaState',[1,2]) as a, arrayReduce('uniqThetaState',[2,3,4]) as b );
+```
+
+``` text
+┌─a_intersect_b─┬─a_cardinality─┬─b_cardinality─┐
+│             1 │             2 │             3 │
+└───────────────┴───────────────┴───────────────┘
+```
+
+## uniqThetaNot
+
+Two uniqThetaSketch objects to do a_not_b calculation(set operation ×), the result is a new uniqThetaSketch.
+
+``` sql
+uniqThetaNot(uniqThetaSketch,uniqThetaSketch)
+```
+
+**Arguments**
+
+-   `uniqThetaSketch` – uniqThetaSketch object.
+
+**Example**
+
+``` sql
+select finalizeAggregation(uniqThetaNot(a, b)) as a_not_b, finalizeAggregation(a) as a_cardinality, finalizeAggregation(b) as b_cardinality
+from
+(select arrayReduce('uniqThetaState',[2,3,4]) as a, arrayReduce('uniqThetaState',[1,2]) as b );
+```
+
+``` text
+┌─a_not_b─┬─a_cardinality─┬─b_cardinality─┐
+│       2 │             3 │             2 │
+└─────────┴───────────────┴───────────────┘
+```
+
+**See Also**
+
+-   [uniqThetaSketch](../../sql-reference/aggregate-functions/reference/uniqthetasketch.md#agg_function-uniqthetasketch)
diff --git a/docs/en/sql-reference/statements/alter/constraint.md b/docs/en/sql-reference/statements/alter/constraint.md
index 15bd27e1a95..844b24d7374 100644
--- a/docs/en/sql-reference/statements/alter/constraint.md
+++ b/docs/en/sql-reference/statements/alter/constraint.md
@@ -9,8 +9,8 @@ sidebar_label: CONSTRAINT
 Constraints could be added or deleted using following syntax:
 
 ``` sql
-ALTER TABLE [db].name ADD CONSTRAINT constraint_name CHECK expression;
-ALTER TABLE [db].name DROP CONSTRAINT constraint_name;
+ALTER TABLE [db].name [ON CLUSTER cluster] ADD CONSTRAINT constraint_name CHECK expression;
+ALTER TABLE [db].name [ON CLUSTER cluster] DROP CONSTRAINT constraint_name;
 ```
 
 See more on [constraints](../../../sql-reference/statements/create/table.md#constraints).
diff --git a/docs/en/sql-reference/statements/alter/ttl.md b/docs/en/sql-reference/statements/alter/ttl.md
index 3e9846ba1ab..a312e8cad91 100644
--- a/docs/en/sql-reference/statements/alter/ttl.md
+++ b/docs/en/sql-reference/statements/alter/ttl.md
@@ -11,7 +11,7 @@ sidebar_label: TTL
 You can change [table TTL](../../../engines/table-engines/mergetree-family/mergetree.md#mergetree-table-ttl) with a request of the following form:
 
 ``` sql
-ALTER TABLE table_name MODIFY TTL ttl_expression;
+ALTER TABLE [db.]table_name [ON CLUSTER cluster] MODIFY TTL ttl_expression;
 ```
 
 ## REMOVE TTL
@@ -19,7 +19,7 @@ ALTER TABLE table_name MODIFY TTL ttl_expression;
 TTL-property can be removed from table with the following query:
 
 ```sql
-ALTER TABLE table_name REMOVE TTL
+ALTER TABLE [db.]table_name [ON CLUSTER cluster] REMOVE TTL
 ```
 
 **Example**
diff --git a/docs/en/sql-reference/statements/show.md b/docs/en/sql-reference/statements/show.md
index 0721f17e9e2..00347d9cb5b 100644
--- a/docs/en/sql-reference/statements/show.md
+++ b/docs/en/sql-reference/statements/show.md
@@ -303,7 +303,7 @@ SHOW USERS
 
 ## SHOW ROLES
 
-Returns a list of [roles](../../operations/access-rights.md#role-management). To view another parameters, see system tables [system.roles](../../operations/system-tables/roles.md#system_tables-roles) and [system.role-grants](../../operations/system-tables/role-grants.md#system_tables-role_grants).
+Returns a list of [roles](../../operations/access-rights.md#role-management). To view another parameters, see system tables [system.roles](../../operations/system-tables/roles.md#system_tables-roles) and [system.role_grants](../../operations/system-tables/role-grants.md#system_tables-role_grants).
 
 ### Syntax
 
diff --git a/docs/ru/sql-reference/functions/date-time-functions.md b/docs/ru/sql-reference/functions/date-time-functions.md
index 63386bb32b9..1c623cd1dab 100644
--- a/docs/ru/sql-reference/functions/date-time-functions.md
+++ b/docs/ru/sql-reference/functions/date-time-functions.md
@@ -267,7 +267,7 @@ SELECT toUnixTimestamp('2017-11-05 08:07:47', 'Asia/Tokyo') AS unix_timestamp;
 └────────────────┘
 ```
 
-:::Attention
+:::note
 Тип возвращаемого описанными далее функциями `toStartOf*`, `toMonday` значения - `Date` или `DateTime`.
 Хотя эти функции могут принимать значения типа `Date32` или `DateTime64` в качестве аргумента, при обработке аргумента вне нормального диапазона значений (`1970` - `2148` для `Date` и `1970-01-01 00:00:00`-`2106-02-07 08:28:15` для `DateTime`) будет получен некорректный результат.
 Возвращаемые значения для значений вне нормального диапазона:
@@ -277,7 +277,7 @@ SELECT toUnixTimestamp('2017-11-05 08:07:47', 'Asia/Tokyo') AS unix_timestamp;
 * `2149-05-31` будет результатом функции `toLastDayOfMonth` при обработке аргумента больше `2149-05-31`.
 :::
 
-:::Attention
+:::note
 Тип возвращаемого описанными далее функциями `toStartOf*`, `toLastDayOfMonth`, `toMonday` значения - `Date` или `DateTime`.
 Хотя эти функции могут принимать значения типа `Date32` или `DateTime64` в качестве аргумента, при обработке аргумента вне нормального диапазона значений (`1970` - `2148` для `Date` и `1970-01-01 00:00:00`-`2106-02-07 08:28:15` для `DateTime`) будет получен некорректный результат.
 Возвращаемые значения для значений вне нормального диапазона:
diff --git a/docs/ru/sql-reference/statements/alter/constraint.md b/docs/ru/sql-reference/statements/alter/constraint.md
index cb380bc6a60..bc65b0bbf27 100644
--- a/docs/ru/sql-reference/statements/alter/constraint.md
+++ b/docs/ru/sql-reference/statements/alter/constraint.md
@@ -11,8 +11,8 @@ sidebar_label: "Манипуляции с ограничениями"
 Добавить или удалить ограничение можно с помощью запросов
 
 ``` sql
-ALTER TABLE [db].name ADD CONSTRAINT constraint_name CHECK expression;
-ALTER TABLE [db].name DROP CONSTRAINT constraint_name;
+ALTER TABLE [db].name [ON CLUSTER cluster] ADD CONSTRAINT constraint_name CHECK expression;
+ALTER TABLE [db].name [ON CLUSTER cluster] DROP CONSTRAINT constraint_name;
 ```
 
 Запросы выполняют добавление или удаление метаданных об ограничениях таблицы `[db].name`, поэтому выполняются мгновенно.
diff --git a/docs/ru/sql-reference/statements/alter/ttl.md b/docs/ru/sql-reference/statements/alter/ttl.md
index 855a651ffc2..2917e96fd34 100644
--- a/docs/ru/sql-reference/statements/alter/ttl.md
+++ b/docs/ru/sql-reference/statements/alter/ttl.md
@@ -11,7 +11,7 @@ sidebar_label: TTL
 Вы можете изменить [TTL для таблицы](../../../engines/table-engines/mergetree-family/mergetree.md#mergetree-column-ttl) запросом следующего вида:
 
 ``` sql
-ALTER TABLE table-name MODIFY TTL ttl-expression
+ALTER TABLE [db.]table-name [ON CLUSTER cluster] MODIFY TTL ttl-expression
 ```
 
 ## REMOVE TTL {#remove-ttl}
@@ -19,7 +19,7 @@ ALTER TABLE table-name MODIFY TTL ttl-expression
 Удалить табличный TTL можно запросом следующего вида:
 
 ```sql
-ALTER TABLE table_name REMOVE TTL
+ALTER TABLE [db.]table_name [ON CLUSTER cluster] REMOVE TTL
 ```
 
 **Пример**
@@ -83,4 +83,4 @@ SELECT * FROM table_with_ttl;
 ### Смотрите также
 
 - Подробнее о [свойстве TTL](../../../engines/table-engines/mergetree-family/mergetree.md#mergetree-column-ttl).
-- Изменить столбец [с TTL](../../../sql-reference/statements/alter/column.md#alter_modify-column).
\ No newline at end of file
+- Изменить столбец [с TTL](../../../sql-reference/statements/alter/column.md#alter_modify-column).
diff --git a/docs/ru/sql-reference/statements/show.md b/docs/ru/sql-reference/statements/show.md
index 1d072c9d5de..59f33c691ae 100644
--- a/docs/ru/sql-reference/statements/show.md
+++ b/docs/ru/sql-reference/statements/show.md
@@ -305,7 +305,7 @@ SHOW USERS
 
 ## SHOW ROLES {#show-roles-statement}
 
-Выводит список [ролей](../../operations/access-rights.md#role-management). Для просмотра параметров ролей, см. системные таблицы [system.roles](../../operations/system-tables/roles.md#system_tables-roles) и [system.role-grants](../../operations/system-tables/role-grants.md#system_tables-role_grants).
+Выводит список [ролей](../../operations/access-rights.md#role-management). Для просмотра параметров ролей, см. системные таблицы [system.roles](../../operations/system-tables/roles.md#system_tables-roles) и [system.role_grants](../../operations/system-tables/role-grants.md#system_tables-role_grants).
 
 ### Синтаксис {#show-roles-syntax}
 
diff --git a/docs/zh/development/tests.md b/docs/zh/development/tests.md
index ca9300597c7..e6d5cf66de9 100644
--- a/docs/zh/development/tests.md
+++ b/docs/zh/development/tests.md
@@ -1,338 +1,297 @@
 ---
-slug: /zh/development/tests
+slug: /en/development/tests
+sidebar_position: 70
+sidebar_label: Testing
+title: ClickHouse Testing
+description: Most of ClickHouse features can be tested with functional tests and they are mandatory to use for every change in ClickHouse code that can be tested that way.
 ---
-# ClickHouse 测试 {#clickhouse-testing}
 
-## 功能测试 {#functional-tests}
+## Functional Tests
 
-功能测试使用起来最简单方便. 大多数 ClickHouse 特性都可以通过功能测试进行测试, 并且对于可以通过功能测试进行测试的 ClickHouse 代码的每一个更改, 都必须使用这些特性
+Functional tests are the most simple and convenient to use. Most of ClickHouse features can be tested with functional tests and they are mandatory to use for every change in ClickHouse code that can be tested that way.
 
-每个功能测试都会向正在运行的 ClickHouse 服务器发送一个或多个查询, 并将结果与参考进行比较.
+Each functional test sends one or multiple queries to the running ClickHouse server and compares the result with reference.
 
-测试位于 `查询` 目录中. 有两个子目录: `无状态` 和 `有状态`. 无状态测试在没有任何预加载测试数据的情况下运行查询 - 它们通常在测试本身内即时创建小型合成数据集. 状态测试需要来自 Yandex.Metrica 的预加载测试数据, 它对公众开放.
+Tests are located in `queries` directory. There are two subdirectories: `stateless` and `stateful`. Stateless tests run queries without any preloaded test data - they often create small synthetic datasets on the fly, within the test itself. Stateful tests require preloaded test data from ClickHouse and it is available to general public.
 
-每个测试可以是两种类型之一: `.sql` 和 `.sh`. `.sql` 测试是简单的 SQL 脚本, 它通过管道传输到  `clickhouse-client --multiquery --testmode`. `.sh` 测试是一个自己运行的脚本. SQL 测试通常比 `.sh` 测试更可取. 仅当您必须测试某些无法从纯 SQL 中执行的功能时才应使用 `.sh` 测试, 例如将一些输入数据传送到 `clickhouse-client` 或测试 `clickhouse-local`.
+Each test can be one of two types: `.sql` and `.sh`. `.sql` test is the simple SQL script that is piped to `clickhouse-client --multiquery`. `.sh` test is a script that is run by itself. SQL tests are generally preferable to `.sh` tests. You should use `.sh` tests only when you have to test some feature that cannot be exercised from pure SQL, such as piping some input data into `clickhouse-client` or testing `clickhouse-local`.
 
-### 在本地运行测试 {#functional-test-locally}
+### Running a Test Locally {#functional-test-locally}
 
-在本地启动ClickHouse服务器, 监听默认端口(9000). 例如, 要运行测试 `01428_hash_set_nan_key`, 请切换到存储库文件夹并运行以下命令:
+Start the ClickHouse server locally, listening on the default port (9000). To
+run, for example, the test `01428_hash_set_nan_key`, change to the repository
+folder and run the following command:
 
 ```
 PATH=$PATH:<path to clickhouse-client> tests/clickhouse-test 01428_hash_set_nan_key
 ```
 
-有关更多选项, 请参阅`tests/clickhouse-test --help`. 您可以简单地运行所有测试或运行由测试名称中的子字符串过滤的测试子集：`./clickhouse-test substring`. 还有并行或随机顺序运行测试的选项.
+For more options, see `tests/clickhouse-test --help`. You can simply run all tests or run subset of tests filtered by substring in test name: `./clickhouse-test substring`. There are also options to run tests in parallel or in randomized order.
 
-### 添加新测试 {#adding-new-test}
+### Adding a New Test
 
-添加新的测试, 在 `queries/0_stateless` 目录下创建 `.sql` 或 `.sh` 文件, 手动检查, 然后通过以下方式生成`.reference`文件：`clickhouse-client -n --testmode < 00000_test.sql > 00000_test.reference` 或 `./00000_test.sh > ./00000_test.reference`.
+To add new test, create a `.sql` or `.sh` file in `queries/0_stateless` directory, check it manually and then generate `.reference` file in the following way: `clickhouse-client --multiquery < 00000_test.sql > 00000_test.reference` or `./00000_test.sh > ./00000_test.reference`.
 
-测试应仅使用(创建、删除等)`test` 数据库中假定已预先创建的表; 测试也可以使用临时表.
+Tests should use (create, drop, etc) only tables in `test` database that is assumed to be created beforehand; also tests can use temporary tables.
 
-### 选择测试名称 {#choosing-test-name}
+### Choosing the Test Name
 
-测试名称以五位数前缀开头, 后跟描述性名称, 例如 `00422_hash_function_constexpr.sql`. 要选择前缀, 请找到目录中已存在的最大前缀, 并将其加一. 在此期间, 可能会添加一些具有相同数字前缀的其他测试, 但这没关系并且不会导致任何问题, 您以后不必更改它.
+The name of the test starts with a five-digit prefix followed by a descriptive name, such as `00422_hash_function_constexpr.sql`. To choose the prefix, find the largest prefix already present in the directory, and increment it by one. In the meantime, some other tests might be added with the same numeric prefix, but this is OK and does not lead to any problems, you don't have to change it later.
 
-一些测试的名称中标有 `zookeeper`、`shard` 或 `long` . `zookeeper` 用于使用 ZooKeeper 的测试. `shard` 用于需要服务器监听 `127.0.0.*` 的测试; `distributed` 或 `global` 具有相同的含义. `long` 用于运行时间稍长于一秒的测试. Yo你可以分别使用 `--no-zookeeper`、`--no-shard` 和 `--no-long` 选项禁用这些测试组. 如果需要 ZooKeeper 或分布式查询，请确保为您的测试名称添加适当的前缀.
+Some tests are marked with `zookeeper`, `shard` or `long` in their names. `zookeeper` is for tests that are using ZooKeeper. `shard` is for tests that requires server to listen `127.0.0.*`; `distributed` or `global` have the same meaning. `long` is for tests that run slightly longer that one second. You can disable these groups of tests using `--no-zookeeper`, `--no-shard` and `--no-long` options, respectively. Make sure to add a proper prefix to your test name if it needs ZooKeeper or distributed queries.
 
-### 检查必须发生的错误 {#checking-error-must-occur}
+### Checking for an Error that Must Occur
 
-有时您想测试是否因不正确的查询而发生服务器错误. 我们支持在 SQL 测试中对此进行特殊注释, 形式如下:
+Sometimes you want to test that a server error occurs for an incorrect query. We support special annotations for this in SQL tests, in the following form:
 ```
 select x; -- { serverError 49 }
 ```
-此测试确保服务器返回关于未知列“x”的错误代码为 49. 如果没有错误, 或者错误不同, 则测试失败. 如果您想确保错误发生在客户端, 请改用 `clientError` 注释.
+This test ensures that the server returns an error with code 49 about unknown column `x`. If there is no error, or the error is different, the test will fail. If you want to ensure that an error occurs on the client side, use `clientError` annotation instead.
 
-不要检查错误消息的特定措辞, 它将来可能会发生变化, 并且测试将不必要地中断. 只检查错误代码. 如果现有的错误代码不足以满足您的需求, 请考虑添加一个新的.
+Do not check for a particular wording of error message, it may change in the future, and the test will needlessly break. Check only the error code. If the existing error code is not precise enough for your needs, consider adding a new one.
 
-### 测试分布式查询 {#testing-distributed-query}
+### Testing a Distributed Query
 
-如果你想在功能测试中使用分布式查询, 你可以使用 `127.0.0.{1..2}` 的地址, 以便服务器查询自己; 或者您可以在服务器配置文件中使用预定义的测试集群, 例如`test_shard_localhost`. 请记住在测试名称中添加 `shard` 或 `distributed` 字样, 以便它以正确的配置在 CI 中运行, 其中服务器配置为支持分布式查询.
+If you want to use distributed queries in functional tests, you can leverage `remote` table function with `127.0.0.{1..2}` addresses for the server to query itself; or you can use predefined test clusters in server configuration file like `test_shard_localhost`. Remember to add the words `shard` or `distributed` to the test name, so that it is run in CI in correct configurations, where the server is configured to support distributed queries.
 
 
-## 已知错误 {#known-bugs}
+## Known Bugs {#known-bugs}
 
-如果我们知道一些可以通过功能测试轻松重现的错误, 我们将准备好的功能测试放在 `tests/queries/bugs` 目录中. 修复错误后, 这些测试将移至 `tests/queries/0_stateless` .
+If we know some bugs that can be easily reproduced by functional tests, we place prepared functional tests in `tests/queries/bugs` directory. These tests will be moved to `tests/queries/0_stateless` when bugs are fixed.
 
-## 集成测试 {#integration-tests}
+## Integration Tests {#integration-tests}
 
-集成测试允许在集群配置中测试 ClickHouse 以及 ClickHouse 与其他服务器(如 MySQL、Postgres、MongoDB)的交互. 它们可以用来模拟网络分裂、丢包等情况. 这些测试在Docker下运行, 并使用各种软件创建多个容器.
+Integration tests allow testing ClickHouse in clustered configuration and ClickHouse interaction with other servers like MySQL, Postgres, MongoDB. They are useful to emulate network splits, packet drops, etc. These tests are run under Docker and create multiple containers with various software.
 
-有关如何运行这些测试, 请参阅 `tests/integration/README.md` .
+See `tests/integration/README.md` on how to run these tests.
 
-注意, ClickHouse与第三方驱动程序的集成没有经过测试. 另外, 我们目前还没有JDBC和ODBC驱动程序的集成测试.
+Note that integration of ClickHouse with third-party drivers is not tested. Also, we currently do not have integration tests with our JDBC and ODBC drivers.
 
-## 单元测试 {#unit-tests}
+## Unit Tests {#unit-tests}
 
-当您想测试的不是 ClickHouse 整体, 而是单个独立库或类时，单元测试很有用. 您可以使用 `ENABLE_TESTS` CMake 选项启用或禁用测试构建. 单元测试(和其他测试程序)位于代码中的 `tests` 子目录中. 要运行单元测试, 请键入 `ninja test` 。有些测试使用 `gtest` , 但有些程序在测试失败时会返回非零退出码.
+Unit tests are useful when you want to test not the ClickHouse as a whole, but a single isolated library or class. You can enable or disable build of tests with `ENABLE_TESTS` CMake option. Unit tests (and other test programs) are located in `tests` subdirectories across the code. To run unit tests, type `ninja test`. Some tests use `gtest`, but some are just programs that return non-zero exit code on test failure.
 
-如果代码已经被功能测试覆盖了, 就没有必要进行单元测试(而且功能测试通常更易于使用).
+It’s not necessary to have unit tests if the code is already covered by functional tests (and functional tests are usually much more simple to use).
 
-例如, 您可以通过直接调用可执行文件来运行单独的 gtest 检查:
+You can run individual gtest checks by calling the executable directly, for example:
 
 ```bash
 $ ./src/unit_tests_dbms --gtest_filter=LocalAddress*
 ```
 
-## 性能测试 {#performance-tests}
+## Performance Tests {#performance-tests}
 
-性能测试允许测量和比较 ClickHouse 的某些孤立部分在合成查询上的性能. 测试位于 `tests/performance`. 每个测试都由带有测试用例描述的 `.xml` 文件表示. 测试使用 `docker/tests/performance-comparison` 工具运行. 请参阅自述文件以进行调用.
+Performance tests allow to measure and compare performance of some isolated part of ClickHouse on synthetic queries. Performance tests are located at `tests/performance/`. Each test is represented by an `.xml` file with a description of the test case. Tests are run with `docker/test/performance-comparison` tool . See the readme file for invocation.
 
-每个测试在循环中运行一个或多个查询(可能带有参数组合). 一些测试可以包含预加载测试数据集的先决条件.
+Each test run one or multiple queries (possibly with combinations of parameters) in a loop.
 
-如果您希望在某些场景中提高ClickHouse的性能，并且如果可以在简单的查询中观察到改进，那么强烈建议编写性能测试。在测试期间使用 `perf top` 或其他perf工具总是有意义的.
+If you want to improve performance of ClickHouse in some scenario, and if improvements can be observed on simple queries, it is highly recommended to write a performance test. Also, it is recommended to write performance tests when you add or modify SQL functions which are relatively isolated and not too obscure. It always makes sense to use `perf top` or other `perf` tools during your tests.
 
-## 测试工具和脚本 {#test-tools-and-scripts}
+## Test Tools and Scripts {#test-tools-and-scripts}
 
-  `tests` 目录中的一些程序不是准备好的测试，而是测试工具. 例如, 对于 `Lexer`, 有一个工具 `src/Parsers/tests/lexer` , 它只是对标准输入进行标记化并将着色结果写入标准输出. 您可以将这些类型的工具用作代码示例以及用于探索和手动测试.
+Some programs in `tests` directory are not prepared tests, but are test tools. For example, for `Lexer` there is a tool `src/Parsers/tests/lexer` that just do tokenization of stdin and writes colorized result to stdout. You can use these kind of tools as a code examples and for exploration and manual testing.
 
-## 其他测试 {#miscellaneous-tests}
+## Miscellaneous Tests {#miscellaneous-tests}
 
-在 `tests/external_models` 中有机器学习模型的测试. 这些测试不会更新, 必须转移到集成测试.
+There are tests for machine learned models in `tests/external_models`. These tests are not updated and must be transferred to integration tests.
 
-仲裁插入有单独的测试. 该测试在不同的服务器上运行 ClickHouse 集群并模拟各种故障情况：网络分裂、丢包(ClickHouse 节点之间、ClickHouse 和 ZooKeeper 之间、ClickHouse 服务器和客户端之间等)、`kill -9`、`kill -STOP` 和 `kill -CONT` , 比如 [Jepsen](https://aphyr.com/tags/Jepsen). 然后测试检查所有已确认的插入是否已写入并且所有被拒绝的插入均未写入.
+There is separate test for quorum inserts. This test run ClickHouse cluster on separate servers and emulate various failure cases: network split, packet drop (between ClickHouse nodes, between ClickHouse and ZooKeeper, between ClickHouse server and client, etc.), `kill -9`, `kill -STOP` and `kill -CONT` , like [Jepsen](https://aphyr.com/tags/Jepsen). Then the test checks that all acknowledged inserts was written and all rejected inserts was not.
 
-在 ClickHouse 开源之前, Quorum 测试是由单独的团队编写的. 这个团队不再与ClickHouse合作. 测试碰巧是用Java编写的. 由于这些原因, 必须重写仲裁测试并将其转移到集成测试.
+Quorum test was written by separate team before ClickHouse was open-sourced. This team no longer work with ClickHouse. Test was accidentally written in Java. For these reasons, quorum test must be rewritten and moved to integration tests.
 
-## 手动测试 {#manual-testing}
+## Manual Testing {#manual-testing}
 
-当您开发一个新特性时, 手动测试它也是合理的. 您可以按照以下步骤进行操作:
+When you develop a new feature, it is reasonable to also test it manually. You can do it with the following steps:
 
-构建 ClickHouse. 从终端运行 ClickHouse：将目录更改为 `programs/clickhouse-server` 并使用 `./clickhouse-server` 运行它.  默认情况下, 它将使用当前目录中的配置(`config.xml`、`users.xml` 和`config.d` 和`users.d` 目录中的文件). 要连接到 ClickHouse 服务器, 请运行 `programs/clickhouse-client/clickhouse-client` .
+Build ClickHouse. Run ClickHouse from the terminal: change directory to `programs/clickhouse-server` and run it with `./clickhouse-server`. It will use configuration (`config.xml`, `users.xml` and files within `config.d` and `users.d` directories) from the current directory by default. To connect to ClickHouse server, run `programs/clickhouse-client/clickhouse-client`.
 
-请注意, 所有 clickhouse 工具(服务器、客户端等)都只是指向名为 `clickhouse` 的单个二进制文件的符号链接. 你可以在 `programs/clickhouse` 找到这个二进制文件. 所有工具也可以作为 `clickhouse tool` 而不是 `clickhouse-tool` 调用.
+Note that all clickhouse tools (server, client, etc) are just symlinks to a single binary named `clickhouse`. You can find this binary at `programs/clickhouse`. All tools can also be invoked as `clickhouse tool` instead of `clickhouse-tool`.
 
-或者, 您可以安装 ClickHouse 包: 从 Yandex 存储库稳定发布, 或者您可以在 ClickHouse 源根目录中使用 `./release` 为自己构建包. 然后使用 `sudo service clickhouse-server start` 启动服务器(或停止以停止服务器). 在 `/etc/clickhouse-server/clickhouse-server.log` 中查找日志.
+Alternatively you can install ClickHouse package: either stable release from ClickHouse repository or you can build package for yourself with `./release` in ClickHouse sources root. Then start the server with `sudo clickhouse start` (or stop to stop the server). Look for logs at `/etc/clickhouse-server/clickhouse-server.log`.
 
-当您的系统上已经安装了 ClickHouse 时，您可以构建一个新的 `clickhouse` 二进制文件并替换现有的二进制文件:
+When ClickHouse is already installed on your system, you can build a new `clickhouse` binary and replace the existing binary:
 
 ``` bash
-$ sudo service clickhouse-server stop
+$ sudo clickhouse stop
 $ sudo cp ./clickhouse /usr/bin/
-$ sudo service clickhouse-server start
+$ sudo clickhouse start
 ```
 
-您也可以停止系统 clickhouse-server 并使用相同的配置运行您自己的服务器, 但登录到终端:
+Also you can stop system clickhouse-server and run your own with the same configuration but with logging to terminal:
 
 ``` bash
-$ sudo service clickhouse-server stop
+$ sudo clickhouse stop
 $ sudo -u clickhouse /usr/bin/clickhouse server --config-file /etc/clickhouse-server/config.xml
 ```
 
-使用 gdb 的示例:
+Example with gdb:
 
 ``` bash
 $ sudo -u clickhouse gdb --args /usr/bin/clickhouse server --config-file /etc/clickhouse-server/config.xml
 ```
 
-如果系统 clickhouse-server 已经在运行并且你不想停止它, 你可以在你的 `config.xml` 中更改端口号(或在 `config.d` 目录中的文件中覆盖它们), 提供适当的数据路径, 并运行它.
+If the system clickhouse-server is already running and you do not want to stop it, you can change port numbers in your `config.xml` (or override them in a file in `config.d` directory), provide appropriate data path, and run it.
 
-`clickhouse` 二进制文件几乎没有依赖关系, 可以在广泛的 Linux 发行版中使用. 要在服务器上快速而肮脏地测试您的更改, 您可以简单地将新构建的 `clickhouse` 二进制文件 `scp` 到您的服务器, 然后按照上面的示例运行它.
+`clickhouse` binary has almost no dependencies and works across wide range of Linux distributions. To quick and dirty test your changes on a server, you can simply `scp` your fresh built `clickhouse` binary to your server and then run it as in examples above.
 
-## 测试环境 {#testing-environment}
+## Build Tests {#build-tests}
 
-在发布稳定版之前, 我们将其部署在测试环境中.测试环境是一个集群，处理 [Yandex.Metrica](https://metrica.yandex.com/) 数据的 1/39 部分. 我们与 Yandex.Metrica 团队共享我们的测试环境. ClickHouse无需在现有数据上停机即可升级. 我们首先看到的是, 数据被成功地处理了, 没有滞后于实时, 复制继续工作, Yandex.Metrica 团队没有发现任何问题. 第一次检查可以通过以下方式进行:
+Build tests allow to check that build is not broken on various alternative configurations and on some foreign systems. These tests are automated as well.
 
-``` sql
-SELECT hostName() AS h, any(version()), any(uptime()), max(UTCEventTime), count() FROM remote('example01-01-{1..3}t', merge, hits) WHERE EventDate >= today() - 2 GROUP BY h ORDER BY h;
-```
+Examples:
+-   cross-compile for Darwin x86_64 (Mac OS X)
+-   cross-compile for FreeBSD x86_64
+-   cross-compile for Linux AArch64
+-   build on Ubuntu with libraries from system packages (discouraged)
+-   build with shared linking of libraries (discouraged)
 
-在某些情况下, 我们还会部署到 Yandex 中我们朋友团队的测试环境：Market、Cloud 等. 此外, 我们还有一些用于开发目的的硬件服务器.
+For example, build with system packages is bad practice, because we cannot guarantee what exact version of packages a system will have. But this is really needed by Debian maintainers. For this reason we at least have to support this variant of build. Another example: shared linking is a common source of trouble, but it is needed for some enthusiasts.
 
-## 负载测试 {#load-testing}
+Though we cannot run all tests on all variant of builds, we want to check at least that various build variants are not broken. For this purpose we use build tests.
 
-部署到测试环境后, 我们使用来自生产集群的查询运行负载测试. 这是手动完成的.
+We also test that there are no translation units that are too long to compile or require too much RAM.
 
-确保您在生产集群上启用了 `query_log`.
+We also test that there are no too large stack frames.
 
-收集一天或更长时间的查询日志:
+## Testing for Protocol Compatibility {#testing-for-protocol-compatibility}
 
-``` bash
-$ clickhouse-client --query="SELECT DISTINCT query FROM system.query_log WHERE event_date = today() AND query LIKE '%ym:%' AND query NOT LIKE '%system.query_log%' AND type = 2 AND is_initial_query" > queries.tsv
-```
+When we extend ClickHouse network protocol, we test manually that old clickhouse-client works with new clickhouse-server and new clickhouse-client works with old clickhouse-server (simply by running binaries from corresponding packages).
 
-这是一个复杂的例子. `type = 2` 将过滤成功执行的查询. `query LIKE '%ym:%'` 是从 Yandex.Metrica 中选择相关查询. `is_initial_query` 是只选择客户端发起的查询, 而不是 ClickHouse 本身(作为分布式查询处理的一部分).
+We also test some cases automatically with integrational tests:
+- if data written by old version of ClickHouse can be successfully read by the new version;
+- do distributed queries work in a cluster with different ClickHouse versions.
 
-`scp` 将此日志记录到您的测试集群并按如下方式运行它:
+## Help from the Compiler {#help-from-the-compiler}
 
-``` bash
-$ clickhouse benchmark --concurrency 16 < queries.tsv
-```
+Main ClickHouse code (that is located in `dbms` directory) is built with `-Wall -Wextra -Werror` and with some additional enabled warnings. Although these options are not enabled for third-party libraries.
 
-(可能你还想指定一个 `--user`)
+Clang has even more useful warnings - you can look for them with `-Weverything` and pick something to default build.
 
-然后把它留到晚上或周末, 去休息一下.
+For production builds, clang is used, but we also test make gcc builds. For development, clang is usually more convenient to use. You can build on your own machine with debug mode (to save battery of your laptop), but please note that compiler is able to generate more warnings with `-O3` due to better control flow and inter-procedure analysis. When building with clang in debug mode, debug version of `libc++` is used that allows to catch more errors at runtime.
 
-您应该检查 `clickhouse-server` 没有崩溃, 内存占用是有限的, 且性能不会随着时间的推移而降低.
+## Sanitizers {#sanitizers}
 
-由于查询和环境的高度可变性, 没有记录和比较精确的查询执行时间.
+### Address sanitizer
+We run functional, integration, stress and unit tests under ASan on per-commit basis.
 
-## 构建测试 {#build-tests}
+### Thread sanitizer
+We run functional, integration, stress and unit tests under TSan on per-commit basis.
 
-构建测试允许检查在各种可选配置和一些外部系统上的构建是否被破坏. 这些测试也是自动化的.
+### Memory sanitizer
+We run functional, integration, stress and unit tests under MSan on per-commit basis.
 
-示例:
--   Darwin x86_64 (Mac OS X) 交叉编译
--   FreeBSD x86_64 交叉编译
--   Linux AArch64 交叉编译
--   使用系统包中的库在 Ubuntu 上构建（不鼓励）
--   使用库的共享链接构建（不鼓励）
-
-例如, 使用系统包构建是不好的做法, 因为我们无法保证系统将拥有哪个确切版本的包. 但这确实是 Debian 维护者所需要的. 出于这个原因, 我们至少必须支持这种构建变体. 另一个例子: 共享链接是一个常见的麻烦来源, 但对于一些爱好者来说是需要的.
-
-虽然我们无法对所有构建变体运行所有测试, 但我们希望至少检查各种构建变体没有被破坏. 为此, 我们使用构建测试.
-
-我们还测试了那些太长而无法编译或需要太多RAM的没有翻译单元.
-
-我们还测试没有太大的堆栈帧.
-
-## 协议兼容性测试 {#testing-for-protocol-compatibility}
-
-当我们扩展 ClickHouse 网络协议时, 我们手动测试旧的 clickhouse-client 与新的 clickhouse-server 一起工作, 而新的 clickhouse-client 与旧的 clickhouse-server 一起工作(只需从相应的包中运行二进制文件).
-
-我们还使用集成测试自动测试一些案例:
-- 旧版本ClickHouse写入的数据是否可以被新版本成功读取;
-- 在具有不同 ClickHouse 版本的集群中执行分布式查询.
-
-## 编译器的帮助 {#help-from-the-compiler}
-
-主要的 ClickHouse 代码(位于 `dbms` 目录中)是用 `-Wall -Wextra -Werror` 和一些额外的启用警告构建的. 虽然没有为第三方库启用这些选项.
-
-Clang 有更多有用的警告 - 你可以用 `-Weverything` 寻找它们并选择一些东西来默认构建.
-
-对于生产构建, 使用 clang, 但我们也测试 make gcc 构建. 对于开发, clang 通常使用起来更方便. 您可以使用调试模式在自己的机器上构建(以节省笔记本电脑的电池), 但请注意, 由于更好的控制流和过程间分析, 编译器能够使用 `-O3` 生成更多警告. 在调试模式下使用 clang 构建时, 使用调试版本的 `libc++` 允许在运行时捕获更多错误.
-
-## 地址清理器 {#sanitizers}
-
-### 地址清理器
-我们在ASan上运行功能测试、集成测试、压力测试和单元测试.
-
-### 线程清理器
-我们在TSan下运行功能测试、集成测试、压力测试和单元测试.
-
-### 内存清理器
-我们在MSan上运行功能测试、集成测试、压力测试和单元测试.
-
-### 未定义的行为清理器
-我们在UBSan下运行功能测试、集成测试、压力测试和单元测试. 某些第三方库的代码未针对 UB 进行清理.
+### Undefined behaviour sanitizer
+We run functional, integration, stress and unit tests under UBSan on per-commit basis. The code of some third-party libraries is not sanitized for UB.
 
 ### Valgrind (Memcheck)
-我们曾经在 Valgrind 下通宵运行功能测试, 但不再这样做了. 这需要几个小时. 目前在`re2`库中有一个已知的误报, 见[这篇文章](https://research.swtch.com/sparse).
+We used to run functional tests under Valgrind overnight, but don't do it anymore. It takes multiple hours. Currently there is one known false positive in `re2` library, see [this article](https://research.swtch.com/sparse).
 
-## 模糊测试 {#fuzzing}
+## Fuzzing {#fuzzing}
 
-ClickHouse 模糊测试是使用 [libFuzzer](https://llvm.org/docs/LibFuzzer.html) 和随机 SQL 查询实现的. 所有模糊测试都应使用sanitizers(地址和未定义)进行.
+ClickHouse fuzzing is implemented both using [libFuzzer](https://llvm.org/docs/LibFuzzer.html) and random SQL queries.
+All the fuzz testing should be performed with sanitizers (Address and Undefined).
 
-LibFuzzer 用于库代码的隔离模糊测试. Fuzzer 作为测试代码的一部分实现, 并具有 `_fuzzer` 名称后缀.
-Fuzzer 示例可以在 `src/Parsers/tests/lexer_fuzzer.cpp` 中找到. LibFuzzer 特定的配置、字典和语料库存储在 `tests/fuzz` 中.
-我们鼓励您为处理用户输入的每个功能编写模糊测试.
+LibFuzzer is used for isolated fuzz testing of library code. Fuzzers are implemented as part of test code and have “_fuzzer” name postfixes.
+Fuzzer example can be found at `src/Parsers/fuzzers/lexer_fuzzer.cpp`. LibFuzzer-specific configs, dictionaries and corpus are stored at `tests/fuzz`.
+We encourage you to write fuzz tests for every functionality that handles user input.
 
-默认情况下不构建模糊器. 要构建模糊器, 应设置` -DENABLE_FUZZING=1` 和 `-DENABLE_TESTS=1` 选项.
-我们建议在构建模糊器时禁用 Jemalloc. 用于将 ClickHouse fuzzing 集成到 Google OSS-Fuzz 的配置可以在 `docker/fuzz` 中找到.
+Fuzzers are not built by default. To build fuzzers both `-DENABLE_FUZZING=1` and `-DENABLE_TESTS=1` options should be set.
+We recommend to disable Jemalloc while building fuzzers. Configuration used to integrate ClickHouse fuzzing to
+Google OSS-Fuzz can be found at `docker/fuzz`.
 
-我们还使用简单的模糊测试来生成随机SQL查询, 并检查服务器在执行这些查询时是否会死亡.
-你可以在 `00746_sql_fuzzy.pl` 中找到它. 这个测试应该连续运行(通宵或更长时间).
+We also use simple fuzz test to generate random SQL queries and to check that the server does not die executing them.
+You can find it in `00746_sql_fuzzy.pl`. This test should be run continuously (overnight and longer).
 
-我们还使用复杂的基于 AST 的查询模糊器, 它能够找到大量的极端情况. 它在查询 AST 中进行随机排列和替换. 它会记住先前测试中的 AST 节点, 以使用它们对后续测试进行模糊测试, 同时以随机顺序处理它们. 您可以在 [这篇博客文章](https://clickhouse.com/blog/en/2021/fuzzing-clickhouse/) 中了解有关此模糊器的更多信息.
+We also use sophisticated AST-based query fuzzer that is able to find huge amount of corner cases. It does random permutations and substitutions in queries AST. It remembers AST nodes from previous tests to use them for fuzzing of subsequent tests while processing them in random order. You can learn more about this fuzzer in [this blog article](https://clickhouse.com/blog/en/2021/fuzzing-clickhouse/).
 
-## 压力测试 {#stress-test}
+## Stress test
 
-压力测试是另一种模糊测试. 它使用单个服务器以随机顺序并行运行所有功能测试. 不检查测试结果.
+Stress tests are another case of fuzzing. It runs all functional tests in parallel in random order with a single server. Results of the tests are not checked.
 
-经检查:
-- 服务器不会崩溃，不会触发调试或清理程序陷阱;
-- 没有死锁;
-- 数据库结构一致;
-- 服务器可以在测试后成功停止并重新启动，没有异常;
+It is checked that:
+- server does not crash, no debug or sanitizer traps are triggered;
+- there are no deadlocks;
+- the database structure is consistent;
+- server can successfully stop after the test and start again without exceptions.
 
-有五种变体 (Debug, ASan, TSan, MSan, UBSan).
+There are five variants (Debug, ASan, TSan, MSan, UBSan).
 
-## 线程模糊器 {#thread-fuzzer}
+## Thread Fuzzer
 
-Thread Fuzzer(请不要与 Thread Sanitizer 混淆)是另一种允许随机化线程执行顺序的模糊测试. 它有助于找到更多特殊情况.
+Thread Fuzzer (please don't mix up with Thread Sanitizer) is another kind of fuzzing that allows to randomize thread order of execution. It helps to find even more special cases.
 
-## 安全审计 {#security-audit}
+## Security Audit
 
-Yandex安全团队的人员从安全的角度对ClickHouse的功能做了一些基本的概述.
+Our Security Team did some basic overview of ClickHouse capabilities from the security standpoint.
 
-## 静态分析仪 {#static-analyzers}
+## Static Analyzers {#static-analyzers}
 
-我们在每次提交的基础上运行 `clang-tidy`. `clang-static-analyzer` 检查也被启用. `clang-tidy` 也用于一些样式检查.
+We run `clang-tidy` on per-commit basis. `clang-static-analyzer` checks are also enabled. `clang-tidy` is also used for some style checks.
 
-我们已经评估了 `clang-tidy`、`Coverity`、`cppcheck`、`PVS-Studio`、`tscancode`、`CodeQL`. 您将在 `tests/instructions/` 目录中找到使用说明. 你也可以阅读[俄文文章](https://habr.com/company/yandex/blog/342018/).
+We have evaluated `clang-tidy`, `Coverity`, `cppcheck`, `PVS-Studio`, `tscancode`, `CodeQL`. You will find instructions for usage in `tests/instructions/` directory.
 
-如果你使用 `CLion` 作为 IDE, 你可以利用一些开箱即用的 `clang-tidy` 检查
+If you use `CLion` as an IDE, you can leverage some `clang-tidy` checks out of the box.
 
-我们还使用 `shellcheck` 对shell脚本进行静态分析.
+We also use `shellcheck` for static analysis of shell scripts.
 
-## 硬化 {#hardening}
+## Hardening {#hardening}
 
-在调试版本中, 我们使用自定义分配器执行用户级分配的 ASLR.
+In debug build we are using custom allocator that does ASLR of user-level allocations.
 
-我们还手动保护在分配后预期为只读的内存区域.
+We also manually protect memory regions that are expected to be readonly after allocation.
 
-在调试构建中, 我们还需要对libc进行自定义, 以确保不会调用 "有害的" (过时的、不安全的、非线程安全的)函数.
+In debug build we also involve a customization of libc that ensures that no "harmful" (obsolete, insecure, not thread-safe) functions are called.
 
-Debug 断言被广泛使用.
+Debug assertions are used extensively.
 
-在调试版本中，如果抛出带有 "逻辑错误" 代码(暗示错误)的异常, 则程序会过早终止. 它允许在发布版本中使用异常, 但在调试版本中使其成为断言.
+In debug build, if exception with "logical error" code (implies a bug) is being thrown, the program is terminated prematurely. It allows to use exceptions in release build but make it an assertion in debug build.
 
-jemalloc 的调试版本用于调试版本.
-libc++ 的调试版本用于调试版本.
+Debug version of jemalloc is used for debug builds.
+Debug version of libc++ is used for debug builds.
 
-## 运行时完整性检查
+## Runtime Integrity Checks
 
-对存储在磁盘上的数据是校验和. MergeTree 表中的数据同时以三种方式进行校验和*(压缩数据块、未压缩数据块、跨块的总校验和). 客户端和服务器之间或服务器之间通过网络传输的数据也会进行校验和. 复制确保副本上的数据位相同.
+Data stored on disk is checksummed. Data in MergeTree tables is checksummed in three ways simultaneously* (compressed data blocks, uncompressed data blocks, the total checksum across blocks). Data transferred over network between client and server or between servers is also checksummed. Replication ensures bit-identical data on replicas.
 
-需要防止硬件故障(存储介质上的位腐烂、服务器上 RAM 中的位翻转、网络控制器 RAM 中的位翻转、网络交换机 RAM 中的位翻转、客户端 RAM 中的位翻转、线路上的位翻转). 请注意，比特位操作很常见, 即使对于 ECC RAM 和 TCP 校验和(如果您每天设法运行数千台处理 PB 数据的服务器, 也可能发生比特位操作. [观看视频(俄语)](https://www.youtube.com/watch?v=ooBAQIe0KlQ).
+It is required to protect from faulty hardware (bit rot on storage media, bit flips in RAM on server, bit flips in RAM of network controller, bit flips in RAM of network switch, bit flips in RAM of client, bit flips on the wire). Note that bit flips are common and likely to occur even for ECC RAM and in presence of TCP checksums (if you manage to run thousands of servers processing petabytes of data each day). [See the video (russian)](https://www.youtube.com/watch?v=ooBAQIe0KlQ).
 
-ClickHouse 提供诊断功能, 可帮助运维工程师找到故障硬件.
+ClickHouse provides diagnostics that will help ops engineers to find faulty hardware.
 
-\* 它并不慢.
+\* and it is not slow.
 
-## 代码风格 {#code-style}
+## Code Style {#code-style}
 
-[此处](style.md)描述了代码样式规则.
+Code style rules are described [here](style.md).
 
-要检查一些常见的样式违规，您可以使用 `utils/check-style` 脚本.
+To check for some common style violations, you can use `utils/check-style` script.
 
-要强制使用正确的代码样式, 您可以使用 `clang-format`. 文件 `.clang-format` 位于源根目录. 它大多与我们的实际代码风格相对应. 但是不建议将 `clang-format` 应用于现有文件, 因为它会使格式变得更糟. 您可以使用可以在 clang 源代码库中找到的 `clang-format-diff` 工具.
+To force proper style of your code, you can use `clang-format`. File `.clang-format` is located at the sources root. It mostly corresponding with our actual code style. But it’s not recommended to apply `clang-format` to existing files because it makes formatting worse. You can use `clang-format-diff` tool that you can find in clang source repository.
 
-或者, 您可以尝试使用 `uncrustify` 工具来重新格式化您的代码. 配置位于源根目录中的 `uncrustify.cfg` 中. 它比 `clang-format` 测试更少.
+Alternatively you can try `uncrustify` tool to reformat your code. Configuration is in `uncrustify.cfg` in the sources root. It is less tested than `clang-format`.
 
-`CLion` 有自己的代码格式化程序, 必须根据我们的代码风格进行调整.
+`CLion` has its own code formatter that has to be tuned for our code style.
 
-我们还使用 `codespell` 来查找代码中的拼写错误.它也是自动化的.
+We also use `codespell` to find typos in code. It is automated as well.
 
-## Metrica B2B 测试 {#metrica-b2b-tests}
+## Test Coverage {#test-coverage}
 
-每个 ClickHouse 版本都使用 Yandex Metrica 和 AppMetrica 引擎进行测试. ClickHouse 的测试版和稳定版部署在 VM 上, 并使用 Metrica 引擎的小副本运行, 该引擎处理输入数据的固定样本. 然后将两个 Metrica 引擎实例的结果放在一起比较.
-
-这些测试由单独的团队自动化. 由于移动部件数量众多, 测试在大多数情况下都因完全不相关的原因而失败, 这些原因很难弄清楚. 这些测试很可能对我们有负面价值. 尽管如此, 这些测试在数百次中被证明是有用的.
-
-## 测试覆盖率 {#test-coverage}
-
-我们还跟踪测试覆盖率, 但仅针对功能测试和 clickhouse-server. 它每天进行.
+We also track test coverage but only for functional tests and only for clickhouse-server. It is performed on daily basis.
 
 ## Tests for Tests
 
-有自动检测薄片测试. 它运行所有新测试100次(用于功能测试)或10次(用于集成测试). 如果至少有一次测试失败，它就被认为是脆弱的.
+There is automated check for flaky tests. It runs all new tests 100 times (for functional tests) or 10 times (for integration tests). If at least single time the test failed, it is considered flaky.
 
 ## Testflows
 
-[Testflows](https://testflows.com/) 是一个企业级的测试框架. Altinity 使用它进行一些测试, 我们在 CI 中运行这些测试.
+[Testflows](https://testflows.com/) is an enterprise-grade open-source testing framework, which is used to test a subset of ClickHouse.
 
-## Yandex 检查 (only for Yandex employees)
+## Test Automation {#test-automation}
 
-这些检查将ClickHouse代码导入到Yandex内部的单一存储库中, 所以ClickHouse代码库可以被Yandex的其他产品(YT和YDB)用作库. 请注意, clickhouse-server本身并不是由内部回购构建的, Yandex应用程序使用的是未经修改的开源构建的.
+We run tests with [GitHub Actions](https://github.com/features/actions).
 
-## 测试自动化 {#test-automation}
+Build jobs and tests are run in Sandbox on per commit basis. Resulting packages and test results are published in GitHub and can be downloaded by direct links. Artifacts are stored for several months. When you send a pull request on GitHub, we tag it as “can be tested” and our CI system will build ClickHouse packages (release, debug, with address sanitizer, etc) for you.
 
-我们使用 Yandex 内部 CI 和名为 "Sandbox" 的作业自动化系统运行测试.
+We do not use Travis CI due to the limit on time and computational power.
+We do not use Jenkins. It was used before and now we are happy we are not using Jenkins.
 
-在每次提交的基础上, 构建作业和测试都在沙箱中运行. 生成的包和测试结果发布在GitHub上, 可以通过直接链接下载. 产物要保存几个月. 当你在GitHub上发送一个pull请求时, 我们会把它标记为 "可以测试" , 我们的CI系统会为你构建ClickHouse包(发布、调试、使用地址清理器等).
-
-由于时间和计算能力的限制, 我们不使用 Travis CI.
-我们不用Jenkins. 以前用过, 现在我们很高兴不用Jenkins了.
-
-[原始文章](https://clickhouse.com/docs/en/development/tests/) <!--hide-->
+[Original article](https://clickhouse.com/docs/en/development/tests/) <!--hide-->
diff --git a/docs/zh/sql-reference/data-types/lowcardinality.md b/docs/zh/sql-reference/data-types/lowcardinality.md
index e089a7f9d41..717c3c979a4 100644
--- a/docs/zh/sql-reference/data-types/lowcardinality.md
+++ b/docs/zh/sql-reference/data-types/lowcardinality.md
@@ -55,6 +55,5 @@ ORDER BY id
 
 ## 参考
 
-- [高效低基数类型](https://www.altinity.com/blog/2019/3/27/low-cardinality).
 - [使用低基数类型减少ClickHouse的存储成本 – 来自Instana工程师的分享](https://www.instana.com/blog/reducing-clickhouse-storage-cost-with-the-low-cardinality-type-lessons-from-an-instana-engineer/).
-- [字符优化 (俄语视频分享)](https://youtu.be/rqf-ILRgBdY?list=PL0Z2YDlm0b3iwXCpEFiOOYmwXzVmjJfEt). [英语分享](https://github.com/ClickHouse/clickhouse-presentations/raw/master/meetup19/string_optimization.pdf).
\ No newline at end of file
+- [字符优化 (俄语视频分享)](https://youtu.be/rqf-ILRgBdY?list=PL0Z2YDlm0b3iwXCpEFiOOYmwXzVmjJfEt). [英语分享](https://github.com/ClickHouse/clickhouse-presentations/raw/master/meetup19/string_optimization.pdf).
diff --git a/docs/zh/sql-reference/statements/create.md b/docs/zh/sql-reference/statements/create.md
index aef21a704b5..af77f4750b5 100644
--- a/docs/zh/sql-reference/statements/create.md
+++ b/docs/zh/sql-reference/statements/create.md
@@ -121,8 +121,6 @@ ENGINE = <Engine>
 ...
 ```
 
-如果指定了编解ec，则默认编解码器不适用。 编解码器可以组合在一个流水线中，例如, `CODEC(Delta, ZSTD)`. 要为您的项目选择最佳的编解码器组合，请通过类似于Altinity中描述的基准测试 [新编码提高ClickHouse效率](https://www.altinity.com/blog/2019/7/new-encodings-to-improve-clickhouse) 文章.
-
 !!! warning "警告"
     您无法使用外部实用程序解压缩ClickHouse数据库文件，如 `lz4`. 相反，使用特殊的 [ﾂ环板compressorｮﾂ嘉ｯﾂ偲](https://github.com/ClickHouse/ClickHouse/tree/master/programs/compressor) 实用程序。
 
diff --git a/programs/client/Client.cpp b/programs/client/Client.cpp
index e27845de184..6506c23428a 100644
--- a/programs/client/Client.cpp
+++ b/programs/client/Client.cpp
@@ -723,7 +723,7 @@ bool Client::processWithFuzzing(const String & full_query)
         // queries, for lack of a better solution.
         // There is also a problem that fuzzer substitutes positive Int64
         // literals or Decimal literals, which are then parsed back as
-        // UInt64, and suddenly duplicate alias substitition starts or stops
+        // UInt64, and suddenly duplicate alias substitution starts or stops
         // working (ASTWithAlias::formatImpl) or something like that.
         // So we compare not even the first and second formatting of the
         // query, but second and third.
diff --git a/programs/disks/CommandMkDir.cpp b/programs/disks/CommandMkDir.cpp
new file mode 100644
index 00000000000..11a940028a3
--- /dev/null
+++ b/programs/disks/CommandMkDir.cpp
@@ -0,0 +1,67 @@
+#pragma once
+
+#include "ICommand.h"
+#include <Interpreters/Context.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int BAD_ARGUMENTS;
+}
+
+class CommandMkDir : public ICommand
+{
+public:
+    CommandMkDir()
+    {
+        command_name = "mkdir";
+        command_option_description.emplace(createOptionsDescription("Allowed options", getTerminalWidth()));
+        description = "Create directory or directories recursively";
+        usage = "mkdir [OPTION]... <PATH>";
+        command_option_description->add_options()
+            ("recursive", "recursively create directories")
+            ;
+    }
+
+    void processOptions(
+        Poco::Util::LayeredConfiguration & config,
+        po::variables_map & options) const override
+    {
+        if (options.count("recursive"))
+            config.setBool("recursive", true);
+    }
+
+    void execute(
+        const std::vector<String> & command_arguments,
+        DB::ContextMutablePtr & global_context,
+        Poco::Util::LayeredConfiguration & config) override
+    {
+        if (command_arguments.size() != 1)
+        {
+            printHelpMessage();
+            throw DB::Exception("Bad Arguments", DB::ErrorCodes::BAD_ARGUMENTS);
+        }
+
+        String disk_name = config.getString("disk", "default");
+
+        String path = command_arguments[0];
+
+        DiskPtr disk = global_context->getDisk(disk_name);
+
+        String full_path = fullPathWithValidate(disk, path);
+        bool recursive = config.getBool("recursive", false);
+
+        if (recursive)
+            disk->createDirectories(full_path);
+        else
+            disk->createDirectory(full_path);
+    }
+};
+}
+
+std::unique_ptr <DB::ICommand> makeCommandMkDir()
+{
+    return std::make_unique<DB::CommandMkDir>();
+}
diff --git a/programs/disks/DisksApp.cpp b/programs/disks/DisksApp.cpp
index 08768386808..b662921a3b1 100644
--- a/programs/disks/DisksApp.cpp
+++ b/programs/disks/DisksApp.cpp
@@ -63,7 +63,7 @@ void DisksApp::addOptions(
 
     positional_options_description.add("command_name", 1);
 
-    supported_commands = {"list-disks", "list", "move", "remove", "link", "copy", "write", "read"};
+    supported_commands = {"list-disks", "list", "move", "remove", "link", "copy", "write", "read", "mkdir"};
 
     command_descriptions.emplace("list-disks", makeCommandListDisks());
     command_descriptions.emplace("list", makeCommandList());
@@ -73,6 +73,7 @@ void DisksApp::addOptions(
     command_descriptions.emplace("copy", makeCommandCopy());
     command_descriptions.emplace("write", makeCommandWrite());
     command_descriptions.emplace("read", makeCommandRead());
+    command_descriptions.emplace("mkdir", makeCommandMkDir());
 }
 
 void DisksApp::processOptions()
diff --git a/programs/disks/DisksApp.h b/programs/disks/DisksApp.h
index cbb3a7dfcc9..24fa9c3d9eb 100644
--- a/programs/disks/DisksApp.h
+++ b/programs/disks/DisksApp.h
@@ -4,6 +4,7 @@
 #include "CommandLink.cpp"
 #include "CommandList.cpp"
 #include "CommandListDisks.cpp"
+#include "CommandMkDir.cpp"
 #include "CommandMove.cpp"
 #include "CommandRead.cpp"
 #include "CommandRemove.cpp"
diff --git a/programs/disks/ICommand.h b/programs/disks/ICommand.h
index 9cde55dbb6b..f57f74a880e 100644
--- a/programs/disks/ICommand.h
+++ b/programs/disks/ICommand.h
@@ -65,3 +65,4 @@ std::unique_ptr <DB::ICommand> makeCommandMove();
 std::unique_ptr <DB::ICommand> makeCommandRead();
 std::unique_ptr <DB::ICommand> makeCommandRemove();
 std::unique_ptr <DB::ICommand> makeCommandWrite();
+std::unique_ptr <DB::ICommand> makeCommandMkDir();
diff --git a/programs/git-import/git-import.cpp b/programs/git-import/git-import.cpp
index 54f5d7f75ea..030ddd263fa 100644
--- a/programs/git-import/git-import.cpp
+++ b/programs/git-import/git-import.cpp
@@ -67,7 +67,7 @@ Run this tool inside your git repository. It will create .tsv files that can be
 The tool can process large enough repositories in a reasonable time.
 It has been tested on:
 - ClickHouse: 31 seconds; 3 million rows;
-- LLVM: 8 minues; 62 million rows;
+- LLVM: 8 minutes; 62 million rows;
 - Linux - 12 minutes; 85 million rows;
 - Chromium - 67 minutes; 343 million rows;
 (the numbers as of Sep 2020)
diff --git a/programs/obfuscator/Obfuscator.cpp b/programs/obfuscator/Obfuscator.cpp
index 6891d2113a5..95bf89b0255 100644
--- a/programs/obfuscator/Obfuscator.cpp
+++ b/programs/obfuscator/Obfuscator.cpp
@@ -24,6 +24,7 @@
 #include <Common/typeid_cast.h>
 #include <Common/assert_cast.h>
 #include <Formats/registerFormats.h>
+#include <Formats/ReadSchemaUtils.h>
 #include <Processors/Formats/IInputFormat.h>
 #include <QueryPipeline/QueryPipelineBuilder.h>
 #include <Processors/Executors/PullingPipelineExecutor.h>
@@ -38,6 +39,7 @@
 #include <IO/WriteBufferFromFile.h>
 #include <Compression/CompressedReadBuffer.h>
 #include <Compression/CompressedWriteBuffer.h>
+#include <Interpreters/parseColumnsListForTableFunction.h>
 #include <memory>
 #include <cmath>
 #include <unistd.h>
@@ -1239,7 +1241,6 @@ try
 
     if (options.count("help")
         || !options.count("seed")
-        || !options.count("structure")
         || !options.count("input-format")
         || !options.count("output-format"))
     {
@@ -1259,7 +1260,11 @@ try
 
     UInt64 seed = sipHash64(options["seed"].as<std::string>());
 
-    std::string structure = options["structure"].as<std::string>();
+    std::string structure;
+
+    if (options.count("structure"))
+        structure = options["structure"].as<std::string>();
+
     std::string input_format = options["input-format"].as<std::string>();
     std::string output_format = options["output-format"].as<std::string>();
 
@@ -1287,32 +1292,51 @@ try
     markov_model_params.determinator_sliding_window_size = options["determinator-sliding-window-size"].as<UInt64>();
 
     /// Create the header block
-    std::vector<std::string> structure_vals;
-    boost::split(structure_vals, structure, boost::algorithm::is_any_of(" ,"), boost::algorithm::token_compress_on);
-
-    if (structure_vals.size() % 2 != 0)
-        throw Exception("Odd number of elements in section structure: must be a list of name type pairs", ErrorCodes::LOGICAL_ERROR);
+    SharedContextHolder shared_context = Context::createShared();
+    auto context = Context::createGlobal(shared_context.get());
+    auto context_const = WithContext(context).getContext();
+    context->makeGlobalContext();
 
     Block header;
-    const DataTypeFactory & data_type_factory = DataTypeFactory::instance();
 
-    for (size_t i = 0, size = structure_vals.size(); i < size; i += 2)
+    ColumnsDescription schema_columns;
+
+    if (structure.empty())
+    {
+        ReadBufferIterator read_buffer_iterator = [&](ColumnsDescription &)
+        {
+            auto file = std::make_unique<ReadBufferFromFileDescriptor>(STDIN_FILENO);
+
+            /// stdin must be seekable
+            auto res = lseek(file->getFD(), 0, SEEK_SET);
+            if (-1 == res)
+                throwFromErrno("Input must be seekable file (it will be read twice).", ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
+
+            return file;
+        };
+
+        schema_columns = readSchemaFromFormat(input_format, {}, read_buffer_iterator, false, context_const);
+    }
+    else
+    {
+        schema_columns = parseColumnsListFromString(structure, context_const);
+    }
+
+    auto schema_columns_info = schema_columns.getOrdinary();
+
+    for (auto & info : schema_columns_info)
     {
         ColumnWithTypeAndName column;
-        column.name = structure_vals[i];
-        column.type = data_type_factory.get(structure_vals[i + 1]);
+        column.name = info.name;
+        column.type = info.type;
         column.column = column.type->createColumn();
         header.insert(std::move(column));
     }
 
-    SharedContextHolder shared_context = Context::createShared();
-    auto context = Context::createGlobal(shared_context.get());
-    context->makeGlobalContext();
-
     ReadBufferFromFileDescriptor file_in(STDIN_FILENO);
     WriteBufferFromFileDescriptor file_out(STDOUT_FILENO);
 
-    if (load_from_file.empty())
+    if (load_from_file.empty() || structure.empty())
     {
         /// stdin must be seekable
         auto res = lseek(file_in.getFD(), 0, SEEK_SET);
diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp
index d788270ecf9..93df877ab8e 100644
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@@ -1036,7 +1036,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
         try
         {
             LOG_DEBUG(
-                log, "Initiailizing merge tree metadata cache lru_cache_size:{} continue_if_corrupted:{}", size, continue_if_corrupted);
+                log, "Initializing merge tree metadata cache lru_cache_size:{} continue_if_corrupted:{}", size, continue_if_corrupted);
             global_context->initializeMergeTreeMetadataCache(path_str + "/" + "rocksdb", size);
         }
         catch (...)
@@ -1089,7 +1089,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
         }
     }
 
-    LOG_DEBUG(log, "Initiailizing interserver credentials.");
+    LOG_DEBUG(log, "Initializing interserver credentials.");
     global_context->updateInterserverCredentials(config());
 
     if (config().has("macros"))
diff --git a/src/Access/AccessControl.cpp b/src/Access/AccessControl.cpp
index 4f62c4ca203..89292fe9272 100644
--- a/src/Access/AccessControl.cpp
+++ b/src/Access/AccessControl.cpp
@@ -79,7 +79,7 @@ public:
             /// No user, probably the user has been dropped while it was in the cache.
             cache.remove(params);
         }
-        auto res = ContextAccess::make(access_control, params);
+        auto res = std::make_shared<ContextAccess>(access_control, params);
         res->initialize();
         cache.add(params, res);
         return res;
diff --git a/src/Access/ContextAccess.cpp b/src/Access/ContextAccess.cpp
index 59c0a692a84..4e409946666 100644
--- a/src/Access/ContextAccess.cpp
+++ b/src/Access/ContextAccess.cpp
@@ -410,7 +410,7 @@ std::shared_ptr<const ContextAccess> ContextAccess::getFullAccess()
 {
     static const std::shared_ptr<const ContextAccess> res = []
     {
-        auto full_access = ContextAccess::make();
+        auto full_access = std::make_shared<ContextAccess>();
         full_access->is_full_access = true;
         full_access->access = std::make_shared<AccessRights>(AccessRights::getFullAccess());
         full_access->access_with_implicit = full_access->access;
diff --git a/src/Access/ContextAccess.h b/src/Access/ContextAccess.h
index 331b449125e..ce1ea2d1220 100644
--- a/src/Access/ContextAccess.h
+++ b/src/Access/ContextAccess.h
@@ -166,12 +166,6 @@ public:
     /// without any limitations. This is used for the global context.
     static std::shared_ptr<const ContextAccess> getFullAccess();
 
-    template <typename... Args>
-    static std::shared_ptr<ContextAccess> make(Args &&... args)
-    {
-        return std::make_shared<ContextAccess>(std::forward<Args>(args)...);
-    }
-
     ~ContextAccess();
 
 private:
diff --git a/src/AggregateFunctions/ThetaSketchData.h b/src/AggregateFunctions/ThetaSketchData.h
index f46836ad189..cd17719a45a 100644
--- a/src/AggregateFunctions/ThetaSketchData.h
+++ b/src/AggregateFunctions/ThetaSketchData.h
@@ -9,6 +9,8 @@
 #include <base/StringRef.h>
 #include <theta_sketch.hpp>
 #include <theta_union.hpp>
+#include <theta_intersection.hpp>
+#include <theta_a_not_b.hpp>
 
 
 namespace DB
@@ -80,6 +82,58 @@ public:
             u->update(rhs.sk_union->get_result());
     }
 
+    void intersect(const ThetaSketchData & rhs)
+    {
+        datasketches::theta_union * u = getSkUnion();
+
+        if (sk_update)
+        {
+            u->update(*sk_update);
+            sk_update.reset(nullptr);
+        }
+
+        datasketches::theta_intersection theta_intersection;
+
+        theta_intersection.update(u->get_result());
+
+        if (rhs.sk_update)
+            theta_intersection.update(*rhs.sk_update);
+        else if (rhs.sk_union)
+            theta_intersection.update(rhs.sk_union->get_result());
+
+        sk_union.reset(nullptr);
+        u = getSkUnion();
+        u->update(theta_intersection.get_result());
+    }
+
+    void aNotB(const ThetaSketchData & rhs)
+    {
+        datasketches::theta_union * u = getSkUnion();
+
+        if (sk_update)
+        {
+            u->update(*sk_update);
+            sk_update.reset(nullptr);
+        }
+
+        datasketches::theta_a_not_b a_not_b;
+
+        if (rhs.sk_update)
+        {
+            datasketches::compact_theta_sketch result = a_not_b.compute(u->get_result(), *rhs.sk_update);
+            sk_union.reset(nullptr);
+            u = getSkUnion();
+            u->update(result);
+        }
+        else if (rhs.sk_union)
+        {
+            datasketches::compact_theta_sketch result = a_not_b.compute(u->get_result(), rhs.sk_union->get_result());
+            sk_union.reset(nullptr);
+            u = getSkUnion();
+            u->update(result);
+        }
+    }
+
     /// You can only call for an empty object.
     void read(DB::ReadBuffer & in)
     {
diff --git a/src/Backups/BackupEntryWrappedWith.h b/src/Backups/BackupEntryWrappedWith.h
new file mode 100644
index 00000000000..97244650b6b
--- /dev/null
+++ b/src/Backups/BackupEntryWrappedWith.h
@@ -0,0 +1,37 @@
+#pragma once
+
+#include <Backups/IBackupEntry.h>
+
+
+namespace DB
+{
+
+/// Wraps another backup entry and a value of any type.
+template <typename T>
+class BackupEntryWrappedWith : public IBackupEntry
+{
+public:
+    BackupEntryWrappedWith(BackupEntryPtr entry_, const T & custom_value_) : entry(entry_), custom_value(custom_value_) { }
+    BackupEntryWrappedWith(BackupEntryPtr entry_, T && custom_value_) : entry(entry_), custom_value(std::move(custom_value_)) { }
+    ~BackupEntryWrappedWith() override = default;
+
+    UInt64 getSize() const override { return entry->getSize(); }
+    std::optional<UInt128> getChecksum() const override { return entry->getChecksum(); }
+    std::unique_ptr<SeekableReadBuffer> getReadBuffer() const override { return entry->getReadBuffer(); }
+    String getFilePath() const override { return entry->getFilePath(); }
+    DiskPtr tryGetDiskIfExists() const override { return entry->tryGetDiskIfExists(); }
+    DataSourceDescription getDataSourceDescription() const override { return entry->getDataSourceDescription(); }
+
+private:
+    BackupEntryPtr entry;
+    T custom_value;
+};
+
+template <typename T>
+void wrapBackupEntriesWith(std::vector<std::pair<String, BackupEntryPtr>> & backup_entries, const T & custom_value)
+{
+    for (auto & [_, backup_entry] : backup_entries)
+        backup_entry = std::make_shared<BackupEntryWrappedWith<T>>(std::move(backup_entry), custom_value);
+}
+
+}
diff --git a/src/Backups/BackupImpl.cpp b/src/Backups/BackupImpl.cpp
index f6442545f48..ffd20e02dd3 100644
--- a/src/Backups/BackupImpl.cpp
+++ b/src/Backups/BackupImpl.cpp
@@ -537,7 +537,7 @@ SizeAndChecksum BackupImpl::getFileSizeAndChecksum(const String & file_name) con
     if (!info)
         throw Exception(
             ErrorCodes::BACKUP_ENTRY_NOT_FOUND, "Backup {}: Entry {} not found in the backup", backup_name, quoteString(file_name));
-    return std::pair(info->size, info->checksum);
+    return {info->size, info->checksum};
 }
 
 BackupEntryPtr BackupImpl::readFile(const String & file_name) const
@@ -625,7 +625,7 @@ CheckBackupResult checkBaseBackupForFile(const SizeAndChecksum & base_backup_inf
 {
     /// We cannot reuse base backup because our file is smaller
     /// than file stored in previous backup
-    if (new_entry_info.size > base_backup_info.first)
+    if (new_entry_info.size < base_backup_info.first)
         return CheckBackupResult::HasNothing;
 
     if (base_backup_info.first == new_entry_info.size)
@@ -682,8 +682,6 @@ ChecksumsForNewEntry calculateNewEntryChecksumsIfNeeded(BackupEntryPtr entry, si
 
 void BackupImpl::writeFile(const String & file_name, BackupEntryPtr entry)
 {
-
-    std::lock_guard lock{mutex};
     if (open_mode != OpenMode::WRITE)
         throw Exception("Backup is not opened for writing", ErrorCodes::LOGICAL_ERROR);
 
@@ -802,7 +800,12 @@ void BackupImpl::writeFile(const String & file_name, BackupEntryPtr entry)
     /// or have only prefix of it in previous backup. Let's go long path.
 
     info.data_file_name = info.file_name;
-    info.archive_suffix = current_archive_suffix;
+
+    if (use_archives)
+    {
+        std::lock_guard lock{mutex};
+        info.archive_suffix = current_archive_suffix;
+    }
 
     bool is_data_file_required;
     coordination->addFileInfo(info, is_data_file_required);
@@ -818,9 +821,11 @@ void BackupImpl::writeFile(const String & file_name, BackupEntryPtr entry)
     /// if source and destination are compatible
     if (!use_archives && info.base_size == 0 && writer->supportNativeCopy(reader_description))
     {
-
+        /// Should be much faster than writing data through server.
         LOG_TRACE(log, "Will copy file {} using native copy", adjusted_path);
-        /// Should be much faster than writing data through server
+
+        /// NOTE: `mutex` must be unlocked here otherwise writing will be in one thread maximum and hence slow.
+
         writer->copyFileNative(entry->tryGetDiskIfExists(), entry->getFilePath(), info.data_file_name);
     }
     else
@@ -838,6 +843,11 @@ void BackupImpl::writeFile(const String & file_name, BackupEntryPtr entry)
         if (use_archives)
         {
             LOG_TRACE(log, "Adding file {} to archive", adjusted_path);
+
+            /// An archive must be written strictly in one thread, so it's correct to lock the mutex for all the time we're writing the file
+            /// to the archive.
+            std::lock_guard lock{mutex};
+
             String archive_suffix = current_archive_suffix;
             bool next_suffix = false;
             if (current_archive_suffix.empty() && is_internal_backup)
@@ -859,6 +869,7 @@ void BackupImpl::writeFile(const String & file_name, BackupEntryPtr entry)
         }
         else
         {
+            /// NOTE: `mutex` must be unlocked here otherwise writing will be in one thread maximum and hence slow.
             writer->copyFileThroughBuffer(std::move(read_buffer), info.data_file_name);
         }
     }
diff --git a/src/Backups/BackupImpl.h b/src/Backups/BackupImpl.h
index 525aec2fcd6..e539239d3ef 100644
--- a/src/Backups/BackupImpl.h
+++ b/src/Backups/BackupImpl.h
@@ -130,7 +130,7 @@ private:
     std::pair<String, std::shared_ptr<IArchiveWriter>> archive_writers[2];
     String current_archive_suffix;
     String lock_file_name;
-    size_t num_files_written = 0;
+    std::atomic<size_t> num_files_written = 0;
     bool writing_finalized = false;
     const Poco::Logger * log;
 };
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index fd8771c1529..3dc42746d67 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -42,6 +42,14 @@ endif ()
 # See `src/Common/TargetSpecific.h`
 option(ENABLE_MULTITARGET_CODE "Enable platform-dependent code" ON)
 
+if (NO_SSE3_OR_HIGHER)
+    # Optimized x86 code in DECLARE_*_SPECIFIC_CODE blocks (see `src/Common/TargetSpecific.h`) is sometimes marked FORCE_INLINE. As a
+    # result, its instruction set requirements (e.g. SSE4.2) leak into generic code. This is normally not a problem for standard x86 builds
+    # because generic code is compiled with SSE 4.2 anyways. But it breaks SSE2-only builds. Therefore disabling the multitarget code
+    # machinery and always use generic code. (The cleaner alternative is removing FORCE_INLINE but that impacts performance too much.)
+    set(ENABLE_MULTITARGET_CODE OFF)
+endif()
+
 if (ENABLE_MULTITARGET_CODE)
     add_definitions(-DENABLE_MULTITARGET_CODE=1)
 else()
diff --git a/src/Client/Connection.cpp b/src/Client/Connection.cpp
index 5047f958a10..7a663195655 100644
--- a/src/Client/Connection.cpp
+++ b/src/Client/Connection.cpp
@@ -24,6 +24,7 @@
 #include <Common/randomSeed.h>
 #include "Core/Block.h"
 #include <Interpreters/ClientInfo.h>
+#include <Interpreters/OpenTelemetrySpanLog.h>
 #include <Compression/CompressionFactory.h>
 #include <QueryPipeline/Pipe.h>
 #include <QueryPipeline/QueryPipelineBuilder.h>
@@ -483,6 +484,22 @@ void Connection::sendQuery(
     bool with_pending_data,
     std::function<void(const Progress &)>)
 {
+    OpenTelemetry::SpanHolder span("Connection::sendQuery()");
+    span.addAttribute("clickhouse.query_id", query_id_);
+    span.addAttribute("clickhouse.query", query);
+    span.addAttribute("target", [this] () { return this->getHost() + ":" + std::to_string(this->getPort()); });
+
+    ClientInfo new_client_info;
+    const auto &current_trace_context = OpenTelemetry::CurrentContext();
+    if (client_info && current_trace_context.isTraceEnabled())
+    {
+        // use current span as the parent of remote span
+        new_client_info = *client_info;
+        new_client_info.client_trace_context = current_trace_context;
+
+        client_info = &new_client_info;
+    }
+
     if (!connected)
         connect(timeouts);
 
@@ -540,7 +557,7 @@ void Connection::sendQuery(
         /// Send correct hash only for !INITIAL_QUERY, due to:
         /// - this will avoid extra protocol complexity for simplest cases
         /// - there is no need in hash for the INITIAL_QUERY anyway
-        ///   (since there is no secure/unsecure changes)
+        ///   (since there is no secure/non-secure changes)
         if (client_info && !cluster_secret.empty() && client_info->query_kind != ClientInfo::QueryKind::INITIAL_QUERY)
         {
 #if USE_SSL
diff --git a/src/Client/ConnectionEstablisher.cpp b/src/Client/ConnectionEstablisher.cpp
index 3ad9f6ba95c..757927d70bc 100644
--- a/src/Client/ConnectionEstablisher.cpp
+++ b/src/Client/ConnectionEstablisher.cpp
@@ -16,6 +16,7 @@ namespace ErrorCodes
     extern const int ATTEMPT_TO_READ_AFTER_EOF;
     extern const int NETWORK_ERROR;
     extern const int SOCKET_TIMEOUT;
+    extern const int DNS_ERROR;
 }
 
 ConnectionEstablisher::ConnectionEstablisher(
@@ -90,6 +91,7 @@ void ConnectionEstablisher::run(ConnectionEstablisher::TryResult & result, std::
     catch (const Exception & e)
     {
         if (e.code() != ErrorCodes::NETWORK_ERROR && e.code() != ErrorCodes::SOCKET_TIMEOUT
+            && e.code() != ErrorCodes::DNS_ERROR
             && e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF)
             throw;
 
diff --git a/src/Client/HedgedConnectionsFactory.cpp b/src/Client/HedgedConnectionsFactory.cpp
index a1b816deecb..81067f51d29 100644
--- a/src/Client/HedgedConnectionsFactory.cpp
+++ b/src/Client/HedgedConnectionsFactory.cpp
@@ -41,7 +41,7 @@ HedgedConnectionsFactory::HedgedConnectionsFactory(
 HedgedConnectionsFactory::~HedgedConnectionsFactory()
 {
     /// Stop anything that maybe in progress,
-    /// to avoid interfer with the subsequent connections.
+    /// to avoid interference with the subsequent connections.
     ///
     /// I.e. some replcas may be in the establishing state,
     /// this means that hedged connection is waiting for TablesStatusResponse,
diff --git a/src/Columns/ColumnArray.cpp b/src/Columns/ColumnArray.cpp
index 93bcc3eb611..fabdc03ace9 100644
--- a/src/Columns/ColumnArray.cpp
+++ b/src/Columns/ColumnArray.cpp
@@ -50,7 +50,7 @@ ColumnArray::ColumnArray(MutableColumnPtr && nested_column, MutableColumnPtr &&
     if (!offsets_concrete)
         throw Exception("offsets_column must be a ColumnUInt64", ErrorCodes::LOGICAL_ERROR);
 
-    if (!offsets_concrete->empty() && data)
+    if (!offsets_concrete->empty() && data && !data->empty())
     {
         Offset last_offset = offsets_concrete->getData().back();
 
diff --git a/src/Common/CaresPTRResolver.cpp b/src/Common/CaresPTRResolver.cpp
index e5d48b864c8..a02909309b6 100644
--- a/src/Common/CaresPTRResolver.cpp
+++ b/src/Common/CaresPTRResolver.cpp
@@ -15,8 +15,8 @@ namespace DB
 
     static void callback(void * arg, int status, int, struct hostent * host)
     {
-        auto * ptr_records = reinterpret_cast<std::unordered_set<std::string>*>(arg);
-        if (status == ARES_SUCCESS && host->h_aliases)
+        auto * ptr_records = static_cast<std::unordered_set<std::string>*>(arg);
+        if (ptr_records && status == ARES_SUCCESS)
         {
             /*
              * In some cases (e.g /etc/hosts), hostent::h_name is filled and hostent::h_aliases is empty.
@@ -28,11 +28,14 @@ namespace DB
                 ptr_records->insert(ptr_record);
             }
 
-            int i = 0;
-            while (auto * ptr_record = host->h_aliases[i])
+            if (host->h_aliases)
             {
-                ptr_records->insert(ptr_record);
-                i++;
+                int i = 0;
+                while (auto * ptr_record = host->h_aliases[i])
+                {
+                    ptr_records->insert(ptr_record);
+                    i++;
+                }
             }
         }
     }
diff --git a/src/Common/ConcurrencyControl.h b/src/Common/ConcurrencyControl.h
index 6f37bb45c84..72bf9899618 100644
--- a/src/Common/ConcurrencyControl.h
+++ b/src/Common/ConcurrencyControl.h
@@ -250,7 +250,7 @@ private:
         }
     }
 
-    SlotCount available(std::unique_lock<std::mutex> &)
+    SlotCount available(std::unique_lock<std::mutex> &) const
     {
         if (cur_concurrency < max_concurrency)
             return max_concurrency - cur_concurrency;
diff --git a/src/Common/Config/ConfigProcessor.cpp b/src/Common/Config/ConfigProcessor.cpp
index 8d7c26e967f..d3462321e83 100644
--- a/src/Common/Config/ConfigProcessor.cpp
+++ b/src/Common/Config/ConfigProcessor.cpp
@@ -13,6 +13,7 @@
 #include <Poco/DOM/Text.h>
 #include <Poco/DOM/Attr.h>
 #include <Poco/DOM/Comment.h>
+#include <Poco/XML/XMLWriter.h>
 #include <Poco/Util/XMLConfiguration.h>
 #include <Common/ZooKeeper/ZooKeeperNodeCache.h>
 #include <Common/ZooKeeper/KeeperException.h>
@@ -729,7 +730,11 @@ void ConfigProcessor::savePreprocessedConfig(const LoadedConfig & loaded_config,
             if (!preprocessed_path_parent.empty())
                 fs::create_directories(preprocessed_path_parent);
         }
-        DOMWriter().writeNode(preprocessed_path, loaded_config.preprocessed_xml);
+        DOMWriter writer;
+        writer.setNewLine("\n");
+        writer.setIndent("    ");
+        writer.setOptions(Poco::XML::XMLWriter::PRETTY_PRINT);
+        writer.writeNode(preprocessed_path, loaded_config.preprocessed_xml);
         LOG_DEBUG(log, "Saved preprocessed configuration to '{}'.", preprocessed_path);
     }
     catch (Poco::Exception & e)
diff --git a/src/Common/Config/YAMLParser.cpp b/src/Common/Config/YAMLParser.cpp
index bb83563ecc9..a34b539ee81 100644
--- a/src/Common/Config/YAMLParser.cpp
+++ b/src/Common/Config/YAMLParser.cpp
@@ -26,114 +26,107 @@ namespace ErrorCodes
     extern const int CANNOT_PARSE_YAML;
 }
 
-/// A prefix symbol in yaml key
-/// We add attributes to nodes by using a prefix symbol in the key part.
-/// Currently we use @ as a prefix symbol. Note, that @ is reserved
-/// by YAML standard, so we need to write a key-value pair like this: "@attribute": attr_value
-const char YAML_ATTRIBUTE_PREFIX = '@';
-
 namespace
 {
+    /// A prefix symbol in yaml key
+    /// We add attributes to nodes by using a prefix symbol in the key part.
+    /// Currently we use @ as a prefix symbol. Note, that @ is reserved
+    /// by YAML standard, so we need to write a key-value pair like this: "@attribute": attr_value
+    const char YAML_ATTRIBUTE_PREFIX = '@';
 
-Poco::AutoPtr<Poco::XML::Element> createCloneNode(Poco::XML::Element & original_node)
-{
-    Poco::AutoPtr<Poco::XML::Element> clone_node = original_node.ownerDocument()->createElement(original_node.nodeName());
-    original_node.parentNode()->appendChild(clone_node);
-    return clone_node;
-}
-
-void processNode(const YAML::Node & node, Poco::XML::Element & parent_xml_element)
-{
-    auto * xml_document = parent_xml_element.ownerDocument();
-    switch (node.Type())
+    Poco::AutoPtr<Poco::XML::Element> cloneXMLNode(const Poco::XML::Element & original_node)
     {
-        case YAML::NodeType::Scalar:
-        {
-            std::string value = node.as<std::string>();
-            Poco::AutoPtr<Poco::XML::Text> xml_value = xml_document->createTextNode(value);
-            parent_xml_element.appendChild(xml_value);
-            break;
-        }
+        Poco::AutoPtr<Poco::XML::Element> clone_node = original_node.ownerDocument()->createElement(original_node.nodeName());
+        original_node.parentNode()->appendChild(clone_node);
+        return clone_node;
+    }
 
-        /// We process YAML Sequences as a
-        /// list of <key>value</key> tags with same key and different values.
-        /// For example, we translate this sequence
-        /// seq:
-        ///     - val1
-        ///     - val2
-        ///
-        /// into this:
-        /// <seq>val1</seq>
-        /// <seq>val2</seq>
-        case YAML::NodeType::Sequence:
+    void processNode(const YAML::Node & node, Poco::XML::Element & parent_xml_node)
+    {
+        auto * xml_document = parent_xml_node.ownerDocument();
+        switch (node.Type())
         {
-            for (const auto & child_node : node)
-                /// For sequences it depends how we want to process them.
-                /// Sequences of key-value pairs such as:
-                /// seq:
-                ///     - k1: val1
-                ///     - k2: val2
-                /// into xml like this:
-                /// <seq>
-                ///     <k1>val1</k1>
-                ///     <k2>val2</k2>
-                /// </seq>
-                ///
-                /// But, if the sequence is just a list, the root-node needs to be repeated, such as:
-                /// seq:
-                ///     - val1
-                ///     - val2
-                /// into xml like this:
-                /// <seq>val1</seq>
-                /// <seq>val2</seq>
-                ///
-                /// Therefore check what type the child is, for further processing.
-                /// Mixing types (values list or map) will lead to strange results but should not happen.
-                if (parent_xml_element.hasChildNodes() && !child_node.IsMap())
-                {
-                    /// Create a new parent node with same tag for each child node
-                    processNode(child_node, *createCloneNode(parent_xml_element));
-                }
-                else
-                {
-                    /// Map, so don't recreate the parent node but add directly
-                    processNode(child_node, parent_xml_element);
-                }
-            break;
-        }
-        case YAML::NodeType::Map:
-        {
-            for (const auto & key_value_pair : node)
+            case YAML::NodeType::Scalar:
             {
-                const auto & key_node = key_value_pair.first;
-                const auto & value_node = key_value_pair.second;
-                std::string key = key_node.as<std::string>();
-                bool is_attribute = (key.starts_with(YAML_ATTRIBUTE_PREFIX) && value_node.IsScalar());
-                if (is_attribute)
-                {
-                    /// we use substr(1) here to remove YAML_ATTRIBUTE_PREFIX from key
-                    auto attribute_name = key.substr(1);
-                    std::string value = value_node.as<std::string>();
-                    parent_xml_element.setAttribute(attribute_name, value);
-                }
-                else
-                {
-                    Poco::AutoPtr<Poco::XML::Element> xml_key = xml_document->createElement(key);
-                    parent_xml_element.appendChild(xml_key);
-                    processNode(value_node, *xml_key);
-                }
+                std::string value = node.as<std::string>();
+                Poco::AutoPtr<Poco::XML::Text> xml_value = xml_document->createTextNode(value);
+                parent_xml_node.appendChild(xml_value);
+                break;
+            }
+
+            /// For sequences we repeat the parent xml node. For example,
+            /// seq:
+            ///     - val1
+            ///     - val2
+            /// is converted into the following xml:
+            /// <seq>val1</seq>
+            /// <seq>val2</seq>
+            ///
+            /// A sequence of mappings is converted in the same way:
+            /// seq:
+            ///     - k1: val1
+            ///       k2: val2
+            ///     - k3: val3
+            /// is converted into the following xml:
+            /// <seq><k1>val1</k1><k2>val2</k2></seq>
+            /// <seq><k3>val3</k3></seq>
+            case YAML::NodeType::Sequence:
+            {
+                size_t i = 0;
+                for (auto it = node.begin(); it != node.end(); ++it, ++i)
+                {
+                    const auto & child_node = *it;
+
+                    bool need_clone_parent_xml_node = (i > 0);
+
+                    if (need_clone_parent_xml_node)
+                    {
+                        /// Create a new parent node with same tag for each child node
+                        processNode(child_node, *cloneXMLNode(parent_xml_node));
+                    }
+                    else
+                    {
+                        /// Map, so don't recreate the parent node but add directly
+                        processNode(child_node, parent_xml_node);
+                    }
+                }
+                break;
+            }
+
+            case YAML::NodeType::Map:
+            {
+                for (const auto & key_value_pair : node)
+                {
+                    const auto & key_node = key_value_pair.first;
+                    const auto & value_node = key_value_pair.second;
+                    std::string key = key_node.as<std::string>();
+                    bool is_attribute = (key.starts_with(YAML_ATTRIBUTE_PREFIX) && value_node.IsScalar());
+                    if (is_attribute)
+                    {
+                        /// we use substr(1) here to remove YAML_ATTRIBUTE_PREFIX from key
+                        auto attribute_name = key.substr(1);
+                        std::string value = value_node.as<std::string>();
+                        parent_xml_node.setAttribute(attribute_name, value);
+                    }
+                    else
+                    {
+                        Poco::AutoPtr<Poco::XML::Element> xml_key = xml_document->createElement(key);
+                        parent_xml_node.appendChild(xml_key);
+                        processNode(value_node, *xml_key);
+                    }
+                }
+                break;
+            }
+
+            case YAML::NodeType::Null: break;
+            case YAML::NodeType::Undefined:
+            {
+                throw Exception(ErrorCodes::CANNOT_PARSE_YAML, "YAMLParser has encountered node with undefined type and cannot continue parsing of the file");
             }
-            break;
-        }
-        case YAML::NodeType::Null: break;
-        case YAML::NodeType::Undefined:
-        {
-            throw Exception(ErrorCodes::CANNOT_PARSE_YAML, "YAMLParser has encountered node with undefined type and cannot continue parsing of the file");
         }
     }
 }
 
-}
 
 Poco::AutoPtr<Poco::XML::Document> YAMLParser::parse(const String& path)
 {
diff --git a/src/Common/CurrentMemoryTracker.cpp b/src/Common/CurrentMemoryTracker.cpp
index 921c244da21..720df07efb9 100644
--- a/src/Common/CurrentMemoryTracker.cpp
+++ b/src/Common/CurrentMemoryTracker.cpp
@@ -52,15 +52,10 @@ void CurrentMemoryTracker::allocImpl(Int64 size, bool throw_if_memory_exceeded)
         if (current_thread)
         {
             Int64 will_be = current_thread->untracked_memory + size;
-            Int64 limit = current_thread->untracked_memory_limit + current_thread->untracked_memory_limit_increase;
 
-            if (will_be > limit)
+            if (will_be > current_thread->untracked_memory_limit)
             {
-                /// Increase limit before track. If tracker throws out-of-limit we would be able to alloc up to untracked_memory_limit bytes
-                /// more. It could be useful to enlarge Exception message in rethrow logic.
-                current_thread->untracked_memory_limit_increase = current_thread->untracked_memory_limit;
                 memory_tracker->allocImpl(will_be, throw_if_memory_exceeded);
-                current_thread->untracked_memory_limit_increase = 0;
                 current_thread->untracked_memory = 0;
             }
             else
diff --git a/src/Common/Elf.cpp b/src/Common/Elf.cpp
index b735367b179..0515cc0765a 100644
--- a/src/Common/Elf.cpp
+++ b/src/Common/Elf.cpp
@@ -22,13 +22,13 @@ Elf::Elf(const std::string & path)
     /// Check if it's an elf.
     elf_size = in.buffer().size();
     if (elf_size < sizeof(ElfEhdr))
-        throw Exception("The size of supposedly ELF file is too small", ErrorCodes::CANNOT_PARSE_ELF);
+        throw Exception(ErrorCodes::CANNOT_PARSE_ELF, "The size of supposedly ELF file '{}' is too small", path);
 
     mapped = in.buffer().begin();
     header = reinterpret_cast<const ElfEhdr *>(mapped);
 
     if (memcmp(header->e_ident, "\x7F""ELF", 4) != 0)
-        throw Exception("The file is not ELF according to magic", ErrorCodes::CANNOT_PARSE_ELF);
+        throw Exception(ErrorCodes::CANNOT_PARSE_ELF, "The file '{}' is not ELF according to magic", path);
 
     /// Get section header.
     ElfOff section_header_offset = header->e_shoff;
@@ -37,7 +37,7 @@ Elf::Elf(const std::string & path)
     if (!section_header_offset
         || !section_header_num_entries
         || section_header_offset + section_header_num_entries * sizeof(ElfShdr) > elf_size)
-        throw Exception("The ELF is truncated (section header points after end of file)", ErrorCodes::CANNOT_PARSE_ELF);
+        throw Exception(ErrorCodes::CANNOT_PARSE_ELF, "The ELF '{}' is truncated (section header points after end of file)", path);
 
     section_headers = reinterpret_cast<const ElfShdr *>(mapped + section_header_offset);
 
@@ -48,11 +48,11 @@ Elf::Elf(const std::string & path)
     });
 
     if (!section_names_strtab)
-        throw Exception("The ELF doesn't have string table with section names", ErrorCodes::CANNOT_PARSE_ELF);
+        throw Exception(ErrorCodes::CANNOT_PARSE_ELF, "The ELF '{}' doesn't have string table with section names", path);
 
     ElfOff section_names_offset = section_names_strtab->header.sh_offset;
     if (section_names_offset >= elf_size)
-        throw Exception("The ELF is truncated (section names string table points after end of file)", ErrorCodes::CANNOT_PARSE_ELF);
+        throw Exception(ErrorCodes::CANNOT_PARSE_ELF, "The ELF '{}' is truncated (section names string table points after end of file)", path);
 
     section_names = reinterpret_cast<const char *>(mapped + section_names_offset);
 
@@ -64,7 +64,7 @@ Elf::Elf(const std::string & path)
     if (!program_header_offset
         || !program_header_num_entries
         || program_header_offset + program_header_num_entries * sizeof(ElfPhdr) > elf_size)
-        throw Exception("The ELF is truncated (program header points after end of file)", ErrorCodes::CANNOT_PARSE_ELF);
+        throw Exception(ErrorCodes::CANNOT_PARSE_ELF, "The ELF '{}' is truncated (program header points after end of file)", path);
 
     program_headers = reinterpret_cast<const ElfPhdr *>(mapped + program_header_offset);
 }
diff --git a/src/Common/FieldVisitorToString.cpp b/src/Common/FieldVisitorToString.cpp
index 7d93cfba78f..e0e138d744c 100644
--- a/src/Common/FieldVisitorToString.cpp
+++ b/src/Common/FieldVisitorToString.cpp
@@ -145,5 +145,11 @@ String FieldVisitorToString::operator() (const Object & x) const
 
 }
 
+String convertFieldToString(const Field & field)
+{
+    if (field.getType() == Field::Types::Which::String)
+        return field.get<String>();
+    return applyVisitor(FieldVisitorToString(), field);
 }
 
+}
diff --git a/src/Common/FieldVisitorToString.h b/src/Common/FieldVisitorToString.h
index 324a4aa73d5..cca29a8f7e0 100644
--- a/src/Common/FieldVisitorToString.h
+++ b/src/Common/FieldVisitorToString.h
@@ -31,5 +31,8 @@ public:
     String operator() (const bool & x) const;
 };
 
-}
+/// Get value from field and convert it to string.
+/// Also remove quotes from strings.
+String convertFieldToString(const Field & field);
 
+}
diff --git a/src/Common/IntervalKind.h b/src/Common/IntervalKind.h
index 59e8d32f3e7..b46805655b1 100644
--- a/src/Common/IntervalKind.h
+++ b/src/Common/IntervalKind.h
@@ -64,7 +64,7 @@ struct IntervalKind
     const char * toNameOfFunctionExtractTimePart() const;
 
     /// Converts the string representation of an interval kind to its IntervalKind equivalent.
-    /// Returns false if the conversion unsucceeded.
+    /// Returns false if the conversion did not succeed.
     /// For example, `IntervalKind::tryParseString('second', result)` returns `result` equals `IntervalKind::Kind::Second`.
     static bool tryParseString(const std::string & kind, IntervalKind::Kind & result);
 };
diff --git a/src/Common/MemoryTracker.cpp b/src/Common/MemoryTracker.cpp
index da3ce3cc3d2..29dbcdd28b0 100644
--- a/src/Common/MemoryTracker.cpp
+++ b/src/Common/MemoryTracker.cpp
@@ -166,27 +166,7 @@ void MemoryTracker::allocImpl(Int64 size, bool throw_if_memory_exceeded, MemoryT
         }
     }
 
-    std::bernoulli_distribution fault(fault_probability);
-    if (unlikely(fault_probability && fault(thread_local_rng)) && memoryTrackerCanThrow(level, true) && throw_if_memory_exceeded)
-    {
-        /// Revert
-        amount.fetch_sub(size, std::memory_order_relaxed);
-
-        /// Prevent recursion. Exception::ctor -> std::string -> new[] -> MemoryTracker::alloc
-        MemoryTrackerBlockerInThread untrack_lock(VariableContext::Global);
-
-        ProfileEvents::increment(ProfileEvents::QueryMemoryLimitExceeded);
-        const auto * description = description_ptr.load(std::memory_order_relaxed);
-        throw DB::Exception(
-            DB::ErrorCodes::MEMORY_LIMIT_EXCEEDED,
-            "Memory tracker{}{}: fault injected. Would use {} (attempt to allocate chunk of {} bytes), maximum: {}",
-            description ? " " : "",
-            description ? description : "",
-            formatReadableSizeWithBinarySuffix(will_be),
-            size,
-            formatReadableSizeWithBinarySuffix(current_hard_limit));
-    }
-
+    bool memory_limit_exceeded_ignored = false;
 
     bool allocation_traced = false;
     if (unlikely(current_profiler_limit && will_be > current_profiler_limit))
@@ -205,54 +185,92 @@ void MemoryTracker::allocImpl(Int64 size, bool throw_if_memory_exceeded, MemoryT
         allocation_traced = true;
     }
 
-    if (unlikely(current_hard_limit && will_be > current_hard_limit) && memoryTrackerCanThrow(level, false) && throw_if_memory_exceeded)
+    std::bernoulli_distribution fault(fault_probability);
+    if (unlikely(fault_probability && fault(thread_local_rng)))
     {
-        OvercommitResult overcommit_result = OvercommitResult::NONE;
-        if (auto * overcommit_tracker_ptr = overcommit_tracker.load(std::memory_order_relaxed); overcommit_tracker_ptr != nullptr && query_tracker != nullptr)
-            overcommit_result = overcommit_tracker_ptr->needToStopQuery(query_tracker, size);
-
-        if (overcommit_result != OvercommitResult::MEMORY_FREED)
+        if (memoryTrackerCanThrow(level, true) && throw_if_memory_exceeded)
         {
             /// Revert
             amount.fetch_sub(size, std::memory_order_relaxed);
 
             /// Prevent recursion. Exception::ctor -> std::string -> new[] -> MemoryTracker::alloc
             MemoryTrackerBlockerInThread untrack_lock(VariableContext::Global);
+
             ProfileEvents::increment(ProfileEvents::QueryMemoryLimitExceeded);
             const auto * description = description_ptr.load(std::memory_order_relaxed);
             throw DB::Exception(
                 DB::ErrorCodes::MEMORY_LIMIT_EXCEEDED,
-                "Memory limit{}{} exceeded: would use {} (attempt to allocate chunk of {} bytes), maximum: {}. OvercommitTracker decision: {}.",
+                "Memory tracker{}{}: fault injected. Would use {} (attempt to allocate chunk of {} bytes), maximum: {}",
                 description ? " " : "",
                 description ? description : "",
                 formatReadableSizeWithBinarySuffix(will_be),
                 size,
-                formatReadableSizeWithBinarySuffix(current_hard_limit),
-                toDescription(overcommit_result));
+                formatReadableSizeWithBinarySuffix(current_hard_limit));
+        }
+        else
+            memory_limit_exceeded_ignored = true;
+    }
+
+
+    if (unlikely(current_hard_limit && will_be > current_hard_limit))
+    {
+        if (memoryTrackerCanThrow(level, false) && throw_if_memory_exceeded)
+        {
+            OvercommitResult overcommit_result = OvercommitResult::NONE;
+            if (auto * overcommit_tracker_ptr = overcommit_tracker.load(std::memory_order_relaxed); overcommit_tracker_ptr != nullptr && query_tracker != nullptr)
+                overcommit_result = overcommit_tracker_ptr->needToStopQuery(query_tracker, size);
+
+            if (overcommit_result != OvercommitResult::MEMORY_FREED)
+            {
+                /// Revert
+                amount.fetch_sub(size, std::memory_order_relaxed);
+
+                /// Prevent recursion. Exception::ctor -> std::string -> new[] -> MemoryTracker::alloc
+                MemoryTrackerBlockerInThread untrack_lock(VariableContext::Global);
+                ProfileEvents::increment(ProfileEvents::QueryMemoryLimitExceeded);
+                const auto * description = description_ptr.load(std::memory_order_relaxed);
+                throw DB::Exception(
+                    DB::ErrorCodes::MEMORY_LIMIT_EXCEEDED,
+                    "Memory limit{}{} exceeded: would use {} (attempt to allocate chunk of {} bytes), maximum: {}. OvercommitTracker decision: {}.",
+                    description ? " " : "",
+                    description ? description : "",
+                    formatReadableSizeWithBinarySuffix(will_be),
+                    size,
+                    formatReadableSizeWithBinarySuffix(current_hard_limit),
+                    toDescription(overcommit_result));
+            }
+            else
+            {
+                // If OvercommitTracker::needToStopQuery returned false, it guarantees that enough memory is freed.
+                // This memory is already counted in variable `amount` in the moment of `will_be` initialization.
+                // Now we just need to update value stored in `will_be`, because it should have changed.
+                will_be = amount.load(std::memory_order_relaxed);
+            }
+        }
+        else
+            memory_limit_exceeded_ignored = true;
+    }
+
+    bool peak_updated = false;
+    /// In case of MEMORY_LIMIT_EXCEEDED was ignored, will_be may include
+    /// memory of other allocations, that may fail but not reverted yet, and so
+    /// updating peak will be inaccurate.
+    if (!memory_limit_exceeded_ignored)
+    {
+        if (throw_if_memory_exceeded)
+        {
+            /// Prevent recursion. Exception::ctor -> std::string -> new[] -> MemoryTracker::alloc
+            MemoryTrackerBlockerInThread untrack_lock(VariableContext::Global);
+            bool log_memory_usage = true;
+            peak_updated = updatePeak(will_be, log_memory_usage);
         }
         else
         {
-            // If OvercommitTracker::needToStopQuery returned false, it guarantees that enough memory is freed.
-            // This memory is already counted in variable `amount` in the moment of `will_be` initialization.
-            // Now we just need to update value stored in `will_be`, because it should have changed.
-            will_be = amount.load(std::memory_order_relaxed);
+            bool log_memory_usage = false;
+            peak_updated = updatePeak(will_be, log_memory_usage);
         }
     }
 
-    bool peak_updated;
-    if (throw_if_memory_exceeded)
-    {
-        /// Prevent recursion. Exception::ctor -> std::string -> new[] -> MemoryTracker::alloc
-        MemoryTrackerBlockerInThread untrack_lock(VariableContext::Global);
-        bool log_memory_usage = true;
-        peak_updated = updatePeak(will_be, log_memory_usage);
-    }
-    else
-    {
-        bool log_memory_usage = false;
-        peak_updated = updatePeak(will_be, log_memory_usage);
-    }
-
     if (peak_updated && allocation_traced)
     {
         MemoryTrackerBlockerInThread untrack_lock(VariableContext::Global);
diff --git a/src/Common/OpenTelemetryTraceContext.cpp b/src/Common/OpenTelemetryTraceContext.cpp
new file mode 100644
index 00000000000..7a1f94926d5
--- /dev/null
+++ b/src/Common/OpenTelemetryTraceContext.cpp
@@ -0,0 +1,374 @@
+#include "Interpreters/OpenTelemetrySpanLog.h"
+
+#include <random>
+#include <base/getThreadId.h>
+#include <Common/Exception.h>
+#include <Common/hex.h>
+#include <Core/Settings.h>
+#include <IO/WriteHelpers.h>
+
+namespace DB
+{
+namespace OpenTelemetry
+{
+
+thread_local TracingContextOnThread current_thread_trace_context;
+
+void Span::addAttribute(std::string_view name, UInt64 value)
+{
+    if (!this->isTraceEnabled() || name.empty())
+        return;
+
+    this->attributes.push_back(Tuple{name, toString(value)});
+}
+
+void Span::addAttributeIfNotZero(std::string_view name, UInt64 value)
+{
+    if (value != 0)
+        addAttribute(name, value);
+}
+
+void Span::addAttribute(std::string_view name, std::string_view value)
+{
+    if (!this->isTraceEnabled() || name.empty())
+        return;
+
+    this->attributes.push_back(Tuple{name, value});
+}
+
+void Span::addAttributeIfNotEmpty(std::string_view name, std::string_view value)
+{
+    if (!this->isTraceEnabled() || name.empty() || value.empty())
+        return;
+
+    this->attributes.push_back(Tuple{name, value});
+}
+
+void Span::addAttribute(std::string_view name, std::function<String()> value_supplier)
+{
+    if (!this->isTraceEnabled() || !value_supplier)
+        return;
+
+    String value = value_supplier();
+    if (value.empty())
+        return;
+
+    this->attributes.push_back(Tuple{name, value});
+}
+
+void Span::addAttribute(const Exception & e) noexcept
+{
+    if (!this->isTraceEnabled())
+        return;
+
+    try
+    {
+        this->attributes.push_back(Tuple{"clickhouse.exception", getExceptionMessage(e, false)});
+    }
+    catch (...)
+    {
+        /// Ignore exceptions
+    }
+}
+
+void Span::addAttribute(std::exception_ptr e) noexcept
+{
+    if (!this->isTraceEnabled() || e == nullptr)
+        return;
+
+    try
+    {
+        this->attributes.push_back(Tuple{"clickhouse.exception", getExceptionMessage(e, false)});
+    }
+    catch (...)
+    {
+        /// Ignore exceptions
+    }
+}
+
+SpanHolder::SpanHolder(std::string_view _operation_name)
+{
+    if (!current_thread_trace_context.isTraceEnabled())
+    {
+        return;
+    }
+
+    /// Use try-catch to make sure the ctor is exception safe.
+    try
+    {
+        this->trace_id = current_thread_trace_context.trace_id;
+        this->parent_span_id = current_thread_trace_context.span_id;
+        this->span_id = thread_local_rng(); // create a new id for this span
+        this->operation_name = _operation_name;
+        this->start_time_us
+            = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
+
+        /// Add new initialization here
+    }
+    catch (...)
+    {
+        tryLogCurrentException(__FUNCTION__);
+
+        /// Clear related fields to make sure the span won't be recorded.
+        this->trace_id = UUID();
+        return;
+    }
+
+    /// Set current span as parent of other spans created later on this thread.
+    current_thread_trace_context.span_id = this->span_id;
+}
+
+void SpanHolder::finish() noexcept
+{
+    if (!this->isTraceEnabled())
+        return;
+
+    // First of all, restore old value of current span.
+    assert(current_thread_trace_context.span_id == span_id);
+    current_thread_trace_context.span_id = parent_span_id;
+
+    try
+    {
+        auto log = current_thread_trace_context.span_log.lock();
+        if (!log)
+        {
+            // The log might be disabled.
+            return;
+        }
+
+        this->finish_time_us
+            = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
+
+        log->add(OpenTelemetrySpanLogElement(*this));
+    }
+    catch (...)
+    {
+        tryLogCurrentException(__FUNCTION__);
+    }
+
+    trace_id = UUID();
+}
+
+SpanHolder::~SpanHolder()
+{
+    finish();
+}
+
+bool TracingContext::parseTraceparentHeader(std::string_view traceparent, String & error)
+{
+    trace_id = 0;
+
+    // Version 00, which is the only one we can parse, is fixed width. Use this
+    // fact for an additional sanity check.
+    const int expected_length = strlen("xx-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx-xxxxxxxxxxxxxxxx-xx");
+    if (traceparent.length() != expected_length)
+    {
+        error = fmt::format("unexpected length {}, expected {}", traceparent.length(), expected_length);
+        return false;
+    }
+
+    const char * data = traceparent.data();
+
+    uint8_t version = unhex2(data);
+    data += 2;
+
+    if (version != 0)
+    {
+        error = fmt::format("unexpected version {}, expected 00", version);
+        return false;
+    }
+
+    if (*data != '-')
+    {
+        error = fmt::format("Malformed traceparant header: {}", traceparent);
+        return false;
+    }
+
+    ++data;
+    UInt64 trace_id_higher_64 = unhexUInt<UInt64>(data);
+    UInt64 trace_id_lower_64 = unhexUInt<UInt64>(data + 16);
+    data += 32;
+
+    if (*data != '-')
+    {
+        error = fmt::format("Malformed traceparant header: {}", traceparent);
+        return false;
+    }
+
+    ++data;
+    UInt64 span_id_64 = unhexUInt<UInt64>(data);
+    data += 16;
+
+    if (*data != '-')
+    {
+        error = fmt::format("Malformed traceparant header: {}", traceparent);
+        return false;
+    }
+
+    ++data;
+    this->trace_flags = unhex2(data);
+    this->trace_id.toUnderType().items[0] = trace_id_higher_64;
+    this->trace_id.toUnderType().items[1] = trace_id_lower_64;
+    this->span_id = span_id_64;
+    return true;
+}
+
+String TracingContext::composeTraceparentHeader() const
+{
+    // This span is a parent for its children, so we specify this span_id as a
+    // parent id.
+    return fmt::format(
+        "00-{:016x}{:016x}-{:016x}-{:02x}",
+        trace_id.toUnderType().items[0],
+        trace_id.toUnderType().items[1],
+        span_id,
+        // This cast is needed because fmt is being weird and complaining that
+        // "mixing character types is not allowed".
+        static_cast<uint8_t>(trace_flags));
+}
+
+const TracingContextOnThread & CurrentContext()
+{
+    return current_thread_trace_context;
+}
+
+void TracingContextOnThread::reset() noexcept
+{
+    this->trace_id = UUID();
+    this->span_id = 0;
+    this->trace_flags = TRACE_FLAG_NONE;
+    this->tracestate = "";
+    this->span_log.reset();
+}
+
+TracingContextHolder::TracingContextHolder(
+    std::string_view _operation_name,
+    TracingContext _parent_trace_context,
+    const Settings * settings_ptr,
+    const std::weak_ptr<OpenTelemetrySpanLog> & _span_log)
+{
+    /// Use try-catch to make sure the ctor is exception safe.
+    /// If any exception is raised during the construction, the tracing is not enabled on current thread.
+    try
+    {
+        if (current_thread_trace_context.isTraceEnabled())
+        {
+            ///
+            /// This is not the normal case,
+            /// it means that construction of current object is not at the start of current thread.
+            /// Usually this is due to:
+            ///    1. bad design
+            ///    2. right design but code changes so that original point where this object is constructing is not the new start execution of current thread
+            ///
+            /// In such case, we should use current context as parent of this new constructing object,
+            /// So this branch ensures this class can be instantiated multiple times on one same thread safely.
+            ///
+            this->is_context_owner = false;
+            this->root_span.trace_id = current_thread_trace_context.trace_id;
+            this->root_span.parent_span_id = current_thread_trace_context.span_id;
+            this->root_span.span_id = thread_local_rng();
+            this->root_span.operation_name = _operation_name;
+            this->root_span.start_time_us
+                = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
+
+            /// Set the root span as parent of other spans created on current thread
+            current_thread_trace_context.span_id = this->root_span.span_id;
+            return;
+        }
+
+        if (!_parent_trace_context.isTraceEnabled())
+        {
+            if (settings_ptr == nullptr)
+                /// Skip tracing context initialization on current thread
+                return;
+
+            // Start the trace with some configurable probability.
+            std::bernoulli_distribution should_start_trace{settings_ptr->opentelemetry_start_trace_probability};
+            if (!should_start_trace(thread_local_rng))
+                /// skip tracing context initialization on current thread
+                return;
+
+            while (_parent_trace_context.trace_id == UUID())
+            {
+                // Make sure the random generated trace_id is not 0 which is an invalid id.
+                _parent_trace_context.trace_id.toUnderType().items[0] = thread_local_rng(); //-V656
+                _parent_trace_context.trace_id.toUnderType().items[1] = thread_local_rng(); //-V656
+            }
+            _parent_trace_context.span_id = 0;
+        }
+
+        this->root_span.trace_id = _parent_trace_context.trace_id;
+        this->root_span.parent_span_id = _parent_trace_context.span_id;
+        this->root_span.span_id = thread_local_rng();
+        this->root_span.operation_name = _operation_name;
+        this->root_span.start_time_us
+            = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
+
+        /// Add new initialization here
+    }
+    catch (...)
+    {
+        tryLogCurrentException(__FUNCTION__);
+
+        /// Clear related fields to make sure the tracing is not enabled.
+        this->root_span.trace_id = UUID();
+        return;
+    }
+
+    /// Set up trace context on current thread only when the root span is successfully initialized.
+    current_thread_trace_context = _parent_trace_context;
+    current_thread_trace_context.span_id = this->root_span.span_id;
+    current_thread_trace_context.trace_flags = TRACE_FLAG_SAMPLED;
+    current_thread_trace_context.span_log = _span_log;
+}
+
+TracingContextHolder::~TracingContextHolder()
+{
+    if (!this->root_span.isTraceEnabled())
+    {
+        return;
+    }
+
+    try
+    {
+        auto shared_span_log = current_thread_trace_context.span_log.lock();
+        if (shared_span_log)
+        {
+            try
+            {
+                /// This object is created to initialize tracing context on a new thread,
+                /// it's helpful to record the thread_id so that we know the thread switching from the span log
+                this->root_span.addAttribute("clickhouse.thread_id", getThreadId());
+            }
+            catch (...)
+            {
+                /// It's acceptable that the attribute is not recorded in case of any exception,
+                /// so the exception is ignored to try to log the span.
+            }
+
+            this->root_span.finish_time_us
+                = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
+
+            shared_span_log->add(OpenTelemetrySpanLogElement(this->root_span));
+        }
+    }
+    catch (...)
+    {
+        tryLogCurrentException(__FUNCTION__);
+    }
+
+    this->root_span.trace_id = UUID();
+
+    if (this->is_context_owner)
+    {
+        /// Clear the context on current thread
+        current_thread_trace_context.reset();
+    }
+    else
+    {
+        current_thread_trace_context.span_id = this->root_span.parent_span_id;
+    }
+}
+
+}
+}
diff --git a/src/Common/OpenTelemetryTraceContext.h b/src/Common/OpenTelemetryTraceContext.h
index 4d2fc656100..63136f8731d 100644
--- a/src/Common/OpenTelemetryTraceContext.h
+++ b/src/Common/OpenTelemetryTraceContext.h
@@ -1,24 +1,161 @@
 #pragma once
 
-#include <base/types.h>
-#include <base/UUID.h>
+#include <Core/Field.h>
 
 namespace DB
 {
 
-// The runtime info we need to create new OpenTelemetry spans.
-struct OpenTelemetryTraceContext
+struct Settings;
+class OpenTelemetrySpanLog;
+
+namespace OpenTelemetry
+{
+
+struct Span
+{
+    UUID trace_id{};
+    UInt64 span_id = 0;
+    UInt64 parent_span_id = 0;
+    String operation_name;
+    UInt64 start_time_us = 0;
+    UInt64 finish_time_us = 0;
+    Map attributes;
+
+    void addAttribute(std::string_view name, UInt64 value);
+    void addAttributeIfNotZero(std::string_view name, UInt64 value);
+    void addAttribute(std::string_view name, std::string_view value);
+    void addAttributeIfNotEmpty(std::string_view name, std::string_view value);
+    void addAttribute(std::string_view name, std::function<String()> value_supplier);
+
+    /// Following two methods are declared as noexcept to make sure they're exception safe
+    /// This is because they're usually called in exception handler
+    void addAttribute(const Exception & e) noexcept;
+    void addAttribute(std::exception_ptr e) noexcept;
+
+    bool isTraceEnabled() const
+    {
+        return trace_id != UUID();
+    }
+};
+
+/// See https://www.w3.org/TR/trace-context/ for trace_flags definition
+enum TraceFlags : UInt8
+{
+    TRACE_FLAG_NONE = 0,
+    TRACE_FLAG_SAMPLED = 1,
+};
+
+/// The runtime info we need to create new OpenTelemetry spans.
+struct TracingContext
 {
     UUID trace_id{};
     UInt64 span_id = 0;
     // The incoming tracestate header and the trace flags, we just pass them
     // downstream. See https://www.w3.org/TR/trace-context/
     String tracestate;
-    UInt8 trace_flags = 0;
+    UInt8 trace_flags = TRACE_FLAG_NONE;
 
     // Parse/compose OpenTelemetry traceparent header.
-    bool parseTraceparentHeader(const std::string & traceparent, std::string & error);
-    std::string composeTraceparentHeader() const;
+    bool parseTraceparentHeader(std::string_view traceparent, String & error);
+    String composeTraceparentHeader() const;
+
+    bool isTraceEnabled() const
+    {
+        return trace_id != UUID();
+    }
+};
+
+/// Tracing context kept on each thread
+struct TracingContextOnThread : TracingContext
+{
+    TracingContextOnThread& operator =(const TracingContext& context)
+    {
+        *(static_cast<TracingContext*>(this)) = context;
+        return *this;
+    }
+
+    void reset() noexcept;
+
+    /// Use weak_ptr instead of shared_ptr to hold a reference to the underlying system.opentelemetry_span_log table
+    /// Since this object is kept on threads and passed across threads, a weak_ptr is more safe to prevent potential leak
+    std::weak_ptr<OpenTelemetrySpanLog> span_log;
+};
+
+/// Get tracing context on current thread
+const TracingContextOnThread& CurrentContext();
+
+/// Holder of tracing context.
+/// It should be initialized at the beginning of each thread execution.
+/// And once it's destructed, it clears the context automatically.
+///
+/// It's also the root of all spans on current thread execution.
+///
+/// Although it's SAFE to construct this object multiple times on one same thread, it should be created at the beginning of one thread execution.
+struct TracingContextHolder
+{
+    /// Forbidden copy ctor and assignment to make the destructor safe
+    TracingContextHolder(const TracingContextHolder& scope) = delete;
+    TracingContextHolder& operator =(const TracingContextHolder& scope) = delete;
+
+    TracingContextHolder(std::string_view _operation_name,
+        const TracingContext& _parent_trace_context,
+        const std::weak_ptr<OpenTelemetrySpanLog>& _log)
+        : TracingContextHolder(_operation_name,
+            _parent_trace_context,
+            nullptr,
+            _log)
+    {
+    }
+
+    /// Initialize a tracing context on a child thread based on the context from the parent thread
+    TracingContextHolder(std::string_view _operation_name, const TracingContextOnThread & _parent_thread_trace_context)
+        : TracingContextHolder(_operation_name,
+            _parent_thread_trace_context,
+            nullptr,
+            _parent_thread_trace_context.span_log)
+    {
+    }
+
+    /// For servers like HTTP/TCP/GRPC to initialize tracing context on thread that process requests from clients
+    TracingContextHolder(std::string_view _operation_name,
+        TracingContext _parent_trace_context,
+        const Settings & _settings,
+        const std::weak_ptr<OpenTelemetrySpanLog> & _log)
+        : TracingContextHolder(_operation_name,
+            _parent_trace_context,
+            &_settings,
+            _log)
+    {
+    }
+
+    TracingContextHolder(std::string_view _operation_name,
+        TracingContext _parent_trace_context,
+        const Settings* settings_ptr,
+        const std::weak_ptr<OpenTelemetrySpanLog> & _log);
+
+    ~TracingContextHolder();
+
+    Span root_span;
+
+private:
+    bool is_context_owner = true;
+};
+
+using TracingContextHolderPtr = std::unique_ptr<TracingContextHolder>;
+
+/// A span holder that creates span automatically in a (function) scope if tracing is enabled.
+/// Once it's created or destructed, it automatically maitains the tracing context on the thread that it lives.
+struct SpanHolder : public Span
+{
+    SpanHolder(std::string_view);
+    ~SpanHolder();
+
+    /// Finish a span explicitly if needed.
+    /// It's safe to call it multiple times
+    void finish() noexcept;
 };
 
 }
+
+}
+
diff --git a/src/Common/OvercommitTracker.h b/src/Common/OvercommitTracker.h
index 64fb6cdc926..598b877ef3c 100644
--- a/src/Common/OvercommitTracker.h
+++ b/src/Common/OvercommitTracker.h
@@ -61,7 +61,7 @@ enum class QueryCancellationState
 
 // Usually it's hard to set some reasonable hard memory limit
 // (especially, the default value). This class introduces new
-// mechanisim for the limiting of memory usage.
+// mechanism for the limiting of memory usage.
 // Soft limit represents guaranteed amount of memory query/user
 // may use. It's allowed to exceed this limit. But if hard limit
 // is reached, query with the biggest overcommit ratio
@@ -82,7 +82,7 @@ protected:
     virtual void pickQueryToExcludeImpl() = 0;
 
     // This mutex is used to disallow concurrent access
-    // to picked_tracker and cancelation_state variables.
+    // to picked_tracker and cancellation_state variables.
     std::mutex overcommit_m;
     std::condition_variable cv;
 
diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp
index 183a06b6610..2997a0c0d08 100644
--- a/src/Common/ProfileEvents.cpp
+++ b/src/Common/ProfileEvents.cpp
@@ -289,6 +289,18 @@ The server successfully detected this situation and will download merged part fr
     M(S3WriteRequestsThrottling, "Number of 429 and 503 errors in POST, DELETE, PUT and PATCH requests to S3 storage.") \
     M(S3WriteRequestsRedirects, "Number of redirects in POST, DELETE, PUT and PATCH requests to S3 storage.") \
     \
+    M(DiskS3ReadMicroseconds, "Time of GET and HEAD requests to DiskS3 storage.") \
+    M(DiskS3ReadRequestsCount, "Number of GET and HEAD requests to DiskS3 storage.") \
+    M(DiskS3ReadRequestsErrors, "Number of non-throttling errors in GET and HEAD requests to DiskS3 storage.") \
+    M(DiskS3ReadRequestsThrottling, "Number of 429 and 503 errors in GET and HEAD requests to DiskS3 storage.") \
+    M(DiskS3ReadRequestsRedirects, "Number of redirects in GET and HEAD requests to DiskS3 storage.") \
+    \
+    M(DiskS3WriteMicroseconds, "Time of POST, DELETE, PUT and PATCH requests to DiskS3 storage.") \
+    M(DiskS3WriteRequestsCount, "Number of POST, DELETE, PUT and PATCH requests to DiskS3 storage.") \
+    M(DiskS3WriteRequestsErrors, "Number of non-throttling errors in POST, DELETE, PUT and PATCH requests to DiskS3 storage.") \
+    M(DiskS3WriteRequestsThrottling, "Number of 429 and 503 errors in POST, DELETE, PUT and PATCH requests to DiskS3 storage.") \
+    M(DiskS3WriteRequestsRedirects, "Number of redirects in POST, DELETE, PUT and PATCH requests to DiskS3 storage.") \
+    \
     M(ReadBufferFromS3Microseconds, "Time spend in reading from S3.") \
     M(ReadBufferFromS3Bytes, "Bytes read from S3.") \
     M(ReadBufferFromS3RequestsErrors, "Number of exceptions while reading from S3.") \
diff --git a/src/Common/SLRUCachePolicy.h b/src/Common/SLRUCachePolicy.h
index 10b043ebaca..8d4709c66a7 100644
--- a/src/Common/SLRUCachePolicy.h
+++ b/src/Common/SLRUCachePolicy.h
@@ -33,7 +33,7 @@ public:
       * max_protected_size shows how many of the most frequently used entries will not be evicted after a sequential scan.
       * max_protected_size == 0 means that the default protected size is equal to half of the total max size.
       */
-    /// TODO: construct from special struct with cache policy parametrs (also with max_protected_size).
+    /// TODO: construct from special struct with cache policy parameters (also with max_protected_size).
     SLRUCachePolicy(size_t max_size_, size_t max_elements_size_ = 0, double size_ratio = 0.5, OnWeightLossFunction on_weight_loss_function_ = {})
         : max_protected_size(max_size_ * std::min(1.0, size_ratio))
         , max_size(max_size_)
diff --git a/src/Common/Stopwatch.h b/src/Common/Stopwatch.h
index 2b60bbde196..cabc6d8ba1e 100644
--- a/src/Common/Stopwatch.h
+++ b/src/Common/Stopwatch.h
@@ -31,7 +31,7 @@ inline UInt64 clock_gettime_ns_adjusted(UInt64 prev_time, clockid_t clock_type =
 }
 
 /** Differs from Poco::Stopwatch only by using 'clock_gettime' instead of 'gettimeofday',
-  *  returns nanoseconds instead of microseconds, and also by other minor differencies.
+  *  returns nanoseconds instead of microseconds, and also by other minor differences.
   */
 class Stopwatch
 {
@@ -152,4 +152,3 @@ private:
     /// Most significant bit is a lock. When it is set, compareAndRestartDeferred method will return false.
     UInt64 nanoseconds(UInt64 prev_time) const { return clock_gettime_ns_adjusted(prev_time, clock_type) & 0x7FFFFFFFFFFFFFFFULL; }
 };
-
diff --git a/src/Common/SymbolIndex.cpp b/src/Common/SymbolIndex.cpp
index 46d9e8dbd5c..e217d23cc27 100644
--- a/src/Common/SymbolIndex.cpp
+++ b/src/Common/SymbolIndex.cpp
@@ -37,7 +37,7 @@ But because ClickHouse is linked with most of the symbols exported (-rdynamic fl
 It allows to get source file names and line numbers from addresses. Only available if you use -g option for compiler.
 It is also used by default for ClickHouse builds, but because of its weight (about two gigabytes)
 it is split to separate binary and provided in clickhouse-common-static-dbg package.
-This separate binary is placed in /usr/lib/debug/usr/bin/clickhouse and is loaded automatically by tools like gdb, addr2line.
+This separate binary is placed in /usr/lib/debug/usr/bin/clickhouse.debug and is loaded automatically by tools like gdb, addr2line.
 When you build ClickHouse by yourself, debug info is not split and present in a single huge binary.
 
 What ClickHouse is using to provide good stack traces?
@@ -391,10 +391,22 @@ void collectSymbolsFromELF(
     std::filesystem::path local_debug_info_path = canonical_path.parent_path() / canonical_path.stem();
     local_debug_info_path += ".debug";
     std::filesystem::path debug_info_path = std::filesystem::path("/usr/lib/debug") / canonical_path.relative_path();
+    debug_info_path += ".debug";
 
-    if (std::filesystem::exists(local_debug_info_path))
+    /// NOTE: This is a workaround for current package system.
+    ///
+    /// Since nfpm cannot copy file only if it exists,
+    /// and so in cmake empty .debug file is created instead,
+    /// but if we will try to load empty Elf file, then the CANNOT_PARSE_ELF
+    /// exception will be thrown from the Elf::Elf.
+    auto exists_not_empty = [](const std::filesystem::path & path)
+    {
+        return std::filesystem::exists(path) && !std::filesystem::is_empty(path);
+    };
+
+    if (exists_not_empty(local_debug_info_path))
         object_name = local_debug_info_path;
-    else if (std::filesystem::exists(debug_info_path))
+    else if (exists_not_empty(debug_info_path))
         object_name = debug_info_path;
     else if (build_id.size() >= 2)
     {
@@ -412,7 +424,7 @@ void collectSymbolsFromELF(
 
         std::filesystem::path build_id_debug_info_path(
             fmt::format("/usr/lib/debug/.build-id/{}/{}.debug", build_id_hex.substr(0, 2), build_id_hex.substr(2)));
-        if (std::filesystem::exists(build_id_debug_info_path))
+        if (exists_not_empty(build_id_debug_info_path))
             object_name = build_id_debug_info_path;
         else
             object_name = canonical_path;
diff --git a/src/Common/ThreadPool.cpp b/src/Common/ThreadPool.cpp
index 3f5091af0c9..0b89139fa53 100644
--- a/src/Common/ThreadPool.cpp
+++ b/src/Common/ThreadPool.cpp
@@ -2,6 +2,7 @@
 #include <Common/setThreadName.h>
 #include <Common/Exception.h>
 #include <Common/getNumberOfPhysicalCPUCores.h>
+#include <Common/OpenTelemetryTraceContext.h>
 
 #include <cassert>
 #include <iostream>
@@ -86,7 +87,7 @@ void ThreadPoolImpl<Thread>::setQueueSize(size_t value)
 
 template <typename Thread>
 template <typename ReturnType>
-ReturnType ThreadPoolImpl<Thread>::scheduleImpl(Job job, int priority, std::optional<uint64_t> wait_microseconds)
+ReturnType ThreadPoolImpl<Thread>::scheduleImpl(Job job, int priority, std::optional<uint64_t> wait_microseconds, bool propagate_opentelemetry_tracing_context)
 {
     auto on_error = [&](const std::string & reason)
     {
@@ -149,7 +150,11 @@ ReturnType ThreadPoolImpl<Thread>::scheduleImpl(Job job, int priority, std::opti
             }
         }
 
-        jobs.emplace(std::move(job), priority);
+        jobs.emplace(std::move(job),
+                     priority,
+                     /// Tracing context on this thread is used as parent context for the sub-thread that runs the job
+                     propagate_opentelemetry_tracing_context ? DB::OpenTelemetry::CurrentContext() : DB::OpenTelemetry::TracingContextOnThread());
+
         ++scheduled_jobs;
         new_job_or_shutdown.notify_one();
     }
@@ -170,9 +175,9 @@ bool ThreadPoolImpl<Thread>::trySchedule(Job job, int priority, uint64_t wait_mi
 }
 
 template <typename Thread>
-void ThreadPoolImpl<Thread>::scheduleOrThrow(Job job, int priority, uint64_t wait_microseconds)
+void ThreadPoolImpl<Thread>::scheduleOrThrow(Job job, int priority, uint64_t wait_microseconds, bool propagate_opentelemetry_tracing_context)
 {
-    scheduleImpl<void>(std::move(job), priority, wait_microseconds);
+    scheduleImpl<void>(std::move(job), priority, wait_microseconds, propagate_opentelemetry_tracing_context);
 }
 
 template <typename Thread>
@@ -250,6 +255,9 @@ void ThreadPoolImpl<Thread>::worker(typename std::list<Thread>::iterator thread_
         Job job;
         bool need_shutdown = false;
 
+        /// A copy of parent trace context
+        DB::OpenTelemetry::TracingContextOnThread parent_thead_trace_context;
+
         {
             std::unique_lock lock(mutex);
             new_job_or_shutdown.wait(lock, [this] { return shutdown || !jobs.empty(); });
@@ -260,6 +268,7 @@ void ThreadPoolImpl<Thread>::worker(typename std::list<Thread>::iterator thread_
                 /// boost::priority_queue does not provide interface for getting non-const reference to an element
                 /// to prevent us from modifying its priority. We have to use const_cast to force move semantics on JobWithPriority::job.
                 job = std::move(const_cast<Job &>(jobs.top().job));
+                parent_thead_trace_context = std::move(const_cast<DB::OpenTelemetry::TracingContextOnThread &>(jobs.top().thread_trace_context));
                 jobs.pop();
             }
             else
@@ -272,22 +281,40 @@ void ThreadPoolImpl<Thread>::worker(typename std::list<Thread>::iterator thread_
 
         if (!need_shutdown)
         {
+            ALLOW_ALLOCATIONS_IN_SCOPE;
+
+            /// Set up tracing context for this thread by its parent context
+            DB::OpenTelemetry::TracingContextHolder thread_trace_context("ThreadPool::worker()", parent_thead_trace_context);
+
             try
             {
-                ALLOW_ALLOCATIONS_IN_SCOPE;
                 CurrentMetrics::Increment metric_active_threads(
                     std::is_same_v<Thread, std::thread> ? CurrentMetrics::GlobalThreadActive : CurrentMetrics::LocalThreadActive);
 
                 job();
+
+                if (thread_trace_context.root_span.isTraceEnabled())
+                {
+                    /// Use the thread name as operation name so that the tracing log will be more clear.
+                    /// The thread name is usually set in the jobs, we can only get the name after the job finishes
+                    std::string thread_name = getThreadName();
+                    if (!thread_name.empty())
+                        thread_trace_context.root_span.operation_name = thread_name;
+                }
+
                 /// job should be reset before decrementing scheduled_jobs to
                 /// ensure that the Job destroyed before wait() returns.
                 job = {};
+                parent_thead_trace_context.reset();
             }
             catch (...)
             {
+                thread_trace_context.root_span.addAttribute(std::current_exception());
+
                 /// job should be reset before decrementing scheduled_jobs to
                 /// ensure that the Job destroyed before wait() returns.
                 job = {};
+                parent_thead_trace_context.reset();
 
                 {
                     std::lock_guard lock(mutex);
@@ -323,7 +350,8 @@ void ThreadPoolImpl<Thread>::worker(typename std::list<Thread>::iterator thread_
 
 
 template class ThreadPoolImpl<std::thread>;
-template class ThreadPoolImpl<ThreadFromGlobalPool>;
+template class ThreadPoolImpl<ThreadFromGlobalPoolImpl<false>>;
+template class ThreadFromGlobalPoolImpl<true>;
 
 std::unique_ptr<GlobalThreadPool> GlobalThreadPool::the_instance;
 
diff --git a/src/Common/ThreadPool.h b/src/Common/ThreadPool.h
index eb3f631b92a..fc5377b3783 100644
--- a/src/Common/ThreadPool.h
+++ b/src/Common/ThreadPool.h
@@ -14,6 +14,7 @@
 
 #include <Poco/Event.h>
 #include <Common/ThreadStatus.h>
+#include <Common/OpenTelemetryTraceContext.h>
 #include <base/scope_guard.h>
 
 /** Very simple thread pool similar to boost::threadpool.
@@ -55,7 +56,7 @@ public:
     bool trySchedule(Job job, int priority = 0, uint64_t wait_microseconds = 0) noexcept;
 
     /// Similar to scheduleOrThrowOnError(...). Wait for specified amount of time and schedule a job or throw an exception.
-    void scheduleOrThrow(Job job, int priority = 0, uint64_t wait_microseconds = 0);
+    void scheduleOrThrow(Job job, int priority = 0, uint64_t wait_microseconds = 0, bool propagate_opentelemetry_tracing_context = true);
 
     /// Wait for all currently active jobs to be done.
     /// You may call schedule and wait many times in arbitrary order.
@@ -96,9 +97,10 @@ private:
     {
         Job job;
         int priority;
+        DB::OpenTelemetry::TracingContextOnThread thread_trace_context;
 
-        JobWithPriority(Job job_, int priority_)
-            : job(job_), priority(priority_) {}
+        JobWithPriority(Job job_, int priority_, const DB::OpenTelemetry::TracingContextOnThread& thread_trace_context_)
+            : job(job_), priority(priority_), thread_trace_context(thread_trace_context_) {}
 
         bool operator< (const JobWithPriority & rhs) const
         {
@@ -111,7 +113,7 @@ private:
     std::exception_ptr first_exception;
 
     template <typename ReturnType>
-    ReturnType scheduleImpl(Job job, int priority, std::optional<uint64_t> wait_microseconds);
+    ReturnType scheduleImpl(Job job, int priority, std::optional<uint64_t> wait_microseconds, bool propagate_opentelemetry_tracing_context = true);
 
     void worker(typename std::list<Thread>::iterator thread_it);
 
@@ -154,14 +156,18 @@ public:
 
 /** Looks like std::thread but allocates threads in GlobalThreadPool.
   * Also holds ThreadStatus for ClickHouse.
+  *
+  * NOTE: User code should use 'ThreadFromGlobalPool' declared below instead of directly using this class.
+  *
   */
-class ThreadFromGlobalPool : boost::noncopyable
+template <bool propagate_opentelemetry_context = true>
+class ThreadFromGlobalPoolImpl : boost::noncopyable
 {
 public:
-    ThreadFromGlobalPool() = default;
+    ThreadFromGlobalPoolImpl() = default;
 
     template <typename Function, typename... Args>
-    explicit ThreadFromGlobalPool(Function && func, Args &&... args)
+    explicit ThreadFromGlobalPoolImpl(Function && func, Args &&... args)
         : state(std::make_shared<State>())
     {
         /// NOTE:
@@ -185,15 +191,19 @@ public:
             /// before sending signal that permits to join this thread.
             DB::ThreadStatus thread_status;
             std::apply(function, arguments);
-        });
+        },
+        0, // default priority
+        0, // default wait_microseconds
+        propagate_opentelemetry_context
+        );
     }
 
-    ThreadFromGlobalPool(ThreadFromGlobalPool && rhs) noexcept
+    ThreadFromGlobalPoolImpl(ThreadFromGlobalPoolImpl && rhs) noexcept
     {
         *this = std::move(rhs);
     }
 
-    ThreadFromGlobalPool & operator=(ThreadFromGlobalPool && rhs) noexcept
+    ThreadFromGlobalPoolImpl & operator=(ThreadFromGlobalPoolImpl && rhs) noexcept
     {
         if (initialized())
             abort();
@@ -201,7 +211,7 @@ public:
         return *this;
     }
 
-    ~ThreadFromGlobalPool()
+    ~ThreadFromGlobalPoolImpl()
     {
         if (initialized())
             abort();
@@ -233,7 +243,7 @@ public:
         return true;
     }
 
-private:
+protected:
     struct State
     {
         /// Should be atomic() because of possible concurrent access between
@@ -254,6 +264,19 @@ private:
     }
 };
 
-
 /// Recommended thread pool for the case when multiple thread pools are created and destroyed.
-using ThreadPool = ThreadPoolImpl<ThreadFromGlobalPool>;
+///
+/// The template parameter of ThreadFromGlobalPool is set to false to disable tracing context propagation to underlying worker.
+/// Because ThreadFromGlobalPool schedules a job upon GlobalThreadPool, this means there will be two workers to schedule a job in 'ThreadPool',
+/// one is at GlobalThreadPool level, the other is at ThreadPool level, so tracing context will be initialized on the same thread twice.
+///
+/// Once the worker on ThreadPool gains the control of execution, it won't return until it's shutdown,
+/// which means the tracing context initialized at underlying worker level won't be delete for a very long time.
+/// This would cause wrong context for further jobs scheduled in ThreadPool.
+///
+/// To make sure the tracing context are correctly propagated, we explicitly disable context propagation(including initialization and de-initialization) at underlying worker level.
+///
+using ThreadPool = ThreadPoolImpl<ThreadFromGlobalPoolImpl<false>>;
+
+/// An alias for user code to execute a job in the global thread pool
+using ThreadFromGlobalPool = ThreadFromGlobalPoolImpl<true>;
diff --git a/src/Common/ThreadStatus.cpp b/src/Common/ThreadStatus.cpp
index 0cbc6f4ce0f..b62a7af6c71 100644
--- a/src/Common/ThreadStatus.cpp
+++ b/src/Common/ThreadStatus.cpp
@@ -3,7 +3,6 @@
 #include <Common/QueryProfiler.h>
 #include <Common/ThreadStatus.h>
 #include <base/errnoToString.h>
-#include <Interpreters/OpenTelemetrySpanLog.h>
 #include <Interpreters/Context.h>
 
 #include <Poco/Logger.h>
diff --git a/src/Common/ThreadStatus.h b/src/Common/ThreadStatus.h
index 2a4ffd229f2..b414a9bccf5 100644
--- a/src/Common/ThreadStatus.h
+++ b/src/Common/ThreadStatus.h
@@ -4,7 +4,6 @@
 #include <Interpreters/Context_fwd.h>
 #include <IO/Progress.h>
 #include <Common/MemoryTracker.h>
-#include <Common/OpenTelemetryTraceContext.h>
 #include <Common/ProfileEvents.h>
 #include <base/StringRef.h>
 #include <Common/ConcurrentBoundedQueue.h>
@@ -33,7 +32,6 @@ class ThreadStatus;
 class QueryProfilerReal;
 class QueryProfilerCPU;
 class QueryThreadLog;
-struct OpenTelemetrySpanHolder;
 class TasksStatsCounters;
 struct RUsageCounters;
 struct PerfEventsCounters;
@@ -135,8 +133,6 @@ public:
     Int64 untracked_memory = 0;
     /// Each thread could new/delete memory in range of (-untracked_memory_limit, untracked_memory_limit) without access to common counters.
     Int64 untracked_memory_limit = 4 * 1024 * 1024;
-    /// Increase limit in case of exception.
-    Int64 untracked_memory_limit_increase = 0;
 
     /// Statistics of read and write rows/bytes
     Progress progress_in;
@@ -145,12 +141,6 @@ public:
     using Deleter = std::function<void()>;
     Deleter deleter;
 
-    // This is the current most-derived OpenTelemetry span for this thread. It
-    // can be changed throughout the query execution, whenever we enter a new
-    // span or exit it. See OpenTelemetrySpanHolder that is normally responsible
-    // for these changes.
-    OpenTelemetryTraceContext thread_trace_context;
-
 protected:
     ThreadGroupStatusPtr thread_group;
 
diff --git a/src/Common/Volnitsky.h b/src/Common/Volnitsky.h
index d7ca7d35277..6f5948b6564 100644
--- a/src/Common/Volnitsky.h
+++ b/src/Common/Volnitsky.h
@@ -497,7 +497,7 @@ private:
     /// last index of offsets that was not processed
     size_t last;
 
-    /// limit for adding to hashtable. In worst case with case insentive search, the table will be filled at most as half
+    /// limit for adding to hashtable. In worst case with case insensitive search, the table will be filled at most as half
     static constexpr size_t small_limit = VolnitskyTraits::hash_size / 8;
 
 public:
diff --git a/src/Common/ZooKeeper/IKeeper.h b/src/Common/ZooKeeper/IKeeper.h
index c6aa954688b..a94e367cd70 100644
--- a/src/Common/ZooKeeper/IKeeper.h
+++ b/src/Common/ZooKeeper/IKeeper.h
@@ -80,7 +80,7 @@ enum class Error : int32_t
     ZUNIMPLEMENTED = -6,        /// Operation is unimplemented
     ZOPERATIONTIMEOUT = -7,     /// Operation timeout
     ZBADARGUMENTS = -8,         /// Invalid arguments
-    ZINVALIDSTATE = -9,         /// Invliad zhandle state
+    ZINVALIDSTATE = -9,         /// Invalid zhandle state
 
     /** API errors.
         * This is never thrown by the server, it shouldn't be used other than
@@ -428,6 +428,12 @@ public:
     Exception(const Error code_, const std::string & path); /// NOLINT
     Exception(const Exception & exc);
 
+    template <typename... Args>
+    Exception(const Error code_, fmt::format_string<Args...> fmt, Args &&... args)
+        : Exception(fmt::format(fmt, std::forward<Args>(args)...), code_)
+    {
+    }
+
     const char * name() const noexcept override { return "Coordination::Exception"; }
     const char * className() const noexcept override { return "Coordination::Exception"; }
     Exception * clone() const override { return new Exception(*this); }
@@ -439,7 +445,7 @@ public:
 /** Usage scenario:
   * - create an object and issue commands;
   * - you provide callbacks for your commands; callbacks are invoked in internal thread and must be cheap:
-  *   for example, just signal a condvar / fulfull a promise.
+  *   for example, just signal a condvar / fulfill a promise.
   * - you also may provide callbacks for watches; they are also invoked in internal thread and must be cheap.
   * - whenever you receive exception with ZSESSIONEXPIRED code or method isExpired returns true,
   *   the ZooKeeper instance is no longer usable - you may only destroy it and probably create another.
diff --git a/src/Common/ZooKeeper/TestKeeper.cpp b/src/Common/ZooKeeper/TestKeeper.cpp
index 3af5dfcc177..098dc522eeb 100644
--- a/src/Common/ZooKeeper/TestKeeper.cpp
+++ b/src/Common/ZooKeeper/TestKeeper.cpp
@@ -507,15 +507,15 @@ ResponsePtr TestKeeperSyncRequest::createResponse() const { return std::make_sha
 ResponsePtr TestKeeperMultiRequest::createResponse() const { return std::make_shared<MultiResponse>(); }
 
 
-TestKeeper::TestKeeper(const String & root_path_, Poco::Timespan operation_timeout_)
-    : root_path(root_path_), operation_timeout(operation_timeout_)
+TestKeeper::TestKeeper(const zkutil::ZooKeeperArgs & args_)
+    : args(args_)
 {
     container.emplace("/", Node());
 
-    if (!root_path.empty())
+    if (!args.chroot.empty())
     {
-        if (root_path.back() == '/')
-            root_path.pop_back();
+        if (args.chroot.back() == '/')
+            args.chroot.pop_back();
     }
 
     processing_thread = ThreadFromGlobalPool([this] { processingThread(); });
@@ -547,7 +547,7 @@ void TestKeeper::processingThread()
         {
             RequestInfo info;
 
-            UInt64 max_wait = static_cast<UInt64>(operation_timeout.totalMilliseconds());
+            UInt64 max_wait = static_cast<UInt64>(args.operation_timeout_ms);
             if (requests_queue.tryPop(info, max_wait))
             {
                 if (expired)
@@ -556,7 +556,7 @@ void TestKeeper::processingThread()
 
                 ++zxid;
 
-                info.request->addRootPath(root_path);
+                info.request->addRootPath(args.chroot);
                 auto [response, _] = info.request->process(container, zxid);
 
                 if (info.watch)
@@ -580,7 +580,7 @@ void TestKeeper::processingThread()
                 if (response->error == Error::ZOK)
                     info.request->processWatches(watches, list_watches);
 
-                response->removeRootPath(root_path);
+                response->removeRootPath(args.chroot);
                 if (info.callback)
                     info.callback(*response);
             }
@@ -689,7 +689,7 @@ void TestKeeper::pushRequest(RequestInfo && request)
         if (expired)
             throw Exception("Session expired", Error::ZSESSIONEXPIRED);
 
-        if (!requests_queue.tryPush(std::move(request), operation_timeout.totalMilliseconds()))
+        if (!requests_queue.tryPush(std::move(request), args.operation_timeout_ms))
             throw Exception("Cannot push request to queue within operation timeout", Error::ZOPERATIONTIMEOUT);
     }
     catch (...)
diff --git a/src/Common/ZooKeeper/TestKeeper.h b/src/Common/ZooKeeper/TestKeeper.h
index 5fcd00b01b0..aad5131fcb5 100644
--- a/src/Common/ZooKeeper/TestKeeper.h
+++ b/src/Common/ZooKeeper/TestKeeper.h
@@ -8,6 +8,7 @@
 
 #include <Poco/Timespan.h>
 #include <Common/ZooKeeper/IKeeper.h>
+#include <Common/ZooKeeper/ZooKeeperArgs.h>
 #include <Common/ThreadPool.h>
 #include <Common/ConcurrentBoundedQueue.h>
 
@@ -33,7 +34,7 @@ using TestKeeperRequestPtr = std::shared_ptr<TestKeeperRequest>;
 class TestKeeper final : public IKeeper
 {
 public:
-    TestKeeper(const String & root_path_, Poco::Timespan operation_timeout_);
+    TestKeeper(const zkutil::ZooKeeperArgs & args_);
     ~TestKeeper() override;
 
     bool isExpired() const override { return expired; }
@@ -123,10 +124,7 @@ private:
 
     Container container;
 
-    String root_path;
-    ACLs default_acls;
-
-    Poco::Timespan operation_timeout;
+    zkutil::ZooKeeperArgs args;
 
     std::mutex push_request_mutex;
     std::atomic<bool> expired{false};
diff --git a/src/Common/ZooKeeper/ZooKeeper.cpp b/src/Common/ZooKeeper/ZooKeeper.cpp
index 96abf3b543a..6fcd3b52f16 100644
--- a/src/Common/ZooKeeper/ZooKeeper.cpp
+++ b/src/Common/ZooKeeper/ZooKeeper.cpp
@@ -6,20 +6,18 @@
 #include <functional>
 #include <filesystem>
 
+#include <Common/randomSeed.h>
 #include <base/find_symbols.h>
 #include <base/sort.h>
 #include <base/getFQDNOrHostName.h>
 #include "Common/ZooKeeper/IKeeper.h"
 #include <Common/StringUtils/StringUtils.h>
 #include <Common/Exception.h>
-#include <Common/isLocalAddress.h>
 
 #include <Poco/Net/NetException.h>
 #include <Poco/Net/DNS.h>
 
 
-#define ZOOKEEPER_CONNECTION_TIMEOUT_MS 1000
-
 namespace fs = std::filesystem;
 
 namespace DB
@@ -49,25 +47,19 @@ static void check(Coordination::Error code, const std::string & path)
 }
 
 
-void ZooKeeper::init(const std::string & implementation_, const Strings & hosts_, const std::string & identity_,
-                     int32_t session_timeout_ms_, int32_t operation_timeout_ms_, const std::string & chroot_, const GetPriorityForLoadBalancing & get_priority_load_balancing_)
-{
-    log = &Poco::Logger::get("ZooKeeper");
-    hosts = hosts_;
-    identity = identity_;
-    session_timeout_ms = session_timeout_ms_;
-    operation_timeout_ms = operation_timeout_ms_;
-    chroot = chroot_;
-    implementation = implementation_;
-    get_priority_load_balancing = get_priority_load_balancing_;
+void ZooKeeper::init(ZooKeeperArgs args_)
 
-    if (implementation == "zookeeper")
+{
+    args = std::move(args_);
+    log = &Poco::Logger::get("ZooKeeper");
+
+    if (args.implementation == "zookeeper")
     {
-        if (hosts.empty())
+        if (args.hosts.empty())
             throw KeeperException("No hosts passed to ZooKeeper constructor.", Coordination::Error::ZBADARGUMENTS);
 
         Coordination::ZooKeeper::Nodes nodes;
-        nodes.reserve(hosts.size());
+        nodes.reserve(args.hosts.size());
 
         /// Shuffle the hosts to distribute the load among ZooKeeper nodes.
         std::vector<ShuffleHost> shuffled_hosts = shuffleHosts();
@@ -108,33 +100,23 @@ void ZooKeeper::init(const std::string & implementation_, const Strings & hosts_
                 throw KeeperException("Cannot use any of provided ZooKeeper nodes", Coordination::Error::ZBADARGUMENTS);
         }
 
-        impl = std::make_unique<Coordination::ZooKeeper>(
-                nodes,
-                chroot,
-                identity_.empty() ? "" : "digest",
-                identity_,
-                Poco::Timespan(0, session_timeout_ms_ * 1000),
-                Poco::Timespan(0, ZOOKEEPER_CONNECTION_TIMEOUT_MS * 1000),
-                Poco::Timespan(0, operation_timeout_ms_ * 1000),
-                zk_log);
+        impl = std::make_unique<Coordination::ZooKeeper>(nodes, args, zk_log);
 
-        if (chroot.empty())
-            LOG_TRACE(log, "Initialized, hosts: {}", fmt::join(hosts, ","));
+        if (args.chroot.empty())
+            LOG_TRACE(log, "Initialized, hosts: {}", fmt::join(args.hosts, ","));
         else
-            LOG_TRACE(log, "Initialized, hosts: {}, chroot: {}", fmt::join(hosts, ","), chroot);
+            LOG_TRACE(log, "Initialized, hosts: {}, chroot: {}", fmt::join(args.hosts, ","), args.chroot);
     }
-    else if (implementation == "testkeeper")
+    else if (args.implementation == "testkeeper")
     {
-        impl = std::make_unique<Coordination::TestKeeper>(
-                chroot,
-                Poco::Timespan(0, operation_timeout_ms_ * 1000));
+        impl = std::make_unique<Coordination::TestKeeper>(args);
     }
     else
     {
-        throw DB::Exception("Unknown implementation of coordination service: " + implementation, DB::ErrorCodes::NOT_IMPLEMENTED);
+        throw DB::Exception("Unknown implementation of coordination service: " + args.implementation, DB::ErrorCodes::NOT_IMPLEMENTED);
     }
 
-    if (!chroot.empty())
+    if (!args.chroot.empty())
     {
         /// Here we check that zk root exists.
         /// This check is clumsy. The reason is we do this request under common mutex, and never want to hung here.
@@ -144,7 +126,7 @@ void ZooKeeper::init(const std::string & implementation_, const Strings & hosts_
         /// This should not happen now, when memory tracker is disabled.
         /// But let's keep it just in case (it is also easy to backport).
         auto future = asyncExists("/");
-        auto res = future.wait_for(std::chrono::milliseconds(operation_timeout_ms));
+        auto res = future.wait_for(std::chrono::milliseconds(args.operation_timeout_ms));
         if (res != std::future_status::ready)
             throw KeeperException("Cannot check if zookeeper root exists.", Coordination::Error::ZOPERATIONTIMEOUT);
 
@@ -153,18 +135,30 @@ void ZooKeeper::init(const std::string & implementation_, const Strings & hosts_
             throw KeeperException(code, "/");
 
         if (code == Coordination::Error::ZNONODE)
-            throw KeeperException("ZooKeeper root doesn't exist. You should create root node " + chroot + " before start.", Coordination::Error::ZNONODE);
+            throw KeeperException("ZooKeeper root doesn't exist. You should create root node " + args.chroot + " before start.", Coordination::Error::ZNONODE);
     }
 }
 
+ZooKeeper::ZooKeeper(const ZooKeeperArgs & args_, std::shared_ptr<DB::ZooKeeperLog> zk_log_)
+{
+    zk_log = std::move(zk_log_);
+    init(args_);
+}
+
+ZooKeeper::ZooKeeper(const Poco::Util::AbstractConfiguration & config, const std::string & config_name, std::shared_ptr<DB::ZooKeeperLog> zk_log_)
+    : zk_log(std::move(zk_log_))
+{
+    init(ZooKeeperArgs(config, config_name));
+}
+
 std::vector<ShuffleHost> ZooKeeper::shuffleHosts() const
 {
-    std::function<size_t(size_t index)> get_priority = get_priority_load_balancing.getPriorityFunc(get_priority_load_balancing.load_balancing, 0, hosts.size());
+    std::function<size_t(size_t index)> get_priority = args.get_priority_load_balancing.getPriorityFunc(args.get_priority_load_balancing.load_balancing, 0, args.hosts.size());
     std::vector<ShuffleHost> shuffle_hosts;
-    for (size_t i = 0; i < hosts.size(); ++i)
+    for (size_t i = 0; i < args.hosts.size(); ++i)
     {
         ShuffleHost shuffle_host;
-        shuffle_host.host = hosts[i];
+        shuffle_host.host = args.hosts[i];
         if (get_priority)
             shuffle_host.priority = get_priority(i);
         shuffle_host.randomize();
@@ -181,125 +175,16 @@ std::vector<ShuffleHost> ZooKeeper::shuffleHosts() const
     return shuffle_hosts;
 }
 
-ZooKeeper::ZooKeeper(const std::string & hosts_string, const std::string & identity_, int32_t session_timeout_ms_,
-                     int32_t operation_timeout_ms_, const std::string & chroot_, const std::string & implementation_,
-                     std::shared_ptr<DB::ZooKeeperLog> zk_log_, const GetPriorityForLoadBalancing & get_priority_load_balancing_)
-{
-    zk_log = std::move(zk_log_);
-    Strings hosts_strings;
-    splitInto<','>(hosts_strings, hosts_string);
-
-    init(implementation_, hosts_strings, identity_, session_timeout_ms_, operation_timeout_ms_, chroot_, get_priority_load_balancing_);
-}
-
-ZooKeeper::ZooKeeper(const Strings & hosts_, const std::string & identity_, int32_t session_timeout_ms_,
-                     int32_t operation_timeout_ms_, const std::string & chroot_, const std::string & implementation_,
-                     std::shared_ptr<DB::ZooKeeperLog> zk_log_, const GetPriorityForLoadBalancing & get_priority_load_balancing_)
-{
-    zk_log = std::move(zk_log_);
-    init(implementation_, hosts_, identity_, session_timeout_ms_, operation_timeout_ms_, chroot_, get_priority_load_balancing_);
-}
-
-struct ZooKeeperArgs
-{
-    ZooKeeperArgs(const Poco::Util::AbstractConfiguration & config, const std::string & config_name)
-    {
-        Poco::Util::AbstractConfiguration::Keys keys;
-        config.keys(config_name, keys);
-
-        session_timeout_ms = Coordination::DEFAULT_SESSION_TIMEOUT_MS;
-        operation_timeout_ms = Coordination::DEFAULT_OPERATION_TIMEOUT_MS;
-        implementation = "zookeeper";
-        for (const auto & key : keys)
-        {
-            if (startsWith(key, "node"))
-            {
-                hosts.push_back(
-                        (config.getBool(config_name + "." + key + ".secure", false) ? "secure://" : "") +
-                        config.getString(config_name + "." + key + ".host") + ":"
-                        + config.getString(config_name + "." + key + ".port", "2181")
-                );
-            }
-            else if (key == "session_timeout_ms")
-            {
-                session_timeout_ms = config.getInt(config_name + "." + key);
-            }
-            else if (key == "operation_timeout_ms")
-            {
-                operation_timeout_ms = config.getInt(config_name + "." + key);
-            }
-            else if (key == "identity")
-            {
-                identity = config.getString(config_name + "." + key);
-            }
-            else if (key == "root")
-            {
-                chroot = config.getString(config_name + "." + key);
-            }
-            else if (key == "implementation")
-            {
-                implementation = config.getString(config_name + "." + key);
-            }
-            else if (key == "zookeeper_load_balancing")
-            {
-                String load_balancing_str = config.getString(config_name + "." + key);
-                /// Use magic_enum to avoid dependency from dbms (`SettingFieldLoadBalancingTraits::fromString(...)`)
-                auto load_balancing = magic_enum::enum_cast<DB::LoadBalancing>(Poco::toUpper(load_balancing_str));
-                if (!load_balancing)
-                    throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Unknown load balancing: {}", load_balancing_str);
-                get_priority_load_balancing.load_balancing = *load_balancing;
-            }
-            else
-                throw KeeperException(std::string("Unknown key ") + key + " in config file", Coordination::Error::ZBADARGUMENTS);
-        }
-
-        if (!chroot.empty())
-        {
-            if (chroot.front() != '/')
-                throw KeeperException(std::string("Root path in config file should start with '/', but got ") + chroot, Coordination::Error::ZBADARGUMENTS);
-            if (chroot.back() == '/')
-                chroot.pop_back();
-        }
-
-        /// init get_priority_load_balancing
-        get_priority_load_balancing.hostname_differences.resize(hosts.size());
-        const String & local_hostname = getFQDNOrHostName();
-        for (size_t i = 0; i < hosts.size(); ++i)
-        {
-            const String & node_host = hosts[i].substr(0, hosts[i].find_last_of(':'));
-            get_priority_load_balancing.hostname_differences[i] = DB::getHostNameDifference(local_hostname, node_host);
-        }
-    }
-
-    Strings hosts;
-    std::string identity;
-    int session_timeout_ms;
-    int operation_timeout_ms;
-    std::string chroot;
-    std::string implementation;
-    GetPriorityForLoadBalancing get_priority_load_balancing;
-};
-
-ZooKeeper::ZooKeeper(const Poco::Util::AbstractConfiguration & config, const std::string & config_name, std::shared_ptr<DB::ZooKeeperLog> zk_log_)
-    : zk_log(std::move(zk_log_))
-{
-    ZooKeeperArgs args(config, config_name);
-    init(args.implementation, args.hosts, args.identity, args.session_timeout_ms, args.operation_timeout_ms, args.chroot, args.get_priority_load_balancing);
-}
 
 bool ZooKeeper::configChanged(const Poco::Util::AbstractConfiguration & config, const std::string & config_name) const
 {
-    ZooKeeperArgs args(config, config_name);
+    ZooKeeperArgs new_args(config, config_name);
 
     // skip reload testkeeper cause it's for test and data in memory
-    if (args.implementation == implementation && implementation == "testkeeper")
+    if (new_args.implementation == args.implementation && args.implementation == "testkeeper")
         return false;
 
-    if (args.get_priority_load_balancing != get_priority_load_balancing)
-        return true;
-
-    return std::tie(args.implementation, args.hosts, args.identity, args.session_timeout_ms, args.operation_timeout_ms, args.chroot, args.get_priority_load_balancing)
-        != std::tie(implementation, hosts, identity, session_timeout_ms, operation_timeout_ms, chroot, args.get_priority_load_balancing);
+    return args != new_args;
 }
 
 
@@ -318,7 +203,7 @@ Coordination::Error ZooKeeper::getChildrenImpl(const std::string & path, Strings
 {
     auto future_result = asyncTryGetChildrenNoThrow(path, watch_callback, list_request_type);
 
-    if (future_result.wait_for(std::chrono::milliseconds(operation_timeout_ms)) != std::future_status::ready)
+    if (future_result.wait_for(std::chrono::milliseconds(args.operation_timeout_ms)) != std::future_status::ready)
     {
         impl->finalize(fmt::format("Operation timeout on {} {}", toString(Coordination::OpNum::List), path));
         return Coordination::Error::ZOPERATIONTIMEOUT;
@@ -385,7 +270,7 @@ Coordination::Error ZooKeeper::createImpl(const std::string & path, const std::s
 {
     auto future_result = asyncTryCreateNoThrow(path, data, mode);
 
-    if (future_result.wait_for(std::chrono::milliseconds(operation_timeout_ms)) != std::future_status::ready)
+    if (future_result.wait_for(std::chrono::milliseconds(args.operation_timeout_ms)) != std::future_status::ready)
     {
         impl->finalize(fmt::format("Operation timeout on {} {}", toString(Coordination::OpNum::Create), path));
         return Coordination::Error::ZOPERATIONTIMEOUT;
@@ -455,7 +340,7 @@ Coordination::Error ZooKeeper::removeImpl(const std::string & path, int32_t vers
     auto future_result = asyncTryRemoveNoThrow(path, version);
 
 
-    if (future_result.wait_for(std::chrono::milliseconds(operation_timeout_ms)) != std::future_status::ready)
+    if (future_result.wait_for(std::chrono::milliseconds(args.operation_timeout_ms)) != std::future_status::ready)
     {
         impl->finalize(fmt::format("Operation timeout on {} {}", toString(Coordination::OpNum::Remove), path));
         return Coordination::Error::ZOPERATIONTIMEOUT;
@@ -487,7 +372,7 @@ Coordination::Error ZooKeeper::existsImpl(const std::string & path, Coordination
 {
     auto future_result = asyncTryExistsNoThrow(path, watch_callback);
 
-    if (future_result.wait_for(std::chrono::milliseconds(operation_timeout_ms)) != std::future_status::ready)
+    if (future_result.wait_for(std::chrono::milliseconds(args.operation_timeout_ms)) != std::future_status::ready)
     {
         impl->finalize(fmt::format("Operation timeout on {} {}", toString(Coordination::OpNum::Exists), path));
         return Coordination::Error::ZOPERATIONTIMEOUT;
@@ -521,7 +406,7 @@ Coordination::Error ZooKeeper::getImpl(const std::string & path, std::string & r
 {
     auto future_result = asyncTryGetNoThrow(path, watch_callback);
 
-    if (future_result.wait_for(std::chrono::milliseconds(operation_timeout_ms)) != std::future_status::ready)
+    if (future_result.wait_for(std::chrono::milliseconds(args.operation_timeout_ms)) != std::future_status::ready)
     {
         impl->finalize(fmt::format("Operation timeout on {} {}", toString(Coordination::OpNum::Get), path));
         return Coordination::Error::ZOPERATIONTIMEOUT;
@@ -593,7 +478,7 @@ Coordination::Error ZooKeeper::setImpl(const std::string & path, const std::stri
 {
     auto future_result = asyncTrySetNoThrow(path, data, version);
 
-    if (future_result.wait_for(std::chrono::milliseconds(operation_timeout_ms)) != std::future_status::ready)
+    if (future_result.wait_for(std::chrono::milliseconds(args.operation_timeout_ms)) != std::future_status::ready)
     {
         impl->finalize(fmt::format("Operation timeout on {} {}", toString(Coordination::OpNum::Set), path));
         return Coordination::Error::ZOPERATIONTIMEOUT;
@@ -645,7 +530,7 @@ Coordination::Error ZooKeeper::multiImpl(const Coordination::Requests & requests
 
     auto future_result = asyncTryMultiNoThrow(requests);
 
-    if (future_result.wait_for(std::chrono::milliseconds(operation_timeout_ms)) != std::future_status::ready)
+    if (future_result.wait_for(std::chrono::milliseconds(args.operation_timeout_ms)) != std::future_status::ready)
     {
         impl->finalize(fmt::format("Operation timeout on {} {}", toString(Coordination::OpNum::Multi), requests[0]->getPath()));
         return Coordination::Error::ZOPERATIONTIMEOUT;
@@ -679,7 +564,7 @@ Coordination::Error ZooKeeper::syncImpl(const std::string & path, std::string &
 {
     auto future_result = asyncTrySyncNoThrow(path);
 
-    if (future_result.wait_for(std::chrono::milliseconds(operation_timeout_ms)) != std::future_status::ready)
+    if (future_result.wait_for(std::chrono::milliseconds(args.operation_timeout_ms)) != std::future_status::ready)
     {
         impl->finalize(fmt::format("Operation timeout on {} {}", toString(Coordination::OpNum::Sync), path));
         return Coordination::Error::ZOPERATIONTIMEOUT;
@@ -884,7 +769,7 @@ void ZooKeeper::waitForEphemeralToDisappearIfAny(const std::string & path)
     if (!tryGet(path, content, nullptr, eph_node_disappeared))
         return;
 
-    int32_t timeout_ms = 3 * session_timeout_ms;
+    int32_t timeout_ms = 3 * args.session_timeout_ms;
     if (!eph_node_disappeared->tryWait(timeout_ms))
         throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR,
                             "Ephemeral node {} still exists after {}s, probably it's owned by someone else. "
@@ -894,7 +779,7 @@ void ZooKeeper::waitForEphemeralToDisappearIfAny(const std::string & path)
 
 ZooKeeperPtr ZooKeeper::startNewSession() const
 {
-    return std::make_shared<ZooKeeper>(hosts, identity, session_timeout_ms, operation_timeout_ms, chroot, implementation, zk_log, get_priority_load_balancing);
+    return std::make_shared<ZooKeeper>(args, zk_log);
 }
 
 
diff --git a/src/Common/ZooKeeper/ZooKeeper.h b/src/Common/ZooKeeper/ZooKeeper.h
index c9b5dc69499..12aa4471ad7 100644
--- a/src/Common/ZooKeeper/ZooKeeper.h
+++ b/src/Common/ZooKeeper/ZooKeeper.h
@@ -13,7 +13,7 @@
 #include <Common/Stopwatch.h>
 #include <Common/ZooKeeper/IKeeper.h>
 #include <Common/ZooKeeper/ZooKeeperConstants.h>
-#include <Common/GetPriorityForLoadBalancing.h>
+#include <Common/ZooKeeper/ZooKeeperArgs.h>
 #include <Common/thread_local_rng.h>
 #include <unistd.h>
 #include <random>
@@ -72,24 +72,11 @@ using GetPriorityForLoadBalancing = DB::GetPriorityForLoadBalancing;
 class ZooKeeper
 {
 public:
+
     using Ptr = std::shared_ptr<ZooKeeper>;
 
-    /// hosts_string -- comma separated [secure://]host:port list
-    explicit ZooKeeper(const std::string & hosts_string, const std::string & identity_ = "",
-              int32_t session_timeout_ms_ = Coordination::DEFAULT_SESSION_TIMEOUT_MS,
-              int32_t operation_timeout_ms_ = Coordination::DEFAULT_OPERATION_TIMEOUT_MS,
-              const std::string & chroot_ = "",
-              const std::string & implementation_ = "zookeeper",
-              std::shared_ptr<DB::ZooKeeperLog> zk_log_ = nullptr,
-              const GetPriorityForLoadBalancing & get_priority_load_balancing_ = {});
+    ZooKeeper(const ZooKeeperArgs & args_, std::shared_ptr<DB::ZooKeeperLog> zk_log_ = nullptr);
 
-    explicit ZooKeeper(const Strings & hosts_, const std::string & identity_ = "",
-              int32_t session_timeout_ms_ = Coordination::DEFAULT_SESSION_TIMEOUT_MS,
-              int32_t operation_timeout_ms_ = Coordination::DEFAULT_OPERATION_TIMEOUT_MS,
-              const std::string & chroot_ = "",
-              const std::string & implementation_ = "zookeeper",
-              std::shared_ptr<DB::ZooKeeperLog> zk_log_ = nullptr,
-              const GetPriorityForLoadBalancing & get_priority_load_balancing_ = {});
 
     /** Config of the form:
         <zookeeper>
@@ -337,8 +324,7 @@ public:
 private:
     friend class EphemeralNodeHolder;
 
-    void init(const std::string & implementation_, const Strings & hosts_, const std::string & identity_,
-              int32_t session_timeout_ms_, int32_t operation_timeout_ms_, const std::string & chroot_, const GetPriorityForLoadBalancing & get_priority_load_balancing_);
+    void init(ZooKeeperArgs args_);
 
     /// The following methods don't any throw exceptions but return error codes.
     Coordination::Error createImpl(const std::string & path, const std::string & data, int32_t mode, std::string & path_created);
@@ -358,20 +344,13 @@ private:
 
     std::unique_ptr<Coordination::IKeeper> impl;
 
-    Strings hosts;
-    std::string identity;
-    int32_t session_timeout_ms;
-    int32_t operation_timeout_ms;
-    std::string chroot;
-    std::string implementation;
+    ZooKeeperArgs args;
 
     std::mutex mutex;
 
     Poco::Logger * log = nullptr;
     std::shared_ptr<DB::ZooKeeperLog> zk_log;
 
-    GetPriorityForLoadBalancing get_priority_load_balancing;
-
     AtomicStopwatch session_uptime;
 };
 
diff --git a/src/Common/ZooKeeper/ZooKeeperArgs.cpp b/src/Common/ZooKeeper/ZooKeeperArgs.cpp
new file mode 100644
index 00000000000..fe2f6957490
--- /dev/null
+++ b/src/Common/ZooKeeper/ZooKeeperArgs.cpp
@@ -0,0 +1,108 @@
+#include <Common/ZooKeeper/ZooKeeperArgs.h>
+#include <Common/ZooKeeper/KeeperException.h>
+#include <base/find_symbols.h>
+#include <base/getFQDNOrHostName.h>
+#include <Poco/Util/AbstractConfiguration.h>
+#include <Common/isLocalAddress.h>
+#include <Poco/String.h>
+
+namespace DB
+{
+namespace ErrorCodes
+{
+    extern const int BAD_ARGUMENTS;
+}
+}
+
+namespace zkutil
+{
+
+ZooKeeperArgs::ZooKeeperArgs(const Poco::Util::AbstractConfiguration & config, const String & config_name)
+{
+    Poco::Util::AbstractConfiguration::Keys keys;
+    config.keys(config_name, keys);
+
+    for (const auto & key : keys)
+    {
+        if (key.starts_with("node"))
+        {
+            hosts.push_back(
+                (config.getBool(config_name + "." + key + ".secure", false) ? "secure://" : "")
+                + config.getString(config_name + "." + key + ".host") + ":" + config.getString(config_name + "." + key + ".port", "2181"));
+        }
+        else if (key == "session_timeout_ms")
+        {
+            session_timeout_ms = config.getInt(config_name + "." + key);
+        }
+        else if (key == "operation_timeout_ms")
+        {
+            operation_timeout_ms = config.getInt(config_name + "." + key);
+        }
+        else if (key == "connection_timeout_ms")
+        {
+            connection_timeout_ms = config.getInt(config_name + "." + key);
+        }
+        else if (key == "send_fault_probability")
+        {
+            send_fault_probability = config.getDouble(config_name + "." + key);
+        }
+        else if (key == "recv_fault_probability")
+        {
+            recv_fault_probability = config.getDouble(config_name + "." + key);
+        }
+        else if (key == "identity")
+        {
+            identity = config.getString(config_name + "." + key);
+            if (!identity.empty())
+                auth_scheme = "digest";
+        }
+        else if (key == "root")
+        {
+            chroot = config.getString(config_name + "." + key);
+        }
+        else if (key == "implementation")
+        {
+            implementation = config.getString(config_name + "." + key);
+        }
+        else if (key == "zookeeper_load_balancing")
+        {
+            String load_balancing_str = config.getString(config_name + "." + key);
+            /// Use magic_enum to avoid dependency from dbms (`SettingFieldLoadBalancingTraits::fromString(...)`)
+            auto load_balancing = magic_enum::enum_cast<DB::LoadBalancing>(Poco::toUpper(load_balancing_str));
+            if (!load_balancing)
+                throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Unknown load balancing: {}", load_balancing_str);
+            get_priority_load_balancing.load_balancing = *load_balancing;
+        }
+        else
+            throw KeeperException(std::string("Unknown key ") + key + " in config file", Coordination::Error::ZBADARGUMENTS);
+    }
+
+    if (!chroot.empty())
+    {
+        if (chroot.front() != '/')
+            throw KeeperException(
+                Coordination::Error::ZBADARGUMENTS,
+                "Root path in config file should start with '/', but got {}", chroot);
+        if (chroot.back() == '/')
+            chroot.pop_back();
+    }
+
+    if (session_timeout_ms < 0 || operation_timeout_ms < 0 || connection_timeout_ms < 0)
+        throw KeeperException("Timeout cannot be negative", Coordination::Error::ZBADARGUMENTS);
+
+    /// init get_priority_load_balancing
+    get_priority_load_balancing.hostname_differences.resize(hosts.size());
+    const String & local_hostname = getFQDNOrHostName();
+    for (size_t i = 0; i < hosts.size(); ++i)
+    {
+        const String & node_host = hosts[i].substr(0, hosts[i].find_last_of(':'));
+        get_priority_load_balancing.hostname_differences[i] = DB::getHostNameDifference(local_hostname, node_host);
+    }
+}
+
+ZooKeeperArgs::ZooKeeperArgs(const String & hosts_string)
+{
+    splitInto<','>(hosts, hosts_string);
+}
+
+}
diff --git a/src/Common/ZooKeeper/ZooKeeperArgs.h b/src/Common/ZooKeeper/ZooKeeperArgs.h
new file mode 100644
index 00000000000..b5c7b293506
--- /dev/null
+++ b/src/Common/ZooKeeper/ZooKeeperArgs.h
@@ -0,0 +1,37 @@
+#pragma once
+#include <Common/ZooKeeper/Types.h>
+#include <Common/ZooKeeper/ZooKeeperConstants.h>
+#include <Common/GetPriorityForLoadBalancing.h>
+
+namespace Poco::Util
+{
+    class AbstractConfiguration;
+}
+
+namespace zkutil
+{
+
+struct ZooKeeperArgs
+{
+    ZooKeeperArgs(const Poco::Util::AbstractConfiguration & config, const String & config_name);
+
+    /// hosts_string -- comma separated [secure://]host:port list
+    ZooKeeperArgs(const String & hosts_string);
+    ZooKeeperArgs() = default;
+    bool operator == (const ZooKeeperArgs &) const = default;
+
+    String implementation = "zookeeper";
+    Strings hosts;
+    String auth_scheme;
+    String identity;
+    String chroot;
+    int32_t connection_timeout_ms = Coordination::DEFAULT_CONNECTION_TIMEOUT_MS;
+    int32_t session_timeout_ms = Coordination::DEFAULT_SESSION_TIMEOUT_MS;
+    int32_t operation_timeout_ms = Coordination::DEFAULT_OPERATION_TIMEOUT_MS;
+    float send_fault_probability = 0;
+    float recv_fault_probability = 0;
+
+    DB::GetPriorityForLoadBalancing get_priority_load_balancing;
+};
+
+}
diff --git a/src/Common/ZooKeeper/ZooKeeperCommon.cpp b/src/Common/ZooKeeper/ZooKeeperCommon.cpp
index b15126f5701..4ab93d814df 100644
--- a/src/Common/ZooKeeper/ZooKeeperCommon.cpp
+++ b/src/Common/ZooKeeper/ZooKeeperCommon.cpp
@@ -898,4 +898,25 @@ ZooKeeperRequestFactory::ZooKeeperRequestFactory()
     registerZooKeeperRequest<OpNum::FilteredList, ZooKeeperFilteredListRequest>(*this);
 }
 
+PathMatchResult matchPath(std::string_view path, std::string_view match_to)
+{
+    using enum PathMatchResult;
+
+    if (path.ends_with('/'))
+        path.remove_suffix(1);
+
+    if (match_to.ends_with('/'))
+        match_to.remove_suffix(1);
+
+    auto [first_it, second_it] = std::mismatch(path.begin(), path.end(), match_to.begin(), match_to.end());
+
+    if (second_it != match_to.end())
+        return NOT_MATCH;
+
+    if (first_it == path.end())
+        return EXACT;
+
+    return *first_it == '/' ? IS_CHILD : NOT_MATCH;
+}
+
 }
diff --git a/src/Common/ZooKeeper/ZooKeeperCommon.h b/src/Common/ZooKeeper/ZooKeeperCommon.h
index 53fabf651fa..9a9700b500b 100644
--- a/src/Common/ZooKeeper/ZooKeeperCommon.h
+++ b/src/Common/ZooKeeper/ZooKeeperCommon.h
@@ -554,4 +554,13 @@ private:
     ZooKeeperRequestFactory();
 };
 
+enum class PathMatchResult
+{
+    NOT_MATCH,
+    EXACT,
+    IS_CHILD
+};
+
+PathMatchResult matchPath(std::string_view path, std::string_view match_to);
+
 }
diff --git a/src/Common/ZooKeeper/ZooKeeperConstants.h b/src/Common/ZooKeeper/ZooKeeperConstants.h
index 44f8437f12c..4066407dc59 100644
--- a/src/Common/ZooKeeper/ZooKeeperConstants.h
+++ b/src/Common/ZooKeeper/ZooKeeperConstants.h
@@ -56,5 +56,6 @@ static constexpr int32_t DEFAULT_SESSION_TIMEOUT_MS = 30000;
 static constexpr int32_t DEFAULT_MIN_SESSION_TIMEOUT_MS = 10000;
 static constexpr int32_t DEFAULT_MAX_SESSION_TIMEOUT_MS = 100000;
 static constexpr int32_t DEFAULT_OPERATION_TIMEOUT_MS = 10000;
+static constexpr int32_t DEFAULT_CONNECTION_TIMEOUT_MS = 1000;
 
 }
diff --git a/src/Common/ZooKeeper/ZooKeeperImpl.cpp b/src/Common/ZooKeeper/ZooKeeperImpl.cpp
index 1d0f1fdb1a2..ece6ce7513a 100644
--- a/src/Common/ZooKeeper/ZooKeeperImpl.cpp
+++ b/src/Common/ZooKeeper/ZooKeeperImpl.cpp
@@ -276,15 +276,15 @@ void ZooKeeper::read(T & x)
     Coordination::read(x, *in);
 }
 
-static void removeRootPath(String & path, const String & root_path)
+static void removeRootPath(String & path, const String & chroot)
 {
-    if (root_path.empty())
+    if (chroot.empty())
         return;
 
-    if (path.size() <= root_path.size())
-        throw Exception("Received path is not longer than root_path", Error::ZDATAINCONSISTENCY);
+    if (path.size() <= chroot.size())
+        throw Exception(Error::ZDATAINCONSISTENCY, "Received path is not longer than chroot");
 
-    path = path.substr(root_path.size());
+    path = path.substr(chroot.size());
 }
 
 ZooKeeper::~ZooKeeper()
@@ -308,27 +308,20 @@ ZooKeeper::~ZooKeeper()
 
 ZooKeeper::ZooKeeper(
     const Nodes & nodes,
-    const String & root_path_,
-    const String & auth_scheme,
-    const String & auth_data,
-    Poco::Timespan session_timeout_,
-    Poco::Timespan connection_timeout,
-    Poco::Timespan operation_timeout_,
+    const zkutil::ZooKeeperArgs & args_,
     std::shared_ptr<ZooKeeperLog> zk_log_)
-    : root_path(root_path_),
-    session_timeout(session_timeout_),
-    operation_timeout(std::min(operation_timeout_, session_timeout_))
+    : args(args_)
 {
     log = &Poco::Logger::get("ZooKeeperClient");
     std::atomic_store(&zk_log, std::move(zk_log_));
 
-    if (!root_path.empty())
+    if (!args.chroot.empty())
     {
-        if (root_path.back() == '/')
-            root_path.pop_back();
+        if (args.chroot.back() == '/')
+            args.chroot.pop_back();
     }
 
-    if (auth_scheme.empty())
+    if (args.auth_scheme.empty())
     {
         ACL acl;
         acl.permissions = ACL::All;
@@ -345,10 +338,22 @@ ZooKeeper::ZooKeeper(
         default_acls.emplace_back(std::move(acl));
     }
 
-    connect(nodes, connection_timeout);
 
-    if (!auth_scheme.empty())
-        sendAuth(auth_scheme, auth_data);
+    /// It makes sense (especially, for async requests) to inject a fault in two places:
+    /// pushRequest (before request is sent) and receiveEvent (after request was executed).
+    if (0 < args.send_fault_probability && args.send_fault_probability <= 1)
+    {
+        send_inject_fault.emplace(args.send_fault_probability);
+    }
+    if (0 < args.recv_fault_probability && args.recv_fault_probability <= 1)
+    {
+        recv_inject_fault.emplace(args.recv_fault_probability);
+    }
+
+    connect(nodes, args.connection_timeout_ms * 1000);
+
+    if (!args.auth_scheme.empty())
+        sendAuth(args.auth_scheme, args.identity);
 
     send_thread = ThreadFromGlobalPool([this] { sendThread(); });
     receive_thread = ThreadFromGlobalPool([this] { receiveThread(); });
@@ -364,7 +369,7 @@ void ZooKeeper::connect(
     Poco::Timespan connection_timeout)
 {
     if (nodes.empty())
-        throw Exception("No nodes passed to ZooKeeper constructor", Error::ZBADARGUMENTS);
+        throw Exception(Error::ZBADARGUMENTS, "No nodes passed to ZooKeeper constructor");
 
     static constexpr size_t num_tries = 3;
     bool connected = false;
@@ -394,8 +399,8 @@ void ZooKeeper::connect(
                 socket.connect(node.address, connection_timeout);
                 socket_address = socket.peerAddress();
 
-                socket.setReceiveTimeout(operation_timeout);
-                socket.setSendTimeout(operation_timeout);
+                socket.setReceiveTimeout(args.operation_timeout_ms * 1000);
+                socket.setSendTimeout(args.operation_timeout_ms * 1000);
                 socket.setNoDelay(true);
 
                 in.emplace(socket);
@@ -453,7 +458,7 @@ void ZooKeeper::connect(
         }
 
         message << fail_reasons.str() << "\n";
-        throw Exception(message.str(), Error::ZCONNECTIONLOSS);
+        throw Exception(Error::ZCONNECTIONLOSS, message.str());
     }
     else
     {
@@ -466,7 +471,7 @@ void ZooKeeper::sendHandshake()
 {
     int32_t handshake_length = 44;
     int64_t last_zxid_seen = 0;
-    int32_t timeout = session_timeout.totalMilliseconds();
+    int32_t timeout = args.session_timeout_ms;
     int64_t previous_session_id = 0;    /// We don't support session restore. So previous session_id is always zero.
     constexpr int32_t passwd_len = 16;
     std::array<char, passwd_len> passwd {};
@@ -491,7 +496,7 @@ void ZooKeeper::receiveHandshake()
 
     read(handshake_length);
     if (handshake_length != SERVER_HANDSHAKE_LENGTH)
-        throw Exception("Unexpected handshake length received: " + DB::toString(handshake_length), Error::ZMARSHALLINGERROR);
+        throw Exception(Error::ZMARSHALLINGERROR, "Unexpected handshake length received: {}", handshake_length);
 
     read(protocol_version_read);
     if (protocol_version_read != ZOOKEEPER_PROTOCOL_VERSION)
@@ -500,15 +505,15 @@ void ZooKeeper::receiveHandshake()
         /// It's better for faster failover than just connection drop.
         /// Implemented in clickhouse-keeper.
         if (protocol_version_read == KEEPER_PROTOCOL_VERSION_CONNECTION_REJECT)
-            throw Exception("Keeper server rejected the connection during the handshake. Possibly it's overloaded, doesn't see leader or stale", Error::ZCONNECTIONLOSS);
+            throw Exception(Error::ZCONNECTIONLOSS, "Keeper server rejected the connection during the handshake. Possibly it's overloaded, doesn't see leader or stale");
         else
-            throw Exception("Unexpected protocol version: " + DB::toString(protocol_version_read), Error::ZMARSHALLINGERROR);
+            throw Exception(Error::ZMARSHALLINGERROR, "Unexpected protocol version: {}", protocol_version_read);
     }
 
     read(timeout);
-    if (timeout != session_timeout.totalMilliseconds())
+    if (timeout != args.session_timeout_ms)
         /// Use timeout from server.
-        session_timeout = timeout * Poco::Timespan::MILLISECONDS;
+        args.session_timeout_ms = timeout;
 
     read(session_id);
     read(passwd);
@@ -535,17 +540,15 @@ void ZooKeeper::sendAuth(const String & scheme, const String & data)
     read(err);
 
     if (read_xid != AUTH_XID)
-        throw Exception("Unexpected event received in reply to auth request: " + DB::toString(read_xid),
-            Error::ZMARSHALLINGERROR);
+        throw Exception(Error::ZMARSHALLINGERROR, "Unexpected event received in reply to auth request: {}", read_xid);
 
     int32_t actual_length = in->count() - count_before_event;
     if (length != actual_length)
-        throw Exception("Response length doesn't match. Expected: " + DB::toString(length) + ", actual: " + DB::toString(actual_length),
-            Error::ZMARSHALLINGERROR);
+        throw Exception(Error::ZMARSHALLINGERROR, "Response length doesn't match. Expected: {}, actual: {}", length, actual_length);
 
     if (err != Error::ZOK)
-        throw Exception("Error received in reply to auth request. Code: " + DB::toString(static_cast<int32_t>(err)) + ". Message: " + String(errorMessage(err)),
-            Error::ZMARSHALLINGERROR);
+        throw Exception(Error::ZMARSHALLINGERROR, "Error received in reply to auth request. Code: {}. Message: {}",
+                        static_cast<int32_t>(err), errorMessage(err));
 }
 
 
@@ -562,14 +565,14 @@ void ZooKeeper::sendThread()
             auto prev_bytes_sent = out->count();
 
             auto now = clock::now();
-            auto next_heartbeat_time = prev_heartbeat_time + std::chrono::milliseconds(session_timeout.totalMilliseconds() / 3);
+            auto next_heartbeat_time = prev_heartbeat_time + std::chrono::milliseconds(args.session_timeout_ms / 3);
 
             if (next_heartbeat_time > now)
             {
                 /// Wait for the next request in queue. No more than operation timeout. No more than until next heartbeat time.
                 UInt64 max_wait = std::min(
                     static_cast<UInt64>(std::chrono::duration_cast<std::chrono::milliseconds>(next_heartbeat_time - now).count()),
-                    static_cast<UInt64>(operation_timeout.totalMilliseconds()));
+                    static_cast<UInt64>(args.operation_timeout_ms));
 
                 RequestInfo info;
                 if (requests_queue.tryPop(info, max_wait))
@@ -594,7 +597,7 @@ void ZooKeeper::sendThread()
                         break;
                     }
 
-                    info.request->addRootPath(root_path);
+                    info.request->addRootPath(args.chroot);
 
                     info.request->probably_sent = true;
                     info.request->write(*out);
@@ -633,13 +636,13 @@ void ZooKeeper::receiveThread()
 
     try
     {
-        Int64 waited = 0;
+        Int64 waited_us = 0;
         while (!requests_queue.isFinished())
         {
             auto prev_bytes_received = in->count();
 
             clock::time_point now = clock::now();
-            UInt64 max_wait = operation_timeout.totalMicroseconds();
+            UInt64 max_wait_us = args.operation_timeout_ms * 1000;
             std::optional<RequestInfo> earliest_operation;
 
             {
@@ -648,30 +651,32 @@ void ZooKeeper::receiveThread()
                 {
                     /// Operations are ordered by xid (and consequently, by time).
                     earliest_operation = operations.begin()->second;
-                    auto earliest_operation_deadline = earliest_operation->time + std::chrono::microseconds(operation_timeout.totalMicroseconds());
+                    auto earliest_operation_deadline = earliest_operation->time + std::chrono::microseconds(args.operation_timeout_ms * 1000);
                     if (now > earliest_operation_deadline)
-                        throw Exception("Operation timeout (deadline already expired) for path: " + earliest_operation->request->getPath(), Error::ZOPERATIONTIMEOUT);
-                    max_wait = std::chrono::duration_cast<std::chrono::microseconds>(earliest_operation_deadline - now).count();
+                        throw Exception(Error::ZOPERATIONTIMEOUT, "Operation timeout (deadline already expired) for path: {}",
+                                        earliest_operation->request->getPath());
+                    max_wait_us = std::chrono::duration_cast<std::chrono::microseconds>(earliest_operation_deadline - now).count();
                 }
             }
 
-            if (in->poll(max_wait))
+            if (in->poll(max_wait_us))
             {
                 if (requests_queue.isFinished())
                     break;
 
                 receiveEvent();
-                waited = 0;
+                waited_us = 0;
             }
             else
             {
                 if (earliest_operation)
                 {
-                    throw Exception("Operation timeout (no response) for request " + toString(earliest_operation->request->getOpNum()) + " for path: " + earliest_operation->request->getPath(), Error::ZOPERATIONTIMEOUT);
+                    throw Exception(Error::ZOPERATIONTIMEOUT, "Operation timeout (no response) for request {} for path: {}",
+                                    earliest_operation->request->getOpNum(), earliest_operation->request->getPath());
                 }
-                waited += max_wait;
-                if (waited >= session_timeout.totalMicroseconds())
-                    throw Exception("Nothing is received in session timeout", Error::ZOPERATIONTIMEOUT);
+                waited_us += max_wait_us;
+                if (waited_us >= args.session_timeout_ms * 1000)
+                    throw Exception(Error::ZOPERATIONTIMEOUT, "Nothing is received in session timeout");
 
             }
 
@@ -703,10 +708,13 @@ void ZooKeeper::receiveEvent()
     ZooKeeperResponsePtr response;
     UInt64 elapsed_ms = 0;
 
+    if (unlikely(recv_inject_fault) && recv_inject_fault.value()(thread_local_rng))
+        throw Exception(Error::ZSESSIONEXPIRED, "Session expired (fault injected on recv)");
+
     if (xid == PING_XID)
     {
         if (err != Error::ZOK)
-            throw Exception("Received error in heartbeat response: " + String(errorMessage(err)), Error::ZRUNTIMEINCONSISTENCY);
+            throw Exception(Error::ZRUNTIMEINCONSISTENCY, "Received error in heartbeat response: {}", errorMessage(err));
 
         response = std::make_shared<ZooKeeperHeartbeatResponse>();
     }
@@ -781,7 +789,7 @@ void ZooKeeper::receiveEvent()
         else
         {
             response->readImpl(*in);
-            response->removeRootPath(root_path);
+            response->removeRootPath(args.chroot);
         }
         /// Instead of setting the watch in sendEvent, set it in receiveEvent because need to check the response.
         /// The watch shouldn't be set if the node does not exist and it will never exist like sequential ephemeral nodes.
@@ -801,9 +809,9 @@ void ZooKeeper::receiveEvent()
             {
                 CurrentMetrics::add(CurrentMetrics::ZooKeeperWatch);
 
-                /// The key of wathces should exclude the root_path
+                /// The key of wathces should exclude the args.chroot
                 String req_path = request_info.request->getPath();
-                removeRootPath(req_path, root_path);
+                removeRootPath(req_path, args.chroot);
                 std::lock_guard lock(watches_mutex);
                 watches[req_path].emplace_back(std::move(request_info.watch));
             }
@@ -811,7 +819,7 @@ void ZooKeeper::receiveEvent()
 
         int32_t actual_length = in->count() - count_before_event;
         if (length != actual_length)
-            throw Exception("Response length doesn't match. Expected: " + DB::toString(length) + ", actual: " + DB::toString(actual_length), Error::ZMARSHALLINGERROR);
+            throw Exception(Error::ZMARSHALLINGERROR, "Response length doesn't match. Expected: {}, actual: {}", length, actual_length);
 
         logOperationIfNeeded(request_info.request, response, /* finalize= */ false, elapsed_ms);   //-V614
     }
@@ -1035,9 +1043,9 @@ void ZooKeeper::pushRequest(RequestInfo && info)
         {
             info.request->xid = next_xid.fetch_add(1);
             if (info.request->xid == CLOSE_XID)
-                throw Exception("xid equal to close_xid", Error::ZSESSIONEXPIRED);
+                throw Exception(Error::ZSESSIONEXPIRED, "xid equal to close_xid");
             if (info.request->xid < 0)
-                throw Exception("XID overflow", Error::ZSESSIONEXPIRED);
+                throw Exception(Error::ZSESSIONEXPIRED, "XID overflow");
 
             if (auto * multi_request = dynamic_cast<ZooKeeperMultiRequest *>(info.request.get()))
             {
@@ -1046,12 +1054,15 @@ void ZooKeeper::pushRequest(RequestInfo && info)
             }
         }
 
-        if (!requests_queue.tryPush(std::move(info), operation_timeout.totalMilliseconds()))
+        if (unlikely(send_inject_fault) && send_inject_fault.value()(thread_local_rng))
+            throw Exception(Error::ZSESSIONEXPIRED, "Session expired (fault injected on send)");
+
+        if (!requests_queue.tryPush(std::move(info), args.operation_timeout_ms))
         {
             if (requests_queue.isFinished())
-                throw Exception("Session expired", Error::ZSESSIONEXPIRED);
+                throw Exception(Error::ZSESSIONEXPIRED, "Session expired");
 
-            throw Exception("Cannot push request to queue within operation timeout", Error::ZOPERATIONTIMEOUT);
+            throw Exception(Error::ZOPERATIONTIMEOUT, "Cannot push request to queue within operation timeout");
         }
     }
     catch (...)
@@ -1079,7 +1090,7 @@ void ZooKeeper::initApiVersion()
     };
 
     get(keeper_api_version_path, std::move(callback), {});
-    if (future.wait_for(std::chrono::milliseconds(operation_timeout.totalMilliseconds())) != std::future_status::ready)
+    if (future.wait_for(std::chrono::milliseconds(args.operation_timeout_ms)) != std::future_status::ready)
     {
         LOG_TRACE(log, "Failed to get API version: timeout");
         return;
@@ -1220,7 +1231,7 @@ void ZooKeeper::list(
     if (keeper_api_version < Coordination::KeeperApiVersion::WITH_FILTERED_LIST)
     {
         if (list_request_type != ListRequestType::ALL)
-            throw Exception("Filtered list request type cannot be used because it's not supported by the server", Error::ZBADARGUMENTS);
+            throw Exception(Error::ZBADARGUMENTS, "Filtered list request type cannot be used because it's not supported by the server");
 
         request = std::make_shared<ZooKeeperListRequest>();
     }
@@ -1299,8 +1310,8 @@ void ZooKeeper::close()
     RequestInfo request_info;
     request_info.request = std::make_shared<ZooKeeperCloseRequest>(std::move(request));
 
-    if (!requests_queue.tryPush(std::move(request_info), operation_timeout.totalMilliseconds()))
-        throw Exception("Cannot push close request to queue within operation timeout", Error::ZOPERATIONTIMEOUT);
+    if (!requests_queue.tryPush(std::move(request_info), args.operation_timeout_ms))
+        throw Exception(Error::ZOPERATIONTIMEOUT, "Cannot push close request to queue within operation timeout");
 
     ProfileEvents::increment(ProfileEvents::ZooKeeperClose);
 }
diff --git a/src/Common/ZooKeeper/ZooKeeperImpl.h b/src/Common/ZooKeeper/ZooKeeperImpl.h
index e00250c1517..6b70f8bc753 100644
--- a/src/Common/ZooKeeper/ZooKeeperImpl.h
+++ b/src/Common/ZooKeeper/ZooKeeperImpl.h
@@ -7,6 +7,7 @@
 #include <Common/ThreadPool.h>
 #include <Common/ZooKeeper/IKeeper.h>
 #include <Common/ZooKeeper/ZooKeeperCommon.h>
+#include <Common/ZooKeeper/ZooKeeperArgs.h>
 #include <Coordination/KeeperConstants.h>
 
 #include <IO/ReadBuffer.h>
@@ -27,6 +28,7 @@
 #include <cstdint>
 #include <optional>
 #include <functional>
+#include <random>
 
 
 /** ZooKeeper C++ library, a replacement for libzookeeper.
@@ -111,12 +113,7 @@ public:
       */
     ZooKeeper(
         const Nodes & nodes,
-        const String & root_path,
-        const String & auth_scheme,
-        const String & auth_data,
-        Poco::Timespan session_timeout_,
-        Poco::Timespan connection_timeout,
-        Poco::Timespan operation_timeout_,
+        const zkutil::ZooKeeperArgs & args_,
         std::shared_ptr<ZooKeeperLog> zk_log_);
 
     ~ZooKeeper() override;
@@ -201,11 +198,12 @@ public:
     void setZooKeeperLog(std::shared_ptr<DB::ZooKeeperLog> zk_log_);
 
 private:
-    String root_path;
     ACLs default_acls;
 
-    Poco::Timespan session_timeout;
-    Poco::Timespan operation_timeout;
+    zkutil::ZooKeeperArgs args;
+
+    std::optional<std::bernoulli_distribution> send_inject_fault;
+    std::optional<std::bernoulli_distribution> recv_inject_fault;
 
     Poco::Net::StreamSocket socket;
     /// To avoid excessive getpeername(2) calls.
diff --git a/src/Common/ZooKeeper/examples/zkutil_test_async.cpp b/src/Common/ZooKeeper/examples/zkutil_test_async.cpp
index 17258c529ff..eafa0e27691 100644
--- a/src/Common/ZooKeeper/examples/zkutil_test_async.cpp
+++ b/src/Common/ZooKeeper/examples/zkutil_test_async.cpp
@@ -5,7 +5,7 @@
 int main(int argc, char ** argv)
 try
 {
-    zkutil::ZooKeeper zookeeper{"localhost:2181"};
+    zkutil::ZooKeeper zookeeper{zkutil::ZooKeeperArgs("localhost:2181")};
 
     auto nodes = zookeeper.getChildren("/tmp");
 
diff --git a/src/Common/ZooKeeper/examples/zkutil_test_commands.cpp b/src/Common/ZooKeeper/examples/zkutil_test_commands.cpp
index 490c834eab9..095a0dde2e7 100644
--- a/src/Common/ZooKeeper/examples/zkutil_test_commands.cpp
+++ b/src/Common/ZooKeeper/examples/zkutil_test_commands.cpp
@@ -16,7 +16,7 @@ try
         return 1;
     }
 
-    ZooKeeper zk(argv[1], "", 5000);
+    ZooKeeper zk{zkutil::ZooKeeperArgs(argv[1])};
 
     std::cout << "create path" << std::endl;
     zk.create("/test", "old", zkutil::CreateMode::Persistent);
diff --git a/src/Common/ZooKeeper/examples/zkutil_test_commands_new_lib.cpp b/src/Common/ZooKeeper/examples/zkutil_test_commands_new_lib.cpp
index 09b94a34b78..021f444386a 100644
--- a/src/Common/ZooKeeper/examples/zkutil_test_commands_new_lib.cpp
+++ b/src/Common/ZooKeeper/examples/zkutil_test_commands_new_lib.cpp
@@ -40,7 +40,8 @@ try
     }
 
 
-    ZooKeeper zk(nodes, {}, {}, {}, {5, 0}, {0, 50000}, {0, 50000}, nullptr);
+    zkutil::ZooKeeperArgs args;
+    ZooKeeper zk(nodes, args, nullptr);
 
     Poco::Event event(true);
 
diff --git a/src/Common/ZooKeeper/tests/gtest_zookeeper.cpp b/src/Common/ZooKeeper/tests/gtest_zookeeper.cpp
new file mode 100644
index 00000000000..5a989e5932f
--- /dev/null
+++ b/src/Common/ZooKeeper/tests/gtest_zookeeper.cpp
@@ -0,0 +1,15 @@
+#include <gtest/gtest.h>
+
+#include <Common/ZooKeeper/ZooKeeperCommon.h>
+
+TEST(ZooKeeperTest, TestMatchPath)
+{
+    using namespace Coordination;
+
+    ASSERT_EQ(matchPath("/path/file", "/path"), PathMatchResult::IS_CHILD);
+    ASSERT_EQ(matchPath("/path/file", "/path/"), PathMatchResult::IS_CHILD);
+    ASSERT_EQ(matchPath("/path/file", "/"), PathMatchResult::IS_CHILD);
+    ASSERT_EQ(matchPath("/", "/"), PathMatchResult::EXACT);
+    ASSERT_EQ(matchPath("/path", "/path/"), PathMatchResult::EXACT);
+    ASSERT_EQ(matchPath("/path/", "/path"), PathMatchResult::EXACT);
+}
diff --git a/src/Common/tests/gtest_merge_configs.cpp b/src/Common/tests/gtest_merge_configs.cpp
index 293de60f7f6..2cc7f4a99af 100644
--- a/src/Common/tests/gtest_merge_configs.cpp
+++ b/src/Common/tests/gtest_merge_configs.cpp
@@ -43,11 +43,8 @@ clickhouse:
     text_log:
         database: system
         table: text_log
-        partition_by:
-            "@remove": "1"
-        engine:
-            - "@replace" : "1"
-            - "ENGINE MergeTree"
+        partition_by: {"@remove": "1"}
+        engine: "ENGINE MergeTree"
         flush_interval_milliseconds: 7500
         level: debug
 )YAML";
@@ -112,11 +109,8 @@ clickhouse:
     text_log :
         database: system
         table: text_log
-        partition_by:
-            "@remove": "1"
-        engine:
-            - "@replace" : "1"
-            - "ENGINE MergeTree"
+        partition_by: {"@remove": "1"}
+        engine: "ENGINE MergeTree"
         flush_interval_milliseconds: 7500
         level: debug
 )YAML";
diff --git a/src/Common/tests/gtest_yaml_parser.cpp b/src/Common/tests/gtest_yaml_parser.cpp
index 8457e6fd4e1..4ffd66ae3a1 100644
--- a/src/Common/tests/gtest_yaml_parser.cpp
+++ b/src/Common/tests/gtest_yaml_parser.cpp
@@ -13,40 +13,12 @@
 
 using namespace DB;
 
-TEST(Common, YamlParserInvalidFile)
+TEST(YamlParser, InvalidFile)
 {
     ASSERT_THROW(YAMLParser::parse("some-non-existing-file.yaml"), Exception);
 }
 
-TEST(Common, YamlParserProcessKeysList)
-{
-    auto yaml_file = getFileWithContents("keys-list.yaml", R"YAML(
-operator:
-    access_management: "1"
-    networks:
-      - ip: "10.1.6.168"
-      - ip: "::1"
-      - ip: "127.0.0.1"
-)YAML");
-    SCOPE_EXIT({ yaml_file->remove(); });
-
-    Poco::AutoPtr<Poco::XML::Document> xml = YAMLParser::parse(yaml_file->path());
-    auto *p_node = xml->getNodeByPath("/clickhouse");
-    EXPECT_EQ(xmlNodeAsString(p_node), R"CONFIG(<clickhouse>
-<operator>
-<access_management>1</access_management>
-<networks>
-<ip>10.1.6.168</ip>
-<ip>::1</ip>
-<ip>127.0.0.1</ip>
-</networks>
-</operator>
-</clickhouse>
-)CONFIG");
-
-}
-
-TEST(Common, YamlParserProcessValuesList)
+TEST(YamlParser, ProcessValuesList)
 {
     auto yaml_file = getFileWithContents("values-list.yaml", R"YAML(
 rules:
@@ -75,4 +47,141 @@ rules:
 )CONFIG");
 
 }
+
+TEST(YamlParser, ProcessKeysList)
+{
+    auto yaml_file = getFileWithContents("keys-list.yaml", R"YAML(
+operator:
+    access_management: 1
+    networks:
+        ip:
+          - 10.1.6.168
+          - ::1
+          - 127.0.0.1
+)YAML");
+    SCOPE_EXIT({ yaml_file->remove(); });
+
+    Poco::AutoPtr<Poco::XML::Document> xml = YAMLParser::parse(yaml_file->path());
+    auto *p_node = xml->getNodeByPath("/clickhouse");
+    EXPECT_EQ(xmlNodeAsString(p_node), R"CONFIG(<clickhouse>
+<operator>
+<access_management>1</access_management>
+<networks>
+<ip>10.1.6.168</ip>
+<ip>::1</ip>
+<ip>127.0.0.1</ip>
+</networks>
+</operator>
+</clickhouse>
+)CONFIG");
+
+}
+
+TEST(YamlParser, ProcessListAttributes)
+{
+    auto yaml_file = getFileWithContents("list_attributes.yaml", R"YAML(
+seq:
+  - "@attr1": x
+  - k1: val1
+    k2: val2
+    "@attr2": y
+  - k3: val3
+    "@attr3": z
+)YAML");
+    SCOPE_EXIT({ yaml_file->remove(); });
+
+    Poco::AutoPtr<Poco::XML::Document> xml = YAMLParser::parse(yaml_file->path());
+    auto *p_node = xml->getNodeByPath("/clickhouse");
+    EXPECT_EQ(xmlNodeAsString(p_node), R"CONFIG(<clickhouse>
+<seq attr1="x"></seq>
+<seq attr2="y">
+<k1>val1</k1>
+<k2>val2</k2>
+</seq>
+<seq attr3="z">
+<k3>val3</k3>
+</seq>
+</clickhouse>
+)CONFIG");
+
+}
+
+TEST(YamlParser, ProcessMapAttributes)
+{
+    auto yaml_file = getFileWithContents("map_attributes.yaml", R"YAML(
+map:
+    "@attr1": x
+    k1: val1
+    k2: val2
+    "@attr2": y
+    k3: val3
+    "@attr3": z
+)YAML");
+    SCOPE_EXIT({ yaml_file->remove(); });
+
+    Poco::AutoPtr<Poco::XML::Document> xml = YAMLParser::parse(yaml_file->path());
+    auto *p_node = xml->getNodeByPath("/clickhouse");
+    EXPECT_EQ(xmlNodeAsString(p_node), R"CONFIG(<clickhouse>
+<map attr1="x" attr2="y" attr3="z">
+<k1>val1</k1>
+<k2>val2</k2>
+<k3>val3</k3>
+</map>
+</clickhouse>
+)CONFIG");
+
+}
+
+TEST(YamlParser, ClusterDef)
+{
+    auto yaml_file = getFileWithContents("cluster_def.yaml", R"YAML(
+test_cluster:
+    shard:
+        - internal_replication: false
+          replica:
+              - host: 127.0.0.1
+                port: 9000
+              - host: 127.0.0.2
+                port: 9000
+        - internal_replication: true
+          replica:
+              - host: 127.0.0.3
+                port: 9000
+              - host: 127.0.0.4
+                port: 9000
+)YAML");
+    SCOPE_EXIT({ yaml_file->remove(); });
+
+    Poco::AutoPtr<Poco::XML::Document> xml = YAMLParser::parse(yaml_file->path());
+    auto *p_node = xml->getNodeByPath("/clickhouse");
+    EXPECT_EQ(xmlNodeAsString(p_node), R"CONFIG(<clickhouse>
+<test_cluster>
+<shard>
+<internal_replication>false</internal_replication>
+<replica>
+<host>127.0.0.1</host>
+<port>9000</port>
+</replica>
+<replica>
+<host>127.0.0.2</host>
+<port>9000</port>
+</replica>
+</shard>
+<shard>
+<internal_replication>true</internal_replication>
+<replica>
+<host>127.0.0.3</host>
+<port>9000</port>
+</replica>
+<replica>
+<host>127.0.0.4</host>
+<port>9000</port>
+</replica>
+</shard>
+</test_cluster>
+</clickhouse>
+)CONFIG");
+
+}
+
 #endif
diff --git a/src/Compression/CompressionFactoryAdditions.cpp b/src/Compression/CompressionFactoryAdditions.cpp
index d87d0f8b4ee..3e215076871 100644
--- a/src/Compression/CompressionFactoryAdditions.cpp
+++ b/src/Compression/CompressionFactoryAdditions.cpp
@@ -116,8 +116,8 @@ ASTPtr CompressionCodecFactory::validateCodecAndGetPreprocessedAST(
                         }
                     };
 
-                    ISerialization::SubstreamPath path;
-                    column_type->getDefaultSerialization()->enumerateStreams(path, callback, column_type);
+                    auto serialization = column_type->getDefaultSerialization();
+                    serialization->enumerateStreams(callback, column_type);
 
                     if (!result_codec)
                         throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot find any substream with data type for type {}. It's a bug", column_type->getName());
diff --git a/src/Compression/LZ4_decompress_faster.cpp b/src/Compression/LZ4_decompress_faster.cpp
index 269c1a681c5..f2084f34274 100644
--- a/src/Compression/LZ4_decompress_faster.cpp
+++ b/src/Compression/LZ4_decompress_faster.cpp
@@ -478,11 +478,7 @@ template <> void inline copyOverlap<32, true>(UInt8 * op, const UInt8 *& match,
 /// See also https://stackoverflow.com/a/30669632
 
 template <size_t copy_amount, bool use_shuffle>
-bool NO_INLINE decompressImpl(
-     const char * const source,
-     char * const dest,
-     size_t source_size,
-     size_t dest_size)
+bool NO_INLINE decompressImpl(const char * const source, char * const dest, size_t source_size, size_t dest_size)
 {
     const UInt8 * ip = reinterpret_cast<const UInt8 *>(source);
     UInt8 * op = reinterpret_cast<UInt8 *>(dest);
@@ -515,6 +511,18 @@ bool NO_INLINE decompressImpl(
 
         const unsigned token = *ip++;
         length = token >> 4;
+
+        UInt8 * copy_end;
+        size_t real_length;
+
+        /// It might be true fairly often for well-compressed columns.
+        /// ATST it may hurt performance in other cases because this condition is hard to predict (especially if the number of zeros is ~50%).
+        /// In such cases this `if` will significantly increase number of mispredicted instructions. But seems like it results in a
+        /// noticeable slowdown only for implementations with `copy_amount` > 8. Probably because they use havier instructions.
+        if constexpr (copy_amount == 8)
+            if (length == 0)
+                goto decompress_match;
+
         if (length == 0x0F)
         {
             if (unlikely(ip + 1 >= input_end))
@@ -524,7 +532,7 @@ bool NO_INLINE decompressImpl(
 
         /// Copy literals.
 
-        UInt8 * copy_end = op + length;
+        copy_end = op + length;
 
         /// input: Hello, world
         ///        ^-ip
@@ -541,7 +549,7 @@ bool NO_INLINE decompressImpl(
             return false;
 
         // Due to implementation specifics the copy length is always a multiple of copy_amount
-        size_t real_length = 0;
+        real_length = 0;
 
         static_assert(copy_amount == 8 || copy_amount == 16 || copy_amount == 32);
         if constexpr (copy_amount == 8)
@@ -552,9 +560,9 @@ bool NO_INLINE decompressImpl(
             real_length = (((length >> 5) + 1) * 32);
 
         if (unlikely(ip + real_length >= input_end + ADDITIONAL_BYTES_AT_END_OF_BUFFER))
-             return false;
+            return false;
 
-        wildCopy<copy_amount>(op, ip, copy_end);    /// Here we can write up to copy_amount - 1 bytes after buffer.
+        wildCopy<copy_amount>(op, ip, copy_end); /// Here we can write up to copy_amount - 1 bytes after buffer.
 
         if (copy_end == output_end)
             return true;
@@ -562,6 +570,8 @@ bool NO_INLINE decompressImpl(
         ip += length;
         op = copy_end;
 
+    decompress_match:
+
         if (unlikely(ip + 1 >= input_end))
             return false;
 
diff --git a/src/Compression/fuzzers/encrypted_decompress_fuzzer.cpp b/src/Compression/fuzzers/encrypted_decompress_fuzzer.cpp
index 9e4d4a3241f..3e3d0e164fe 100644
--- a/src/Compression/fuzzers/encrypted_decompress_fuzzer.cpp
+++ b/src/Compression/fuzzers/encrypted_decompress_fuzzer.cpp
@@ -58,7 +58,7 @@ Fuzzing data consists of:
             else:
                 read_key()
             if (7):
-                read_nonce (simillar to read_key)
+                read_nonce (similar to read_key)
             if (8):
                 set current_key
 
diff --git a/src/Coordination/KeeperSnapshotManager.cpp b/src/Coordination/KeeperSnapshotManager.cpp
index 1e3f37b617f..fe4050eb685 100644
--- a/src/Coordination/KeeperSnapshotManager.cpp
+++ b/src/Coordination/KeeperSnapshotManager.cpp
@@ -13,8 +13,10 @@
 #include <filesystem>
 #include <memory>
 #include <Common/logger_useful.h>
-#include "Coordination/KeeperContext.h"
+#include <Coordination/KeeperContext.h>
 #include <Coordination/KeeperConstants.h>
+#include <Common/ZooKeeper/ZooKeeperCommon.h>
+
 
 namespace DB
 {
@@ -146,33 +148,6 @@ namespace
     }
 }
 
-namespace
-{
-
-enum class PathMatchResult
-{
-    NOT_MATCH,
-    EXACT,
-    IS_CHILD
-};
-
-PathMatchResult matchPath(const std::string_view path, const std::string_view match_to)
-{
-    using enum PathMatchResult;
-
-    auto [first_it, second_it] = std::mismatch(path.begin(), path.end(), match_to.begin(), match_to.end());
-
-    if (second_it != match_to.end())
-        return NOT_MATCH;
-
-    if (first_it == path.end())
-        return EXACT;
-
-    return *first_it == '/' ? IS_CHILD : NOT_MATCH;
-}
-
-}
-
 void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, WriteBuffer & out, KeeperContextPtr keeper_context)
 {
     writeBinary(static_cast<uint8_t>(snapshot.version), out);
@@ -217,7 +192,7 @@ void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, Wr
         const auto & path = it->key;
 
         // write only the root system path because of digest
-        if (matchPath(path.toView(), keeper_system_path) == PathMatchResult::IS_CHILD)
+        if (Coordination::matchPath(path.toView(), keeper_system_path) == Coordination::PathMatchResult::IS_CHILD)
         {
             ++it;
             continue;
@@ -365,8 +340,8 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial
         KeeperStorage::Node node{};
         readNode(node, in, current_version, storage.acl_map);
 
-        using enum PathMatchResult;
-        auto match_result = matchPath(path, keeper_system_path);
+        using enum Coordination::PathMatchResult;
+        auto match_result = Coordination::matchPath(path, keeper_system_path);
 
         const std::string error_msg = fmt::format("Cannot read node on path {} from a snapshot because it is used as a system node", path);
         if (match_result == IS_CHILD)
diff --git a/src/Coordination/KeeperSnapshotManager.h b/src/Coordination/KeeperSnapshotManager.h
index 4984e54f15f..c00ce9421e7 100644
--- a/src/Coordination/KeeperSnapshotManager.h
+++ b/src/Coordination/KeeperSnapshotManager.h
@@ -27,7 +27,7 @@ enum SnapshotVersion : uint8_t
 
 static constexpr auto CURRENT_SNAPSHOT_VERSION = SnapshotVersion::V5;
 
-/// What is stored in binary shapsnot
+/// What is stored in binary snapshot
 struct SnapshotDeserializationResult
 {
     /// Storage
diff --git a/src/Coordination/KeeperStorage.cpp b/src/Coordination/KeeperStorage.cpp
index 397cd2c0c71..cfe614e1287 100644
--- a/src/Coordination/KeeperStorage.cpp
+++ b/src/Coordination/KeeperStorage.cpp
@@ -879,7 +879,7 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr
             path_created += seq_num_str.str();
         }
 
-        if (path_created.starts_with(keeper_system_path))
+        if (Coordination::matchPath(path_created, keeper_system_path) != Coordination::PathMatchResult::NOT_MATCH)
         {
             auto error_msg = fmt::format("Trying to create a node inside the internal Keeper path ({}) which is not allowed. Path: {}", keeper_system_path, path_created);
 
@@ -1049,7 +1049,7 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr
 
         std::vector<KeeperStorage::Delta> new_deltas;
 
-        if (request.path.starts_with(keeper_system_path))
+        if (Coordination::matchPath(request.path, keeper_system_path) != Coordination::PathMatchResult::NOT_MATCH)
         {
             auto error_msg = fmt::format("Trying to delete an internal Keeper path ({}) which is not allowed", request.path);
 
@@ -1203,7 +1203,7 @@ struct KeeperStorageSetRequestProcessor final : public KeeperStorageRequestProce
 
         std::vector<KeeperStorage::Delta> new_deltas;
 
-        if (request.path.starts_with(keeper_system_path))
+        if (Coordination::matchPath(request.path, keeper_system_path) != Coordination::PathMatchResult::NOT_MATCH)
         {
             auto error_msg = fmt::format("Trying to update an internal Keeper path ({}) which is not allowed", request.path);
 
@@ -1472,7 +1472,7 @@ struct KeeperStorageSetACLRequestProcessor final : public KeeperStorageRequestPr
     {
         Coordination::ZooKeeperSetACLRequest & request = dynamic_cast<Coordination::ZooKeeperSetACLRequest &>(*zk_request);
 
-        if (request.path.starts_with(keeper_system_path))
+        if (Coordination::matchPath(request.path, keeper_system_path) != Coordination::PathMatchResult::NOT_MATCH)
         {
             auto error_msg = fmt::format("Trying to update an internal Keeper path ({}) which is not allowed", request.path);
 
@@ -2192,7 +2192,7 @@ void KeeperStorage::rollbackRequest(int64_t rollback_zxid, bool allow_missing)
     }
     catch (...)
     {
-        LOG_FATAL(&Poco::Logger::get("KeeperStorage"), "Failed to rollback log. Terminating to avoid incosistencies");
+        LOG_FATAL(&Poco::Logger::get("KeeperStorage"), "Failed to rollback log. Terminating to avoid inconsistencies");
         std::terminate();
     }
 }
diff --git a/src/Coordination/SessionExpiryQueue.h b/src/Coordination/SessionExpiryQueue.h
index 8581800834d..862ec35e2f6 100644
--- a/src/Coordination/SessionExpiryQueue.h
+++ b/src/Coordination/SessionExpiryQueue.h
@@ -53,7 +53,7 @@ public:
     /// Session was actually removed
     bool remove(int64_t session_id);
 
-    /// Update session expiry time (must be called on hearbeats)
+    /// Update session expiry time (must be called on heartbeats)
     void addNewSessionOrUpdate(int64_t session_id, int64_t timeout_ms);
 
     /// Get all expired sessions
diff --git a/src/Coordination/tests/gtest_coordination.cpp b/src/Coordination/tests/gtest_coordination.cpp
index 493e76ee5fc..fa6bfca7c7a 100644
--- a/src/Coordination/tests/gtest_coordination.cpp
+++ b/src/Coordination/tests/gtest_coordination.cpp
@@ -1339,7 +1339,7 @@ void testLogAndStateMachine(Coordination::CoordinationSettingsPtr settings, uint
             nuraft::async_result<bool>::handler_type when_done = [&snapshot_created] (bool & ret, nuraft::ptr<std::exception> &/*exception*/)
             {
                 snapshot_created = ret;
-                std::cerr << "Snapshot finised\n";
+                std::cerr << "Snapshot finished\n";
             };
 
             state_machine->create_snapshot(s, when_done);
@@ -2141,6 +2141,38 @@ TEST_P(CoordinationTest, TestCurrentApiVersion)
     EXPECT_EQ(keeper_version, static_cast<uint8_t>(current_keeper_api_version));
 }
 
+TEST_P(CoordinationTest, TestSystemNodeModify)
+{
+    using namespace Coordination;
+    int64_t zxid{0};
+
+    // On INIT we abort when a system path is modified
+    keeper_context->server_state = KeeperContext::Phase::RUNNING;
+    KeeperStorage storage{500, "", keeper_context};
+    const auto assert_create = [&](const std::string_view path, const auto expected_code)
+    {
+        auto request = std::make_shared<ZooKeeperCreateRequest>();
+        request->path = path;
+        storage.preprocessRequest(request, 0, 0, zxid);
+        auto responses = storage.processRequest(request, 0, zxid);
+        ASSERT_FALSE(responses.empty());
+
+        const auto & response = responses[0];
+        ASSERT_EQ(response.response->error, expected_code) << "Unexpected error for path " << path;
+
+        ++zxid;
+    };
+
+    assert_create("/keeper", Error::ZBADARGUMENTS);
+    assert_create("/keeper/with_child", Error::ZBADARGUMENTS);
+    assert_create(DB::keeper_api_version_path, Error::ZBADARGUMENTS);
+
+    assert_create("/keeper_map", Error::ZOK);
+    assert_create("/keeper1", Error::ZOK);
+    assert_create("/keepe", Error::ZOK);
+    assert_create("/keeper1/test", Error::ZOK);
+}
+
 INSTANTIATE_TEST_SUITE_P(CoordinationTestSuite,
     CoordinationTest,
     ::testing::ValuesIn(std::initializer_list<CompressionParam>{
diff --git a/src/Core/Settings.h b/src/Core/Settings.h
index e9c8c65bb4e..c07be4b92da 100644
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@@ -149,7 +149,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
     \
     M(UInt64, parallel_distributed_insert_select, 0, "Process distributed INSERT SELECT query in the same cluster on local tables on every shard; if set to 1 - SELECT is executed on each shard; if set to 2 - SELECT and INSERT are executed on each shard", 0) \
     M(UInt64, distributed_group_by_no_merge, 0, "If 1, Do not merge aggregation states from different servers for distributed queries (shards will process query up to the Complete stage, initiator just proxies the data from the shards). If 2 the initiator will apply ORDER BY and LIMIT stages (it is not in case when shard process query up to the Complete stage)", 0) \
-    M(UInt64, distributed_push_down_limit, 1, "If 1, LIMIT will be applied on each shard separatelly. Usually you don't need to use it, since this will be done automatically if it is possible, i.e. for simple query SELECT FROM LIMIT.", 0) \
+    M(UInt64, distributed_push_down_limit, 1, "If 1, LIMIT will be applied on each shard separately. Usually you don't need to use it, since this will be done automatically if it is possible, i.e. for simple query SELECT FROM LIMIT.", 0) \
     M(Bool, optimize_distributed_group_by_sharding_key, true, "Optimize GROUP BY sharding_key queries (by avoiding costly aggregation on the initiator server).", 0) \
     M(UInt64, optimize_skip_unused_shards_limit, 1000, "Limit for number of sharding key values, turns off optimize_skip_unused_shards if the limit is reached", 0) \
     M(Bool, optimize_skip_unused_shards, false, "Assumes that data is distributed by sharding_key. Optimization to skip unused shards if SELECT query filters by sharding_key.", 0) \
@@ -213,7 +213,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
     \
     M(Bool, insert_deduplicate, true, "For INSERT queries in the replicated table, specifies that deduplication of insertings blocks should be performed", 0) \
     \
-    M(UInt64Auto, insert_quorum, 0, "For INSERT queries in the replicated table, wait writing for the specified number of replicas and linearize the addition of the data. 0 - disabled.", 0) \
+    M(UInt64Auto, insert_quorum, 0, "For INSERT queries in the replicated table, wait writing for the specified number of replicas and linearize the addition of the data. 0 - disabled, 'auto' - use majority", 0) \
     M(Milliseconds, insert_quorum_timeout, 600000, "If the quorum of replicas did not meet in specified time (in milliseconds), exception will be thrown and insertion is aborted.", 0) \
     M(Bool, insert_quorum_parallel, true, "For quorum INSERT queries - enable to make parallel inserts without linearizability", 0) \
     M(UInt64, select_sequential_consistency, 0, "For SELECT queries from the replicated table, throw an exception if the replica does not have a chunk written with the quorum; do not read the parts that have not yet been written with the quorum.", 0) \
@@ -346,7 +346,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
     M(UInt64, max_temporary_non_const_columns, 0, "Similar to the 'max_temporary_columns' setting but applies only to non-constant columns. This makes sense, because constant columns are cheap and it is reasonable to allow more of them.", 0) \
     \
     M(UInt64, max_subquery_depth, 100, "If a query has more than specified number of nested subqueries, throw an exception. This allows you to have a sanity check to protect the users of your cluster from going insane with their queries.", 0) \
-    M(UInt64, max_pipeline_depth, 1000, "If a query has more than specified stages in the query pipeline, throw an exception. Pipeline has stages for every relational operator. This allows to limit the complexity of the queries.", 0) \
+    M(UInt64, max_analyze_depth, 5000, "Maximum number of analyses performed by interpreter.", 0) \
     M(UInt64, max_ast_depth, 1000, "Maximum depth of query syntax tree. Checked after parsing.", 0) \
     M(UInt64, max_ast_elements, 50000, "Maximum size of query syntax tree in number of nodes. Checked after parsing.", 0) \
     M(UInt64, max_expanded_ast_elements, 500000, "Maximum size of query syntax tree in number of nodes after expansion of aliases and the asterisk.", 0) \
@@ -366,6 +366,8 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
     M(UInt64, partial_merge_join_left_table_buffer_bytes, 0, "If not 0 group left table blocks in bigger ones for left-side table in partial merge join. It uses up to 2x of specified memory per joining thread.", 0) \
     M(UInt64, partial_merge_join_rows_in_right_blocks, 65536, "Split right-hand joining data in blocks of specified size. It's a portion of data indexed by min-max values and possibly unloaded on disk.", 0) \
     M(UInt64, join_on_disk_max_files_to_merge, 64, "For MergeJoin on disk set how much files it's allowed to sort simultaneously. Then this value bigger then more memory used and then less disk I/O needed. Minimum is 2.", 0) \
+    M(UInt64, max_rows_in_set_to_optimize_join, 100'000, "Maximal size of the set to filter joined tables by each other row sets before joining. 0 - disable.", 0) \
+    \
     M(Bool, compatibility_ignore_collation_in_create_table, true, "Compatibility ignore collation in create table", 0) \
     \
     M(String, temporary_files_codec, "LZ4", "Set compression codec for temporary files (sort and join on disk). I.e. LZ4, NONE.", 0) \
@@ -554,7 +556,9 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
     M(UInt64, external_storage_connect_timeout_sec, DBMS_DEFAULT_CONNECT_TIMEOUT_SEC, "Connect timeout in seconds. Now supported only for MySQL", 0)  \
     M(UInt64, external_storage_rw_timeout_sec, DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC, "Read/write timeout in seconds. Now supported only for MySQL", 0)  \
     \
-    M(UnionMode, union_default_mode, UnionMode::Unspecified, "Set default Union Mode in SelectWithUnion query. Possible values: empty string, 'ALL', 'DISTINCT'. If empty, query without Union Mode will throw exception.", 0) \
+    M(SetOperationMode, union_default_mode, SetOperationMode::Unspecified, "Set default mode in UNION query. Possible values: empty string, 'ALL', 'DISTINCT'. If empty, query without mode will throw exception.", 0) \
+    M(SetOperationMode, intersect_default_mode, SetOperationMode::ALL, "Set default mode in INTERSECT query. Possible values: empty string, 'ALL', 'DISTINCT'. If empty, query without mode will throw exception.", 0) \
+    M(SetOperationMode, except_default_mode, SetOperationMode::ALL, "Set default mode in EXCEPT query. Possible values: empty string, 'ALL', 'DISTINCT'. If empty, query without mode will throw exception.", 0) \
     M(Bool, optimize_aggregators_of_group_by_keys, true, "Eliminates min/max/any/anyLast aggregators of GROUP BY keys in SELECT section", 0) \
     M(Bool, optimize_group_by_function_keys, true, "Eliminates functions of other keys in GROUP BY section", 0) \
     M(Bool, legacy_column_name_of_tuple_literal, false, "List all names of element of large tuple literals in their column names instead of hash. This settings exists only for compatibility reasons. It makes sense to set to 'true', while doing rolling update of cluster from version lower than 21.7 to higher.", 0) \
@@ -674,6 +678,8 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
     MAKE_OBSOLETE(M, UInt64, background_message_broker_schedule_pool_size, 16) \
     MAKE_OBSOLETE(M, UInt64, background_distributed_schedule_pool_size, 16) \
     MAKE_OBSOLETE(M, DefaultDatabaseEngine, default_database_engine, DefaultDatabaseEngine::Atomic) \
+    MAKE_OBSOLETE(M, UInt64, max_pipeline_depth, 0) \
+
     /** The section above is for obsolete settings. Do not add anything there. */
 
 
diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h
index 8d0e69f4b29..be2def2c01a 100644
--- a/src/Core/SettingsChangesHistory.h
+++ b/src/Core/SettingsChangesHistory.h
@@ -89,7 +89,7 @@ static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> sett
         {"22.3", {{"cast_ipv4_ipv6_default_on_conversion_error", true, false, "Make functions cast(value, 'IPv4') and cast(value, 'IPv6') behave same as toIPv4 and toIPv6 functions"}}},
         {"21.12", {{"stream_like_engine_allow_direct_select", true, false, "Do not allow direct select for Kafka/RabbitMQ/FileLog by default"}}},
         {"21.9", {{"output_format_decimal_trailing_zeros", true, false, "Do not output trailing zeros in text representation of Decimal types by default for better looking output"},
-                  {"use_hedged_requests", false, true, "Enable Hedged Requests feature bu default"}}},
+                  {"use_hedged_requests", false, true, "Enable Hedged Requests feature by default"}}},
         {"21.7", {{"legacy_column_name_of_tuple_literal", true, false, "Add this setting only for compatibility reasons. It makes sense to set to 'true', while doing rolling update of cluster from version lower than 21.7 to higher"}}},
         {"21.5", {{"async_socket_for_remote", false, true, "Fix all problems and turn on asynchronous reads from socket for remote queries by default again"}}},
         {"21.3", {{"async_socket_for_remote", true, false, "Turn off asynchronous reads from socket for remote queries because of some problems"},
diff --git a/src/Core/SettingsEnums.cpp b/src/Core/SettingsEnums.cpp
index 616026520db..daa678c0141 100644
--- a/src/Core/SettingsEnums.cpp
+++ b/src/Core/SettingsEnums.cpp
@@ -114,10 +114,10 @@ IMPLEMENT_SETTING_MULTI_ENUM(MySQLDataTypesSupport, ErrorCodes::UNKNOWN_MYSQL_DA
      {"date2Date32", MySQLDataTypesSupport::DATE2DATE32},
      {"date2String", MySQLDataTypesSupport::DATE2STRING}})
 
-IMPLEMENT_SETTING_ENUM(UnionMode, ErrorCodes::UNKNOWN_UNION,
-    {{"",         UnionMode::Unspecified},
-     {"ALL",      UnionMode::ALL},
-     {"DISTINCT", UnionMode::DISTINCT}})
+IMPLEMENT_SETTING_ENUM(SetOperationMode, ErrorCodes::UNKNOWN_UNION,
+    {{"",         SetOperationMode::Unspecified},
+     {"ALL",      SetOperationMode::ALL},
+     {"DISTINCT", SetOperationMode::DISTINCT}})
 
 IMPLEMENT_SETTING_ENUM(DistributedDDLOutputMode, ErrorCodes::BAD_ARGUMENTS,
     {{"none",         DistributedDDLOutputMode::NONE},
diff --git a/src/Core/SettingsEnums.h b/src/Core/SettingsEnums.h
index 308d53ff690..b5e908defc7 100644
--- a/src/Core/SettingsEnums.h
+++ b/src/Core/SettingsEnums.h
@@ -130,14 +130,14 @@ enum class MySQLDataTypesSupport
 
 DECLARE_SETTING_MULTI_ENUM(MySQLDataTypesSupport)
 
-enum class UnionMode
+enum class SetOperationMode
 {
-    Unspecified = 0, // Query UNION without UnionMode will throw exception
-    ALL, // Query UNION without UnionMode -> SELECT ... UNION ALL SELECT ...
-    DISTINCT // Query UNION without UnionMode -> SELECT ... UNION DISTINCT SELECT ...
+    Unspecified = 0, // Query UNION / EXCEPT / INTERSECT without SetOperationMode will throw exception
+    ALL, // Query UNION / EXCEPT / INTERSECT without SetOperationMode -> SELECT ... UNION / EXCEPT / INTERSECT ALL SELECT ...
+    DISTINCT // Query UNION / EXCEPT / INTERSECT without SetOperationMode -> SELECT ... UNION / EXCEPT / INTERSECT DISTINCT SELECT ...
 };
 
-DECLARE_SETTING_ENUM(UnionMode)
+DECLARE_SETTING_ENUM(SetOperationMode)
 
 enum class DistributedDDLOutputMode
 {
@@ -153,7 +153,7 @@ enum class HandleKafkaErrorMode
 {
     DEFAULT = 0, // Ignore errors with threshold.
     STREAM, // Put errors to stream in the virtual column named ``_error.
-    /*FIXED_SYSTEM_TABLE, Put errors to in a fixed system table likey system.kafka_errors. This is not implemented now.  */
+    /*FIXED_SYSTEM_TABLE, Put errors to in a fixed system table likely system.kafka_errors. This is not implemented now.  */
     /*CUSTOM_SYSTEM_TABLE, Put errors to in a custom system table. This is not implemented now.  */
 };
 
diff --git a/src/DataTypes/IDataType.cpp b/src/DataTypes/IDataType.cpp
index cb3bab5c653..0c29c263fe7 100644
--- a/src/DataTypes/IDataType.cpp
+++ b/src/DataTypes/IDataType.cpp
@@ -84,18 +84,20 @@ void IDataType::forEachSubcolumn(
     {
         for (size_t i = 0; i < subpath.size(); ++i)
         {
-            if (!subpath[i].visited && ISerialization::hasSubcolumnForPath(subpath, i + 1))
+            size_t prefix_len = i + 1;
+            if (!subpath[i].visited && ISerialization::hasSubcolumnForPath(subpath, prefix_len))
             {
-                auto name = ISerialization::getSubcolumnNameForStream(subpath, i + 1);
-                auto subdata = ISerialization::createFromPath(subpath, i);
+                auto name = ISerialization::getSubcolumnNameForStream(subpath, prefix_len);
+                auto subdata = ISerialization::createFromPath(subpath, prefix_len);
                 callback(subpath, name, subdata);
             }
             subpath[i].visited = true;
         }
     };
 
-    SubstreamPath path;
-    data.serialization->enumerateStreams(path, callback_with_data, data);
+    ISerialization::EnumerateStreamsSettings settings;
+    settings.position_independent_encoding = false;
+    data.serialization->enumerateStreams(settings, callback_with_data, data);
 }
 
 template <typename Ptr>
@@ -118,33 +120,38 @@ Ptr IDataType::getForSubcolumn(
     return res;
 }
 
+bool IDataType::hasSubcolumn(const String & subcolumn_name) const
+{
+    return tryGetSubcolumnType(subcolumn_name) != nullptr;
+}
+
 DataTypePtr IDataType::tryGetSubcolumnType(const String & subcolumn_name) const
 {
-    SubstreamData data = { getDefaultSerialization(), getPtr(), nullptr, nullptr };
+    auto data = SubstreamData(getDefaultSerialization()).withType(getPtr());
     return getForSubcolumn<DataTypePtr>(subcolumn_name, data, &SubstreamData::type, false);
 }
 
 DataTypePtr IDataType::getSubcolumnType(const String & subcolumn_name) const
 {
-    SubstreamData data = { getDefaultSerialization(), getPtr(), nullptr, nullptr };
+    auto data = SubstreamData(getDefaultSerialization()).withType(getPtr());
     return getForSubcolumn<DataTypePtr>(subcolumn_name, data, &SubstreamData::type, true);
 }
 
 ColumnPtr IDataType::tryGetSubcolumn(const String & subcolumn_name, const ColumnPtr & column) const
 {
-    SubstreamData data = { getDefaultSerialization(), nullptr, column, nullptr };
+    auto data = SubstreamData(getDefaultSerialization()).withColumn(column);
     return getForSubcolumn<ColumnPtr>(subcolumn_name, data, &SubstreamData::column, false);
 }
 
 ColumnPtr IDataType::getSubcolumn(const String & subcolumn_name, const ColumnPtr & column) const
 {
-    SubstreamData data = { getDefaultSerialization(), nullptr, column, nullptr };
+    auto data = SubstreamData(getDefaultSerialization()).withColumn(column);
     return getForSubcolumn<ColumnPtr>(subcolumn_name, data, &SubstreamData::column, true);
 }
 
 SerializationPtr IDataType::getSubcolumnSerialization(const String & subcolumn_name, const SerializationPtr & serialization) const
 {
-    SubstreamData data = { serialization, nullptr, nullptr, nullptr };
+    auto data = SubstreamData(serialization);
     return getForSubcolumn<SerializationPtr>(subcolumn_name, data, &SubstreamData::serialization, true);
 }
 
@@ -154,7 +161,7 @@ Names IDataType::getSubcolumnNames() const
     forEachSubcolumn([&](const auto &, const auto & name, const auto &)
     {
         res.push_back(name);
-    }, { getDefaultSerialization(), nullptr, nullptr, nullptr });
+    }, SubstreamData(getDefaultSerialization()));
     return res;
 }
 
diff --git a/src/DataTypes/IDataType.h b/src/DataTypes/IDataType.h
index 0174ca426c2..c93128ced95 100644
--- a/src/DataTypes/IDataType.h
+++ b/src/DataTypes/IDataType.h
@@ -79,6 +79,8 @@ public:
     /// Data type id. It's used for runtime type checks.
     virtual TypeIndex getTypeId() const = 0;
 
+    bool hasSubcolumn(const String & subcolumn_name) const;
+
     DataTypePtr tryGetSubcolumnType(const String & subcolumn_name) const;
     DataTypePtr getSubcolumnType(const String & subcolumn_name) const;
 
diff --git a/src/DataTypes/Serializations/ISerialization.cpp b/src/DataTypes/Serializations/ISerialization.cpp
index e6e6fdba5dc..da0142a5d57 100644
--- a/src/DataTypes/Serializations/ISerialization.cpp
+++ b/src/DataTypes/Serializations/ISerialization.cpp
@@ -73,24 +73,24 @@ String ISerialization::SubstreamPath::toString() const
 }
 
 void ISerialization::enumerateStreams(
-    SubstreamPath & path,
+    EnumerateStreamsSettings & settings,
     const StreamCallback & callback,
     const SubstreamData & data) const
 {
-    path.push_back(Substream::Regular);
-    path.back().data = data;
-    callback(path);
-    path.pop_back();
+    settings.path.push_back(Substream::Regular);
+    settings.path.back().data = data;
+    callback(settings.path);
+    settings.path.pop_back();
 }
 
-void ISerialization::enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const
+void ISerialization::enumerateStreams(
+    const StreamCallback & callback,
+    const DataTypePtr & type,
+    const ColumnPtr & column) const
 {
-    enumerateStreams(path, callback, {getPtr(), nullptr, nullptr, nullptr});
-}
-
-void ISerialization::enumerateStreams(SubstreamPath & path, const StreamCallback & callback, const DataTypePtr & type) const
-{
-    enumerateStreams(path, callback, {getPtr(), type, nullptr, nullptr});
+    EnumerateStreamsSettings settings;
+    auto data = SubstreamData(getPtr()).withType(type).withColumn(column);
+    enumerateStreams(settings, callback, data);
 }
 
 void ISerialization::serializeBinaryBulk(const IColumn & column, WriteBuffer &, size_t, size_t) const
@@ -184,7 +184,7 @@ String ISerialization::getFileNameForStream(const NameAndTypePair & column, cons
     return getFileNameForStream(column.getNameInStorage(), path);
 }
 
-static size_t isOffsetsOfNested(const ISerialization::SubstreamPath & path)
+bool isOffsetsOfNested(const ISerialization::SubstreamPath & path)
 {
     if (path.empty())
         return false;
@@ -287,10 +287,13 @@ bool ISerialization::hasSubcolumnForPath(const SubstreamPath & path, size_t pref
 
 ISerialization::SubstreamData ISerialization::createFromPath(const SubstreamPath & path, size_t prefix_len)
 {
-    assert(prefix_len < path.size());
+    assert(prefix_len <= path.size());
+    if (prefix_len == 0)
+        return {};
 
-    SubstreamData res = path[prefix_len].data;
-    for (ssize_t i = static_cast<ssize_t>(prefix_len) - 1; i >= 0; --i)
+    ssize_t last_elem = prefix_len - 1;
+    auto res = path[last_elem].data;
+    for (ssize_t i = last_elem - 1; i >= 0; --i)
     {
         const auto & creator = path[i].creator;
         if (creator)
diff --git a/src/DataTypes/Serializations/ISerialization.h b/src/DataTypes/Serializations/ISerialization.h
index b5d2082631e..1193c15b939 100644
--- a/src/DataTypes/Serializations/ISerialization.h
+++ b/src/DataTypes/Serializations/ISerialization.h
@@ -101,6 +101,30 @@ public:
 
     struct SubstreamData
     {
+        SubstreamData() = default;
+        SubstreamData(SerializationPtr serialization_)
+            : serialization(std::move(serialization_))
+        {
+        }
+
+        SubstreamData & withType(DataTypePtr type_)
+        {
+            type = std::move(type_);
+            return *this;
+        }
+
+        SubstreamData & withColumn(ColumnPtr column_)
+        {
+            column = std::move(column_);
+            return *this;
+        }
+
+        SubstreamData & withSerializationInfo(SerializationInfoPtr serialization_info_)
+        {
+            serialization_info = std::move(serialization_info_);
+            return *this;
+        }
+
         SerializationPtr serialization;
         DataTypePtr type;
         ColumnPtr column;
@@ -164,16 +188,22 @@ public:
 
     using StreamCallback = std::function<void(const SubstreamPath &)>;
 
+    struct EnumerateStreamsSettings
+    {
+        SubstreamPath path;
+        bool position_independent_encoding = true;
+    };
+
     virtual void enumerateStreams(
-        SubstreamPath & path,
+        EnumerateStreamsSettings & settings,
         const StreamCallback & callback,
         const SubstreamData & data) const;
 
-    void enumerateStreams(const StreamCallback & callback, SubstreamPath & path) const;
-    void enumerateStreams(const StreamCallback & callback, SubstreamPath && path) const { enumerateStreams(callback, path); }
-    void enumerateStreams(const StreamCallback & callback) const { enumerateStreams(callback, {}); }
-
-    void enumerateStreams(SubstreamPath & path, const StreamCallback & callback, const DataTypePtr & type) const;
+    /// Enumerate streams with default settings.
+    void enumerateStreams(
+        const StreamCallback & callback,
+        const DataTypePtr & type = nullptr,
+        const ColumnPtr & column = nullptr) const;
 
     using OutputStreamGetter = std::function<WriteBuffer*(const SubstreamPath &)>;
     using InputStreamGetter = std::function<ReadBuffer*(const SubstreamPath &)>;
@@ -375,4 +405,6 @@ State * ISerialization::checkAndGetState(const StatePtr & state) const
     return state_concrete;
 }
 
+bool isOffsetsOfNested(const ISerialization::SubstreamPath & path);
+
 }
diff --git a/src/DataTypes/Serializations/SerializationArray.cpp b/src/DataTypes/Serializations/SerializationArray.cpp
index 625f2dce0b0..48980febd9d 100644
--- a/src/DataTypes/Serializations/SerializationArray.cpp
+++ b/src/DataTypes/Serializations/SerializationArray.cpp
@@ -155,30 +155,30 @@ namespace
 
         return column_offsets;
     }
-}
 
-ColumnPtr arrayOffsetsToSizes(const IColumn & column)
-{
-    const auto & column_offsets = assert_cast<const ColumnArray::ColumnOffsets &>(column);
-    MutableColumnPtr column_sizes = column_offsets.cloneEmpty();
-
-    if (column_offsets.empty())
-        return column_sizes;
-
-    const auto & offsets_data = column_offsets.getData();
-    auto & sizes_data = assert_cast<ColumnArray::ColumnOffsets &>(*column_sizes).getData();
-
-    sizes_data.resize(offsets_data.size());
-
-    IColumn::Offset prev_offset = 0;
-    for (size_t i = 0, size = offsets_data.size(); i < size; ++i)
+    ColumnPtr arrayOffsetsToSizes(const IColumn & column)
     {
-        auto current_offset = offsets_data[i];
-        sizes_data[i] = current_offset - prev_offset;
-        prev_offset =  current_offset;
-    }
+        const auto & column_offsets = assert_cast<const ColumnArray::ColumnOffsets &>(column);
+        MutableColumnPtr column_sizes = column_offsets.cloneEmpty();
 
-    return column_sizes;
+        if (column_offsets.empty())
+            return column_sizes;
+
+        const auto & offsets_data = column_offsets.getData();
+        auto & sizes_data = assert_cast<ColumnArray::ColumnOffsets &>(*column_sizes).getData();
+
+        sizes_data.resize(offsets_data.size());
+
+        IColumn::Offset prev_offset = 0;
+        for (size_t i = 0, size = offsets_data.size(); i < size; ++i)
+        {
+            auto current_offset = offsets_data[i];
+            sizes_data[i] = current_offset - prev_offset;
+            prev_offset =  current_offset;
+        }
+
+        return column_sizes;
+    }
 }
 
 DataTypePtr SerializationArray::SubcolumnCreator::create(const DataTypePtr & prev) const
@@ -197,41 +197,42 @@ ColumnPtr SerializationArray::SubcolumnCreator::create(const ColumnPtr & prev) c
 }
 
 void SerializationArray::enumerateStreams(
-    SubstreamPath & path,
+    EnumerateStreamsSettings & settings,
     const StreamCallback & callback,
     const SubstreamData & data) const
 {
     const auto * type_array = data.type ? &assert_cast<const DataTypeArray &>(*data.type) : nullptr;
     const auto * column_array = data.column ? &assert_cast<const ColumnArray &>(*data.column) : nullptr;
-    auto offsets_column = column_array ? column_array->getOffsetsPtr() : nullptr;
+    auto offsets = column_array ? column_array->getOffsetsPtr() : nullptr;
 
-    path.push_back(Substream::ArraySizes);
-    path.back().data =
-    {
+    auto offsets_serialization =
         std::make_shared<SerializationNamed>(
             std::make_shared<SerializationNumber<UInt64>>(),
-                "size" + std::to_string(getArrayLevel(path)), false),
-        data.type ? std::make_shared<DataTypeUInt64>() : nullptr,
-        offsets_column ? arrayOffsetsToSizes(*offsets_column) : nullptr,
-        data.serialization_info,
-    };
+                "size" + std::to_string(getArrayLevel(settings.path)), false);
 
-    callback(path);
+    auto offsets_column = offsets && !settings.position_independent_encoding
+        ? arrayOffsetsToSizes(*offsets)
+        : offsets;
 
-    path.back() = Substream::ArrayElements;
-    path.back().data = data;
-    path.back().creator = std::make_shared<SubcolumnCreator>(offsets_column);
+    settings.path.push_back(Substream::ArraySizes);
+    settings.path.back().data = SubstreamData(offsets_serialization)
+        .withType(type_array ? std::make_shared<DataTypeUInt64>() : nullptr)
+        .withColumn(std::move(offsets_column))
+        .withSerializationInfo(data.serialization_info);
 
-    SubstreamData next_data =
-    {
-        nested,
-        type_array ? type_array->getNestedType() : nullptr,
-        column_array ? column_array->getDataPtr() : nullptr,
-        data.serialization_info,
-    };
+    callback(settings.path);
 
-    nested->enumerateStreams(path, callback, next_data);
-    path.pop_back();
+    settings.path.back() = Substream::ArrayElements;
+    settings.path.back().data = data;
+    settings.path.back().creator = std::make_shared<SubcolumnCreator>(offsets);
+
+    auto next_data = SubstreamData(nested)
+        .withType(type_array ? type_array->getNestedType() : nullptr)
+        .withColumn(column_array ? column_array->getDataPtr() : nullptr)
+        .withSerializationInfo(data.serialization_info);
+
+    nested->enumerateStreams(settings, callback, next_data);
+    settings.path.pop_back();
 }
 
 void SerializationArray::serializeBinaryBulkStatePrefix(
diff --git a/src/DataTypes/Serializations/SerializationArray.h b/src/DataTypes/Serializations/SerializationArray.h
index 3769f8a4513..84e37acbaad 100644
--- a/src/DataTypes/Serializations/SerializationArray.h
+++ b/src/DataTypes/Serializations/SerializationArray.h
@@ -36,7 +36,7 @@ public:
       */
 
     void enumerateStreams(
-        SubstreamPath & path,
+        EnumerateStreamsSettings & settings,
         const StreamCallback & callback,
         const SubstreamData & data) const override;
 
@@ -79,6 +79,4 @@ private:
     };
 };
 
-ColumnPtr arrayOffsetsToSizes(const IColumn & column);
-
 }
diff --git a/src/DataTypes/Serializations/SerializationLowCardinality.cpp b/src/DataTypes/Serializations/SerializationLowCardinality.cpp
index 8e19c5a740b..dfe0188c8e7 100644
--- a/src/DataTypes/Serializations/SerializationLowCardinality.cpp
+++ b/src/DataTypes/Serializations/SerializationLowCardinality.cpp
@@ -41,30 +41,26 @@ SerializationLowCardinality::SerializationLowCardinality(const DataTypePtr & dic
 }
 
 void SerializationLowCardinality::enumerateStreams(
-    SubstreamPath & path,
+    EnumerateStreamsSettings & settings,
     const StreamCallback & callback,
     const SubstreamData & data) const
 {
     const auto * column_lc = data.column ? &getColumnLowCardinality(*data.column) : nullptr;
 
-    SubstreamData dict_data =
-    {
-        dict_inner_serialization,
-        data.type ? dictionary_type : nullptr,
-        column_lc ? column_lc->getDictionary().getNestedColumn() : nullptr,
-        data.serialization_info,
-    };
+    settings.path.push_back(Substream::DictionaryKeys);
+    auto dict_data = SubstreamData(dict_inner_serialization)
+        .withType(data.type ? dictionary_type : nullptr)
+        .withColumn(column_lc ? column_lc->getDictionary().getNestedColumn() : nullptr)
+        .withSerializationInfo(data.serialization_info);
 
-    path.push_back(Substream::DictionaryKeys);
-    path.back().data = dict_data;
+    settings.path.back().data = dict_data;
+    dict_inner_serialization->enumerateStreams(settings, callback, dict_data);
 
-    dict_inner_serialization->enumerateStreams(path, callback, dict_data);
+    settings.path.back() = Substream::DictionaryIndexes;
+    settings.path.back().data = data;
 
-    path.back() = Substream::DictionaryIndexes;
-    path.back().data = data;
-
-    callback(path);
-    path.pop_back();
+    callback(settings.path);
+    settings.path.pop_back();
 }
 
 struct KeysSerializationVersion
diff --git a/src/DataTypes/Serializations/SerializationLowCardinality.h b/src/DataTypes/Serializations/SerializationLowCardinality.h
index 96e3a297d6a..cc090f2044e 100644
--- a/src/DataTypes/Serializations/SerializationLowCardinality.h
+++ b/src/DataTypes/Serializations/SerializationLowCardinality.h
@@ -18,7 +18,7 @@ public:
     explicit SerializationLowCardinality(const DataTypePtr & dictionary_type);
 
     void enumerateStreams(
-        SubstreamPath & path,
+        EnumerateStreamsSettings & settings,
         const StreamCallback & callback,
         const SubstreamData & data) const override;
 
diff --git a/src/DataTypes/Serializations/SerializationMap.cpp b/src/DataTypes/Serializations/SerializationMap.cpp
index ea22070b5b1..e46bb480d14 100644
--- a/src/DataTypes/Serializations/SerializationMap.cpp
+++ b/src/DataTypes/Serializations/SerializationMap.cpp
@@ -257,19 +257,16 @@ void SerializationMap::deserializeTextCSV(IColumn & column, ReadBuffer & istr, c
 }
 
 void SerializationMap::enumerateStreams(
-    SubstreamPath & path,
+    EnumerateStreamsSettings & settings,
     const StreamCallback & callback,
     const SubstreamData & data) const
 {
-    SubstreamData next_data =
-    {
-        nested,
-        data.type ? assert_cast<const DataTypeMap &>(*data.type).getNestedType() : nullptr,
-        data.column ? assert_cast<const ColumnMap &>(*data.column).getNestedColumnPtr() : nullptr,
-        data.serialization_info,
-    };
+    auto next_data = SubstreamData(nested)
+        .withType(data.type ? assert_cast<const DataTypeMap &>(*data.type).getNestedType() : nullptr)
+        .withColumn(data.column ? assert_cast<const ColumnMap &>(*data.column).getNestedColumnPtr() : nullptr)
+        .withSerializationInfo(data.serialization_info);
 
-    nested->enumerateStreams(path, callback, next_data);
+    nested->enumerateStreams(settings, callback, next_data);
 }
 
 void SerializationMap::serializeBinaryBulkStatePrefix(
diff --git a/src/DataTypes/Serializations/SerializationMap.h b/src/DataTypes/Serializations/SerializationMap.h
index 93b3e179499..42f99ca7991 100644
--- a/src/DataTypes/Serializations/SerializationMap.h
+++ b/src/DataTypes/Serializations/SerializationMap.h
@@ -32,7 +32,7 @@ public:
     void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override;
 
     void enumerateStreams(
-        SubstreamPath & path,
+        EnumerateStreamsSettings & settings,
         const StreamCallback & callback,
         const SubstreamData & data) const override;
 
diff --git a/src/DataTypes/Serializations/SerializationNamed.cpp b/src/DataTypes/Serializations/SerializationNamed.cpp
index 097e9cedfbe..4dac4b3a922 100644
--- a/src/DataTypes/Serializations/SerializationNamed.cpp
+++ b/src/DataTypes/Serializations/SerializationNamed.cpp
@@ -4,16 +4,16 @@ namespace DB
 {
 
 void SerializationNamed::enumerateStreams(
-    SubstreamPath & path,
+    EnumerateStreamsSettings & settings,
     const StreamCallback & callback,
     const SubstreamData & data) const
 {
-    addToPath(path);
-    path.back().data = data;
-    path.back().creator = std::make_shared<SubcolumnCreator>(name, escape_delimiter);
+    addToPath(settings.path);
+    settings.path.back().data = data;
+    settings.path.back().creator = std::make_shared<SubcolumnCreator>(name, escape_delimiter);
 
-    nested_serialization->enumerateStreams(path, callback, data);
-    path.pop_back();
+    nested_serialization->enumerateStreams(settings, callback, data);
+    settings.path.pop_back();
 }
 
 void SerializationNamed::serializeBinaryBulkStatePrefix(
diff --git a/src/DataTypes/Serializations/SerializationNamed.h b/src/DataTypes/Serializations/SerializationNamed.h
index 343b96c16e3..2a2c7c0dfc7 100644
--- a/src/DataTypes/Serializations/SerializationNamed.h
+++ b/src/DataTypes/Serializations/SerializationNamed.h
@@ -26,7 +26,7 @@ public:
     const String & getElementName() const { return name; }
 
     void enumerateStreams(
-        SubstreamPath & path,
+        EnumerateStreamsSettings & settings,
         const StreamCallback & callback,
         const SubstreamData & data) const override;
 
diff --git a/src/DataTypes/Serializations/SerializationNullable.cpp b/src/DataTypes/Serializations/SerializationNullable.cpp
index a6273deaa30..560b73bc827 100644
--- a/src/DataTypes/Serializations/SerializationNullable.cpp
+++ b/src/DataTypes/Serializations/SerializationNullable.cpp
@@ -38,38 +38,35 @@ ColumnPtr SerializationNullable::SubcolumnCreator::create(const ColumnPtr & prev
 }
 
 void SerializationNullable::enumerateStreams(
-    SubstreamPath & path,
+    EnumerateStreamsSettings & settings,
     const StreamCallback & callback,
     const SubstreamData & data) const
 {
     const auto * type_nullable = data.type ? &assert_cast<const DataTypeNullable &>(*data.type) : nullptr;
     const auto * column_nullable = data.column ? &assert_cast<const ColumnNullable &>(*data.column) : nullptr;
 
-    path.push_back(Substream::NullMap);
-    path.back().data =
-    {
-        std::make_shared<SerializationNamed>(std::make_shared<SerializationNumber<UInt8>>(), "null", false),
-        type_nullable ? std::make_shared<DataTypeUInt8>() : nullptr,
-        column_nullable ? column_nullable->getNullMapColumnPtr() : nullptr,
-        data.serialization_info,
-    };
+    auto null_map_serialization = std::make_shared<SerializationNamed>(std::make_shared<SerializationNumber<UInt8>>(), "null", false);
 
-    callback(path);
+    settings.path.push_back(Substream::NullMap);
+    auto null_map_data = SubstreamData(null_map_serialization)
+        .withType(type_nullable ? std::make_shared<DataTypeUInt8>() : nullptr)
+        .withColumn(column_nullable ? column_nullable->getNullMapColumnPtr() : nullptr)
+        .withSerializationInfo(data.serialization_info);
 
-    path.back() = Substream::NullableElements;
-    path.back().creator = std::make_shared<SubcolumnCreator>(path.back().data.column);
-    path.back().data = data;
+    settings.path.back().data = null_map_data;
+    callback(settings.path);
 
-    SubstreamData next_data =
-    {
-        nested,
-        type_nullable ? type_nullable->getNestedType() : nullptr,
-        column_nullable ? column_nullable->getNestedColumnPtr() : nullptr,
-        data.serialization_info,
-    };
+    settings.path.back() = Substream::NullableElements;
+    settings.path.back().creator = std::make_shared<SubcolumnCreator>(null_map_data.column);
+    settings.path.back().data = data;
 
-    nested->enumerateStreams(path, callback, next_data);
-    path.pop_back();
+    auto next_data = SubstreamData(nested)
+        .withType(type_nullable ? type_nullable->getNestedType() : nullptr)
+        .withColumn(column_nullable ? column_nullable->getNestedColumnPtr() : nullptr)
+        .withSerializationInfo(data.serialization_info);
+
+    nested->enumerateStreams(settings, callback, next_data);
+    settings.path.pop_back();
 }
 
 void SerializationNullable::serializeBinaryBulkStatePrefix(
diff --git a/src/DataTypes/Serializations/SerializationNullable.h b/src/DataTypes/Serializations/SerializationNullable.h
index e6e0e4f33c2..ea3958065e7 100644
--- a/src/DataTypes/Serializations/SerializationNullable.h
+++ b/src/DataTypes/Serializations/SerializationNullable.h
@@ -14,7 +14,7 @@ public:
     explicit SerializationNullable(const SerializationPtr & nested_) : nested(nested_) {}
 
     void enumerateStreams(
-        SubstreamPath & path,
+        EnumerateStreamsSettings & settings,
         const StreamCallback & callback,
         const SubstreamData & data) const override;
 
diff --git a/src/DataTypes/Serializations/SerializationSparse.cpp b/src/DataTypes/Serializations/SerializationSparse.cpp
index 6fa40e460c5..855bdfa1b3e 100644
--- a/src/DataTypes/Serializations/SerializationSparse.cpp
+++ b/src/DataTypes/Serializations/SerializationSparse.cpp
@@ -148,39 +148,33 @@ ColumnPtr SerializationSparse::SubcolumnCreator::create(const ColumnPtr & prev)
 }
 
 void SerializationSparse::enumerateStreams(
-    SubstreamPath & path,
+    EnumerateStreamsSettings & settings,
     const StreamCallback & callback,
     const SubstreamData & data) const
 {
     const auto * column_sparse = data.column ? &assert_cast<const ColumnSparse &>(*data.column) : nullptr;
-
     size_t column_size = column_sparse ? column_sparse->size() : 0;
 
-    path.push_back(Substream::SparseOffsets);
-    path.back().data =
-    {
-        std::make_shared<SerializationNumber<UInt64>>(),
-        data.type ? std::make_shared<DataTypeUInt64>() : nullptr,
-        column_sparse ? column_sparse->getOffsetsPtr() : nullptr,
-        data.serialization_info,
-    };
+    settings.path.push_back(Substream::SparseOffsets);
+    auto offsets_data = SubstreamData(std::make_shared<SerializationNumber<UInt64>>())
+        .withType(data.type ? std::make_shared<DataTypeUInt64>() : nullptr)
+        .withColumn(column_sparse ? column_sparse->getOffsetsPtr() : nullptr)
+        .withSerializationInfo(data.serialization_info);
 
-    callback(path);
+    settings.path.back().data = offsets_data;
+    callback(settings.path);
 
-    path.back() = Substream::SparseElements;
-    path.back().creator = std::make_shared<SubcolumnCreator>(path.back().data.column, column_size);
-    path.back().data = data;
+    settings.path.back() = Substream::SparseElements;
+    settings.path.back().creator = std::make_shared<SubcolumnCreator>(offsets_data.column, column_size);
+    settings.path.back().data = data;
 
-    SubstreamData next_data =
-    {
-        nested,
-        data.type,
-        column_sparse ? column_sparse->getValuesPtr() : nullptr,
-        data.serialization_info,
-    };
+    auto next_data = SubstreamData(nested)
+        .withType(data.type)
+        .withColumn(column_sparse ? column_sparse->getValuesPtr() : nullptr)
+        .withSerializationInfo(data.serialization_info);
 
-    nested->enumerateStreams(path, callback, next_data);
-    path.pop_back();
+    nested->enumerateStreams(settings, callback, next_data);
+    settings.path.pop_back();
 }
 
 void SerializationSparse::serializeBinaryBulkStatePrefix(
diff --git a/src/DataTypes/Serializations/SerializationSparse.h b/src/DataTypes/Serializations/SerializationSparse.h
index 54ab4853360..dc2f63c5a05 100644
--- a/src/DataTypes/Serializations/SerializationSparse.h
+++ b/src/DataTypes/Serializations/SerializationSparse.h
@@ -28,7 +28,7 @@ public:
     Kind getKind() const override { return Kind::SPARSE; }
 
     virtual void enumerateStreams(
-        SubstreamPath & path,
+        EnumerateStreamsSettings & settings,
         const StreamCallback & callback,
         const SubstreamData & data) const override;
 
diff --git a/src/DataTypes/Serializations/SerializationTuple.cpp b/src/DataTypes/Serializations/SerializationTuple.cpp
index 8138b15c9af..5663ff86dd6 100644
--- a/src/DataTypes/Serializations/SerializationTuple.cpp
+++ b/src/DataTypes/Serializations/SerializationTuple.cpp
@@ -283,7 +283,7 @@ void SerializationTuple::deserializeTextCSV(IColumn & column, ReadBuffer & istr,
 }
 
 void SerializationTuple::enumerateStreams(
-    SubstreamPath & path,
+    EnumerateStreamsSettings & settings,
     const StreamCallback & callback,
     const SubstreamData & data) const
 {
@@ -293,15 +293,12 @@ void SerializationTuple::enumerateStreams(
 
     for (size_t i = 0; i < elems.size(); ++i)
     {
-        SubstreamData next_data =
-        {
-            elems[i],
-            type_tuple ? type_tuple->getElement(i) : nullptr,
-            column_tuple ? column_tuple->getColumnPtr(i) : nullptr,
-            info_tuple ? info_tuple->getElementInfo(i) : nullptr,
-        };
+        auto next_data = SubstreamData(elems[i])
+            .withType(type_tuple ? type_tuple->getElement(i) : nullptr)
+            .withColumn(column_tuple ? column_tuple->getColumnPtr(i) : nullptr)
+            .withSerializationInfo(info_tuple ? info_tuple->getElementInfo(i) : nullptr);
 
-        elems[i]->enumerateStreams(path, callback, next_data);
+        elems[i]->enumerateStreams(settings, callback, next_data);
     }
 }
 
diff --git a/src/DataTypes/Serializations/SerializationTuple.h b/src/DataTypes/Serializations/SerializationTuple.h
index e82d8473645..d1caeb73dad 100644
--- a/src/DataTypes/Serializations/SerializationTuple.h
+++ b/src/DataTypes/Serializations/SerializationTuple.h
@@ -34,7 +34,7 @@ public:
     /** Each sub-column in a tuple is serialized in separate stream.
       */
     void enumerateStreams(
-        SubstreamPath & path,
+        EnumerateStreamsSettings & settings,
         const StreamCallback & callback,
         const SubstreamData & data) const override;
 
diff --git a/src/DataTypes/Serializations/SerializationWrapper.cpp b/src/DataTypes/Serializations/SerializationWrapper.cpp
index 271c53dfcf1..7c50c1c6e26 100644
--- a/src/DataTypes/Serializations/SerializationWrapper.cpp
+++ b/src/DataTypes/Serializations/SerializationWrapper.cpp
@@ -5,11 +5,11 @@ namespace DB
 {
 
 void SerializationWrapper::enumerateStreams(
-    SubstreamPath & path,
+    EnumerateStreamsSettings & settings,
     const StreamCallback & callback,
     const SubstreamData & data) const
 {
-    nested_serialization->enumerateStreams(path, callback, data);
+    nested_serialization->enumerateStreams(settings, callback, data);
 }
 
 void SerializationWrapper::serializeBinaryBulkStatePrefix(
diff --git a/src/DataTypes/Serializations/SerializationWrapper.h b/src/DataTypes/Serializations/SerializationWrapper.h
index 43fc7e9914a..d010c6b5314 100644
--- a/src/DataTypes/Serializations/SerializationWrapper.h
+++ b/src/DataTypes/Serializations/SerializationWrapper.h
@@ -21,7 +21,7 @@ public:
     Kind getKind() const override { return nested_serialization->getKind(); }
 
     void enumerateStreams(
-        SubstreamPath & path,
+        EnumerateStreamsSettings & settings,
         const StreamCallback & callback,
         const SubstreamData & data) const override;
 
diff --git a/src/Dictionaries/HierarchyDictionariesUtils.h b/src/Dictionaries/HierarchyDictionariesUtils.h
index 621290f40f9..c7508ddd220 100644
--- a/src/Dictionaries/HierarchyDictionariesUtils.h
+++ b/src/Dictionaries/HierarchyDictionariesUtils.h
@@ -180,7 +180,7 @@ namespace detail
 
     /** Returns array with UInt8 represent if key from in_keys array is in hierarchy of key from keys column.
       * If value in result array is 1 that means key from in_keys array is in hierarchy of key from
-      * keys array with same index, 0 therwise.
+      * keys array with same index, 0 otherwise.
       * For getting hierarchy implementation uses getKeysHierarchy function.
       *
       * Not: keys size must be equal to in_keys_size.
diff --git a/src/Dictionaries/MongoDBDictionarySource.cpp b/src/Dictionaries/MongoDBDictionarySource.cpp
index 1ede0ec5045..a735f426ec7 100644
--- a/src/Dictionaries/MongoDBDictionarySource.cpp
+++ b/src/Dictionaries/MongoDBDictionarySource.cpp
@@ -118,7 +118,7 @@ MongoDBDictionarySource::MongoDBDictionarySource(
         Poco::URI poco_uri(uri);
 
         // Parse database from URI. This is required for correctness -- the
-        // cursor is created using database name and colleciton name, so we have
+        // cursor is created using database name and collection name, so we have
         // to specify them properly.
         db = poco_uri.getPath();
         // getPath() may return a leading slash, remove it.
diff --git a/src/Dictionaries/getDictionaryConfigurationFromAST.cpp b/src/Dictionaries/getDictionaryConfigurationFromAST.cpp
index edc3c34fe81..e19495a27a3 100644
--- a/src/Dictionaries/getDictionaryConfigurationFromAST.cpp
+++ b/src/Dictionaries/getDictionaryConfigurationFromAST.cpp
@@ -44,15 +44,6 @@ struct AttributeConfiguration
 
 using AttributeNameToConfiguration = std::unordered_map<std::string, AttributeConfiguration>;
 
-/// Get value from field and convert it to string.
-/// Also remove quotes from strings.
-String getFieldAsString(const Field & field)
-{
-    if (field.getType() == Field::Types::Which::String)
-        return field.get<String>();
-    return applyVisitor(FieldVisitorToString(), field);
-}
-
 String getAttributeExpression(const ASTDictionaryAttributeDeclaration * dict_attr)
 {
     if (!dict_attr->expression)
@@ -61,7 +52,7 @@ String getAttributeExpression(const ASTDictionaryAttributeDeclaration * dict_att
     /// EXPRESSION PROPERTY should be expression or string
     String expression_str;
     if (const auto * literal = dict_attr->expression->as<ASTLiteral>(); literal && literal->value.getType() == Field::Types::String)
-        expression_str = getFieldAsString(literal->value);
+        expression_str = convertFieldToString(literal->value);
     else
         expression_str = queryToString(dict_attr->expression);
 
@@ -244,7 +235,7 @@ void buildAttributeExpressionIfNeeded(
     root->appendChild(expression_element);
 }
 
-/** Transofrms single dictionary attribute to configuration
+/** Transforms single dictionary attribute to configuration
   *  third_column UInt8 DEFAULT 2 EXPRESSION rand() % 100 * 77
   * to
   *  <attribute>
@@ -275,7 +266,7 @@ void buildSingleAttribute(
     AutoPtr<Element> null_value_element(doc->createElement("null_value"));
     String null_value_str;
     if (dict_attr->default_value)
-        null_value_str = getFieldAsString(dict_attr->default_value->as<ASTLiteral>()->value);
+        null_value_str = convertFieldToString(dict_attr->default_value->as<ASTLiteral>()->value);
     AutoPtr<Text> null_value(doc->createTextNode(null_value_str));
     null_value_element->appendChild(null_value);
     attribute_element->appendChild(null_value_element);
@@ -452,7 +443,7 @@ void buildConfigurationFromFunctionWithKeyValueArguments(
         }
         else if (const auto * literal = pair->second->as<const ASTLiteral>())
         {
-            AutoPtr<Text> value(doc->createTextNode(getFieldAsString(literal->value)));
+            AutoPtr<Text> value(doc->createTextNode(convertFieldToString(literal->value)));
             current_xml_element->appendChild(value);
         }
         else if (const auto * list = pair->second->as<const ASTExpressionList>())
@@ -473,7 +464,7 @@ void buildConfigurationFromFunctionWithKeyValueArguments(
             Field value;
             result->get(0, value);
 
-            AutoPtr<Text> text_value(doc->createTextNode(getFieldAsString(value)));
+            AutoPtr<Text> text_value(doc->createTextNode(convertFieldToString(value)));
             current_xml_element->appendChild(text_value);
         }
         else
@@ -519,7 +510,7 @@ void buildSourceConfiguration(
         {
             AutoPtr<Element> setting_change_element(doc->createElement(name));
             settings_element->appendChild(setting_change_element);
-            AutoPtr<Text> setting_value(doc->createTextNode(getFieldAsString(value)));
+            AutoPtr<Text> setting_value(doc->createTextNode(convertFieldToString(value)));
             setting_change_element->appendChild(setting_value);
         }
     }
diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h
index bfbdba0e050..81f33b27056 100644
--- a/src/Disks/IDisk.h
+++ b/src/Disks/IDisk.h
@@ -239,7 +239,16 @@ public:
     }
 
     /// For one local path there might be multiple remote paths in case of Log family engines.
-    using LocalPathWithObjectStoragePaths = std::pair<String, StoredObjects>;
+    struct LocalPathWithObjectStoragePaths
+     {
+         std::string local_path;
+         std::string common_prefix_for_objects;
+         StoredObjects objects;
+
+         LocalPathWithObjectStoragePaths(
+             const std::string & local_path_, const std::string & common_prefix_for_objects_, StoredObjects && objects_)
+             : local_path(local_path_), common_prefix_for_objects(common_prefix_for_objects_), objects(std::move(objects_)) {}
+     };
 
     virtual void getRemotePathsRecursive(const String &, std::vector<LocalPathWithObjectStoragePaths> &)
     {
diff --git a/src/Disks/IO/ReadBufferFromAzureBlobStorage.cpp b/src/Disks/IO/ReadBufferFromAzureBlobStorage.cpp
index b5347280497..96ae50bbbcf 100644
--- a/src/Disks/IO/ReadBufferFromAzureBlobStorage.cpp
+++ b/src/Disks/IO/ReadBufferFromAzureBlobStorage.cpp
@@ -29,7 +29,7 @@ ReadBufferFromAzureBlobStorage::ReadBufferFromAzureBlobStorage(
     size_t max_single_download_retries_,
     bool use_external_buffer_,
     size_t read_until_position_)
-    : ReadBufferFromFileBase(read_settings_.remote_fs_buffer_size, nullptr, 0)
+    : ReadBufferFromFileBase(use_external_buffer_ ? 0 : read_settings_.remote_fs_buffer_size, nullptr, 0)
     , blob_container_client(blob_container_client_)
     , path(path_)
     , max_single_read_retries(max_single_read_retries_)
diff --git a/src/Disks/IO/ReadIndirectBufferFromRemoteFS.cpp b/src/Disks/IO/ReadIndirectBufferFromRemoteFS.cpp
index 3f7b378dee4..26947af23ec 100644
--- a/src/Disks/IO/ReadIndirectBufferFromRemoteFS.cpp
+++ b/src/Disks/IO/ReadIndirectBufferFromRemoteFS.cpp
@@ -1,6 +1,7 @@
 #include "ReadIndirectBufferFromRemoteFS.h"
 
 #include <Disks/IO/ReadBufferFromRemoteFSGather.h>
+#include <IO/ReadSettings.h>
 
 
 namespace DB
@@ -13,8 +14,8 @@ namespace ErrorCodes
 
 
 ReadIndirectBufferFromRemoteFS::ReadIndirectBufferFromRemoteFS(
-    std::shared_ptr<ReadBufferFromRemoteFSGather> impl_)
-    : ReadBufferFromFileBase(DBMS_DEFAULT_BUFFER_SIZE, nullptr, 0)
+    std::shared_ptr<ReadBufferFromRemoteFSGather> impl_, const ReadSettings & settings)
+    : ReadBufferFromFileBase(settings.remote_fs_buffer_size, nullptr, 0)
     , impl(impl_)
 {
 }
diff --git a/src/Disks/IO/ReadIndirectBufferFromRemoteFS.h b/src/Disks/IO/ReadIndirectBufferFromRemoteFS.h
index fcd463a92c8..996e69296a6 100644
--- a/src/Disks/IO/ReadIndirectBufferFromRemoteFS.h
+++ b/src/Disks/IO/ReadIndirectBufferFromRemoteFS.h
@@ -9,6 +9,7 @@ namespace DB
 {
 
 class ReadBufferFromRemoteFSGather;
+struct ReadSettings;
 
 /**
 * Reads data from S3/HDFS/Web using stored paths in metadata.
@@ -18,7 +19,7 @@ class ReadIndirectBufferFromRemoteFS : public ReadBufferFromFileBase
 {
 
 public:
-    explicit ReadIndirectBufferFromRemoteFS(std::shared_ptr<ReadBufferFromRemoteFSGather> impl_);
+    explicit ReadIndirectBufferFromRemoteFS(std::shared_ptr<ReadBufferFromRemoteFSGather> impl_, const ReadSettings & settings);
 
     off_t seek(off_t offset_, int whence) override;
 
diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp
index 40f68b86e9d..09e5c3d32dc 100644
--- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp
@@ -112,7 +112,7 @@ std::unique_ptr<ReadBufferFromFileBase> AzureObjectStorage::readObjects( /// NOL
     }
     else
     {
-        auto buf = std::make_unique<ReadIndirectBufferFromRemoteFS>(std::move(reader_impl));
+        auto buf = std::make_unique<ReadIndirectBufferFromRemoteFS>(std::move(reader_impl), disk_read_settings);
         return std::make_unique<SeekAvoidingReadBuffer>(std::move(buf), settings_ptr->min_bytes_for_seek);
     }
 }
diff --git a/src/Disks/ObjectStorages/DiskObjectStorage.cpp b/src/Disks/ObjectStorages/DiskObjectStorage.cpp
index 00ef01645cd..db8f90e777d 100644
--- a/src/Disks/ObjectStorages/DiskObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/DiskObjectStorage.cpp
@@ -127,7 +127,7 @@ void DiskObjectStorage::getRemotePathsRecursive(const String & local_path, std::
     {
         try
         {
-            paths_map.emplace_back(local_path, getStorageObjects(local_path));
+            paths_map.emplace_back(local_path, metadata_storage->getObjectStorageRootPath(), getStorageObjects(local_path));
         }
         catch (const Exception & e)
         {
@@ -253,6 +253,13 @@ void DiskObjectStorage::removeSharedFile(const String & path, bool delete_metada
     transaction->commit();
 }
 
+void DiskObjectStorage::removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only)
+{
+    auto transaction = createObjectStorageTransaction();
+    transaction->removeSharedFiles(files, keep_all_batch_data, file_names_remove_metadata_only);
+    transaction->commit();
+}
+
 UInt32 DiskObjectStorage::getRefCount(const String & path) const
 {
     return metadata_storage->getHardlinkCount(path);
@@ -275,7 +282,10 @@ String DiskObjectStorage::getUniqueId(const String & path) const
 bool DiskObjectStorage::checkUniqueId(const String & id) const
 {
     if (!id.starts_with(object_storage_root_path))
+    {
+        LOG_DEBUG(log, "Blob with id {} doesn't start with blob storage prefix {}", id, object_storage_root_path);
         return false;
+    }
 
     auto object = StoredObject::create(*object_storage, id, {}, {}, true);
     return object_storage->exists(object);
diff --git a/src/Disks/ObjectStorages/DiskObjectStorage.h b/src/Disks/ObjectStorages/DiskObjectStorage.h
index 34056f17b3c..14fb84d7a15 100644
--- a/src/Disks/ObjectStorages/DiskObjectStorage.h
+++ b/src/Disks/ObjectStorages/DiskObjectStorage.h
@@ -92,6 +92,8 @@ public:
 
     void removeSharedRecursive(const String & path, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) override;
 
+    void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) override;
+
     MetadataStoragePtr getMetadataStorage() override { return metadata_storage; }
 
     UInt32 getRefCount(const String & path) const override;
diff --git a/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp b/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp
index ae03915d944..f18debe8a8b 100644
--- a/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp
+++ b/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp
@@ -68,6 +68,14 @@ void DiskObjectStorageMetadata::deserialize(ReadBuffer & buf)
     }
 }
 
+void DiskObjectStorageMetadata::createFromSingleObject(const std::string & relative_path, size_t bytes_size, size_t ref_count_, bool read_only_)
+{
+    storage_objects.emplace_back(relative_path, bytes_size);
+    total_size = bytes_size;
+    ref_count = ref_count_;
+    read_only = read_only_;
+}
+
 void DiskObjectStorageMetadata::deserializeFromString(const std::string & data)
 {
     ReadBufferFromString buf(data);
diff --git a/src/Disks/ObjectStorages/DiskObjectStorageMetadata.h b/src/Disks/ObjectStorages/DiskObjectStorageMetadata.h
index d3ea5795dd3..09e0f4ee85b 100644
--- a/src/Disks/ObjectStorages/DiskObjectStorageMetadata.h
+++ b/src/Disks/ObjectStorages/DiskObjectStorageMetadata.h
@@ -50,6 +50,7 @@ public:
 
     void deserialize(ReadBuffer & buf);
     void deserializeFromString(const std::string & data);
+    void createFromSingleObject(const std::string & relative_path, size_t bytes_size, size_t ref_count_, bool is_read_only_);
 
     void serialize(WriteBuffer & buf, bool sync) const;
     std::string serializeToString() const;
diff --git a/src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp b/src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp
index 54d5a2f2368..5f376de34dc 100644
--- a/src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp
+++ b/src/Disks/ObjectStorages/DiskObjectStorageTransaction.cpp
@@ -3,7 +3,7 @@
 #include <Common/checkStackSize.h>
 #include <ranges>
 #include <Common/logger_useful.h>
-
+#include <Common/Exception.h>
 
 namespace DB
 {
@@ -138,6 +138,87 @@ struct RemoveObjectStorageOperation final : public IDiskObjectStorageOperation
     }
 };
 
+struct RemoveManyObjectStorageOperation final : public IDiskObjectStorageOperation
+{
+    RemoveBatchRequest remove_paths;
+    bool keep_all_batch_data;
+    NameSet file_names_remove_metadata_only;
+    StoredObjects objects_to_remove;
+    bool remove_from_cache = false;
+
+    RemoveManyObjectStorageOperation(
+        IObjectStorage & object_storage_,
+        IMetadataStorage & metadata_storage_,
+        const RemoveBatchRequest & remove_paths_,
+        bool keep_all_batch_data_,
+        const NameSet & file_names_remove_metadata_only_)
+        : IDiskObjectStorageOperation(object_storage_, metadata_storage_)
+        , remove_paths(remove_paths_)
+        , keep_all_batch_data(keep_all_batch_data_)
+        , file_names_remove_metadata_only(file_names_remove_metadata_only_)
+    {}
+
+    std::string getInfoForLog() const override
+    {
+        return fmt::format("RemoveManyObjectStorageOperation (paths size: {}, keep all batch {}, files to keep {})", remove_paths.size(), keep_all_batch_data, fmt::join(file_names_remove_metadata_only, ", "));
+    }
+
+    void execute(MetadataTransactionPtr tx) override
+    {
+        for (const auto & [path, if_exists] : remove_paths)
+        {
+
+            if (!metadata_storage.exists(path))
+            {
+                if (if_exists)
+                    continue;
+
+                throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "Metadata path '{}' doesn't exist", path);
+            }
+
+            if (!metadata_storage.isFile(path))
+                throw Exception(ErrorCodes::BAD_FILE_TYPE, "Path '{}' is not a regular file", path);
+
+            try
+            {
+                uint32_t hardlink_count = metadata_storage.getHardlinkCount(path);
+                auto objects = metadata_storage.getStorageObjects(path);
+
+                tx->unlinkMetadata(path);
+
+                /// File is really redundant
+                if (hardlink_count == 0 && !keep_all_batch_data && !file_names_remove_metadata_only.contains(fs::path(path).filename()))
+                    objects_to_remove.insert(objects_to_remove.end(), objects.begin(), objects.end());
+            }
+            catch (const Exception & e)
+            {
+                /// If it's impossible to read meta - just remove it from FS.
+                if (e.code() == ErrorCodes::UNKNOWN_FORMAT
+                    || e.code() == ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF
+                    || e.code() == ErrorCodes::CANNOT_READ_ALL_DATA
+                    || e.code() == ErrorCodes::CANNOT_OPEN_FILE)
+                {
+                    tx->unlinkFile(path);
+                }
+                else
+                    throw;
+            }
+        }
+    }
+
+    void undo() override
+    {
+
+    }
+
+    void finalize() override
+    {
+        if (!objects_to_remove.empty())
+            object_storage.removeObjects(objects_to_remove);
+    }
+};
+
+
 struct RemoveRecursiveObjectStorageOperation final : public IDiskObjectStorageOperation
 {
     std::string path;
@@ -479,14 +560,8 @@ void DiskObjectStorageTransaction::removeFileIfExists(const std::string & path)
 void DiskObjectStorageTransaction::removeSharedFiles(
     const RemoveBatchRequest & files, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only)
 {
-    for (const auto & file : files)
-    {
-        bool keep_file = keep_all_batch_data || file_names_remove_metadata_only.contains(fs::path(file.path).filename());
-        if (file.if_exists)
-            removeSharedFileIfExists(file.path, keep_file);
-        else
-            removeSharedFile(file.path, keep_file);
-    }
+    auto operation = std::make_unique<RemoveManyObjectStorageOperation>(object_storage, metadata_storage, files, keep_all_batch_data, file_names_remove_metadata_only);
+    operations_to_execute.emplace_back(std::move(operation));
 }
 
 namespace
@@ -633,9 +708,11 @@ void DiskObjectStorageTransaction::commit()
         {
             operations_to_execute[i]->execute(metadata_transaction);
         }
-        catch (Exception & ex)
+        catch (...)
         {
-            ex.addMessage(fmt::format("While executing operation #{} ({})", i, operations_to_execute[i]->getInfoForLog()));
+            tryLogCurrentException(
+                &Poco::Logger::get("DiskObjectStorageTransaction"),
+                fmt::format("An error occurred while executing transaction's operation #{} ({})", i, operations_to_execute[i]->getInfoForLog()));
 
             for (int64_t j = i; j >= 0; --j)
             {
@@ -643,9 +720,12 @@ void DiskObjectStorageTransaction::commit()
                 {
                     operations_to_execute[j]->undo();
                 }
-                catch (Exception & rollback_ex)
+                catch (...)
                 {
-                    rollback_ex.addMessage(fmt::format("While undoing operation #{}", i));
+                    tryLogCurrentException(
+                        &Poco::Logger::get("DiskObjectStorageTransaction"),
+                        fmt::format("An error occurred while undoing transaction's operation #{}", i));
+
                     throw;
                 }
             }
diff --git a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp
index 2303401466d..2f82458ecd8 100644
--- a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp
@@ -70,11 +70,12 @@ std::unique_ptr<ReadBufferFromFileBase> HDFSObjectStorage::readObjects( /// NOLI
         auto hdfs_path = path.substr(begin_of_path);
         auto hdfs_uri = path.substr(0, begin_of_path);
 
-        return std::make_unique<ReadBufferFromHDFS>(hdfs_uri, hdfs_path, config, disk_read_settings);
+        return std::make_unique<ReadBufferFromHDFS>(
+            hdfs_uri, hdfs_path, config, disk_read_settings, /* read_until_position */0, /* use_external_buffer */true);
     };
 
     auto hdfs_impl = std::make_unique<ReadBufferFromRemoteFSGather>(std::move(read_buffer_creator), objects, disk_read_settings);
-    auto buf = std::make_unique<ReadIndirectBufferFromRemoteFS>(std::move(hdfs_impl));
+    auto buf = std::make_unique<ReadIndirectBufferFromRemoteFS>(std::move(hdfs_impl), read_settings);
     return std::make_unique<SeekAvoidingReadBuffer>(std::move(buf), settings->min_bytes_for_seek);
 }
 
diff --git a/src/Disks/ObjectStorages/IMetadataStorage.h b/src/Disks/ObjectStorages/IMetadataStorage.h
index 300d8ec59b9..3d6c772157d 100644
--- a/src/Disks/ObjectStorages/IMetadataStorage.h
+++ b/src/Disks/ObjectStorages/IMetadataStorage.h
@@ -124,7 +124,7 @@ public:
 
     virtual ~IMetadataStorage() = default;
 
-    /// ==== More specefic methods. Previous were almost general purpose. ====
+    /// ==== More specific methods. Previous were almost general purpose. ====
 
     /// Read multiple metadata files into strings and return mapping from file_path -> metadata
     virtual std::unordered_map<std::string, std::string> getSerializedMetadata(const std::vector<String> & file_paths) const = 0;
diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp
index ccde7d20778..45304ac2fac 100644
--- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp
@@ -31,6 +31,7 @@
 #include <Common/logger_useful.h>
 #include <Common/MultiVersion.h>
 
+
 namespace DB
 {
 
@@ -55,7 +56,7 @@ void throwIfError(const Aws::Utils::Outcome<Result, Error> & response)
     if (!response.IsSuccess())
     {
         const auto & err = response.GetError();
-        throw Exception(ErrorCodes::S3_ERROR, "{} (Code: {})", err.GetMessage(), static_cast<size_t>(err.GetErrorType()));
+        throw S3Exception(fmt::format("{} (Code: {})", err.GetMessage(), static_cast<size_t>(err.GetErrorType())), err.GetErrorType());
     }
 }
 
@@ -69,7 +70,7 @@ void throwIfUnexpectedError(const Aws::Utils::Outcome<Result, Error> & response,
     if (!response.IsSuccess() && (!if_exists || !isNotFoundError(response.GetError().GetErrorType())))
     {
         const auto & err = response.GetError();
-        throw Exception(ErrorCodes::S3_ERROR, "{} (Code: {})", err.GetMessage(), static_cast<size_t>(err.GetErrorType()));
+        throw S3Exception(err.GetErrorType(), "{} (Code: {})", err.GetMessage(), static_cast<size_t>(err.GetErrorType()));
     }
 }
 
@@ -90,7 +91,19 @@ void logIfError(const Aws::Utils::Outcome<Result, Error> & response, std::functi
 
 std::string S3ObjectStorage::generateBlobNameForPath(const std::string & /* path */)
 {
-    return getRandomASCIIString(32);
+    /// Path to store the new S3 object.
+
+    /// Total length is 32 a-z characters for enough randomness.
+    /// First 3 characters are used as a prefix for
+    /// https://aws.amazon.com/premiumsupport/knowledge-center/s3-object-key-naming-pattern/
+
+    constexpr size_t key_name_total_size = 32;
+    constexpr size_t key_name_prefix_size = 3;
+
+    /// Path to store new S3 object.
+    return fmt::format("{}/{}",
+        getRandomASCIIString(key_name_prefix_size),
+        getRandomASCIIString(key_name_total_size - key_name_prefix_size));
 }
 
 Aws::S3::Model::HeadObjectOutcome S3ObjectStorage::requestObjectHeadData(const std::string & bucket_from, const std::string & key) const
@@ -157,7 +170,7 @@ std::unique_ptr<ReadBufferFromFileBase> S3ObjectStorage::readObjects( /// NOLINT
     }
     else
     {
-        auto buf = std::make_unique<ReadIndirectBufferFromRemoteFS>(std::move(s3_impl));
+        auto buf = std::make_unique<ReadIndirectBufferFromRemoteFS>(std::move(s3_impl), disk_read_settings);
         return std::make_unique<SeekAvoidingReadBuffer>(std::move(buf), settings_ptr->min_bytes_for_seek);
     }
 }
@@ -245,6 +258,8 @@ void S3ObjectStorage::removeObjectImpl(const StoredObject & object, bool if_exis
     auto outcome = client_ptr->DeleteObject(request);
 
     throwIfUnexpectedError(outcome, if_exists);
+
+    LOG_TRACE(log, "Object with path {} was removed from S3", object.absolute_path);
 }
 
 void S3ObjectStorage::removeObjectsImpl(const StoredObjects & objects, bool if_exists)
@@ -288,6 +303,8 @@ void S3ObjectStorage::removeObjectsImpl(const StoredObjects & objects, bool if_e
             auto outcome = client_ptr->DeleteObjects(request);
 
             throwIfUnexpectedError(outcome, if_exists);
+
+            LOG_TRACE(log, "Objects with paths [{}] were removed from S3", keys);
         }
     }
 }
diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h
index 448826bfa71..ecbd8cc9aa1 100644
--- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h
+++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h
@@ -12,6 +12,7 @@
 #include <aws/s3/model/ListObjectsV2Result.h>
 #include <Storages/StorageS3Settings.h>
 #include <Common/MultiVersion.h>
+#include <Common/logger_useful.h>
 
 
 namespace DB
@@ -180,6 +181,7 @@ private:
 
     const String version_id;
 
+    Poco::Logger * log = &Poco::Logger::get("S3ObjectStorage");
     DataSourceDescription data_source_description;
 };
 
diff --git a/src/Disks/ObjectStorages/S3/diskSettings.cpp b/src/Disks/ObjectStorages/S3/diskSettings.cpp
index b34eddf63f0..a93d95d91bd 100644
--- a/src/Disks/ObjectStorages/S3/diskSettings.cpp
+++ b/src/Disks/ObjectStorages/S3/diskSettings.cpp
@@ -116,7 +116,8 @@ std::unique_ptr<Aws::S3::S3Client> getClient(const Poco::Util::AbstractConfigura
     S3::PocoHTTPClientConfiguration client_configuration = S3::ClientFactory::instance().createClientConfiguration(
         config.getString(config_prefix + ".region", ""),
         context->getRemoteHostFilter(), context->getGlobalContext()->getSettingsRef().s3_max_redirects,
-        context->getGlobalContext()->getSettingsRef().enable_s3_requests_logging);
+        context->getGlobalContext()->getSettingsRef().enable_s3_requests_logging,
+        /* for_disk_s3 = */ true);
 
     S3::URI uri(Poco::URI(config.getString(config_prefix + ".endpoint")));
     if (uri.key.back() != '/')
diff --git a/src/Disks/ObjectStorages/StoredObject.h b/src/Disks/ObjectStorages/StoredObject.h
index acb8a5fd127..d9faa766540 100644
--- a/src/Disks/ObjectStorages/StoredObject.h
+++ b/src/Disks/ObjectStorages/StoredObject.h
@@ -3,6 +3,7 @@
 #include <string>
 #include <Disks/ObjectStorages/IObjectStorage_fwd.h>
 
+
 namespace DB
 {
 
diff --git a/src/Disks/ObjectStorages/Web/WebObjectStorage.cpp b/src/Disks/ObjectStorages/Web/WebObjectStorage.cpp
index 3c7ce47340d..b0fed4e001b 100644
--- a/src/Disks/ObjectStorages/Web/WebObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/Web/WebObjectStorage.cpp
@@ -188,7 +188,7 @@ std::unique_ptr<ReadBufferFromFileBase> WebObjectStorage::readObject( /// NOLINT
     }
     else
     {
-        auto buf = std::make_unique<ReadIndirectBufferFromRemoteFS>(std::move(web_impl));
+        auto buf = std::make_unique<ReadIndirectBufferFromRemoteFS>(std::move(web_impl), read_settings);
         return std::make_unique<SeekAvoidingReadBuffer>(std::move(buf), min_bytes_for_seek);
     }
 }
diff --git a/src/Formats/JSONUtils.cpp b/src/Formats/JSONUtils.cpp
index 895833abf4d..9aa0645f580 100644
--- a/src/Formats/JSONUtils.cpp
+++ b/src/Formats/JSONUtils.cpp
@@ -218,7 +218,6 @@ namespace JSONUtils
         {
             auto object = field.getObject();
             DataTypes value_types;
-            bool have_object_value = false;
             for (const auto key_value_pair : object)
             {
                 auto type = getDataTypeFromFieldImpl(key_value_pair.second, settings, numbers_parsed_from_json_strings);
@@ -226,10 +225,7 @@ namespace JSONUtils
                     continue;
 
                 if (isObject(type))
-                {
-                    have_object_value = true;
-                    break;
-                }
+                    return std::make_shared<DataTypeObject>("json", true);
 
                 value_types.push_back(type);
             }
@@ -242,7 +238,7 @@ namespace JSONUtils
             for (size_t i = 1; i < value_types.size(); ++i)
                 are_types_equal &= value_types[i]->equals(*value_types[0]);
 
-            if (have_object_value || !are_types_equal)
+            if (!are_types_equal)
                 return std::make_shared<DataTypeObject>("json", true);
 
             return std::make_shared<DataTypeMap>(std::make_shared<DataTypeString>(), value_types[0]);
diff --git a/src/Functions/CMakeLists.txt b/src/Functions/CMakeLists.txt
index 1bc95b49dbe..0387cc86d48 100644
--- a/src/Functions/CMakeLists.txt
+++ b/src/Functions/CMakeLists.txt
@@ -92,6 +92,11 @@ list (APPEND OBJECT_LIBS $<TARGET_OBJECTS:clickhouse_functions_url>)
 add_subdirectory(array)
 list (APPEND OBJECT_LIBS $<TARGET_OBJECTS:clickhouse_functions_array>)
 
+if (TARGET ch_contrib::datasketches)
+    add_subdirectory(UniqTheta)
+    list (APPEND OBJECT_LIBS $<TARGET_OBJECTS:clickhouse_functions_uniqtheta>)
+endif()
+
 add_subdirectory(JSONPath)
 list (APPEND PRIVATE_LIBS clickhouse_functions_jsonpath)
 
diff --git a/src/Functions/FunctionHelpers.h b/src/Functions/FunctionHelpers.h
index 8d33c820185..18a4e584080 100644
--- a/src/Functions/FunctionHelpers.h
+++ b/src/Functions/FunctionHelpers.h
@@ -134,7 +134,7 @@ using FunctionArgumentDescriptors = std::vector<FunctionArgumentDescriptor>;
  * (e.g. depending on result type or other trait).
  * First, checks that number of arguments is as expected (including optional arguments).
  * Second, checks that mandatory args present and have valid type.
- * Third, checks optional arguents types, skipping ones that are missing.
+ * Third, checks optional arguments types, skipping ones that are missing.
  *
  * Please note that if you have several optional arguments, like f([a, b, c]),
  * only these calls are considered valid:
diff --git a/src/Functions/IFunction.h b/src/Functions/IFunction.h
index 95af8a61aae..83b89b85b62 100644
--- a/src/Functions/IFunction.h
+++ b/src/Functions/IFunction.h
@@ -171,7 +171,7 @@ public:
       */
     virtual bool isSuitableForConstantFolding() const { return true; }
 
-    /** If function isSuitableForConstantFolding then, this method will be called during query analyzis
+    /** If function isSuitableForConstantFolding then, this method will be called during query analysis
       * if some arguments are constants. For example logical functions (AndFunction, OrFunction) can
       * return they result based on some constant arguments.
       * Arguments are passed without modifications, useDefaultImplementationForNulls, useDefaultImplementationForNothing,
@@ -394,7 +394,7 @@ private:
 using FunctionOverloadResolverPtr = std::shared_ptr<IFunctionOverloadResolver>;
 
 /// Old function interface. Check documentation in IFunction.h.
-/// If client do not need statefull properties it can implement this interface.
+/// If client do not need stateful properties it can implement this interface.
 class IFunction
 {
 public:
diff --git a/src/Functions/UniqTheta/CMakeLists.txt b/src/Functions/UniqTheta/CMakeLists.txt
new file mode 100644
index 00000000000..27e23eb3881
--- /dev/null
+++ b/src/Functions/UniqTheta/CMakeLists.txt
@@ -0,0 +1,9 @@
+include("${ClickHouse_SOURCE_DIR}/cmake/dbms_glob_sources.cmake")
+
+add_library(clickhouse_functions_uniqtheta FunctionsUniqTheta.cpp)
+
+target_link_libraries(clickhouse_functions_uniqtheta PRIVATE dbms)
+
+if (TARGET ch_contrib::datasketches)
+    target_link_libraries (clickhouse_functions_uniqtheta PRIVATE ch_contrib::datasketches)
+endif ()
diff --git a/src/Functions/UniqTheta/FunctionsUniqTheta.cpp b/src/Functions/UniqTheta/FunctionsUniqTheta.cpp
new file mode 100644
index 00000000000..aa280c0818e
--- /dev/null
+++ b/src/Functions/UniqTheta/FunctionsUniqTheta.cpp
@@ -0,0 +1,68 @@
+#include <Functions/FunctionFactory.h>
+
+#include "FunctionsUniqTheta.h"
+
+#if USE_DATASKETCHES
+
+namespace DB
+{
+
+REGISTER_FUNCTION(UniqTheta)
+{
+    factory.registerFunction<FunctionUniqThetaIntersect>(
+            {
+              R"(
+Two uniqThetaSketch objects to do intersect calculation(set operation ∩), the result is a new uniqThetaSketch.
+
+A uniqThetaSketch object is to be constructed by aggregation function uniqTheta with -State.
+
+UniqThetaSketch is a data structure storage of approximate values set.
+For more information on RoaringBitmap, see: [Theta Sketch Framework](https://datasketches.apache.org/docs/Theta/ThetaSketchFramework.html).
+
+Typical usage:
+[example:typical]
+)",
+                    Documentation::Examples{
+                            {"typical", "select finalizeAggregation(uniqThetaIntersect(arrayReduce('uniqThetaState',[1,2]), arrayReduce('uniqThetaState',[2,3,4])));"}},
+                    Documentation::Categories{"uniqTheta"}
+            });
+
+    factory.registerFunction<FunctionUniqThetaUnion>(
+            {
+              R"(
+Two uniqThetaSketch objects to do union calculation(set operation ∪), the result is a new uniqThetaSketch.
+
+A uniqThetaSketch object is to be constructed by aggregation function uniqTheta with -State.
+
+UniqThetaSketch is a data structure storage of approximate values set.
+For more information on RoaringBitmap, see: [Theta Sketch Framework](https://datasketches.apache.org/docs/Theta/ThetaSketchFramework.html).
+
+Typical usage:
+[example:typical]
+)",
+                    Documentation::Examples{
+                            {"typical", "select finalizeAggregation(uniqThetaUnion(arrayReduce('uniqThetaState',[1,2]), arrayReduce('uniqThetaState',[2,3,4])));"}},
+                    Documentation::Categories{"uniqTheta"}
+            });
+    factory.registerFunction<FunctionUniqThetaNot>(
+            {
+              R"(
+Two uniqThetaSketch objects to do a_not_b calculation(set operation ×), the result is a new uniqThetaSketch.
+
+A uniqThetaSketch object is to be constructed by aggregation function uniqTheta with -State.
+
+UniqThetaSketch is a data structure storage of approximate values set.
+For more information on RoaringBitmap, see: [Theta Sketch Framework](https://datasketches.apache.org/docs/Theta/ThetaSketchFramework.html).
+
+Typical usage:
+[example:typical]
+)",
+                    Documentation::Examples{
+                            {"typical", "select finalizeAggregation(uniqThetaNot(arrayReduce('uniqThetaState',[1,2]), arrayReduce('uniqThetaState',[2,3,4])));"}},
+                    Documentation::Categories{"uniqTheta"}
+            });
+}
+
+}
+
+#endif
diff --git a/src/Functions/UniqTheta/FunctionsUniqTheta.h b/src/Functions/UniqTheta/FunctionsUniqTheta.h
new file mode 100644
index 00000000000..7cdbf587cf7
--- /dev/null
+++ b/src/Functions/UniqTheta/FunctionsUniqTheta.h
@@ -0,0 +1,176 @@
+#pragma once
+
+#include <Common/config.h>
+
+#if USE_DATASKETCHES
+
+#include <AggregateFunctions/AggregateFunctionFactory.h>
+#include <Columns/ColumnAggregateFunction.h>
+#include <Columns/ColumnArray.h>
+#include <Columns/ColumnConst.h>
+#include <Columns/ColumnString.h>
+#include <Columns/ColumnVector.h>
+#include <Columns/ColumnsNumber.h>
+#include <DataTypes/DataTypeAggregateFunction.h>
+#include <DataTypes/DataTypeArray.h>
+#include <DataTypes/DataTypeString.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <Functions/FunctionHelpers.h>
+#include <Functions/IFunction.h>
+#include <Interpreters/castColumn.h>
+#include <Common/assert_cast.h>
+#include <Common/typeid_cast.h>
+
+#include <AggregateFunctions/AggregateFunctionUniq.h>
+
+namespace DB
+{
+
+
+    namespace ErrorCodes
+    {
+        extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+    }
+
+    struct UniqThetaIntersectImpl
+    {
+        static void apply(AggregateFunctionUniqThetaData & sketch_data_1, const AggregateFunctionUniqThetaData & sketch_data_2)
+        {
+            sketch_data_1.set.intersect(sketch_data_2.set);
+        }
+    };
+
+    struct UniqThetaUnionImpl
+    {
+        static void apply(AggregateFunctionUniqThetaData & sketch_data_1, const AggregateFunctionUniqThetaData & sketch_data_2)
+        {
+            sketch_data_1.set.merge(sketch_data_2.set);
+        }
+    };
+
+    struct UniqThetaNotImpl
+    {
+        static void apply(AggregateFunctionUniqThetaData & sketch_data_1, const AggregateFunctionUniqThetaData & sketch_data_2)
+        {
+            sketch_data_1.set.aNotB(sketch_data_2.set);
+        }
+    };
+
+    template <typename Impl, typename Name>
+    class FunctionUniqTheta : public IFunction
+    {
+    public:
+        static constexpr auto name = Name::name;
+
+        static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionUniqTheta>(); }
+
+        String getName() const override { return name; }
+
+        bool isVariadic() const override { return false; }
+
+        bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
+
+        size_t getNumberOfArguments() const override { return 2; }
+
+        DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
+        {
+            const auto * sketch_type0 = typeid_cast<const DataTypeAggregateFunction *>(arguments[0].get());
+            if (!(sketch_type0 && sketch_type0->getFunctionName() == "uniqTheta"))
+                throw Exception(
+                        "First argument for function " + getName() + " must be a uniqTheta but it has type " + arguments[0]->getName(),
+                        ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+
+            const auto * sketch_type1 = typeid_cast<const DataTypeAggregateFunction *>(arguments[1].get());
+            if (!(sketch_type1 && sketch_type1->getFunctionName() == "uniqTheta"))
+                throw Exception(
+                        "Second argument for function " + getName() + " must be a uniqTheta but it has type " + arguments[1]->getName(),
+                        ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+
+            const DataTypes & arg_data_types0 = sketch_type0->getArgumentsDataTypes();
+            const DataTypes & arg_data_types1 = sketch_type1->getArgumentsDataTypes();
+
+            if (arg_data_types0.size() != arg_data_types1.size())
+                throw Exception(
+                        "The nested type in uniqThetas must be the same length, but one is " + std::to_string(arg_data_types0.size())
+                        + ", and the other is " + std::to_string(arg_data_types1.size()),
+                        ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+
+            size_t types_size = arg_data_types0.size();
+            for (size_t i = 0; i < types_size; ++i)
+            {
+                if (!arg_data_types0[i]->equals(*arg_data_types1[i]))
+                    throw Exception(
+                            "The " + std::to_string(i) + "th nested type in uniqThetas must be the same, but one is " + arg_data_types0[i]->getName()
+                            + ", and the other is " + arg_data_types1[i]->getName(),
+                            ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+            }
+
+
+            return arguments[0];
+        }
+
+        bool useDefaultImplementationForConstants() const override { return true; }
+
+        ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
+        {
+            const ColumnAggregateFunction * column_ptrs[2];
+            bool is_column_const[2];
+            for (size_t i = 0; i < 2; ++i)
+            {
+                if (const auto * argument_column_const = typeid_cast<const ColumnConst *>(arguments[i].column.get()))
+                {
+                    column_ptrs[i] = typeid_cast<const ColumnAggregateFunction *>(argument_column_const->getDataColumnPtr().get());
+                    is_column_const[i] = true;
+                }
+                else
+                {
+                    column_ptrs[i] = typeid_cast<const ColumnAggregateFunction *>(arguments[i].column.get());
+                    is_column_const[i] = false;
+                }
+            }
+
+            auto col_to = ColumnAggregateFunction::create(column_ptrs[0]->getAggregateFunction());
+
+            col_to->reserve(input_rows_count);
+
+            const PaddedPODArray<AggregateDataPtr> & container0 = column_ptrs[0]->getData();
+            const PaddedPODArray<AggregateDataPtr> & container1 = column_ptrs[1]->getData();
+
+            for (size_t i = 0; i < input_rows_count; ++i)
+            {
+                const AggregateDataPtr data_ptr_0 = is_column_const[0] ? container0[0] : container0[i];
+                const AggregateDataPtr data_ptr_1 = is_column_const[1] ? container1[0] : container1[i];
+
+                col_to->insertFrom(data_ptr_0);
+                AggregateFunctionUniqThetaData & sketch_data_1 = *reinterpret_cast<AggregateFunctionUniqThetaData *>(col_to->getData()[i]);
+                const AggregateFunctionUniqThetaData & sketch_data_2
+                        = *reinterpret_cast<const AggregateFunctionUniqThetaData *>(data_ptr_1);
+                Impl::apply(sketch_data_1, sketch_data_2);
+            }
+            return col_to;
+        }
+    };
+
+    struct NameUniqThetaIntersect
+    {
+        static constexpr auto name = "uniqThetaIntersect";
+    };
+
+    struct NameUniqThetaUnion
+    {
+        static constexpr auto name = "uniqThetaUnion";
+    };
+
+    struct NameUniqThetaNot
+    {
+        static constexpr auto name = "uniqThetaNot";
+    };
+
+    using FunctionUniqThetaIntersect = FunctionUniqTheta<UniqThetaIntersectImpl, NameUniqThetaIntersect>;
+    using FunctionUniqThetaUnion = FunctionUniqTheta<UniqThetaUnionImpl, NameUniqThetaUnion>;
+    using FunctionUniqThetaNot = FunctionUniqTheta<UniqThetaNotImpl, NameUniqThetaNot>;
+
+}
+
+
+#endif
diff --git a/src/IO/ConcatSeekableReadBuffer.cpp b/src/IO/ConcatSeekableReadBuffer.cpp
index c5d48376e2f..0943d1eac45 100644
--- a/src/IO/ConcatSeekableReadBuffer.cpp
+++ b/src/IO/ConcatSeekableReadBuffer.cpp
@@ -9,6 +9,11 @@ namespace ErrorCodes
     extern const int ARGUMENT_OUT_OF_BOUND;
 }
 
+ConcatSeekableReadBuffer::BufferInfo::BufferInfo(BufferInfo && src) noexcept
+    : in(std::exchange(src.in, nullptr)), own_in(std::exchange(src.own_in, false)), size(std::exchange(src.size, 0))
+{
+}
+
 ConcatSeekableReadBuffer::BufferInfo::~BufferInfo()
 {
     if (own_in)
diff --git a/src/IO/ConcatSeekableReadBuffer.h b/src/IO/ConcatSeekableReadBuffer.h
index 5d7dca82524..c8c16c5d887 100644
--- a/src/IO/ConcatSeekableReadBuffer.h
+++ b/src/IO/ConcatSeekableReadBuffer.h
@@ -30,7 +30,7 @@ private:
     struct BufferInfo
     {
         BufferInfo() = default;
-        BufferInfo(BufferInfo &&) = default;
+        BufferInfo(BufferInfo && src) noexcept;
         ~BufferInfo();
         SeekableReadBuffer * in = nullptr;
         bool own_in = false;
diff --git a/src/IO/HashingReadBuffer.h b/src/IO/HashingReadBuffer.h
index 5d42c64478c..a0a029e6f85 100644
--- a/src/IO/HashingReadBuffer.h
+++ b/src/IO/HashingReadBuffer.h
@@ -18,29 +18,38 @@ public:
     {
         working_buffer = in.buffer();
         pos = in.position();
+        hashing_begin = pos;
+    }
 
-        /// calculate hash from the data already read
-        if (!working_buffer.empty())
+    uint128 getHash()
+    {
+        if (pos > hashing_begin)
         {
-            calculateHash(pos, working_buffer.end() - pos);
+            calculateHash(hashing_begin, pos - hashing_begin);
+            hashing_begin = pos;
         }
+        return IHashingBuffer<ReadBuffer>::getHash();
     }
 
 private:
     bool nextImpl() override
     {
+        if (pos > hashing_begin)
+            calculateHash(hashing_begin, pos - hashing_begin);
+
         in.position() = pos;
         bool res = in.next();
         working_buffer = in.buffer();
-        pos = in.position();
 
         // `pos` may be different from working_buffer.begin() when using sophisticated ReadBuffers.
-        calculateHash(pos, working_buffer.end() - pos);
+        pos = in.position();
+        hashing_begin = pos;
 
         return res;
     }
 
     ReadBuffer & in;
+    BufferBase::Position hashing_begin;
 };
 
 }
diff --git a/src/IO/ReadBufferFromS3.cpp b/src/IO/ReadBufferFromS3.cpp
index 7fb432eab22..380365f9b95 100644
--- a/src/IO/ReadBufferFromS3.cpp
+++ b/src/IO/ReadBufferFromS3.cpp
@@ -34,6 +34,7 @@ namespace ErrorCodes
     extern const int CANNOT_SEEK_THROUGH_FILE;
     extern const int SEEK_POSITION_OUT_OF_BOUND;
     extern const int LOGICAL_ERROR;
+    extern const int CANNOT_ALLOCATE_MEMORY;
 }
 
 
@@ -48,7 +49,7 @@ ReadBufferFromS3::ReadBufferFromS3(
     size_t offset_,
     size_t read_until_position_,
     bool restricted_seek_)
-    : ReadBufferFromFileBase(settings_.remote_fs_buffer_size, nullptr, 0)
+    : ReadBufferFromFileBase(use_external_buffer_ ? 0 : settings_.remote_fs_buffer_size, nullptr, 0)
     , client_ptr(std::move(client_ptr_))
     , bucket(bucket_)
     , key(key_)
@@ -136,6 +137,23 @@ bool ReadBufferFromS3::nextImpl()
             ProfileEvents::increment(ProfileEvents::ReadBufferFromS3Microseconds, watch.elapsedMicroseconds());
             ProfileEvents::increment(ProfileEvents::ReadBufferFromS3RequestsErrors, 1);
 
+            if (const auto * s3_exception = dynamic_cast<const S3Exception *>(&e))
+            {
+                /// It doesn't make sense to retry Access Denied or No Such Key
+                if (!s3_exception->isRetryableError())
+                {
+                    tryLogCurrentException(log, fmt::format("while reading key: {}, from bucket: {}", key, bucket));
+                    throw;
+                }
+            }
+
+            /// It doesn't make sense to retry allocator errors
+            if (e.code() == ErrorCodes::CANNOT_ALLOCATE_MEMORY)
+            {
+                tryLogCurrentException(log);
+                throw;
+            }
+
             LOG_DEBUG(
                 log,
                 "Caught exception while reading S3 object. Bucket: {}, Key: {}, Version: {}, Offset: {}, Attempt: {}, Message: {}",
@@ -306,7 +324,10 @@ std::unique_ptr<ReadBuffer> ReadBufferFromS3::initialize()
         return std::make_unique<ReadBufferFromIStream>(read_result.GetBody(), buffer_size);
     }
     else
-        throw Exception(outcome.GetError().GetMessage(), ErrorCodes::S3_ERROR);
+    {
+        const auto & error = outcome.GetError();
+        throw S3Exception(error.GetMessage(), error.GetErrorType());
+    }
 }
 
 SeekableReadBufferPtr ReadBufferS3Factory::getReader()
diff --git a/src/IO/S3/PocoHTTPClient.cpp b/src/IO/S3/PocoHTTPClient.cpp
index 089d89cd8a7..569bebb1ed1 100644
--- a/src/IO/S3/PocoHTTPClient.cpp
+++ b/src/IO/S3/PocoHTTPClient.cpp
@@ -42,6 +42,18 @@ namespace ProfileEvents
     extern const Event S3WriteRequestsErrors;
     extern const Event S3WriteRequestsThrottling;
     extern const Event S3WriteRequestsRedirects;
+
+    extern const Event DiskS3ReadMicroseconds;
+    extern const Event DiskS3ReadRequestsCount;
+    extern const Event DiskS3ReadRequestsErrors;
+    extern const Event DiskS3ReadRequestsThrottling;
+    extern const Event DiskS3ReadRequestsRedirects;
+
+    extern const Event DiskS3WriteMicroseconds;
+    extern const Event DiskS3WriteRequestsCount;
+    extern const Event DiskS3WriteRequestsErrors;
+    extern const Event DiskS3WriteRequestsThrottling;
+    extern const Event DiskS3WriteRequestsRedirects;
 }
 
 namespace CurrentMetrics
@@ -62,11 +74,13 @@ PocoHTTPClientConfiguration::PocoHTTPClientConfiguration(
         const String & force_region_,
         const RemoteHostFilter & remote_host_filter_,
         unsigned int s3_max_redirects_,
-        bool enable_s3_requests_logging_)
+        bool enable_s3_requests_logging_,
+        bool for_disk_s3_)
     : force_region(force_region_)
     , remote_host_filter(remote_host_filter_)
     , s3_max_redirects(s3_max_redirects_)
     , enable_s3_requests_logging(enable_s3_requests_logging_)
+    , for_disk_s3(for_disk_s3_)
 {
 }
 
@@ -112,6 +126,7 @@ PocoHTTPClient::PocoHTTPClient(const PocoHTTPClientConfiguration & client_config
     , remote_host_filter(client_configuration.remote_host_filter)
     , s3_max_redirects(client_configuration.s3_max_redirects)
     , enable_s3_requests_logging(client_configuration.enable_s3_requests_logging)
+    , for_disk_s3(client_configuration.for_disk_s3)
     , extra_headers(client_configuration.extra_headers)
 {
 }
@@ -121,9 +136,29 @@ std::shared_ptr<Aws::Http::HttpResponse> PocoHTTPClient::MakeRequest(
     Aws::Utils::RateLimits::RateLimiterInterface * readLimiter,
     Aws::Utils::RateLimits::RateLimiterInterface * writeLimiter) const
 {
-    auto response = Aws::MakeShared<PocoHTTPResponse>("PocoHTTPClient", request);
-    makeRequestInternal(*request, response, readLimiter, writeLimiter);
-    return response;
+    try
+    {
+        auto response = Aws::MakeShared<PocoHTTPResponse>("PocoHTTPClient", request);
+        makeRequestInternal(*request, response, readLimiter, writeLimiter);
+        return response;
+    }
+    catch (const Exception &)
+    {
+        throw;
+    }
+    catch (const Poco::Exception & e)
+    {
+        throw Exception(Exception::CreateFromPocoTag{}, e);
+    }
+    catch (const std::exception & e)
+    {
+        throw Exception(Exception::CreateFromSTDTag{}, e);
+    }
+    catch (...)
+    {
+        tryLogCurrentException(__PRETTY_FUNCTION__);
+        throw;
+    }
 }
 
 namespace
@@ -156,6 +191,46 @@ namespace
     }
 }
 
+PocoHTTPClient::S3MetricKind PocoHTTPClient::getMetricKind(const Aws::Http::HttpRequest & request)
+{
+    switch (request.GetMethod())
+    {
+        case Aws::Http::HttpMethod::HTTP_GET:
+        case Aws::Http::HttpMethod::HTTP_HEAD:
+            return S3MetricKind::Read;
+        case Aws::Http::HttpMethod::HTTP_POST:
+        case Aws::Http::HttpMethod::HTTP_DELETE:
+        case Aws::Http::HttpMethod::HTTP_PUT:
+        case Aws::Http::HttpMethod::HTTP_PATCH:
+            return S3MetricKind::Write;
+    }
+    throw Exception("Unsupported request method", ErrorCodes::NOT_IMPLEMENTED);
+}
+
+void PocoHTTPClient::addMetric(const Aws::Http::HttpRequest & request, S3MetricType type, ProfileEvents::Count amount) const
+{
+    const ProfileEvents::Event events_map[static_cast<size_t>(S3MetricType::EnumSize)][static_cast<size_t>(S3MetricKind::EnumSize)] = {
+        {ProfileEvents::S3ReadMicroseconds, ProfileEvents::S3WriteMicroseconds},
+        {ProfileEvents::S3ReadRequestsCount, ProfileEvents::S3WriteRequestsCount},
+        {ProfileEvents::S3ReadRequestsErrors, ProfileEvents::S3WriteRequestsErrors},
+        {ProfileEvents::S3ReadRequestsThrottling, ProfileEvents::S3WriteRequestsThrottling},
+        {ProfileEvents::S3ReadRequestsRedirects, ProfileEvents::S3WriteRequestsRedirects},
+    };
+
+    const ProfileEvents::Event disk_s3_events_map[static_cast<size_t>(S3MetricType::EnumSize)][static_cast<size_t>(S3MetricKind::EnumSize)] = {
+        {ProfileEvents::DiskS3ReadMicroseconds, ProfileEvents::DiskS3WriteMicroseconds},
+        {ProfileEvents::DiskS3ReadRequestsCount, ProfileEvents::DiskS3WriteRequestsCount},
+        {ProfileEvents::DiskS3ReadRequestsErrors, ProfileEvents::DiskS3WriteRequestsErrors},
+        {ProfileEvents::DiskS3ReadRequestsThrottling, ProfileEvents::DiskS3WriteRequestsThrottling},
+        {ProfileEvents::DiskS3ReadRequestsRedirects, ProfileEvents::DiskS3WriteRequestsRedirects},
+    };
+
+    S3MetricKind kind = getMetricKind(request);
+
+    ProfileEvents::increment(events_map[static_cast<unsigned int>(type)][static_cast<unsigned int>(kind)], amount);
+    if (for_disk_s3)
+        ProfileEvents::increment(disk_s3_events_map[static_cast<unsigned int>(type)][static_cast<unsigned int>(kind)], amount);
+}
 
 void PocoHTTPClient::makeRequestInternal(
     Aws::Http::HttpRequest & request,
@@ -169,45 +244,7 @@ void PocoHTTPClient::makeRequestInternal(
     if (enable_s3_requests_logging)
         LOG_TEST(log, "Make request to: {}", uri);
 
-    enum class S3MetricType
-    {
-        Microseconds,
-        Count,
-        Errors,
-        Throttling,
-        Redirects,
-
-        EnumSize,
-    };
-
-    auto select_metric = [&request](S3MetricType type)
-    {
-        const ProfileEvents::Event events_map[][2] = {
-            {ProfileEvents::S3ReadMicroseconds, ProfileEvents::S3WriteMicroseconds},
-            {ProfileEvents::S3ReadRequestsCount, ProfileEvents::S3WriteRequestsCount},
-            {ProfileEvents::S3ReadRequestsErrors, ProfileEvents::S3WriteRequestsErrors},
-            {ProfileEvents::S3ReadRequestsThrottling, ProfileEvents::S3WriteRequestsThrottling},
-            {ProfileEvents::S3ReadRequestsRedirects, ProfileEvents::S3WriteRequestsRedirects},
-        };
-
-        static_assert((sizeof(events_map) / sizeof(events_map[0])) == static_cast<unsigned int>(S3MetricType::EnumSize));
-
-        switch (request.GetMethod())
-        {
-            case Aws::Http::HttpMethod::HTTP_GET:
-            case Aws::Http::HttpMethod::HTTP_HEAD:
-                return events_map[static_cast<unsigned int>(type)][0]; // Read
-            case Aws::Http::HttpMethod::HTTP_POST:
-            case Aws::Http::HttpMethod::HTTP_DELETE:
-            case Aws::Http::HttpMethod::HTTP_PUT:
-            case Aws::Http::HttpMethod::HTTP_PATCH:
-                return events_map[static_cast<unsigned int>(type)][1]; // Write
-        }
-
-        throw Exception("Unsupported request method", ErrorCodes::NOT_IMPLEMENTED);
-    };
-
-    ProfileEvents::increment(select_metric(S3MetricType::Count));
+    addMetric(request, S3MetricType::Count);
     CurrentMetrics::Increment metric_increment{CurrentMetrics::S3Requests};
 
     try
@@ -314,7 +351,7 @@ void PocoHTTPClient::makeRequestInternal(
             auto & response_body_stream = session->receiveResponse(poco_response);
 
             watch.stop();
-            ProfileEvents::increment(select_metric(S3MetricType::Microseconds), watch.elapsedMicroseconds());
+            addMetric(request, S3MetricType::Microseconds, watch.elapsedMicroseconds());
 
             int status_code = static_cast<int>(poco_response.getStatus());
 
@@ -329,7 +366,7 @@ void PocoHTTPClient::makeRequestInternal(
                 if (enable_s3_requests_logging)
                     LOG_TEST(log, "Redirecting request to new location: {}", location);
 
-                ProfileEvents::increment(select_metric(S3MetricType::Redirects));
+                addMetric(request, S3MetricType::Redirects);
 
                 continue;
             }
@@ -367,7 +404,7 @@ void PocoHTTPClient::makeRequestInternal(
                     LOG_WARNING(log, "Response for request contain <Error> tag in body, settings internal server error (500 code)");
                     response->SetResponseCode(Aws::Http::HttpResponseCode::INTERNAL_SERVER_ERROR);
 
-                    ProfileEvents::increment(select_metric(S3MetricType::Errors));
+                    addMetric(request, S3MetricType::Errors);
                     if (error_report)
                         error_report(request_configuration);
 
@@ -381,11 +418,11 @@ void PocoHTTPClient::makeRequestInternal(
 
                 if (status_code == 429 || status_code == 503)
                 { // API throttling
-                    ProfileEvents::increment(select_metric(S3MetricType::Throttling));
+                    addMetric(request, S3MetricType::Throttling);
                 }
                 else if (status_code >= 300)
                 {
-                    ProfileEvents::increment(select_metric(S3MetricType::Errors));
+                    addMetric(request, S3MetricType::Errors);
                     if (status_code >= 500 && error_report)
                         error_report(request_configuration);
                 }
@@ -403,7 +440,7 @@ void PocoHTTPClient::makeRequestInternal(
         response->SetClientErrorType(Aws::Client::CoreErrors::NETWORK_CONNECTION);
         response->SetClientErrorMessage(getCurrentExceptionMessage(false));
 
-        ProfileEvents::increment(select_metric(S3MetricType::Errors));
+        addMetric(request, S3MetricType::Errors);
     }
 }
 
diff --git a/src/IO/S3/PocoHTTPClient.h b/src/IO/S3/PocoHTTPClient.h
index 5fc8c9acc17..9005f132974 100644
--- a/src/IO/S3/PocoHTTPClient.h
+++ b/src/IO/S3/PocoHTTPClient.h
@@ -44,6 +44,7 @@ struct PocoHTTPClientConfiguration : public Aws::Client::ClientConfiguration
     const RemoteHostFilter & remote_host_filter;
     unsigned int s3_max_redirects;
     bool enable_s3_requests_logging;
+    bool for_disk_s3;
     HeaderCollection extra_headers;
 
     void updateSchemeAndRegion();
@@ -55,7 +56,8 @@ private:
         const String & force_region_,
         const RemoteHostFilter & remote_host_filter_,
         unsigned int s3_max_redirects_,
-        bool enable_s3_requests_logging_
+        bool enable_s3_requests_logging_,
+        bool for_disk_s3_
     );
 
     /// Constructor of Aws::Client::ClientConfiguration must be called after AWS SDK initialization.
@@ -113,18 +115,42 @@ public:
         Aws::Utils::RateLimits::RateLimiterInterface * writeLimiter) const override;
 
 private:
+
     void makeRequestInternal(
         Aws::Http::HttpRequest & request,
         std::shared_ptr<PocoHTTPResponse> & response,
         Aws::Utils::RateLimits::RateLimiterInterface * readLimiter,
         Aws::Utils::RateLimits::RateLimiterInterface * writeLimiter) const;
 
+    enum class S3MetricType
+    {
+        Microseconds,
+        Count,
+        Errors,
+        Throttling,
+        Redirects,
+
+        EnumSize,
+    };
+
+    enum class S3MetricKind
+    {
+        Read,
+        Write,
+
+        EnumSize,
+    };
+
+    static S3MetricKind getMetricKind(const Aws::Http::HttpRequest & request);
+    void addMetric(const Aws::Http::HttpRequest & request, S3MetricType type, ProfileEvents::Count amount = 1) const;
+
     std::function<ClientConfigurationPerRequest(const Aws::Http::HttpRequest &)> per_request_configuration;
     std::function<void(const ClientConfigurationPerRequest &)> error_report;
     ConnectionTimeouts timeouts;
     const RemoteHostFilter & remote_host_filter;
     unsigned int s3_max_redirects;
     bool enable_s3_requests_logging;
+    bool for_disk_s3;
     const HeaderCollection extra_headers;
 };
 
diff --git a/src/IO/S3/tests/gtest_aws_s3_client.cpp b/src/IO/S3/tests/gtest_aws_s3_client.cpp
index 44bdf436fae..21d421bb4f6 100644
--- a/src/IO/S3/tests/gtest_aws_s3_client.cpp
+++ b/src/IO/S3/tests/gtest_aws_s3_client.cpp
@@ -87,7 +87,8 @@ TEST(IOTestAwsS3Client, AppendExtraSSECHeaders)
         region,
         remote_host_filter,
         s3_max_redirects,
-        enable_s3_requests_logging
+        enable_s3_requests_logging,
+        /* for_disk_s3 = */ false
     );
 
     client_configuration.endpointOverride = uri.endpoint;
diff --git a/src/IO/S3Common.cpp b/src/IO/S3Common.cpp
index fb9cff5d109..e97fa707c13 100644
--- a/src/IO/S3Common.cpp
+++ b/src/IO/S3Common.cpp
@@ -24,7 +24,7 @@
 #    include <aws/core/utils/UUID.h>
 #    include <aws/core/http/HttpClientFactory.h>
 #    include <aws/s3/S3Client.h>
-#    include <aws/s3/model/HeadObjectRequest.h>  // Y_IGNORE
+#    include <aws/s3/model/HeadObjectRequest.h>
 
 #    include <IO/S3/PocoHTTPClientFactory.h>
 #    include <IO/S3/PocoHTTPClient.h>
@@ -35,6 +35,26 @@
 
 #    include <fstream>
 
+namespace DB
+{
+
+bool S3Exception::isRetryableError() const
+{
+    /// Looks like these list is quite conservative, add more codes if you wish
+    static const std::unordered_set<Aws::S3::S3Errors> unretryable_errors = {
+        Aws::S3::S3Errors::NO_SUCH_KEY,
+        Aws::S3::S3Errors::ACCESS_DENIED,
+        Aws::S3::S3Errors::INVALID_ACCESS_KEY_ID,
+        Aws::S3::S3Errors::INVALID_SIGNATURE,
+        Aws::S3::S3Errors::NO_SUCH_UPLOAD,
+        Aws::S3::S3Errors::NO_SUCH_BUCKET,
+    };
+
+    return !unretryable_errors.contains(code);
+}
+
+}
+
 namespace
 {
 
@@ -543,7 +563,7 @@ public:
             /// AWS API tries credentials providers one by one. Some of providers (like ProfileConfigFileAWSCredentialsProvider) can be
             /// quite verbose even if nobody configured them. So we use our provider first and only after it use default providers.
             {
-                DB::S3::PocoHTTPClientConfiguration aws_client_configuration = DB::S3::ClientFactory::instance().createClientConfiguration(configuration.region, configuration.remote_host_filter, configuration.s3_max_redirects, configuration.enable_s3_requests_logging);
+                DB::S3::PocoHTTPClientConfiguration aws_client_configuration = DB::S3::ClientFactory::instance().createClientConfiguration(configuration.region, configuration.remote_host_filter, configuration.s3_max_redirects, configuration.enable_s3_requests_logging, configuration.for_disk_s3);
                 AddProvider(std::make_shared<AwsAuthSTSAssumeRoleWebIdentityCredentialsProvider>(aws_client_configuration));
             }
 
@@ -580,7 +600,7 @@ public:
             }
             else if (Aws::Utils::StringUtils::ToLower(ec2_metadata_disabled.c_str()) != "true")
             {
-                DB::S3::PocoHTTPClientConfiguration aws_client_configuration = DB::S3::ClientFactory::instance().createClientConfiguration(configuration.region, configuration.remote_host_filter, configuration.s3_max_redirects, configuration.enable_s3_requests_logging);
+                DB::S3::PocoHTTPClientConfiguration aws_client_configuration = DB::S3::ClientFactory::instance().createClientConfiguration(configuration.region, configuration.remote_host_filter, configuration.s3_max_redirects, configuration.enable_s3_requests_logging, configuration.for_disk_s3);
 
                 /// See MakeDefaultHttpResourceClientConfiguration().
                 /// This is part of EC2 metadata client, but unfortunately it can't be accessed from outside
@@ -700,9 +720,10 @@ namespace S3
         const String & force_region,
         const RemoteHostFilter & remote_host_filter,
         unsigned int s3_max_redirects,
-        bool enable_s3_requests_logging)
+        bool enable_s3_requests_logging,
+        bool for_disk_s3)
     {
-        return PocoHTTPClientConfiguration(force_region, remote_host_filter, s3_max_redirects, enable_s3_requests_logging);
+        return PocoHTTPClientConfiguration(force_region, remote_host_filter, s3_max_redirects, enable_s3_requests_logging, for_disk_s3);
     }
 
     URI::URI(const Poco::URI & uri_)
diff --git a/src/IO/S3Common.h b/src/IO/S3Common.h
index 46a09ee8901..da7ecf95b78 100644
--- a/src/IO/S3Common.h
+++ b/src/IO/S3Common.h
@@ -7,23 +7,62 @@
 #include <base/types.h>
 #include <aws/core/Aws.h>
 #include <aws/core/client/ClientConfiguration.h>
+#include <aws/s3/S3Errors.h>
 #include <IO/S3/PocoHTTPClient.h>
 #include <Poco/URI.h>
 
+#include <Common/Exception.h>
+
 namespace Aws::S3
 {
     class S3Client;
 }
 
+
 namespace DB
 {
-    class RemoteHostFilter;
-    struct HttpHeader;
-    using HeaderCollection = std::vector<HttpHeader>;
+namespace ErrorCodes
+{
+    extern const int S3_ERROR;
 }
 
+class RemoteHostFilter;
+struct HttpHeader;
+using HeaderCollection = std::vector<HttpHeader>;
+
+class S3Exception : public Exception
+{
+public:
+
+    // Format message with fmt::format, like the logging functions.
+    template <typename... Args>
+    S3Exception(Aws::S3::S3Errors code_, fmt::format_string<Args...> fmt, Args &&... args)
+        : Exception(fmt::format(fmt, std::forward<Args>(args)...), ErrorCodes::S3_ERROR)
+        , code(code_)
+    {
+    }
+
+    S3Exception(const std::string & msg, Aws::S3::S3Errors code_)
+        : Exception(msg, ErrorCodes::S3_ERROR)
+        , code(code_)
+    {}
+
+    Aws::S3::S3Errors getS3ErrorCode() const
+    {
+        return code;
+    }
+
+    bool isRetryableError() const;
+
+private:
+    const Aws::S3::S3Errors code;
+};
+}
+
+
 namespace DB::S3
 {
+
 class ClientFactory
 {
 public:
@@ -45,7 +84,8 @@ public:
         const String & force_region,
         const RemoteHostFilter & remote_host_filter,
         unsigned int s3_max_redirects,
-        bool enable_s3_requests_logging);
+        bool enable_s3_requests_logging,
+        bool for_disk_s3);
 
 private:
     ClientFactory();
diff --git a/src/IO/WriteBufferFromS3.cpp b/src/IO/WriteBufferFromS3.cpp
index 6207ba53bd8..b09abda85db 100644
--- a/src/IO/WriteBufferFromS3.cpp
+++ b/src/IO/WriteBufferFromS3.cpp
@@ -8,6 +8,7 @@
 
 #include <IO/WriteBufferFromS3.h>
 #include <IO/WriteHelpers.h>
+#include <IO/S3Common.h>
 #include <Interpreters/Context.h>
 
 #include <aws/s3/S3Client.h>
@@ -40,7 +41,7 @@ namespace ErrorCodes
 struct WriteBufferFromS3::UploadPartTask
 {
     Aws::S3::Model::UploadPartRequest req;
-    bool is_finised = false;
+    bool is_finished = false;
     std::string tag;
     std::exception_ptr exception;
 };
@@ -48,7 +49,7 @@ struct WriteBufferFromS3::UploadPartTask
 struct WriteBufferFromS3::PutObjectTask
 {
     Aws::S3::Model::PutObjectRequest req;
-    bool is_finised = false;
+    bool is_finished = false;
     std::exception_ptr exception;
 };
 
@@ -64,10 +65,10 @@ WriteBufferFromS3::WriteBufferFromS3(
     : BufferWithOwnMemory<WriteBuffer>(buffer_size_, nullptr, 0)
     , bucket(bucket_)
     , key(key_)
-    , client_ptr(std::move(client_ptr_))
-    , upload_part_size(s3_settings_.min_upload_part_size)
     , s3_settings(s3_settings_)
+    , client_ptr(std::move(client_ptr_))
     , object_metadata(std::move(object_metadata_))
+    , upload_part_size(s3_settings_.min_upload_part_size)
     , schedule(std::move(schedule_))
     , write_settings(write_settings_)
 {
@@ -173,7 +174,9 @@ void WriteBufferFromS3::finalizeImpl()
         auto response = client_ptr->HeadObject(request);
 
         if (!response.IsSuccess())
-            throw Exception(ErrorCodes::S3_ERROR, "Object {} from bucket {} disappeared immediately after upload, it's a bug in S3 or S3 API.", key, bucket);
+            throw S3Exception(fmt::format("Object {} from bucket {} disappeared immediately after upload, it's a bug in S3 or S3 API.", key, bucket), response.GetError().GetErrorType());
+        else
+            LOG_TRACE(log, "Object {} exists after upload", key);
     }
 }
 
@@ -197,7 +200,7 @@ void WriteBufferFromS3::createMultipartUpload()
         LOG_TRACE(log, "Multipart upload has created. Bucket: {}, Key: {}, Upload id: {}", bucket, key, multipart_upload_id);
     }
     else
-        throw Exception(outcome.GetError().GetMessage(), ErrorCodes::S3_ERROR);
+        throw S3Exception(outcome.GetError().GetMessage(), outcome.GetError().GetErrorType());
 }
 
 void WriteBufferFromS3::writePart()
@@ -218,7 +221,7 @@ void WriteBufferFromS3::writePart()
         return;
     }
 
-    if (part_tags.size() == S3_WARN_MAX_PARTS)
+    if (TSA_SUPPRESS_WARNING_FOR_READ(part_tags).size() == S3_WARN_MAX_PARTS)
     {
         // Don't throw exception here by ourselves but leave the decision to take by S3 server.
         LOG_WARNING(log, "Maximum part number in S3 protocol has reached (too many parts). Server may not accept this whole upload.");
@@ -231,6 +234,7 @@ void WriteBufferFromS3::writePart()
         int part_number;
         {
             std::lock_guard lock(bg_tasks_mutex);
+
             task = &upload_object_tasks.emplace_back();
             ++num_added_bg_tasks;
             part_number = num_added_bg_tasks;
@@ -240,7 +244,7 @@ void WriteBufferFromS3::writePart()
         auto task_finish_notify = [&, task]()
         {
             std::lock_guard lock(bg_tasks_mutex);
-            task->is_finised = true;
+            task->is_finished = true;
             ++num_finished_bg_tasks;
 
             /// Notification under mutex is important here.
@@ -276,9 +280,11 @@ void WriteBufferFromS3::writePart()
     else
     {
         UploadPartTask task;
-        fillUploadRequest(task.req, part_tags.size() + 1);
+        auto & tags = TSA_SUPPRESS_WARNING_FOR_WRITE(part_tags); /// Suppress warning because schedule == false.
+
+        fillUploadRequest(task.req, tags.size() + 1);
         processUploadRequest(task);
-        part_tags.push_back(task.tag);
+        tags.push_back(task.tag);
     }
 }
 
@@ -302,19 +308,22 @@ void WriteBufferFromS3::processUploadRequest(UploadPartTask & task)
     if (outcome.IsSuccess())
     {
         task.tag = outcome.GetResult().GetETag();
+        std::lock_guard lock(bg_tasks_mutex); /// Protect part_tags from race
         LOG_TRACE(log, "Writing part finished. Bucket: {}, Key: {}, Upload_id: {}, Etag: {}, Parts: {}", bucket, key, multipart_upload_id, task.tag, part_tags.size());
     }
     else
-        throw Exception(outcome.GetError().GetMessage(), ErrorCodes::S3_ERROR);
+        throw S3Exception(outcome.GetError().GetMessage(), outcome.GetError().GetErrorType());
 
     total_parts_uploaded++;
 }
 
 void WriteBufferFromS3::completeMultipartUpload()
 {
-    LOG_TRACE(log, "Completing multipart upload. Bucket: {}, Key: {}, Upload_id: {}, Parts: {}", bucket, key, multipart_upload_id, part_tags.size());
+    const auto & tags = TSA_SUPPRESS_WARNING_FOR_READ(part_tags);
 
-    if (part_tags.empty())
+    LOG_TRACE(log, "Completing multipart upload. Bucket: {}, Key: {}, Upload_id: {}, Parts: {}", bucket, key, multipart_upload_id, tags.size());
+
+    if (tags.empty())
         throw Exception("Failed to complete multipart upload. No parts have uploaded", ErrorCodes::S3_ERROR);
 
     Aws::S3::Model::CompleteMultipartUploadRequest req;
@@ -323,10 +332,10 @@ void WriteBufferFromS3::completeMultipartUpload()
     req.SetUploadId(multipart_upload_id);
 
     Aws::S3::Model::CompletedMultipartUpload multipart_upload;
-    for (size_t i = 0; i < part_tags.size(); ++i)
+    for (size_t i = 0; i < tags.size(); ++i)
     {
         Aws::S3::Model::CompletedPart part;
-        multipart_upload.AddParts(part.WithETag(part_tags[i]).WithPartNumber(i + 1));
+        multipart_upload.AddParts(part.WithETag(tags[i]).WithPartNumber(i + 1));
     }
 
     req.SetMultipartUpload(multipart_upload);
@@ -334,12 +343,13 @@ void WriteBufferFromS3::completeMultipartUpload()
     auto outcome = client_ptr->CompleteMultipartUpload(req);
 
     if (outcome.IsSuccess())
-        LOG_TRACE(log, "Multipart upload has completed. Bucket: {}, Key: {}, Upload_id: {}, Parts: {}", bucket, key, multipart_upload_id, part_tags.size());
+        LOG_TRACE(log, "Multipart upload has completed. Bucket: {}, Key: {}, Upload_id: {}, Parts: {}", bucket, key, multipart_upload_id, tags.size());
     else
     {
-        throw Exception(ErrorCodes::S3_ERROR, "{} Tags:{}",
-            outcome.GetError().GetMessage(),
-            fmt::join(part_tags.begin(), part_tags.end(), " "));
+        throw S3Exception(
+            outcome.GetError().GetErrorType(),
+            "Message: {}, Key: {}, Bucket: {}, Tags: {}",
+            outcome.GetError().GetMessage(), key, bucket, fmt::join(tags.begin(), tags.end(), " "));
     }
 }
 
@@ -364,7 +374,7 @@ void WriteBufferFromS3::makeSinglepartUpload()
         auto task_notify_finish = [&]()
         {
             std::lock_guard lock(bg_tasks_mutex);
-            put_object_task->is_finised = true;
+            put_object_task->is_finished = true;
 
             /// Notification under mutex is important here.
             /// Othervies, WriteBuffer could be destroyed in between
@@ -417,37 +427,42 @@ void WriteBufferFromS3::fillPutRequest(Aws::S3::Model::PutObjectRequest & req)
     req.SetContentType("binary/octet-stream");
 }
 
-void WriteBufferFromS3::processPutRequest(PutObjectTask & task)
+void WriteBufferFromS3::processPutRequest(const PutObjectTask & task)
 {
     auto outcome = client_ptr->PutObject(task.req);
     bool with_pool = static_cast<bool>(schedule);
     if (outcome.IsSuccess())
         LOG_TRACE(log, "Single part upload has completed. Bucket: {}, Key: {}, Object size: {}, WithPool: {}", bucket, key, task.req.GetContentLength(), with_pool);
     else
-        throw Exception(outcome.GetError().GetMessage(), ErrorCodes::S3_ERROR);
+        throw S3Exception(
+            outcome.GetError().GetErrorType(),
+            "Message: {}, Key: {}, Bucket: {}, Object size: {}, WithPool: {}",
+            outcome.GetError().GetMessage(), key, bucket, task.req.GetContentLength(), with_pool);
 }
 
 void WriteBufferFromS3::waitForReadyBackGroundTasks()
 {
     if (schedule)
     {
-        std::lock_guard lock(bg_tasks_mutex);
+        std::unique_lock lock(bg_tasks_mutex);
+
+        /// Suppress warnings because bg_tasks_mutex is actually hold, but tsa annotations do not understand std::unique_lock
+        auto & tasks = TSA_SUPPRESS_WARNING_FOR_WRITE(upload_object_tasks);
+
+        while (!tasks.empty() && tasks.front().is_finished)
         {
-            while (!upload_object_tasks.empty() && upload_object_tasks.front().is_finised)
+            auto & task = tasks.front();
+            auto exception = task.exception;
+            auto tag = std::move(task.tag);
+            tasks.pop_front();
+
+            if (exception)
             {
-                auto & task = upload_object_tasks.front();
-                auto exception = task.exception;
-                auto tag = std::move(task.tag);
-                upload_object_tasks.pop_front();
-
-                if (exception)
-                {
-                    waitForAllBackGroundTasks();
-                    std::rethrow_exception(exception);
-                }
-
-                part_tags.push_back(tag);
+                waitForAllBackGroundTasksUnlocked(lock);
+                std::rethrow_exception(exception);
             }
+
+            TSA_SUPPRESS_WARNING_FOR_WRITE(part_tags).push_back(tag);
         }
     }
 }
@@ -457,22 +472,33 @@ void WriteBufferFromS3::waitForAllBackGroundTasks()
     if (schedule)
     {
         std::unique_lock lock(bg_tasks_mutex);
-        bg_tasks_condvar.wait(lock, [this]() { return num_added_bg_tasks == num_finished_bg_tasks; });
+        waitForAllBackGroundTasksUnlocked(lock);
+    }
+}
 
-        while (!upload_object_tasks.empty())
+void WriteBufferFromS3::waitForAllBackGroundTasksUnlocked(std::unique_lock<std::mutex> & bg_tasks_lock)
+{
+    if (schedule)
+    {
+        bg_tasks_condvar.wait(bg_tasks_lock, [this]() {return TSA_SUPPRESS_WARNING_FOR_READ(num_added_bg_tasks) == TSA_SUPPRESS_WARNING_FOR_READ(num_finished_bg_tasks); });
+
+        /// Suppress warnings because bg_tasks_mutex is actually hold, but tsa annotations do not understand std::unique_lock
+        auto & tasks = TSA_SUPPRESS_WARNING_FOR_WRITE(upload_object_tasks);
+        while (!tasks.empty())
         {
-            auto & task = upload_object_tasks.front();
+            auto & task = tasks.front();
+
             if (task.exception)
                 std::rethrow_exception(task.exception);
 
-            part_tags.push_back(task.tag);
+            TSA_SUPPRESS_WARNING_FOR_WRITE(part_tags).push_back(task.tag);
 
-            upload_object_tasks.pop_front();
+            tasks.pop_front();
         }
 
         if (put_object_task)
         {
-            bg_tasks_condvar.wait(lock, [this]() { return put_object_task->is_finised; });
+            bg_tasks_condvar.wait(bg_tasks_lock, [this]() { return put_object_task->is_finished; });
             if (put_object_task->exception)
                 std::rethrow_exception(put_object_task->exception);
         }
diff --git a/src/IO/WriteBufferFromS3.h b/src/IO/WriteBufferFromS3.h
index 99440654910..ae03299ffbd 100644
--- a/src/IO/WriteBufferFromS3.h
+++ b/src/IO/WriteBufferFromS3.h
@@ -80,37 +80,39 @@ private:
 
     struct PutObjectTask;
     void fillPutRequest(Aws::S3::Model::PutObjectRequest & req);
-    void processPutRequest(PutObjectTask & task);
+    void processPutRequest(const PutObjectTask & task);
 
     void waitForReadyBackGroundTasks();
     void waitForAllBackGroundTasks();
+    void waitForAllBackGroundTasksUnlocked(std::unique_lock<std::mutex> & bg_tasks_lock);
+
+    const String bucket;
+    const String key;
+    const S3Settings::ReadWriteSettings s3_settings;
+    const std::shared_ptr<const Aws::S3::S3Client> client_ptr;
+    const std::optional<std::map<String, String>> object_metadata;
 
-    String bucket;
-    String key;
-    std::shared_ptr<const Aws::S3::S3Client> client_ptr;
     size_t upload_part_size = 0;
-    S3Settings::ReadWriteSettings s3_settings;
-    std::optional<std::map<String, String>> object_metadata;
-
-    /// Buffer to accumulate data.
-    std::shared_ptr<Aws::StringStream> temporary_buffer;
+    std::shared_ptr<Aws::StringStream> temporary_buffer; /// Buffer to accumulate data.
     size_t last_part_size = 0;
     std::atomic<size_t> total_parts_uploaded = 0;
 
     /// Upload in S3 is made in parts.
     /// We initiate upload, then upload each part and get ETag as a response, and then finalizeImpl() upload with listing all our parts.
     String multipart_upload_id;
-    std::vector<String> part_tags;
+    std::vector<String> TSA_GUARDED_BY(bg_tasks_mutex) part_tags;
 
     bool is_prefinalized = false;
 
     /// Following fields are for background uploads in thread pool (if specified).
     /// We use std::function to avoid dependency of Interpreters
-    ScheduleFunc schedule;
-    std::unique_ptr<PutObjectTask> put_object_task;
-    std::list<UploadPartTask> upload_object_tasks;
-    size_t num_added_bg_tasks = 0;
-    size_t num_finished_bg_tasks = 0;
+    const ScheduleFunc schedule;
+
+    std::unique_ptr<PutObjectTask> put_object_task; /// Does not need protection by mutex because of the logic around is_finished field.
+    std::list<UploadPartTask> TSA_GUARDED_BY(bg_tasks_mutex) upload_object_tasks;
+    size_t num_added_bg_tasks TSA_GUARDED_BY(bg_tasks_mutex) = 0;
+    size_t num_finished_bg_tasks TSA_GUARDED_BY(bg_tasks_mutex) = 0;
+
     std::mutex bg_tasks_mutex;
     std::condition_variable bg_tasks_condvar;
 
diff --git a/src/IO/WriteHelpers.h b/src/IO/WriteHelpers.h
index f9892ac6194..7acea87c0d7 100644
--- a/src/IO/WriteHelpers.h
+++ b/src/IO/WriteHelpers.h
@@ -146,14 +146,14 @@ inline size_t writeFloatTextFastPath(T x, char * buffer)
         /// The library Ryu has low performance on integers.
         /// This workaround improves performance 6..10 times.
 
-        if (DecomposedFloat64(x).is_integer_in_representable_range())
+        if (DecomposedFloat64(x).isIntegerInRepresentableRange())
             result = itoa(Int64(x), buffer) - buffer;
         else
             result = jkj::dragonbox::to_chars_n(x, buffer) - buffer;
     }
     else
     {
-        if (DecomposedFloat32(x).is_integer_in_representable_range())
+        if (DecomposedFloat32(x).isIntegerInRepresentableRange())
             result = itoa(Int32(x), buffer) - buffer;
         else
             result = jkj::dragonbox::to_chars_n(x, buffer) - buffer;
diff --git a/src/Interpreters/AggregationUtils.cpp b/src/Interpreters/AggregationUtils.cpp
new file mode 100644
index 00000000000..43062546450
--- /dev/null
+++ b/src/Interpreters/AggregationUtils.cpp
@@ -0,0 +1,113 @@
+#include <Interpreters/AggregationUtils.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+}
+
+OutputBlockColumns prepareOutputBlockColumns(
+    const Aggregator::Params & params,
+    const Aggregator::AggregateFunctionsPlainPtrs & aggregate_functions,
+    const Block & res_header,
+    Arenas & aggregates_pools,
+    bool final,
+    size_t rows)
+{
+    MutableColumns key_columns(params.keys_size);
+    MutableColumns aggregate_columns(params.aggregates_size);
+    MutableColumns final_aggregate_columns(params.aggregates_size);
+    Aggregator::AggregateColumnsData aggregate_columns_data(params.aggregates_size);
+
+    for (size_t i = 0; i < params.keys_size; ++i)
+    {
+        key_columns[i] = res_header.safeGetByPosition(i).type->createColumn();
+        key_columns[i]->reserve(rows);
+    }
+
+    for (size_t i = 0; i < params.aggregates_size; ++i)
+    {
+        if (!final)
+        {
+            const auto & aggregate_column_name = params.aggregates[i].column_name;
+            aggregate_columns[i] = res_header.getByName(aggregate_column_name).type->createColumn();
+
+            /// The ColumnAggregateFunction column captures the shared ownership of the arena with the aggregate function states.
+            ColumnAggregateFunction & column_aggregate_func = assert_cast<ColumnAggregateFunction &>(*aggregate_columns[i]);
+
+            for (auto & pool : aggregates_pools)
+                column_aggregate_func.addArena(pool);
+
+            aggregate_columns_data[i] = &column_aggregate_func.getData();
+            aggregate_columns_data[i]->reserve(rows);
+        }
+        else
+        {
+            final_aggregate_columns[i] = aggregate_functions[i]->getReturnType()->createColumn();
+            final_aggregate_columns[i]->reserve(rows);
+
+            if (aggregate_functions[i]->isState())
+            {
+                /// The ColumnAggregateFunction column captures the shared ownership of the arena with aggregate function states.
+                if (auto * column_aggregate_func = typeid_cast<ColumnAggregateFunction *>(final_aggregate_columns[i].get()))
+                    for (auto & pool : aggregates_pools)
+                        column_aggregate_func->addArena(pool);
+
+                /// Aggregate state can be wrapped into array if aggregate function ends with -Resample combinator.
+                final_aggregate_columns[i]->forEachSubcolumn(
+                    [&aggregates_pools](auto & subcolumn)
+                    {
+                        if (auto * column_aggregate_func = typeid_cast<ColumnAggregateFunction *>(subcolumn.get()))
+                            for (auto & pool : aggregates_pools)
+                                column_aggregate_func->addArena(pool);
+                    });
+            }
+        }
+    }
+
+    if (key_columns.size() != params.keys_size)
+        throw Exception{"Aggregate. Unexpected key columns size.", ErrorCodes::LOGICAL_ERROR};
+
+    std::vector<IColumn *> raw_key_columns;
+    raw_key_columns.reserve(key_columns.size());
+    for (auto & column : key_columns)
+        raw_key_columns.push_back(column.get());
+
+    return {
+        .key_columns = std::move(key_columns),
+        .raw_key_columns = std::move(raw_key_columns),
+        .aggregate_columns = std::move(aggregate_columns),
+        .final_aggregate_columns = std::move(final_aggregate_columns),
+        .aggregate_columns_data = std::move(aggregate_columns_data),
+    };
+}
+
+Block finalizeBlock(const Aggregator::Params & params, const Block & res_header, OutputBlockColumns && out_cols, bool final, size_t rows)
+{
+    auto && [key_columns, raw_key_columns, aggregate_columns, final_aggregate_columns, aggregate_columns_data] = out_cols;
+
+    Block res = res_header.cloneEmpty();
+
+    for (size_t i = 0; i < params.keys_size; ++i)
+        res.getByPosition(i).column = std::move(key_columns[i]);
+
+    for (size_t i = 0; i < params.aggregates_size; ++i)
+    {
+        const auto & aggregate_column_name = params.aggregates[i].column_name;
+        if (final)
+            res.getByName(aggregate_column_name).column = std::move(final_aggregate_columns[i]);
+        else
+            res.getByName(aggregate_column_name).column = std::move(aggregate_columns[i]);
+    }
+
+    /// Change the size of the columns-constants in the block.
+    size_t columns = res_header.columns();
+    for (size_t i = 0; i < columns; ++i)
+        if (isColumnConst(*res.getByPosition(i).column))
+            res.getByPosition(i).column = res.getByPosition(i).column->cut(0, rows);
+
+    return res;
+}
+}
diff --git a/src/Interpreters/AggregationUtils.h b/src/Interpreters/AggregationUtils.h
new file mode 100644
index 00000000000..cc37cec0a69
--- /dev/null
+++ b/src/Interpreters/AggregationUtils.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <Interpreters/Aggregator.h>
+
+namespace DB
+{
+
+struct OutputBlockColumns
+{
+    MutableColumns key_columns;
+    std::vector<IColumn *> raw_key_columns;
+    MutableColumns aggregate_columns;
+    MutableColumns final_aggregate_columns;
+    Aggregator::AggregateColumnsData aggregate_columns_data;
+};
+
+
+OutputBlockColumns prepareOutputBlockColumns(
+    const Aggregator::Params & params,
+    const Aggregator::AggregateFunctionsPlainPtrs & aggregate_functions,
+    const Block & res_header,
+    Arenas & aggregates_pools,
+    bool final,
+    size_t rows);
+
+Block finalizeBlock(const Aggregator::Params & params, const Block & res_header, OutputBlockColumns && out_cols, bool final, size_t rows);
+}
diff --git a/src/Interpreters/Aggregator.cpp b/src/Interpreters/Aggregator.cpp
index 3340170f71b..ef55f92f63a 100644
--- a/src/Interpreters/Aggregator.cpp
+++ b/src/Interpreters/Aggregator.cpp
@@ -34,6 +34,8 @@
 
 #include <Parsers/ASTSelectQuery.h>
 
+#include <Interpreters/AggregationUtils.h>
+
 namespace ProfileEvents
 {
     extern const Event ExternalAggregationWritePart;
@@ -1587,16 +1589,10 @@ Block Aggregator::convertOneBucketToBlock(
     bool final,
     size_t bucket) const
 {
-    Block block = prepareBlockAndFill(data_variants, final, method.data.impls[bucket].size(),
-        [bucket, &method, arena, this] (
-            MutableColumns & key_columns,
-            AggregateColumnsData & aggregate_columns,
-            MutableColumns & final_aggregate_columns,
-            bool final_)
-        {
-            convertToBlockImpl(method, method.data.impls[bucket],
-                key_columns, aggregate_columns, final_aggregate_columns, arena, final_);
-        });
+    // Used in ConvertingAggregatedToChunksSource -> ConvertingAggregatedToChunksTransform (expects single chunk for each bucket_id).
+    constexpr bool return_single_block = true;
+    Block block = convertToBlockImpl<return_single_block>(
+        method, method.data.impls[bucket], arena, data_variants.aggregates_pools, final, method.data.impls[bucket].size());
 
     block.info.bucket_num = bucket;
     return block;
@@ -1702,26 +1698,17 @@ bool Aggregator::checkLimits(size_t result_size, bool & no_more_keys) const
 }
 
 
-template <typename Method, typename Table>
-void Aggregator::convertToBlockImpl(
-    Method & method,
-    Table & data,
-    MutableColumns & key_columns,
-    AggregateColumnsData & aggregate_columns,
-    MutableColumns & final_aggregate_columns,
-    Arena * arena,
-    bool final) const
+template <bool return_single_block, typename Method, typename Table>
+Aggregator::ConvertToBlockRes<return_single_block>
+Aggregator::convertToBlockImpl(Method & method, Table & data, Arena * arena, Arenas & aggregates_pools, bool final, size_t rows) const
 {
     if (data.empty())
-        return;
+    {
+        auto && out_cols = prepareOutputBlockColumns(params, aggregate_functions, getHeader(final), aggregates_pools, final, rows);
+        return {finalizeBlock(params, getHeader(final), std::move(out_cols), final, rows)};
+    }
 
-    if (key_columns.size() != params.keys_size)
-        throw Exception{"Aggregate. Unexpected key columns size.", ErrorCodes::LOGICAL_ERROR};
-
-    std::vector<IColumn *> raw_key_columns;
-    raw_key_columns.reserve(key_columns.size());
-    for (auto & column : key_columns)
-        raw_key_columns.push_back(column.get());
+    ConvertToBlockRes<return_single_block> res;
 
     if (final)
     {
@@ -1729,20 +1716,23 @@ void Aggregator::convertToBlockImpl(
         if (compiled_aggregate_functions_holder)
         {
             static constexpr bool use_compiled_functions = !Method::low_cardinality_optimization;
-            convertToBlockImplFinal<Method, use_compiled_functions>(method, data, std::move(raw_key_columns), final_aggregate_columns, arena);
+            res = convertToBlockImplFinal<Method, use_compiled_functions, return_single_block>(method, data, arena, aggregates_pools, rows);
         }
         else
 #endif
         {
-            convertToBlockImplFinal<Method, false>(method, data, std::move(raw_key_columns), final_aggregate_columns, arena);
+            res = convertToBlockImplFinal<Method, false, return_single_block>(method, data, arena, aggregates_pools, rows);
         }
     }
     else
     {
-        convertToBlockImplNotFinal(method, data, std::move(raw_key_columns), aggregate_columns);
+        res = convertToBlockImplNotFinal<return_single_block>(method, data, aggregates_pools, rows);
     }
+
     /// In order to release memory early.
     data.clearAndShrink();
+
+    return res;
 }
 
 
@@ -1811,38 +1801,9 @@ inline void Aggregator::insertAggregatesIntoColumns(Mapped & mapped, MutableColu
 }
 
 
-template <typename Method, bool use_compiled_functions, typename Table>
-void NO_INLINE Aggregator::convertToBlockImplFinal(
-    Method & method,
-    Table & data,
-    std::vector<IColumn *>  key_columns,
-    MutableColumns & final_aggregate_columns,
-    Arena * arena) const
+template <bool use_compiled_functions>
+Block Aggregator::insertResultsIntoColumns(PaddedPODArray<AggregateDataPtr> & places, OutputBlockColumns && out_cols, Arena * arena) const
 {
-    if constexpr (Method::low_cardinality_optimization)
-    {
-        if (data.hasNullKeyData())
-        {
-            key_columns[0]->insertDefault();
-            insertAggregatesIntoColumns(data.getNullKeyData(), final_aggregate_columns, arena);
-        }
-    }
-
-    auto shuffled_key_sizes = method.shuffleKeyColumns(key_columns, key_sizes);
-    const auto & key_sizes_ref = shuffled_key_sizes ? *shuffled_key_sizes :  key_sizes;
-
-    PaddedPODArray<AggregateDataPtr> places;
-    places.reserve(data.size());
-
-    data.forEachValue([&](const auto & key, auto & mapped)
-    {
-        method.insertKeyIntoColumns(key, key_columns, key_sizes_ref);
-        places.emplace_back(mapped);
-
-        /// Mark the cell as destroyed so it will not be destroyed in destructor.
-        mapped = nullptr;
-    });
-
     std::exception_ptr exception;
     size_t aggregate_functions_destroy_index = 0;
 
@@ -1863,7 +1824,7 @@ void NO_INLINE Aggregator::convertToBlockImplFinal(
                 if (!is_aggregate_function_compiled[i])
                     continue;
 
-                auto & final_aggregate_column = final_aggregate_columns[i];
+                auto & final_aggregate_column = out_cols.final_aggregate_columns[i];
                 final_aggregate_column = final_aggregate_column->cloneResized(places.size());
                 columns_data.emplace_back(getColumnData(final_aggregate_column.get()));
             }
@@ -1884,7 +1845,7 @@ void NO_INLINE Aggregator::convertToBlockImplFinal(
                 }
             }
 
-            auto & final_aggregate_column = final_aggregate_columns[aggregate_functions_destroy_index];
+            auto & final_aggregate_column = out_cols.final_aggregate_columns[aggregate_functions_destroy_index];
             size_t offset = offsets_of_aggregate_states[aggregate_functions_destroy_index];
 
             /** We increase aggregate_functions_destroy_index because by function contract if insertResultIntoBatch
@@ -1898,7 +1859,8 @@ void NO_INLINE Aggregator::convertToBlockImplFinal(
             bool is_state = aggregate_functions[destroy_index]->isState();
             bool destroy_place_after_insert = !is_state;
 
-            aggregate_functions[destroy_index]->insertResultIntoBatch(0, places.size(), places.data(), offset, *final_aggregate_column, arena, destroy_place_after_insert);
+            aggregate_functions[destroy_index]->insertResultIntoBatch(
+                0, places.size(), places.data(), offset, *final_aggregate_column, arena, destroy_place_after_insert);
         }
     }
     catch (...)
@@ -1923,125 +1885,155 @@ void NO_INLINE Aggregator::convertToBlockImplFinal(
 
     if (exception)
         std::rethrow_exception(exception);
+
+    return finalizeBlock(params, getHeader(/* final */ true), std::move(out_cols), /* final */ true, places.size());
 }
 
-template <typename Method, typename Table>
-void NO_INLINE Aggregator::convertToBlockImplNotFinal(
-    Method & method,
-    Table & data,
-    std::vector<IColumn *>  key_columns,
-    AggregateColumnsData & aggregate_columns) const
+template <typename Method, bool use_compiled_functions, bool return_single_block, typename Table>
+Aggregator::ConvertToBlockRes<return_single_block> NO_INLINE
+Aggregator::convertToBlockImplFinal(Method & method, Table & data, Arena * arena, Arenas & aggregates_pools, size_t) const
 {
-    if constexpr (Method::low_cardinality_optimization)
+    const size_t max_block_size = params.max_block_size;
+    const bool final = true;
+    ConvertToBlockRes<return_single_block> res;
+
+    std::optional<OutputBlockColumns> out_cols;
+    std::optional<Sizes> shuffled_key_sizes;
+    PaddedPODArray<AggregateDataPtr> places;
+
+    auto init_out_cols = [&]()
     {
-        if (data.hasNullKeyData())
+        out_cols = prepareOutputBlockColumns(params, aggregate_functions, getHeader(final), aggregates_pools, final, max_block_size);
+
+        if constexpr (Method::low_cardinality_optimization)
         {
-            key_columns[0]->insertDefault();
-
-            for (size_t i = 0; i < params.aggregates_size; ++i)
-                aggregate_columns[i]->push_back(data.getNullKeyData() + offsets_of_aggregate_states[i]);
-
-            data.getNullKeyData() = nullptr;
-        }
-    }
-
-    auto shuffled_key_sizes = method.shuffleKeyColumns(key_columns, key_sizes);
-    const auto & key_sizes_ref = shuffled_key_sizes ? *shuffled_key_sizes :  key_sizes;
-
-    data.forEachValue([&](const auto & key, auto & mapped)
-    {
-        method.insertKeyIntoColumns(key, key_columns, key_sizes_ref);
-
-        /// reserved, so push_back does not throw exceptions
-        for (size_t i = 0; i < params.aggregates_size; ++i)
-            aggregate_columns[i]->push_back(mapped + offsets_of_aggregate_states[i]);
-
-        mapped = nullptr;
-    });
-}
-
-
-template <typename Filler>
-Block Aggregator::prepareBlockAndFill(
-    AggregatedDataVariants & data_variants,
-    bool final,
-    size_t rows,
-    Filler && filler) const
-{
-    MutableColumns key_columns(params.keys_size);
-    MutableColumns aggregate_columns(params.aggregates_size);
-    MutableColumns final_aggregate_columns(params.aggregates_size);
-    AggregateColumnsData aggregate_columns_data(params.aggregates_size);
-
-    Block res_header = getHeader(final);
-
-    for (size_t i = 0; i < params.keys_size; ++i)
-    {
-        key_columns[i] = res_header.safeGetByPosition(i).type->createColumn();
-        key_columns[i]->reserve(rows);
-    }
-
-    for (size_t i = 0; i < params.aggregates_size; ++i)
-    {
-        if (!final)
-        {
-            const auto & aggregate_column_name = params.aggregates[i].column_name;
-            aggregate_columns[i] = res_header.getByName(aggregate_column_name).type->createColumn();
-
-            /// The ColumnAggregateFunction column captures the shared ownership of the arena with the aggregate function states.
-            ColumnAggregateFunction & column_aggregate_func = assert_cast<ColumnAggregateFunction &>(*aggregate_columns[i]);
-
-            for (auto & pool : data_variants.aggregates_pools)
-                column_aggregate_func.addArena(pool);
-
-            aggregate_columns_data[i] = &column_aggregate_func.getData();
-            aggregate_columns_data[i]->reserve(rows);
-        }
-        else
-        {
-            final_aggregate_columns[i] = aggregate_functions[i]->getReturnType()->createColumn();
-            final_aggregate_columns[i]->reserve(rows);
-
-            if (aggregate_functions[i]->isState())
+            if (data.hasNullKeyData())
             {
-                /// The ColumnAggregateFunction column captures the shared ownership of the arena with aggregate function states.
-                if (auto * column_aggregate_func = typeid_cast<ColumnAggregateFunction *>(final_aggregate_columns[i].get()))
-                    for (auto & pool : data_variants.aggregates_pools)
-                        column_aggregate_func->addArena(pool);
-
-                /// Aggregate state can be wrapped into array if aggregate function ends with -Resample combinator.
-                final_aggregate_columns[i]->forEachSubcolumn([&data_variants](auto & subcolumn)
-                {
-                    if (auto * column_aggregate_func = typeid_cast<ColumnAggregateFunction *>(subcolumn.get()))
-                        for (auto & pool : data_variants.aggregates_pools)
-                            column_aggregate_func->addArena(pool);
-                });
+                out_cols->key_columns[0]->insertDefault();
+                insertAggregatesIntoColumns(data.getNullKeyData(), out_cols->final_aggregate_columns, arena);
+                data.hasNullKeyData() = false;
             }
         }
-    }
 
-    filler(key_columns, aggregate_columns_data, final_aggregate_columns, final);
+        shuffled_key_sizes = method.shuffleKeyColumns(out_cols->raw_key_columns, key_sizes);
 
-    Block res = res_header.cloneEmpty();
+        places.reserve(max_block_size);
+    };
 
-    for (size_t i = 0; i < params.keys_size; ++i)
-        res.getByPosition(i).column = std::move(key_columns[i]);
+    // should be invoked at least once, because null data might be the only content of the `data`
+    init_out_cols();
 
-    for (size_t i = 0; i < params.aggregates_size; ++i)
+    data.forEachValue(
+        [&](const auto & key, auto & mapped)
+        {
+            if (!out_cols.has_value())
+                init_out_cols();
+
+            const auto & key_sizes_ref = shuffled_key_sizes ? *shuffled_key_sizes : key_sizes;
+            method.insertKeyIntoColumns(key, out_cols->raw_key_columns, key_sizes_ref);
+            places.emplace_back(mapped);
+
+            /// Mark the cell as destroyed so it will not be destroyed in destructor.
+            mapped = nullptr;
+
+            if constexpr (!return_single_block)
+            {
+                if (places.size() >= max_block_size)
+                {
+                    res.emplace_back(insertResultsIntoColumns<use_compiled_functions>(places, std::move(out_cols.value()), arena));
+                    places.clear();
+                    out_cols.reset();
+                }
+            }
+        });
+
+    if constexpr (return_single_block)
     {
-        const auto & aggregate_column_name = params.aggregates[i].column_name;
-        if (final)
-            res.getByName(aggregate_column_name).column = std::move(final_aggregate_columns[i]);
-        else
-            res.getByName(aggregate_column_name).column = std::move(aggregate_columns[i]);
+        return insertResultsIntoColumns<use_compiled_functions>(places, std::move(out_cols.value()), arena);
     }
+    else
+    {
+        if (out_cols.has_value())
+            res.emplace_back(insertResultsIntoColumns<use_compiled_functions>(places, std::move(out_cols.value()), arena));
+        return res;
+    }
+}
 
-    /// Change the size of the columns-constants in the block.
-    size_t columns = res_header.columns();
-    for (size_t i = 0; i < columns; ++i)
-        if (isColumnConst(*res.getByPosition(i).column))
-            res.getByPosition(i).column = res.getByPosition(i).column->cut(0, rows);
+template <bool return_single_block, typename Method, typename Table>
+Aggregator::ConvertToBlockRes<return_single_block> NO_INLINE
+Aggregator::convertToBlockImplNotFinal(Method & method, Table & data, Arenas & aggregates_pools, size_t) const
+{
+    const size_t max_block_size = params.max_block_size;
+    const bool final = false;
+    ConvertToBlockRes<return_single_block> res;
 
+    std::optional<OutputBlockColumns> out_cols;
+    std::optional<Sizes> shuffled_key_sizes;
+
+    auto init_out_cols = [&]()
+    {
+        out_cols = prepareOutputBlockColumns(params, aggregate_functions, getHeader(final), aggregates_pools, final, max_block_size);
+
+        if constexpr (Method::low_cardinality_optimization)
+        {
+            if (data.hasNullKeyData())
+            {
+                out_cols->raw_key_columns[0]->insertDefault();
+
+                for (size_t i = 0; i < params.aggregates_size; ++i)
+                    out_cols->aggregate_columns_data[i]->push_back(data.getNullKeyData() + offsets_of_aggregate_states[i]);
+
+                data.getNullKeyData() = nullptr;
+                data.hasNullKeyData() = false;
+            }
+        }
+
+        shuffled_key_sizes = method.shuffleKeyColumns(out_cols->raw_key_columns, key_sizes);
+    };
+
+    // should be invoked at least once, because null data might be the only content of the `data`
+    init_out_cols();
+
+    size_t rows_in_current_block = 0;
+
+    data.forEachValue(
+        [&](const auto & key, auto & mapped)
+        {
+            if (!out_cols.has_value())
+                init_out_cols();
+
+            const auto & key_sizes_ref = shuffled_key_sizes ? *shuffled_key_sizes : key_sizes;
+            method.insertKeyIntoColumns(key, out_cols->raw_key_columns, key_sizes_ref);
+
+            /// reserved, so push_back does not throw exceptions
+            for (size_t i = 0; i < params.aggregates_size; ++i)
+                out_cols->aggregate_columns_data[i]->push_back(mapped + offsets_of_aggregate_states[i]);
+
+            mapped = nullptr;
+
+            ++rows_in_current_block;
+
+            if constexpr (!return_single_block)
+            {
+                if (rows_in_current_block >= max_block_size)
+                {
+                    res.emplace_back(finalizeBlock(params, getHeader(final), std::move(out_cols.value()), final, rows_in_current_block));
+                    out_cols.reset();
+                    rows_in_current_block = 0;
+                }
+            }
+        });
+
+    if constexpr (return_single_block)
+    {
+        return finalizeBlock(params, getHeader(final), std::move(out_cols).value(), final, rows_in_current_block);
+    }
+    else
+    {
+        if (rows_in_current_block)
+            res.emplace_back(finalizeBlock(params, getHeader(final), std::move(out_cols).value(), final, rows_in_current_block));
+        return res;
+    }
     return res;
 }
 
@@ -2105,39 +2097,35 @@ void Aggregator::createStatesAndFillKeyColumnsWithSingleKey(
 Block Aggregator::prepareBlockAndFillWithoutKey(AggregatedDataVariants & data_variants, bool final, bool is_overflows) const
 {
     size_t rows = 1;
+    auto && out_cols
+        = prepareOutputBlockColumns(params, aggregate_functions, getHeader(final), data_variants.aggregates_pools, final, rows);
+    auto && [key_columns, raw_key_columns, aggregate_columns, final_aggregate_columns, aggregate_columns_data] = out_cols;
 
-    auto filler = [&data_variants, this](
-        MutableColumns & key_columns,
-        AggregateColumnsData & aggregate_columns,
-        MutableColumns & final_aggregate_columns,
-        bool final_)
+    if (data_variants.type == AggregatedDataVariants::Type::without_key || params.overflow_row)
     {
-        if (data_variants.type == AggregatedDataVariants::Type::without_key || params.overflow_row)
+        AggregatedDataWithoutKey & data = data_variants.without_key;
+
+        if (!data)
+            throw Exception("Wrong data variant passed.", ErrorCodes::LOGICAL_ERROR);
+
+        if (!final)
         {
-            AggregatedDataWithoutKey & data = data_variants.without_key;
-
-            if (!data)
-                throw Exception("Wrong data variant passed.", ErrorCodes::LOGICAL_ERROR);
-
-            if (!final_)
-            {
-                for (size_t i = 0; i < params.aggregates_size; ++i)
-                    aggregate_columns[i]->push_back(data + offsets_of_aggregate_states[i]);
-                data = nullptr;
-            }
-            else
-            {
-                /// Always single-thread. It's safe to pass current arena from 'aggregates_pool'.
-                insertAggregatesIntoColumns(data, final_aggregate_columns, data_variants.aggregates_pool);
-            }
-
-            if (params.overflow_row)
-                for (size_t i = 0; i < params.keys_size; ++i)
-                    key_columns[i]->insertDefault();
+            for (size_t i = 0; i < params.aggregates_size; ++i)
+                aggregate_columns_data[i]->push_back(data + offsets_of_aggregate_states[i]);
+            data = nullptr;
+        }
+        else
+        {
+            /// Always single-thread. It's safe to pass current arena from 'aggregates_pool'.
+            insertAggregatesIntoColumns(data, final_aggregate_columns, data_variants.aggregates_pool);
         }
-    };
 
-    Block block = prepareBlockAndFill(data_variants, final, rows, filler);
+        if (params.overflow_row)
+            for (size_t i = 0; i < params.keys_size; ++i)
+                key_columns[i]->insertDefault();
+    }
+
+    Block block = finalizeBlock(params, getHeader(final), std::move(out_cols), final, rows);
 
     if (is_overflows)
         block.info.is_overflows = true;
@@ -2148,29 +2136,22 @@ Block Aggregator::prepareBlockAndFillWithoutKey(AggregatedDataVariants & data_va
     return block;
 }
 
-Block Aggregator::prepareBlockAndFillSingleLevel(AggregatedDataVariants & data_variants, bool final) const
+template <bool return_single_block>
+Aggregator::ConvertToBlockRes<return_single_block>
+Aggregator::prepareBlockAndFillSingleLevel(AggregatedDataVariants & data_variants, bool final) const
 {
-    size_t rows = data_variants.sizeWithoutOverflowRow();
+    const size_t rows = data_variants.sizeWithoutOverflowRow();
+#define M(NAME) \
+    else if (data_variants.type == AggregatedDataVariants::Type::NAME) \
+    { \
+        return convertToBlockImpl<return_single_block>( \
+            *data_variants.NAME, data_variants.NAME->data, data_variants.aggregates_pool, data_variants.aggregates_pools, final, rows); \
+    }
 
-    auto filler = [&data_variants, this](
-        MutableColumns & key_columns,
-        AggregateColumnsData & aggregate_columns,
-        MutableColumns & final_aggregate_columns,
-        bool final_)
-    {
-    #define M(NAME) \
-        else if (data_variants.type == AggregatedDataVariants::Type::NAME) \
-            convertToBlockImpl(*data_variants.NAME, data_variants.NAME->data, \
-                key_columns, aggregate_columns, final_aggregate_columns, data_variants.aggregates_pool, final_);
-
-        if (false) {} // NOLINT
-        APPLY_FOR_VARIANTS_SINGLE_LEVEL(M)
-    #undef M
-        else
-            throw Exception("Unknown aggregated data variant.", ErrorCodes::UNKNOWN_AGGREGATED_DATA_VARIANT);
-    };
-
-    return prepareBlockAndFill(data_variants, final, rows, filler);
+    if (false) {} // NOLINT
+    APPLY_FOR_VARIANTS_SINGLE_LEVEL(M)
+#undef M
+    else throw Exception("Unknown aggregated data variant.", ErrorCodes::UNKNOWN_AGGREGATED_DATA_VARIANT);
 }
 
 
@@ -2292,7 +2273,7 @@ BlocksList Aggregator::convertToBlocks(AggregatedDataVariants & data_variants, b
     if (data_variants.type != AggregatedDataVariants::Type::without_key)
     {
         if (!data_variants.isTwoLevel())
-            blocks.emplace_back(prepareBlockAndFillSingleLevel(data_variants, final));
+            blocks.splice(blocks.end(), prepareBlockAndFillSingleLevel</* return_single_block */ false>(data_variants, final));
         else
             blocks.splice(blocks.end(), prepareBlocksAndFillTwoLevel(data_variants, final, thread_pool.get()));
     }
@@ -3044,9 +3025,15 @@ Block Aggregator::mergeBlocks(BlocksList & blocks, bool final)
 
     Block block;
     if (result.type == AggregatedDataVariants::Type::without_key || is_overflows)
+    {
         block = prepareBlockAndFillWithoutKey(result, final, is_overflows);
+    }
     else
-        block = prepareBlockAndFillSingleLevel(result, final);
+    {
+        // Used during memory efficient merging (SortingAggregatedTransform expects single chunk for each bucket_id).
+        constexpr bool return_single_block = true;
+        block = prepareBlockAndFillSingleLevel<return_single_block>(result, final);
+    }
     /// NOTE: two-level data is not possible here - chooseAggregationMethod chooses only among single-level methods.
 
     if (!final)
@@ -3247,4 +3234,6 @@ void Aggregator::destroyAllAggregateStates(AggregatedDataVariants & result) cons
 }
 
 
+template Aggregator::ConvertToBlockRes<false>
+Aggregator::prepareBlockAndFillSingleLevel<false>(AggregatedDataVariants & data_variants, bool final) const;
 }
diff --git a/src/Interpreters/Aggregator.h b/src/Interpreters/Aggregator.h
index ce63c24969a..1d317e0a93a 100644
--- a/src/Interpreters/Aggregator.h
+++ b/src/Interpreters/Aggregator.h
@@ -1,8 +1,9 @@
 #pragma once
 
-#include <mutex>
-#include <memory>
 #include <functional>
+#include <memory>
+#include <mutex>
+#include <type_traits>
 
 #include <Common/logger_useful.h>
 
@@ -872,6 +873,7 @@ using ManyAggregatedDataVariantsPtr = std::shared_ptr<ManyAggregatedDataVariants
 
 class CompiledAggregateFunctionsHolder;
 class NativeWriter;
+struct OutputBlockColumns;
 
 /** How are "total" values calculated with WITH TOTALS?
   * (For more details, see TotalsHavingTransform.)
@@ -933,6 +935,8 @@ public:
         bool compile_aggregate_expressions;
         size_t min_count_to_compile_aggregate_expression;
 
+        size_t max_block_size;
+
         bool only_merge;
 
         struct StatsCollectingParams
@@ -969,6 +973,7 @@ public:
             size_t min_free_disk_space_,
             bool compile_aggregate_expressions_,
             size_t min_count_to_compile_aggregate_expression_,
+            size_t max_block_size_,
             bool only_merge_ = false, // true for projections
             const StatsCollectingParams & stats_collecting_params_ = {})
             : keys(keys_)
@@ -987,15 +992,16 @@ public:
             , min_free_disk_space(min_free_disk_space_)
             , compile_aggregate_expressions(compile_aggregate_expressions_)
             , min_count_to_compile_aggregate_expression(min_count_to_compile_aggregate_expression_)
+            , max_block_size(max_block_size_)
             , only_merge(only_merge_)
             , stats_collecting_params(stats_collecting_params_)
         {
         }
 
         /// Only parameters that matter during merge.
-        Params(const Names & keys_, const AggregateDescriptions & aggregates_, bool overflow_row_, size_t max_threads_)
+        Params(const Names & keys_, const AggregateDescriptions & aggregates_, bool overflow_row_, size_t max_threads_, size_t max_block_size_)
             : Params(
-                keys_, aggregates_, overflow_row_, 0, OverflowMode::THROW, 0, 0, 0, false, nullptr, max_threads_, 0, false, 0, true, {})
+                keys_, aggregates_, overflow_row_, 0, OverflowMode::THROW, 0, 0, 0, false, nullptr, max_threads_, 0, false, 0, max_block_size_, true, {})
         {
         }
 
@@ -1277,15 +1283,12 @@ private:
     void mergeSingleLevelDataImpl(
         ManyAggregatedDataVariants & non_empty_data) const;
 
-    template <typename Method, typename Table>
-    void convertToBlockImpl(
-        Method & method,
-        Table & data,
-        MutableColumns & key_columns,
-        AggregateColumnsData & aggregate_columns,
-        MutableColumns & final_aggregate_columns,
-        Arena * arena,
-        bool final) const;
+    template <bool return_single_block>
+    using ConvertToBlockRes = std::conditional_t<return_single_block, Block, BlocksList>;
+
+    template <bool return_single_block, typename Method, typename Table>
+    ConvertToBlockRes<return_single_block>
+    convertToBlockImpl(Method & method, Table & data, Arena * arena, Arenas & aggregates_pools, bool final, size_t rows) const;
 
     template <typename Mapped>
     void insertAggregatesIntoColumns(
@@ -1293,27 +1296,16 @@ private:
         MutableColumns & final_aggregate_columns,
         Arena * arena) const;
 
-    template <typename Method, bool use_compiled_functions, typename Table>
-    void convertToBlockImplFinal(
-        Method & method,
-        Table & data,
-        std::vector<IColumn *> key_columns,
-        MutableColumns & final_aggregate_columns,
-        Arena * arena) const;
+    template <bool use_compiled_functions>
+    Block insertResultsIntoColumns(PaddedPODArray<AggregateDataPtr> & places, OutputBlockColumns && out_cols, Arena * arena) const;
 
-    template <typename Method, typename Table>
-    void convertToBlockImplNotFinal(
-        Method & method,
-        Table & data,
-        std::vector<IColumn *>  key_columns,
-        AggregateColumnsData & aggregate_columns) const;
+    template <typename Method, bool use_compiled_functions, bool return_single_block, typename Table>
+    ConvertToBlockRes<return_single_block>
+    convertToBlockImplFinal(Method & method, Table & data, Arena * arena, Arenas & aggregates_pools, size_t rows) const;
 
-    template <typename Filler>
-    Block prepareBlockAndFill(
-        AggregatedDataVariants & data_variants,
-        bool final,
-        size_t rows,
-        Filler && filler) const;
+    template <bool return_single_block, typename Method, typename Table>
+    ConvertToBlockRes<return_single_block>
+    convertToBlockImplNotFinal(Method & method, Table & data, Arenas & aggregates_pools, size_t rows) const;
 
     template <typename Method>
     Block convertOneBucketToBlock(
@@ -1331,9 +1323,11 @@ private:
         std::atomic<bool> * is_cancelled = nullptr) const;
 
     Block prepareBlockAndFillWithoutKey(AggregatedDataVariants & data_variants, bool final, bool is_overflows) const;
-    Block prepareBlockAndFillSingleLevel(AggregatedDataVariants & data_variants, bool final) const;
     BlocksList prepareBlocksAndFillTwoLevel(AggregatedDataVariants & data_variants, bool final, ThreadPool * thread_pool) const;
 
+    template <bool return_single_block>
+    ConvertToBlockRes<return_single_block> prepareBlockAndFillSingleLevel(AggregatedDataVariants & data_variants, bool final) const;
+
     template <typename Method>
     BlocksList prepareBlocksAndFillTwoLevelImpl(
         AggregatedDataVariants & data_variants,
diff --git a/src/Interpreters/Cache/FileCache.cpp b/src/Interpreters/Cache/FileCache.cpp
index 2fa5792fb43..f51df9ae737 100644
--- a/src/Interpreters/Cache/FileCache.cpp
+++ b/src/Interpreters/Cache/FileCache.cpp
@@ -122,6 +122,7 @@ void FileCache::initialize()
             fs::create_directories(cache_base_path);
         }
 
+        status_file = make_unique<StatusFile>(fs::path(cache_base_path) / "status", StatusFile::write_full_info);
         is_initialized = true;
     }
 }
@@ -963,12 +964,19 @@ void FileCache::loadCacheInfoIntoMemory(std::lock_guard<std::mutex> & cache_lock
     fs::directory_iterator key_prefix_it{cache_base_path};
     for (; key_prefix_it != fs::directory_iterator(); ++key_prefix_it)
     {
+        if (!key_prefix_it->is_directory())
+        {
+            if (key_prefix_it->path().filename() != "status")
+                LOG_DEBUG(log, "Unexpected file {} (not a directory), will skip it", key_prefix_it->path().string());
+            continue;
+        }
+
         fs::directory_iterator key_it{key_prefix_it->path()};
         for (; key_it != fs::directory_iterator(); ++key_it)
         {
             if (!key_it->is_directory())
             {
-                LOG_WARNING(log, "Unexpected file: {}. Expected a directory", key_it->path().string());
+                LOG_DEBUG(log, "Unexpected file {} (not a directory), will skip it", key_it->path().string());
                 continue;
             }
 
diff --git a/src/Interpreters/Cache/FileCache.h b/src/Interpreters/Cache/FileCache.h
index e12d1684e9a..3f5a5c9e1c5 100644
--- a/src/Interpreters/Cache/FileCache.h
+++ b/src/Interpreters/Cache/FileCache.h
@@ -18,6 +18,7 @@
 #include <Interpreters/Cache/IFileCachePriority.h>
 #include <Common/logger_useful.h>
 #include <Interpreters/Cache/FileCacheKey.h>
+#include <Common/StatusFile.h>
 
 namespace DB
 {
@@ -143,6 +144,7 @@ private:
 
     bool is_initialized = false;
     std::exception_ptr initialization_exception;
+    std::unique_ptr<StatusFile> status_file;
 
     mutable std::mutex mutex;
 
diff --git a/src/Interpreters/Cache/FileCacheSettings.cpp b/src/Interpreters/Cache/FileCacheSettings.cpp
index b08c80f20db..819eeaf4140 100644
--- a/src/Interpreters/Cache/FileCacheSettings.cpp
+++ b/src/Interpreters/Cache/FileCacheSettings.cpp
@@ -31,7 +31,7 @@ void FileCacheSettings::loadFromConfig(const Poco::Util::AbstractConfiguration &
     enable_filesystem_query_cache_limit = config.getUInt64(config_prefix + ".enable_filesystem_query_cache_limit", false);
     enable_cache_hits_threshold = config.getUInt64(config_prefix + ".enable_cache_hits_threshold", REMOTE_FS_OBJECTS_CACHE_ENABLE_HITS_THRESHOLD);
 
-    do_not_evict_index_and_mark_files = config.getUInt64(config_prefix + ".do_not_evict_index_and_mark_files", true);
+    do_not_evict_index_and_mark_files = config.getUInt64(config_prefix + ".do_not_evict_index_and_mark_files", false);
 }
 
 }
diff --git a/src/Interpreters/ClientInfo.h b/src/Interpreters/ClientInfo.h
index 3ea846101f5..a1096b99325 100644
--- a/src/Interpreters/ClientInfo.h
+++ b/src/Interpreters/ClientInfo.h
@@ -62,9 +62,8 @@ public:
     time_t initial_query_start_time{};
     Decimal64 initial_query_start_time_microseconds{};
 
-    // OpenTelemetry trace context we received from client, or which we are going
-    // to send to server.
-    OpenTelemetryTraceContext client_trace_context;
+    /// OpenTelemetry trace context we received from client, or which we are going to send to server.
+    OpenTelemetry::TracingContext client_trace_context;
 
     /// All below are parameters related to initial query.
 
diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp
index 87c2a997afa..18d24e7a7d0 100644
--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@@ -1344,29 +1344,6 @@ void Context::setCurrentQueryId(const String & query_id)
     random.words.a = thread_local_rng(); //-V656
     random.words.b = thread_local_rng(); //-V656
 
-    if (client_info.client_trace_context.trace_id != UUID())
-    {
-        // Use the OpenTelemetry trace context we received from the client, and
-        // create a new span for the query.
-        query_trace_context = client_info.client_trace_context;
-        query_trace_context.span_id = thread_local_rng();
-    }
-    else if (client_info.query_kind == ClientInfo::QueryKind::INITIAL_QUERY)
-    {
-        // If this is an initial query without any parent OpenTelemetry trace, we
-        // might start the trace ourselves, with some configurable probability.
-        std::bernoulli_distribution should_start_trace{
-            settings.opentelemetry_start_trace_probability};
-
-        if (should_start_trace(thread_local_rng))
-        {
-            // Use the randomly generated default query id as the new trace id.
-            query_trace_context.trace_id = random.uuid;
-            query_trace_context.span_id = thread_local_rng();
-            // Mark this trace as sampled in the flags.
-            query_trace_context.trace_flags = 1;
-        }
-    }
 
     String query_id_to_set = query_id;
     if (query_id_to_set.empty())    /// If the user did not submit his query_id, then we generate it ourselves.
diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h
index b8a0d9ab600..8bab5707298 100644
--- a/src/Interpreters/Context.h
+++ b/src/Interpreters/Context.h
@@ -364,8 +364,26 @@ private:
     bool apply_deleted_mask = true;
 
 public:
-    // Top-level OpenTelemetry trace context for the query. Makes sense only for a query context.
-    OpenTelemetryTraceContext query_trace_context;
+    /// Some counters for current query execution.
+    /// Most of them are workarounds and should be removed in the future.
+    struct KitchenSink
+    {
+        std::atomic<size_t> analyze_counter = 0;
+
+        KitchenSink() = default;
+
+        KitchenSink(const KitchenSink & rhs)
+            : analyze_counter(rhs.analyze_counter.load())
+        {}
+
+        KitchenSink & operator=(const KitchenSink & rhs)
+        {
+            analyze_counter = rhs.analyze_counter.load();
+            return *this;
+        }
+    };
+
+    KitchenSink kitchen_sink;
 
 private:
     using SampleBlockCache = std::unordered_map<std::string, Block>;
diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp
index 105d46eed1f..be32125edf8 100644
--- a/src/Interpreters/ExpressionAnalyzer.cpp
+++ b/src/Interpreters/ExpressionAnalyzer.cpp
@@ -725,7 +725,7 @@ void ExpressionAnalyzer::makeWindowDescriptionFromAST(const Context & context_,
                     with_alias->getColumnName(), 1 /* direction */,
                     1 /* nulls_direction */));
 
-            auto actions_dag = std::make_shared<ActionsDAG>(columns_after_join);
+            auto actions_dag = std::make_shared<ActionsDAG>(aggregated_columns);
             getRootActions(column_ast, false, actions_dag);
             desc.partition_by_actions.push_back(std::move(actions_dag));
         }
@@ -746,7 +746,7 @@ void ExpressionAnalyzer::makeWindowDescriptionFromAST(const Context & context_,
                     order_by_element.direction,
                     order_by_element.nulls_direction));
 
-            auto actions_dag = std::make_shared<ActionsDAG>(columns_after_join);
+            auto actions_dag = std::make_shared<ActionsDAG>(aggregated_columns);
             getRootActions(column_ast, false, actions_dag);
             desc.order_by_actions.push_back(std::move(actions_dag));
         }
diff --git a/src/Interpreters/IInterpreterUnionOrSelectQuery.h b/src/Interpreters/IInterpreterUnionOrSelectQuery.h
index a1c86f9de85..6f893d4703e 100644
--- a/src/Interpreters/IInterpreterUnionOrSelectQuery.h
+++ b/src/Interpreters/IInterpreterUnionOrSelectQuery.h
@@ -58,6 +58,8 @@ public:
     /// Add limits from external query.
     void addStorageLimits(const StorageLimitsList & limits);
 
+    ContextPtr getContext() const { return context; }
+
 protected:
     ASTPtr query_ptr;
     ContextMutablePtr context;
diff --git a/src/Interpreters/InterpreterDeleteQuery.cpp b/src/Interpreters/InterpreterDeleteQuery.cpp
index 5e5bebfdc58..51fb6cfb948 100644
--- a/src/Interpreters/InterpreterDeleteQuery.cpp
+++ b/src/Interpreters/InterpreterDeleteQuery.cpp
@@ -21,7 +21,6 @@ namespace DB
 
 namespace ErrorCodes
 {
-    extern const int BAD_ARGUMENTS;
     extern const int TABLE_IS_READ_ONLY;
     extern const int SUPPORT_IS_DISABLED;
 }
@@ -34,11 +33,6 @@ InterpreterDeleteQuery::InterpreterDeleteQuery(const ASTPtr & query_ptr_, Contex
 
 BlockIO InterpreterDeleteQuery::execute()
 {
-    if (!getContext()->getSettingsRef().allow_experimental_lightweight_delete)
-    {
-        throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Lightweight delete mutate is experimental. Set `allow_experimental_lightweight_delete` setting to enable it");
-    }
-
     FunctionNameNormalizer().visit(query_ptr.get());
     const ASTDeleteQuery & delete_query = query_ptr->as<ASTDeleteQuery &>();
     auto table_id = getContext()->resolveStorageID(delete_query, Context::ResolveOrdinary);
@@ -49,10 +43,6 @@ BlockIO InterpreterDeleteQuery::execute()
 
     /// First check table storage for validations.
     StoragePtr table = DatabaseCatalog::instance().getTable(table_id, getContext());
-    auto merge_tree = std::dynamic_pointer_cast<MergeTreeData>(table);
-    if (!merge_tree)
-        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Only MergeTree family tables are supported");
-
     checkStorageSupportsTransactionsIfNeeded(table, getContext());
     if (table->isStaticStorage())
         throw Exception(ErrorCodes::TABLE_IS_READ_ONLY, "Table is read-only");
@@ -69,6 +59,27 @@ BlockIO InterpreterDeleteQuery::execute()
     auto table_lock = table->lockForShare(getContext()->getCurrentQueryId(), getContext()->getSettingsRef().lock_acquire_timeout);
     auto metadata_snapshot = table->getInMemoryMetadataPtr();
 
+    auto merge_tree = std::dynamic_pointer_cast<MergeTreeData>(table);
+    if (!merge_tree)
+    {
+        /// Convert to MutationCommand
+        MutationCommands mutation_commands;
+        MutationCommand mut_command;
+
+        mut_command.type = MutationCommand::Type::DELETE;
+        mut_command.predicate = delete_query.predicate;
+
+        mutation_commands.emplace_back(mut_command);
+
+        table->checkMutationIsPossible(mutation_commands, getContext()->getSettingsRef());
+        MutationsInterpreter(table, metadata_snapshot, mutation_commands, getContext(), false).validate();
+        table->mutate(mutation_commands, getContext());
+        return {};
+    }
+
+    if (!getContext()->getSettingsRef().allow_experimental_lightweight_delete)
+        throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Lightweight delete mutate is experimental. Set `allow_experimental_lightweight_delete` setting to enable it");
+
     /// Convert to MutationCommand
     MutationCommands mutation_commands;
     MutationCommand mut_command;
diff --git a/src/Interpreters/InterpreterDescribeQuery.cpp b/src/Interpreters/InterpreterDescribeQuery.cpp
index 9919b1272bd..0524feea1f6 100644
--- a/src/Interpreters/InterpreterDescribeQuery.cpp
+++ b/src/Interpreters/InterpreterDescribeQuery.cpp
@@ -163,7 +163,7 @@ BlockIO InterpreterDescribeQuery::execute()
                     res_columns[6]->insertDefault();
 
                 res_columns[7]->insert(1u);
-            }, { type->getDefaultSerialization(), type, nullptr, nullptr });
+            }, ISerialization::SubstreamData(type->getDefaultSerialization()).withType(type));
         }
     }
 
diff --git a/src/Interpreters/InterpreterExplainQuery.cpp b/src/Interpreters/InterpreterExplainQuery.cpp
index 4799970b6a1..746d382198d 100644
--- a/src/Interpreters/InterpreterExplainQuery.cpp
+++ b/src/Interpreters/InterpreterExplainQuery.cpp
@@ -316,7 +316,7 @@ QueryPipeline InterpreterExplainQuery::executeImpl()
             interpreter.buildQueryPlan(plan);
 
             if (settings.optimize)
-                plan.optimize(QueryPlanOptimizationSettings::fromContext(getContext()));
+                plan.optimize(QueryPlanOptimizationSettings::fromContext(interpreter.getContext()));
 
             if (settings.json)
             {
@@ -326,7 +326,7 @@ QueryPipeline InterpreterExplainQuery::executeImpl()
                 auto plan_array = std::make_unique<JSONBuilder::JSONArray>();
                 plan_array->add(std::move(plan_map));
 
-                auto format_settings = getFormatSettings(getContext());
+                auto format_settings = getFormatSettings(interpreter.getContext());
                 format_settings.json.quote_64bit_integers = false;
 
                 JSONBuilder::FormatSettings json_format_settings{.settings = format_settings};
diff --git a/src/Interpreters/InterpreterFactory.cpp b/src/Interpreters/InterpreterFactory.cpp
index 00183086bf6..170f3c463b4 100644
--- a/src/Interpreters/InterpreterFactory.cpp
+++ b/src/Interpreters/InterpreterFactory.cpp
@@ -114,8 +114,6 @@ namespace ErrorCodes
 
 std::unique_ptr<IInterpreter> InterpreterFactory::get(ASTPtr & query, ContextMutablePtr context, const SelectQueryOptions & options)
 {
-    OpenTelemetrySpanHolder span("InterpreterFactory::get()");
-
     ProfileEvents::increment(ProfileEvents::Query);
 
     if (query->as<ASTSelectQuery>())
diff --git a/src/Interpreters/InterpreterInsertQuery.cpp b/src/Interpreters/InterpreterInsertQuery.cpp
index 7b6066575ae..1e1b22cb5e0 100644
--- a/src/Interpreters/InterpreterInsertQuery.cpp
+++ b/src/Interpreters/InterpreterInsertQuery.cpp
@@ -346,7 +346,7 @@ BlockIO InterpreterInsertQuery::execute()
                 const auto & union_modes = select_query.list_of_modes;
 
                 /// ASTSelectWithUnionQuery is not normalized now, so it may pass some queries which can be Trivial select queries
-                const auto mode_is_all = [](const auto & mode) { return mode == SelectUnionMode::ALL; };
+                const auto mode_is_all = [](const auto & mode) { return mode == SelectUnionMode::UNION_ALL; };
 
                 is_trivial_insert_select =
                     std::all_of(union_modes.begin(), union_modes.end(), std::move(mode_is_all))
diff --git a/src/Interpreters/InterpreterRenameQuery.cpp b/src/Interpreters/InterpreterRenameQuery.cpp
index f4dbbaec16d..c22863ef8e5 100644
--- a/src/Interpreters/InterpreterRenameQuery.cpp
+++ b/src/Interpreters/InterpreterRenameQuery.cpp
@@ -16,6 +16,7 @@ namespace DB
 namespace ErrorCodes
 {
     extern const int NOT_IMPLEMENTED;
+    extern const int LOGICAL_ERROR;
 }
 
 InterpreterRenameQuery::InterpreterRenameQuery(const ASTPtr & query_ptr_, ContextPtr context_)
@@ -31,11 +32,11 @@ BlockIO InterpreterRenameQuery::execute()
     if (!rename.cluster.empty())
     {
         DDLQueryOnClusterParams params;
-        params.access_to_check = getRequiredAccess();
+        params.access_to_check = getRequiredAccess(rename.database ? RenameType::RenameDatabase : RenameType::RenameTable);
         return executeDDLQueryOnCluster(query_ptr, getContext(), params);
     }
 
-    getContext()->checkAccess(getRequiredAccess());
+    getContext()->checkAccess(getRequiredAccess(rename.database ? RenameType::RenameDatabase : RenameType::RenameTable));
 
     String path = getContext()->getPath();
     String current_database = getContext()->getCurrentDatabase();
@@ -165,18 +166,30 @@ BlockIO InterpreterRenameQuery::executeToDatabase(const ASTRenameQuery &, const
     return {};
 }
 
-AccessRightsElements InterpreterRenameQuery::getRequiredAccess() const
+AccessRightsElements InterpreterRenameQuery::getRequiredAccess(InterpreterRenameQuery::RenameType type) const
 {
     AccessRightsElements required_access;
     const auto & rename = query_ptr->as<const ASTRenameQuery &>();
     for (const auto & elem : rename.elements)
     {
-        required_access.emplace_back(AccessType::SELECT | AccessType::DROP_TABLE, elem.from.database, elem.from.table);
-        required_access.emplace_back(AccessType::CREATE_TABLE | AccessType::INSERT, elem.to.database, elem.to.table);
-        if (rename.exchange)
+        if (type == RenameType::RenameTable)
         {
-            required_access.emplace_back(AccessType::CREATE_TABLE | AccessType::INSERT, elem.from.database, elem.from.table);
-            required_access.emplace_back(AccessType::SELECT | AccessType::DROP_TABLE, elem.to.database, elem.to.table);
+            required_access.emplace_back(AccessType::SELECT | AccessType::DROP_TABLE, elem.from.database, elem.from.table);
+            required_access.emplace_back(AccessType::CREATE_TABLE | AccessType::INSERT, elem.to.database, elem.to.table);
+            if (rename.exchange)
+            {
+                required_access.emplace_back(AccessType::CREATE_TABLE | AccessType::INSERT , elem.from.database, elem.from.table);
+                required_access.emplace_back(AccessType::SELECT | AccessType::DROP_TABLE, elem.to.database, elem.to.table);
+            }
+        }
+        else if (type == RenameType::RenameDatabase)
+        {
+            required_access.emplace_back(AccessType::SELECT | AccessType::DROP_DATABASE, elem.from.database);
+            required_access.emplace_back(AccessType::CREATE_DATABASE | AccessType::INSERT, elem.to.database);
+        }
+        else
+        {
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown type of rename query");
         }
     }
     return required_access;
diff --git a/src/Interpreters/InterpreterRenameQuery.h b/src/Interpreters/InterpreterRenameQuery.h
index 194f6266634..6141e8c1585 100644
--- a/src/Interpreters/InterpreterRenameQuery.h
+++ b/src/Interpreters/InterpreterRenameQuery.h
@@ -63,7 +63,13 @@ private:
     BlockIO executeToTables(const ASTRenameQuery & rename, const RenameDescriptions & descriptions, TableGuards & ddl_guards);
     BlockIO executeToDatabase(const ASTRenameQuery & rename, const RenameDescriptions & descriptions);
 
-    AccessRightsElements getRequiredAccess() const;
+    enum class RenameType
+    {
+        RenameTable,
+        RenameDatabase
+    };
+
+    AccessRightsElements getRequiredAccess(RenameType type) const;
 
     ASTPtr query_ptr;
     bool renamed_instead_of_exchange{false};
diff --git a/src/Interpreters/InterpreterSelectIntersectExceptQuery.cpp b/src/Interpreters/InterpreterSelectIntersectExceptQuery.cpp
index d6add3f77a9..a134f7bb913 100644
--- a/src/Interpreters/InterpreterSelectIntersectExceptQuery.cpp
+++ b/src/Interpreters/InterpreterSelectIntersectExceptQuery.cpp
@@ -4,11 +4,14 @@
 #include <Interpreters/InterpreterSelectQuery.h>
 #include <Parsers/ASTSelectIntersectExceptQuery.h>
 #include <Parsers/ASTSelectWithUnionQuery.h>
+#include <Processors/QueryPlan/DistinctStep.h>
+#include <Processors/QueryPlan/ExpressionStep.h>
 #include <Processors/QueryPlan/IQueryPlanStep.h>
 #include <Processors/QueryPlan/IntersectOrExceptStep.h>
+#include <Processors/QueryPlan/LimitStep.h>
+#include <Processors/QueryPlan/OffsetStep.h>
 #include <Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.h>
 #include <Processors/QueryPlan/QueryPlan.h>
-#include <Processors/QueryPlan/ExpressionStep.h>
 #include <QueryPipeline/QueryPipelineBuilder.h>
 
 
@@ -134,10 +137,29 @@ void InterpreterSelectIntersectExceptQuery::buildQueryPlan(QueryPlan & query_pla
         data_streams[i] = plans[i]->getCurrentDataStream();
     }
 
-    auto max_threads = context->getSettingsRef().max_threads;
+    const Settings & settings = context->getSettingsRef();
+    auto max_threads = settings.max_threads;
     auto step = std::make_unique<IntersectOrExceptStep>(std::move(data_streams), final_operator, max_threads);
     query_plan.unitePlans(std::move(step), std::move(plans));
 
+    const auto & query = query_ptr->as<ASTSelectIntersectExceptQuery &>();
+    if (query.final_operator == ASTSelectIntersectExceptQuery::Operator::INTERSECT_DISTINCT
+        || query.final_operator == ASTSelectIntersectExceptQuery::Operator::EXCEPT_DISTINCT)
+    {
+        /// Add distinct transform
+        SizeLimits limits(settings.max_rows_in_distinct, settings.max_bytes_in_distinct, settings.distinct_overflow_mode);
+
+        auto distinct_step = std::make_unique<DistinctStep>(
+            query_plan.getCurrentDataStream(),
+            limits,
+            0,
+            result_header.getNames(),
+            false,
+            settings.optimize_distinct_in_order);
+
+        query_plan.addStep(std::move(distinct_step));
+    }
+
     addAdditionalPostFilter(query_plan);
     query_plan.addInterpreterContext(context);
 }
diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp
index 08e9da3f003..c73db82a27b 100644
--- a/src/Interpreters/InterpreterSelectQuery.cpp
+++ b/src/Interpreters/InterpreterSelectQuery.cpp
@@ -39,6 +39,7 @@
 #include <QueryPipeline/Pipe.h>
 #include <Processors/QueryPlan/AggregatingStep.h>
 #include <Processors/QueryPlan/ArrayJoinStep.h>
+#include <Processors/QueryPlan/CreateSetAndFilterOnTheFlyStep.h>
 #include <Processors/QueryPlan/CreatingSetsStep.h>
 #include <Processors/QueryPlan/CubeStep.h>
 #include <Processors/QueryPlan/DistinctStep.h>
@@ -639,7 +640,18 @@ InterpreterSelectQuery::InterpreterSelectQuery(
     analyze(shouldMoveToPrewhere());
 
     bool need_analyze_again = false;
-    if (analysis_result.prewhere_constant_filter_description.always_false || analysis_result.prewhere_constant_filter_description.always_true)
+    bool can_analyze_again = false;
+    if (context->hasQueryContext())
+    {
+        /// Check number of calls of 'analyze' function.
+        /// If it is too big, we will not analyze the query again not to have exponential blowup.
+        std::atomic<size_t> & current_query_analyze_count = context->getQueryContext()->kitchen_sink.analyze_counter;
+        ++current_query_analyze_count;
+        can_analyze_again = settings.max_analyze_depth == 0 || current_query_analyze_count < settings.max_analyze_depth;
+    }
+
+    if (can_analyze_again && (analysis_result.prewhere_constant_filter_description.always_false ||
+                              analysis_result.prewhere_constant_filter_description.always_true))
     {
         if (analysis_result.prewhere_constant_filter_description.always_true)
             query.setExpression(ASTSelectQuery::Expression::PREWHERE, {});
@@ -647,7 +659,9 @@ InterpreterSelectQuery::InterpreterSelectQuery(
             query.setExpression(ASTSelectQuery::Expression::PREWHERE, std::make_shared<ASTLiteral>(0u));
         need_analyze_again = true;
     }
-    if (analysis_result.where_constant_filter_description.always_false || analysis_result.where_constant_filter_description.always_true)
+
+    if (can_analyze_again && (analysis_result.where_constant_filter_description.always_false ||
+                              analysis_result.where_constant_filter_description.always_true))
     {
         if (analysis_result.where_constant_filter_description.always_true)
             query.setExpression(ASTSelectQuery::Expression::WHERE, {});
@@ -658,7 +672,8 @@ InterpreterSelectQuery::InterpreterSelectQuery(
 
     if (need_analyze_again)
     {
-        LOG_TRACE(log, "Running 'analyze' second time");
+        size_t current_query_analyze_count = context->getQueryContext()->kitchen_sink.analyze_counter.load();
+        LOG_TRACE(log, "Running 'analyze' second time (current analyze depth: {})", current_query_analyze_count);
 
         /// Reuse already built sets for multiple passes of analysis
         prepared_sets = query_analyzer->getPreparedSets();
@@ -1422,7 +1437,7 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, std::optional<P
                     if (!joined_plan)
                         throw Exception(ErrorCodes::LOGICAL_ERROR, "There is no joined plan for query");
 
-                    auto add_sorting = [&settings, this] (QueryPlan & plan, const Names & key_names, bool is_right)
+                    auto add_sorting = [&settings, this] (QueryPlan & plan, const Names & key_names, JoinTableSide join_pos)
                     {
                         SortDescription order_descr;
                         order_descr.reserve(key_names.size());
@@ -1441,15 +1456,43 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, std::optional<P
                             this->context->getTemporaryVolume(),
                             settings.min_free_disk_space_for_temporary_data,
                             settings.optimize_sorting_by_input_stream_properties);
-                        sorting_step->setStepDescription(fmt::format("Sort {} before JOIN", is_right ? "right" : "left"));
+                        sorting_step->setStepDescription(fmt::format("Sort {} before JOIN", join_pos));
                         plan.addStep(std::move(sorting_step));
                     };
 
+                    auto crosswise_connection = CreateSetAndFilterOnTheFlyStep::createCrossConnection();
+                    auto add_create_set = [&settings, crosswise_connection](QueryPlan & plan, const Names & key_names, JoinTableSide join_pos)
+                    {
+                        auto creating_set_step = std::make_unique<CreateSetAndFilterOnTheFlyStep>(
+                            plan.getCurrentDataStream(), key_names, settings.max_rows_in_set_to_optimize_join, crosswise_connection, join_pos);
+                        creating_set_step->setStepDescription(fmt::format("Create set and filter {} joined stream", join_pos));
+
+                        auto * step_raw_ptr = creating_set_step.get();
+                        plan.addStep(std::move(creating_set_step));
+                        return step_raw_ptr;
+                    };
+
                     if (expressions.join->pipelineType() == JoinPipelineType::YShaped)
                     {
-                        const auto & join_clause = expressions.join->getTableJoin().getOnlyClause();
-                        add_sorting(query_plan, join_clause.key_names_left, false);
-                        add_sorting(*joined_plan, join_clause.key_names_right, true);
+                        const auto & table_join = expressions.join->getTableJoin();
+                        const auto & join_clause = table_join.getOnlyClause();
+
+                        auto join_kind = table_join.kind();
+                        bool kind_allows_filtering = isInner(join_kind) || isLeft(join_kind) || isRight(join_kind);
+                        if (settings.max_rows_in_set_to_optimize_join > 0 && kind_allows_filtering)
+                        {
+                            auto * left_set = add_create_set(query_plan, join_clause.key_names_left, JoinTableSide::Left);
+                            auto * right_set = add_create_set(*joined_plan, join_clause.key_names_right, JoinTableSide::Right);
+
+                            if (isInnerOrLeft(join_kind))
+                                right_set->setFiltering(left_set->getSet());
+
+                            if (isInnerOrRight(join_kind))
+                                left_set->setFiltering(right_set->getSet());
+                        }
+
+                        add_sorting(query_plan, join_clause.key_names_left, JoinTableSide::Left);
+                        add_sorting(*joined_plan, join_clause.key_names_right, JoinTableSide::Right);
                     }
 
                     QueryPlanStepPtr join_step = std::make_unique<JoinStep>(
@@ -1720,7 +1763,7 @@ static void executeMergeAggregatedImpl(
       *  but it can work more slowly.
       */
 
-    Aggregator::Params params(keys, aggregates, overflow_row, settings.max_threads);
+    Aggregator::Params params(keys, aggregates, overflow_row, settings.max_threads, settings.max_block_size);
 
     auto merging_aggregated = std::make_unique<MergingAggregatedStep>(
         query_plan.getCurrentDataStream(),
@@ -2316,6 +2359,7 @@ static Aggregator::Params getAggregatorParams(
         settings.min_free_disk_space_for_temporary_data,
         settings.compile_aggregate_expressions,
         settings.min_count_to_compile_aggregate_expression,
+        settings.max_block_size,
         /* only_merge */ false,
         stats_collecting_params
     };
diff --git a/src/Interpreters/InterpreterSelectWithUnionQuery.cpp b/src/Interpreters/InterpreterSelectWithUnionQuery.cpp
index 87a182e70ae..a679b17a5bd 100644
--- a/src/Interpreters/InterpreterSelectWithUnionQuery.cpp
+++ b/src/Interpreters/InterpreterSelectWithUnionQuery.cpp
@@ -317,13 +317,13 @@ void InterpreterSelectWithUnionQuery::buildQueryPlan(QueryPlan & query_plan)
             data_streams[i] = plans[i]->getCurrentDataStream();
         }
 
-        auto max_threads = context->getSettingsRef().max_threads;
+        auto max_threads = settings.max_threads;
         auto union_step = std::make_unique<UnionStep>(std::move(data_streams), max_threads);
 
         query_plan.unitePlans(std::move(union_step), std::move(plans));
 
         const auto & query = query_ptr->as<ASTSelectWithUnionQuery &>();
-        if (query.union_mode == SelectUnionMode::DISTINCT)
+        if (query.union_mode == SelectUnionMode::UNION_DISTINCT)
         {
             /// Add distinct transform
             SizeLimits limits(settings.max_rows_in_distinct, settings.max_bytes_in_distinct, settings.distinct_overflow_mode);
diff --git a/src/Interpreters/InterpreterSystemQuery.cpp b/src/Interpreters/InterpreterSystemQuery.cpp
index 106b1611f80..a6d88c7d28b 100644
--- a/src/Interpreters/InterpreterSystemQuery.cpp
+++ b/src/Interpreters/InterpreterSystemQuery.cpp
@@ -753,7 +753,7 @@ bool InterpreterSystemQuery::dropReplicaImpl(ASTSystemQuery & query, const Stora
                         "if you want to clean the data and drop this replica", ErrorCodes::TABLE_WAS_NOT_DROPPED);
 
     /// NOTE it's not atomic: replica may become active after this check, but before dropReplica(...)
-    /// However, the main usecase is to drop dead replica, which cannot become active.
+    /// However, the main use case is to drop dead replica, which cannot become active.
     /// This check prevents only from accidental drop of some other replica.
     if (zookeeper->exists(status.zookeeper_path + "/replicas/" + query.replica + "/is_active"))
         throw Exception("Can't drop replica: " + query.replica + ", because it's active",
diff --git a/src/Interpreters/MutationsInterpreter.cpp b/src/Interpreters/MutationsInterpreter.cpp
index d3679aac891..c496995ba65 100644
--- a/src/Interpreters/MutationsInterpreter.cpp
+++ b/src/Interpreters/MutationsInterpreter.cpp
@@ -226,7 +226,7 @@ bool isStorageTouchedByMutations(
     ASTPtr select_query = prepareQueryAffectedAST(commands, storage, context_copy);
 
     /// Interpreter must be alive, when we use result of execute() method.
-    /// For some reason it may copy context and and give it into ExpressionTransform
+    /// For some reason it may copy context and give it into ExpressionTransform
     /// after that we will use context from destroyed stack frame in our stream.
     InterpreterSelectQuery interpreter(
         select_query, context_copy, storage, metadata_snapshot, SelectQueryOptions().ignoreLimits().ignoreProjections());
@@ -288,13 +288,17 @@ MutationsInterpreter::MutationsInterpreter(
     const StorageMetadataPtr & metadata_snapshot_,
     MutationCommands commands_,
     ContextPtr context_,
-    bool can_execute_)
+    bool can_execute_,
+    bool return_all_columns_,
+    bool return_deleted_rows_)
     : storage(std::move(storage_))
     , metadata_snapshot(metadata_snapshot_)
     , commands(std::move(commands_))
     , context(Context::createCopy(context_))
     , can_execute(can_execute_)
     , select_limits(SelectQueryOptions().analyze(!can_execute).ignoreLimits().ignoreProjections())
+    , return_all_columns(return_all_columns_)
+    , return_deleted_rows(return_deleted_rows_)
 {
     mutation_ast = prepare(!can_execute);
 }
@@ -472,14 +476,21 @@ ASTPtr MutationsInterpreter::prepare(bool dry_run)
     /// First, break a sequence of commands into stages.
     for (auto & command : commands)
     {
+        // we can return deleted rows only if it's the only present command
+        assert(command.type == MutationCommand::DELETE || !return_deleted_rows);
+
         if (command.type == MutationCommand::DELETE)
         {
             mutation_kind.set(MutationKind::MUTATE_OTHER);
             if (stages.empty() || !stages.back().column_to_updated.empty())
                 stages.emplace_back(context);
 
-            auto negated_predicate = makeASTFunction("isZeroOrNull", getPartitionAndPredicateExpressionForMutationCommand(command));
-            stages.back().filters.push_back(negated_predicate);
+            auto predicate  = getPartitionAndPredicateExpressionForMutationCommand(command);
+
+            if (!return_deleted_rows)
+                predicate = makeASTFunction("isZeroOrNull", predicate);
+
+            stages.back().filters.push_back(predicate);
         }
         else if (command.type == MutationCommand::UPDATE)
         {
@@ -789,7 +800,7 @@ ASTPtr MutationsInterpreter::prepareInterpreterSelectQuery(std::vector<Stage> &
     /// Next, for each stage calculate columns changed by this and previous stages.
     for (size_t i = 0; i < prepared_stages.size(); ++i)
     {
-        if (!prepared_stages[i].filters.empty())
+        if (return_all_columns || !prepared_stages[i].filters.empty())
         {
             for (const auto & column : all_columns)
                 prepared_stages[i].output_columns.insert(column.name);
diff --git a/src/Interpreters/MutationsInterpreter.h b/src/Interpreters/MutationsInterpreter.h
index 94525bf6b8c..336c5f11162 100644
--- a/src/Interpreters/MutationsInterpreter.h
+++ b/src/Interpreters/MutationsInterpreter.h
@@ -43,7 +43,9 @@ public:
         const StorageMetadataPtr & metadata_snapshot_,
         MutationCommands commands_,
         ContextPtr context_,
-        bool can_execute_);
+        bool can_execute_,
+        bool return_all_columns_ = false,
+        bool return_deleted_rows_ = false);
 
     void validate();
 
@@ -156,6 +158,12 @@ private:
 
     /// Columns, that we need to read for calculation of skip indices, projections or TTL expressions.
     ColumnDependencies dependencies;
+
+    // whether all columns should be returned, not just updated
+    bool return_all_columns;
+
+    // whether we should return deleted or nondeleted rows on DELETE mutation
+    bool return_deleted_rows;
 };
 
 }
diff --git a/src/Interpreters/NormalizeSelectWithUnionQueryVisitor.cpp b/src/Interpreters/NormalizeSelectWithUnionQueryVisitor.cpp
index 364828e6126..40c42f7728e 100644
--- a/src/Interpreters/NormalizeSelectWithUnionQueryVisitor.cpp
+++ b/src/Interpreters/NormalizeSelectWithUnionQueryVisitor.cpp
@@ -59,22 +59,22 @@ void NormalizeSelectWithUnionQueryMatcher::visit(ASTSelectWithUnionQuery & ast,
             continue;
 
         /// Rewrite UNION Mode
-        if (union_modes[i] == SelectUnionMode::Unspecified)
+        if (union_modes[i] == SelectUnionMode::UNION_DEFAULT)
         {
-            if (data.union_default_mode == UnionMode::ALL)
-                union_modes[i] = SelectUnionMode::ALL;
-            else if (data.union_default_mode == UnionMode::DISTINCT)
-                union_modes[i] = SelectUnionMode::DISTINCT;
+            if (data.union_default_mode == SetOperationMode::ALL)
+                union_modes[i] = SelectUnionMode::UNION_ALL;
+            else if (data.union_default_mode == SetOperationMode::DISTINCT)
+                union_modes[i] = SelectUnionMode::UNION_DISTINCT;
             else
                 throw Exception(
                     "Expected ALL or DISTINCT in SelectWithUnion query, because setting (union_default_mode) is empty",
                     DB::ErrorCodes::EXPECTED_ALL_OR_DISTINCT);
         }
 
-        if (union_modes[i] == SelectUnionMode::ALL)
+        if (union_modes[i] == SelectUnionMode::UNION_ALL)
         {
             if (auto * inner_union = select_list[i + 1]->as<ASTSelectWithUnionQuery>();
-                inner_union && inner_union->union_mode == SelectUnionMode::ALL)
+                inner_union && inner_union->union_mode == SelectUnionMode::UNION_ALL)
             {
                 /// Inner_union is an UNION ALL list, just lift up
                 for (auto child = inner_union->list_of_selects->children.rbegin(); child != inner_union->list_of_selects->children.rend();
@@ -85,7 +85,7 @@ void NormalizeSelectWithUnionQueryMatcher::visit(ASTSelectWithUnionQuery & ast,
                 selects.push_back(select_list[i + 1]);
         }
         /// flatten all left nodes and current node to a UNION DISTINCT list
-        else if (union_modes[i] == SelectUnionMode::DISTINCT)
+        else if (union_modes[i] == SelectUnionMode::UNION_DISTINCT)
         {
             auto distinct_list = std::make_shared<ASTSelectWithUnionQuery>();
             distinct_list->list_of_selects = std::make_shared<ASTExpressionList>();
@@ -96,7 +96,7 @@ void NormalizeSelectWithUnionQueryMatcher::visit(ASTSelectWithUnionQuery & ast,
                 getSelectsFromUnionListNode(select_list[j], distinct_list->list_of_selects->children);
             }
 
-            distinct_list->union_mode = SelectUnionMode::DISTINCT;
+            distinct_list->union_mode = SelectUnionMode::UNION_DISTINCT;
             distinct_list->is_normalized = true;
             selects.push_back(std::move(distinct_list));
             distinct_found = true;
@@ -113,7 +113,7 @@ void NormalizeSelectWithUnionQueryMatcher::visit(ASTSelectWithUnionQuery & ast,
     if (!distinct_found)
     {
         if (auto * inner_union = select_list[0]->as<ASTSelectWithUnionQuery>();
-            inner_union && inner_union->union_mode == SelectUnionMode::ALL)
+            inner_union && inner_union->union_mode == SelectUnionMode::UNION_ALL)
         {
             /// Inner_union is an UNION ALL list, just lift it up
             for (auto child = inner_union->list_of_selects->children.rbegin(); child != inner_union->list_of_selects->children.rend();
@@ -136,7 +136,7 @@ void NormalizeSelectWithUnionQueryMatcher::visit(ASTSelectWithUnionQuery & ast,
     std::reverse(selects.begin(), selects.end());
 
     ast.is_normalized = true;
-    ast.union_mode = SelectUnionMode::ALL;
+    ast.union_mode = SelectUnionMode::UNION_ALL;
     ast.set_of_modes = std::move(current_set_of_modes);
 
     ast.list_of_selects->children = std::move(selects);
diff --git a/src/Interpreters/NormalizeSelectWithUnionQueryVisitor.h b/src/Interpreters/NormalizeSelectWithUnionQueryVisitor.h
index d035e90f5a5..e8194f0dfe1 100644
--- a/src/Interpreters/NormalizeSelectWithUnionQueryVisitor.h
+++ b/src/Interpreters/NormalizeSelectWithUnionQueryVisitor.h
@@ -18,7 +18,7 @@ class NormalizeSelectWithUnionQueryMatcher
 public:
     struct Data
     {
-        const UnionMode & union_default_mode;
+        const SetOperationMode union_default_mode;
     };
 
     static void getSelectsFromUnionListNode(ASTPtr ast_select, ASTs & selects);
diff --git a/src/Interpreters/OpenTelemetrySpanLog.cpp b/src/Interpreters/OpenTelemetrySpanLog.cpp
index 2683a5f7955..57d5c11ad97 100644
--- a/src/Interpreters/OpenTelemetrySpanLog.cpp
+++ b/src/Interpreters/OpenTelemetrySpanLog.cpp
@@ -68,195 +68,5 @@ void OpenTelemetrySpanLogElement::appendToBlock(MutableColumns & columns) const
     columns[i++]->insert(attributes);
 }
 
-
-OpenTelemetrySpanHolder::OpenTelemetrySpanHolder(const std::string & _operation_name)
-{
-    trace_id = 0;
-
-    if (!CurrentThread::isInitialized())
-    {
-        // There may be no thread context if we're running inside the
-        // clickhouse-client, e.g. reading an external table provided with the
-        // `--external` option.
-        return;
-    }
-
-    auto & thread = CurrentThread::get();
-
-    trace_id = thread.thread_trace_context.trace_id;
-    if (trace_id == UUID())
-        return;
-
-    parent_span_id = thread.thread_trace_context.span_id;
-    span_id = thread_local_rng();
-    operation_name = _operation_name;
-    start_time_us = std::chrono::duration_cast<std::chrono::microseconds>(
-        std::chrono::system_clock::now().time_since_epoch()).count();
-
-    thread.thread_trace_context.span_id = span_id;
 }
 
-
-OpenTelemetrySpanHolder::~OpenTelemetrySpanHolder()
-{
-    try
-    {
-        if (trace_id == UUID())
-            return;
-
-        // First of all, return old value of current span.
-        auto & thread = CurrentThread::get();
-        assert(thread.thread_trace_context.span_id == span_id);
-        thread.thread_trace_context.span_id = parent_span_id;
-
-        // Not sure what's the best way to access the log from here.
-        auto * thread_group = CurrentThread::getGroup().get();
-        // Not sure whether and when this can be null.
-        if (!thread_group)
-            return;
-
-        ContextPtr context;
-        {
-            std::lock_guard lock(thread_group->mutex);
-            context = thread_group->query_context.lock();
-        }
-
-        if (!context)
-        {
-            // Both global and query contexts can be null when executing a
-            // background task, and global context can be null for some
-            // queries.
-            return;
-        }
-
-        auto log = context->getOpenTelemetrySpanLog();
-        if (!log)
-        {
-            // The log might be disabled.
-            return;
-        }
-
-        finish_time_us = std::chrono::duration_cast<std::chrono::microseconds>(
-            std::chrono::system_clock::now().time_since_epoch()).count();
-
-        log->add(OpenTelemetrySpanLogElement(
-                     static_cast<OpenTelemetrySpan>(*this)));
-    }
-    catch (...)
-    {
-        tryLogCurrentException(__FUNCTION__);
-    }
-}
-
-void OpenTelemetrySpanHolder::addAttribute(const std::string& name, UInt64 value)
-{
-    if (trace_id == UUID())
-        return;
-
-    this->attributes.push_back(Tuple{name, toString(value)});
-}
-
-void OpenTelemetrySpanHolder::addAttribute(const std::string& name, const std::string& value)
-{
-    if (trace_id == UUID())
-        return;
-
-    this->attributes.push_back(Tuple{name, value});
-}
-
-void OpenTelemetrySpanHolder::addAttribute(const Exception & e)
-{
-    if (trace_id == UUID())
-        return;
-
-    this->attributes.push_back(Tuple{"clickhouse.exception", getExceptionMessage(e, false)});
-}
-
-void OpenTelemetrySpanHolder::addAttribute(std::exception_ptr e)
-{
-    if (trace_id == UUID() || e == nullptr)
-        return;
-
-    this->attributes.push_back(Tuple{"clickhouse.exception", getExceptionMessage(e, false)});
-}
-
-bool OpenTelemetryTraceContext::parseTraceparentHeader(const std::string & traceparent,
-    std::string & error)
-{
-    trace_id = 0;
-
-    // Version 00, which is the only one we can parse, is fixed width. Use this
-    // fact for an additional sanity check.
-    const int expected_length = strlen("xx-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx-xxxxxxxxxxxxxxxx-xx");
-    if (traceparent.length() != expected_length)
-    {
-        error = fmt::format("unexpected length {}, expected {}",
-            traceparent.length(), expected_length);
-        return false;
-    }
-
-    const char * data = traceparent.data();
-
-    uint8_t version = unhex2(data);
-    data += 2;
-
-    if (version != 0)
-    {
-        error = fmt::format("unexpected version {}, expected 00", version);
-        return false;
-    }
-
-    if (*data != '-')
-    {
-        error = fmt::format("Malformed traceparant header: {}", traceparent);
-        return false;
-    }
-
-    ++data;
-    UInt64 trace_id_higher_64 = unhexUInt<UInt64>(data);
-    UInt64 trace_id_lower_64 = unhexUInt<UInt64>(data + 16);
-    data += 32;
-
-    if (*data != '-')
-    {
-        error = fmt::format("Malformed traceparant header: {}", traceparent);
-        return false;
-    }
-
-    ++data;
-    UInt64 span_id_64 = unhexUInt<UInt64>(data);
-    data += 16;
-
-    if (*data != '-')
-    {
-        error = fmt::format("Malformed traceparant header: {}", traceparent);
-        return false;
-    }
-
-    ++data;
-    this->trace_flags = unhex2(data);
-
-    // store the 128-bit trace id in big-endian order
-    this->trace_id.toUnderType().items[0] = trace_id_higher_64;
-    this->trace_id.toUnderType().items[1] = trace_id_lower_64;
-    this->span_id = span_id_64;
-    return true;
-}
-
-
-std::string OpenTelemetryTraceContext::composeTraceparentHeader() const
-{
-    // This span is a parent for its children, so we specify this span_id as a
-    // parent id.
-    return fmt::format("00-{:016x}{:016x}-{:016x}-{:02x}",
-                       // Output the trace id in network byte order
-                       trace_id.toUnderType().items[0],
-                       trace_id.toUnderType().items[1],
-                       span_id,
-                       // This cast is needed because fmt is being weird and complaining that
-                       // "mixing character types is not allowed".
-                       static_cast<uint8_t>(trace_flags));
-}
-
-
-}
diff --git a/src/Interpreters/OpenTelemetrySpanLog.h b/src/Interpreters/OpenTelemetrySpanLog.h
index 34f4765c8c4..e5a5b082284 100644
--- a/src/Interpreters/OpenTelemetrySpanLog.h
+++ b/src/Interpreters/OpenTelemetrySpanLog.h
@@ -7,24 +7,11 @@
 namespace DB
 {
 
-struct OpenTelemetrySpan
-{
-    UUID trace_id;
-    UInt64 span_id;
-    UInt64 parent_span_id;
-    std::string operation_name;
-    UInt64 start_time_us;
-    UInt64 finish_time_us;
-    Map attributes;
-    // I don't understand how Links work, namely, which direction should they
-    // point to, and how they are related with parent_span_id, so no Links for now.
-};
-
-struct OpenTelemetrySpanLogElement : public OpenTelemetrySpan
+struct OpenTelemetrySpanLogElement : public OpenTelemetry::Span
 {
     OpenTelemetrySpanLogElement() = default;
-    explicit OpenTelemetrySpanLogElement(const OpenTelemetrySpan & span)
-        : OpenTelemetrySpan(span) {}
+    OpenTelemetrySpanLogElement(const OpenTelemetry::Span & span)
+        : OpenTelemetry::Span(span) {}
 
     static std::string name() { return "OpenTelemetrySpanLog"; }
     static NamesAndTypesList getNamesAndTypes();
@@ -41,15 +28,4 @@ public:
     using SystemLog<OpenTelemetrySpanLogElement>::SystemLog;
 };
 
-struct OpenTelemetrySpanHolder : public OpenTelemetrySpan
-{
-    explicit OpenTelemetrySpanHolder(const std::string & _operation_name);
-    void addAttribute(const std::string& name, UInt64 value);
-    void addAttribute(const std::string& name, const std::string& value);
-    void addAttribute(const Exception & e);
-    void addAttribute(std::exception_ptr e);
-
-    ~OpenTelemetrySpanHolder();
-};
-
 }
diff --git a/src/Interpreters/ReplaceQueryParameterVisitor.cpp b/src/Interpreters/ReplaceQueryParameterVisitor.cpp
index 03de8aecc92..664cda74522 100644
--- a/src/Interpreters/ReplaceQueryParameterVisitor.cpp
+++ b/src/Interpreters/ReplaceQueryParameterVisitor.cpp
@@ -1,16 +1,17 @@
-#include <Common/typeid_cast.h>
-#include <Common/quoteString.h>
 #include <Columns/IColumn.h>
-#include <DataTypes/IDataType.h>
 #include <DataTypes/DataTypeFactory.h>
+#include <DataTypes/IDataType.h>
 #include <Formats/FormatSettings.h>
 #include <IO/ReadBufferFromString.h>
-#include <Parsers/ASTIdentifier.h>
-#include <Parsers/ASTLiteral.h>
-#include <Parsers/ASTQueryParameter.h>
 #include <Interpreters/IdentifierSemantic.h>
 #include <Interpreters/ReplaceQueryParameterVisitor.h>
 #include <Interpreters/addTypeConversionToAST.h>
+#include <Parsers/ASTIdentifier.h>
+#include <Parsers/ASTLiteral.h>
+#include <Parsers/ASTQueryParameter.h>
+#include <Parsers/TablePropertiesQueriesASTs.h>
+#include <Common/quoteString.h>
+#include <Common/typeid_cast.h>
 
 
 namespace DB
@@ -30,7 +31,12 @@ void ReplaceQueryParameterVisitor::visit(ASTPtr & ast)
     else if (ast->as<ASTIdentifier>() || ast->as<ASTTableIdentifier>())
         visitIdentifier(ast);
     else
-        visitChildren(ast);
+    {
+        if (auto * describe_query = dynamic_cast<ASTDescribeQuery *>(ast.get()); describe_query && describe_query->table_expression)
+            visitChildren(describe_query->table_expression);
+        else
+            visitChildren(ast);
+    }
 }
 
 
diff --git a/src/Interpreters/RewriteCountDistinctVisitor.cpp b/src/Interpreters/RewriteCountDistinctVisitor.cpp
index a7a26a63460..cf28d8abb87 100644
--- a/src/Interpreters/RewriteCountDistinctVisitor.cpp
+++ b/src/Interpreters/RewriteCountDistinctVisitor.cpp
@@ -62,7 +62,7 @@ void RewriteCountDistinctFunctionMatcher::visit(ASTPtr & ast, Data & /*data*/)
         auto expr = std::make_shared<ASTExpressionList>();
         expr->children.emplace_back(cloned_select_query);
         auto select_with_union = std::make_shared<ASTSelectWithUnionQuery>();
-        select_with_union->union_mode = SelectUnionMode::Unspecified;
+        select_with_union->union_mode = SelectUnionMode::UNION_DEFAULT;
         select_with_union->is_normalized = false;
         select_with_union->list_of_modes.clear();
         select_with_union->set_of_modes.clear();
diff --git a/src/Interpreters/SelectIntersectExceptQueryVisitor.cpp b/src/Interpreters/SelectIntersectExceptQueryVisitor.cpp
index 4d695263f26..756a8a48e25 100644
--- a/src/Interpreters/SelectIntersectExceptQueryVisitor.cpp
+++ b/src/Interpreters/SelectIntersectExceptQueryVisitor.cpp
@@ -2,6 +2,8 @@
 #include <Parsers/ASTExpressionList.h>
 #include <Parsers/ASTSelectWithUnionQuery.h>
 #include <Common/typeid_cast.h>
+#include <Core/SettingsEnums.h>
+#include <Parsers/SelectUnionMode.h>
 
 
 namespace DB
@@ -9,6 +11,7 @@ namespace DB
 namespace ErrorCodes
 {
     extern const int LOGICAL_ERROR;
+    extern const int EXPECTED_ALL_OR_DISTINCT;
 }
 
 /*
@@ -27,9 +30,9 @@ void SelectIntersectExceptQueryMatcher::visit(ASTPtr & ast, Data & data)
         visit(*select_union, data);
 }
 
-void SelectIntersectExceptQueryMatcher::visit(ASTSelectWithUnionQuery & ast, Data &)
+void SelectIntersectExceptQueryMatcher::visit(ASTSelectWithUnionQuery & ast, Data & data)
 {
-    const auto & union_modes = ast.list_of_modes;
+    auto union_modes = std::move(ast.list_of_modes);
 
     if (union_modes.empty())
         return;
@@ -46,14 +49,39 @@ void SelectIntersectExceptQueryMatcher::visit(ASTSelectWithUnionQuery & ast, Dat
     selects.pop_back();
     SelectUnionModes modes;
 
-    for (const auto & mode : union_modes)
+    for (auto & mode : union_modes)
     {
+        /// Rewrite intersect / except mode
+        if (mode == SelectUnionMode::EXCEPT_DEFAULT)
+        {
+            if (data.except_default_mode == SetOperationMode::ALL)
+                mode = SelectUnionMode::EXCEPT_ALL;
+            else if (data.except_default_mode == SetOperationMode::DISTINCT)
+                mode = SelectUnionMode::EXCEPT_DISTINCT;
+            else
+                throw Exception(
+                    "Expected ALL or DISTINCT in EXCEPT query, because setting (except_default_mode) is empty",
+                    DB::ErrorCodes::EXPECTED_ALL_OR_DISTINCT);
+        }
+        else if (mode == SelectUnionMode::INTERSECT_DEFAULT)
+        {
+            if (data.intersect_default_mode == SetOperationMode::ALL)
+                mode = SelectUnionMode::INTERSECT_ALL;
+            else if (data.intersect_default_mode == SetOperationMode::DISTINCT)
+                mode = SelectUnionMode::INTERSECT_DISTINCT;
+            else
+                throw Exception(
+                    "Expected ALL or DISTINCT in INTERSECT query, because setting (intersect_default_mode) is empty",
+                    DB::ErrorCodes::EXPECTED_ALL_OR_DISTINCT);
+        }
+
         switch (mode)
         {
-            case SelectUnionMode::EXCEPT:
+            case SelectUnionMode::EXCEPT_ALL:
+            case SelectUnionMode::EXCEPT_DISTINCT:
             {
                 auto left = std::make_shared<ASTSelectWithUnionQuery>();
-                left->union_mode = SelectUnionMode::ALL;
+                left->union_mode = mode == SelectUnionMode::EXCEPT_ALL ? SelectUnionMode::UNION_ALL : SelectUnionMode::UNION_DISTINCT;
 
                 left->list_of_selects = std::make_shared<ASTExpressionList>();
                 left->children.push_back(left->list_of_selects);
@@ -66,17 +94,22 @@ void SelectIntersectExceptQueryMatcher::visit(ASTSelectWithUnionQuery & ast, Dat
                 selects.pop_back();
 
                 auto except_node = std::make_shared<ASTSelectIntersectExceptQuery>();
-                except_node->final_operator = ASTSelectIntersectExceptQuery::Operator::EXCEPT;
+                except_node->final_operator = mode == SelectUnionMode::EXCEPT_ALL
+                    ? ASTSelectIntersectExceptQuery::Operator::EXCEPT_ALL
+                    : ASTSelectIntersectExceptQuery::Operator::EXCEPT_DISTINCT;
                 except_node->children = {left, right};
 
                 children = {except_node};
                 break;
             }
-            case SelectUnionMode::INTERSECT:
+            case SelectUnionMode::INTERSECT_ALL:
+            case SelectUnionMode::INTERSECT_DISTINCT:
             {
                 bool from_except = false;
                 const auto * except_ast = typeid_cast<const ASTSelectIntersectExceptQuery *>(children.back().get());
-                if (except_ast && (except_ast->final_operator == ASTSelectIntersectExceptQuery::Operator::EXCEPT))
+                if (except_ast
+                    && (except_ast->final_operator == ASTSelectIntersectExceptQuery::Operator::EXCEPT_ALL
+                        || except_ast->final_operator == ASTSelectIntersectExceptQuery::Operator::EXCEPT_DISTINCT))
                     from_except = true;
 
                 ASTPtr left;
@@ -94,7 +127,9 @@ void SelectIntersectExceptQueryMatcher::visit(ASTSelectWithUnionQuery & ast, Dat
                 selects.pop_back();
 
                 auto intersect_node = std::make_shared<ASTSelectIntersectExceptQuery>();
-                intersect_node->final_operator = ASTSelectIntersectExceptQuery::Operator::INTERSECT;
+                intersect_node->final_operator = mode == SelectUnionMode::INTERSECT_ALL
+                    ? ASTSelectIntersectExceptQuery::Operator::INTERSECT_ALL
+                    : ASTSelectIntersectExceptQuery::Operator::INTERSECT_DISTINCT;
                 intersect_node->children = {left, right};
 
                 if (from_except)
@@ -122,7 +157,6 @@ void SelectIntersectExceptQueryMatcher::visit(ASTSelectWithUnionQuery & ast, Dat
         children.emplace_back(std::move(right));
     }
 
-    ast.union_mode = SelectUnionMode::Unspecified;
     ast.list_of_selects->children = std::move(children);
     ast.list_of_modes = std::move(modes);
 }
diff --git a/src/Interpreters/SelectIntersectExceptQueryVisitor.h b/src/Interpreters/SelectIntersectExceptQueryVisitor.h
index 5aeb15f70bb..daf6d2ea0df 100644
--- a/src/Interpreters/SelectIntersectExceptQueryVisitor.h
+++ b/src/Interpreters/SelectIntersectExceptQueryVisitor.h
@@ -6,6 +6,7 @@
 #include <Interpreters/InDepthNodeVisitor.h>
 
 #include <Parsers/ASTSelectIntersectExceptQuery.h>
+#include <Core/SettingsEnums.h>
 
 
 namespace DB
@@ -17,7 +18,11 @@ class ASTSelectWithUnionQuery;
 class SelectIntersectExceptQueryMatcher
 {
 public:
-    struct Data {};
+    struct Data
+    {
+        const SetOperationMode intersect_default_mode;
+        const SetOperationMode except_default_mode;
+    };
 
     static bool needChildVisit(const ASTPtr &, const ASTPtr &) { return true; }
 
diff --git a/src/Interpreters/Session.h b/src/Interpreters/Session.h
index 8de76349b7d..ed4f7809dee 100644
--- a/src/Interpreters/Session.h
+++ b/src/Interpreters/Session.h
@@ -65,6 +65,8 @@ public:
     ContextMutablePtr sessionContext() { return session_context; }
     ContextPtr sessionContext() const { return session_context; }
 
+    ContextPtr  sessionOrGlobalContext() const { return session_context ? session_context : global_context; }
+
     /// Makes a query context, can be used multiple times, with or without makeSession() called earlier.
     /// The query context will be created from a copy of a session context if it exists, or from a copy of
     /// a global context otherwise. In the latter case the function also assigns an user to this context.
diff --git a/src/Interpreters/Set.cpp b/src/Interpreters/Set.cpp
index af05b33c1f6..ded8b04a589 100644
--- a/src/Interpreters/Set.cpp
+++ b/src/Interpreters/Set.cpp
@@ -22,6 +22,8 @@
 #include <Interpreters/castColumn.h>
 #include <Interpreters/Context.h>
 
+#include <Processors/Chunk.h>
+
 #include <Storages/MergeTree/KeyCondition.h>
 
 #include <base/range.h>
@@ -162,8 +164,16 @@ void Set::setHeader(const ColumnsWithTypeAndName & header)
     data.init(data.chooseMethod(key_columns, key_sizes));
 }
 
-
 bool Set::insertFromBlock(const ColumnsWithTypeAndName & columns)
+{
+    Columns cols;
+    cols.reserve(columns.size());
+    for (const auto & column : columns)
+        cols.emplace_back(column.column);
+    return insertFromBlock(cols);
+}
+
+bool Set::insertFromBlock(const Columns & columns)
 {
     std::lock_guard<std::shared_mutex> lock(rwlock);
 
@@ -179,11 +189,11 @@ bool Set::insertFromBlock(const ColumnsWithTypeAndName & columns)
     /// Remember the columns we will work with
     for (size_t i = 0; i < keys_size; ++i)
     {
-        materialized_columns.emplace_back(columns.at(i).column->convertToFullIfNeeded());
+        materialized_columns.emplace_back(columns.at(i)->convertToFullIfNeeded());
         key_columns.emplace_back(materialized_columns.back().get());
     }
 
-    size_t rows = columns.at(0).column->size();
+    size_t rows = columns.at(0)->size();
 
     /// We will insert to the Set only keys, where all components are not NULL.
     ConstNullMapPtr null_map{};
@@ -393,7 +403,13 @@ void Set::checkColumnsNumber(size_t num_key_columns) const
 
 bool Set::areTypesEqual(size_t set_type_idx, const DataTypePtr & other_type) const
 {
-    return removeNullable(recursiveRemoveLowCardinality(data_types[set_type_idx]))->equals(*removeNullable(recursiveRemoveLowCardinality(other_type)));
+    /// Out-of-bound access can happen when same set expression built with different columns.
+    /// Caller may call this method to make sure that the set is indeed the one they want
+    /// without awaring data_types.size().
+    if (set_type_idx >= data_types.size())
+        return false;
+    return removeNullable(recursiveRemoveLowCardinality(data_types[set_type_idx]))
+        ->equals(*removeNullable(recursiveRemoveLowCardinality(other_type)));
 }
 
 void Set::checkTypesEqual(size_t set_type_idx, const DataTypePtr & other_type) const
diff --git a/src/Interpreters/Set.h b/src/Interpreters/Set.h
index 6a3b28407ee..44f543ce222 100644
--- a/src/Interpreters/Set.h
+++ b/src/Interpreters/Set.h
@@ -20,6 +20,7 @@ class Context;
 class IFunctionBase;
 using FunctionBasePtr = std::shared_ptr<IFunctionBase>;
 
+class Chunk;
 
 /** Data structure for implementation of IN expression.
   */
@@ -45,11 +46,14 @@ public:
     void setHeader(const ColumnsWithTypeAndName & header);
 
     /// Returns false, if some limit was exceeded and no need to insert more data.
+    bool insertFromBlock(const Columns & columns);
     bool insertFromBlock(const ColumnsWithTypeAndName & columns);
+
     /// Call after all blocks were inserted. To get the information that set is already created.
     void finishInsert() { is_created = true; }
 
-    bool isCreated() const { return is_created; }
+    /// finishInsert and isCreated are thread-safe
+    bool isCreated() const { return is_created.load(); }
 
     /** For columns of 'block', check belonging of corresponding rows to the set.
       * Return UInt8 column with the result.
@@ -111,7 +115,7 @@ private:
     bool transform_null_in;
 
     /// Check if set contains all the data.
-    bool is_created = false;
+    std::atomic<bool> is_created = false;
 
     /// If in the left part columns contains the same types as the elements of the set.
     void executeOrdinary(
diff --git a/src/Interpreters/TableJoin.h b/src/Interpreters/TableJoin.h
index d0bf64fdebe..3835ef77deb 100644
--- a/src/Interpreters/TableJoin.h
+++ b/src/Interpreters/TableJoin.h
@@ -73,16 +73,32 @@ public:
             return key_names_right.size();
         }
 
-        String formatDebug() const
+        String formatDebug(bool short_format = false) const
         {
-            return fmt::format("Left keys: [{}] Right keys [{}] Condition columns: '{}', '{}'",
-                               fmt::join(key_names_left, ", "), fmt::join(key_names_right, ", "),
-                               condColumnNames().first, condColumnNames().second);
+            const auto & [left_cond, right_cond] = condColumnNames();
+
+            if (short_format)
+            {
+                return fmt::format("({}) = ({}){}{}", fmt::join(key_names_left, ", "), fmt::join(key_names_right, ", "),
+                                   !left_cond.empty() ? " AND " + left_cond : "", !right_cond.empty() ? " AND " + right_cond : "");
+            }
+
+            return fmt::format(
+                "Left keys: [{}] Right keys [{}] Condition columns: '{}', '{}'",
+                 fmt::join(key_names_left, ", "), fmt::join(key_names_right, ", "), left_cond, right_cond);
         }
     };
 
     using Clauses = std::vector<JoinOnClause>;
 
+    static std::string formatClauses(const Clauses & clauses, bool short_format = false)
+    {
+        std::vector<std::string> res;
+        for (const auto & clause : clauses)
+            res.push_back("[" + clause.formatDebug(short_format) + "]");
+        return fmt::format("{}", fmt::join(res, "; "));
+    }
+
 private:
     /** Query of the form `SELECT expr(x) AS k FROM t1 ANY LEFT JOIN (SELECT expr(x) AS k FROM t2) USING k`
       * The join is made by column k.
diff --git a/src/Interpreters/ThreadStatusExt.cpp b/src/Interpreters/ThreadStatusExt.cpp
index 643b2fdf497..b1f5749da25 100644
--- a/src/Interpreters/ThreadStatusExt.cpp
+++ b/src/Interpreters/ThreadStatusExt.cpp
@@ -84,15 +84,6 @@ void ThreadStatus::attachQueryContext(ContextPtr query_context_)
             thread_group->global_context = global_context;
     }
 
-    // Generate new span for thread manually here, because we can't depend
-    // on OpenTelemetrySpanHolder due to link order issues.
-    // FIXME why and how is this different from setupState()?
-    thread_trace_context = query_context_->query_trace_context;
-    if (thread_trace_context.trace_id != UUID())
-    {
-        thread_trace_context.span_id = thread_local_rng();
-    }
-
     applyQuerySettings();
 }
 
@@ -132,18 +123,6 @@ void ThreadStatus::setupState(const ThreadGroupStatusPtr & thread_group_)
     if (auto query_context_ptr = query_context.lock())
     {
         applyQuerySettings();
-
-        // Generate new span for thread manually here, because we can't depend
-        // on OpenTelemetrySpanHolder due to link order issues.
-        thread_trace_context = query_context_ptr->query_trace_context;
-        if (thread_trace_context.trace_id != UUID())
-        {
-            thread_trace_context.span_id = thread_local_rng();
-        }
-    }
-    else
-    {
-        thread_trace_context.trace_id = 0;
     }
 
     initPerformanceCounters();
@@ -353,42 +332,6 @@ void ThreadStatus::detachQuery(bool exit_if_already_detached, bool thread_exits)
 
     assertState({ThreadState::AttachedToQuery}, __PRETTY_FUNCTION__);
 
-    std::shared_ptr<OpenTelemetrySpanLog> opentelemetry_span_log;
-    auto query_context_ptr = query_context.lock();
-    if (thread_trace_context.trace_id != UUID() && query_context_ptr)
-    {
-        opentelemetry_span_log = query_context_ptr->getOpenTelemetrySpanLog();
-    }
-
-    if (opentelemetry_span_log)
-    {
-        // Log the current thread span.
-        // We do this manually, because we can't use OpenTelemetrySpanHolder as a
-        // ThreadStatus member, because of linking issues. This file is linked
-        // separately, so we can reference OpenTelemetrySpanLog here, but if we had
-        // the span holder as a field, we would have to reference it in the
-        // destructor, which is in another library.
-        OpenTelemetrySpanLogElement span;
-
-        span.trace_id = thread_trace_context.trace_id;
-        // All child span holders should be finished by the time we detach this
-        // thread, so the current span id should be the thread span id. If not,
-        // an assertion for a proper parent span in ~OpenTelemetrySpanHolder()
-        // is going to fail, because we're going to reset it to zero later in
-        // this function.
-        span.span_id = thread_trace_context.span_id;
-        assert(query_context_ptr);
-        span.parent_span_id = query_context_ptr->query_trace_context.span_id;
-        span.operation_name = getThreadName();
-        span.start_time_us = query_start_time_microseconds;
-        span.finish_time_us =
-            std::chrono::duration_cast<std::chrono::microseconds>(
-                std::chrono::system_clock::now().time_since_epoch()).count();
-        span.attributes.push_back(Tuple{"clickhouse.thread_id", toString(thread_id)});
-
-        opentelemetry_span_log->add(span);
-    }
-
     finalizeQueryProfiler();
     finalizePerformanceCounters();
 
@@ -404,8 +347,6 @@ void ThreadStatus::detachQuery(bool exit_if_already_detached, bool thread_exits)
 
     query_id.clear();
     query_context.reset();
-    thread_trace_context.trace_id = 0;
-    thread_trace_context.span_id = 0;
 
     /// Avoid leaking of ThreadGroupStatus::finished_threads_counters_memory
     /// (this is in case someone uses system thread but did not call getProfileEventsCountersAndMemoryForThreads())
diff --git a/src/Interpreters/TreeCNFConverter.cpp b/src/Interpreters/TreeCNFConverter.cpp
index 1f61c88ddd0..8812e90a5f0 100644
--- a/src/Interpreters/TreeCNFConverter.cpp
+++ b/src/Interpreters/TreeCNFConverter.cpp
@@ -349,7 +349,7 @@ CNFQuery & CNFQuery::pullNotOutFunctions()
     return *this;
 }
 
-CNFQuery & CNFQuery::pushNotInFuntions()
+CNFQuery & CNFQuery::pushNotInFunctions()
 {
     transformAtoms([](const AtomicFormula & atom) -> AtomicFormula
                    {
diff --git a/src/Interpreters/TreeCNFConverter.h b/src/Interpreters/TreeCNFConverter.h
index a5d42e6b989..70c8990f74a 100644
--- a/src/Interpreters/TreeCNFConverter.h
+++ b/src/Interpreters/TreeCNFConverter.h
@@ -133,7 +133,7 @@ public:
     /// Converts != -> NOT =; <,>= -> (NOT) <; >,<= -> (NOT) <= for simpler matching
     CNFQuery & pullNotOutFunctions();
     /// Revert pullNotOutFunctions actions
-    CNFQuery & pushNotInFuntions();
+    CNFQuery & pushNotInFunctions();
 
     /// (a OR b OR ...) AND (NOT a OR b OR ...) -> (b OR ...)
     CNFQuery & reduce();
diff --git a/src/Interpreters/TreeOptimizer.cpp b/src/Interpreters/TreeOptimizer.cpp
index 07c7cd85d1b..3f7e141db3e 100644
--- a/src/Interpreters/TreeOptimizer.cpp
+++ b/src/Interpreters/TreeOptimizer.cpp
@@ -154,7 +154,7 @@ void optimizeGroupBy(ASTSelectQuery * select_query, ContextPtr context)
                     continue;
                 }
             }
-            /// don't optimise functions that shadow any of it's arguments, e.g.:
+            /// don't optimize functions that shadow any of it's arguments, e.g.:
             /// SELECT toString(dummy) as dummy FROM system.one GROUP BY dummy;
             if (!function->alias.empty())
             {
@@ -453,7 +453,7 @@ void optimizeMonotonousFunctionsInOrderBy(ASTSelectQuery * select_query, Context
         return;
 
     /// Do not apply optimization for Distributed and Merge storages,
-    /// because we can't get the sorting key of their undelying tables
+    /// because we can't get the sorting key of their underlying tables
     /// and we can break the matching of the sorting key for `read_in_order`
     /// optimization by removing monotonous functions from the prefix of key.
     if (result.is_remote_storage || (result.storage && result.storage->getName() == "Merge"))
@@ -632,7 +632,7 @@ bool convertQueryToCNF(ASTSelectQuery * select_query)
         if (!cnf_form)
             return false;
 
-        cnf_form->pushNotInFuntions();
+        cnf_form->pushNotInFunctions();
         select_query->refWhere() = TreeCNFConverter::fromCNF(*cnf_form);
         return true;
     }
diff --git a/src/Interpreters/TreeRewriter.cpp b/src/Interpreters/TreeRewriter.cpp
index 9248e8eecb6..73410a39ffd 100644
--- a/src/Interpreters/TreeRewriter.cpp
+++ b/src/Interpreters/TreeRewriter.cpp
@@ -521,10 +521,15 @@ void removeUnneededColumnsFromSelectClause(ASTSelectQuery * select_query, const
                 ++new_elements_size;
             }
             /// removing aggregation can change number of rows, so `count()` result in outer sub-query would be wrong
-            if (func && AggregateUtils::isAggregateFunction(*func) && !select_query->groupBy())
+            if (func && !select_query->groupBy())
             {
-                new_elements[result_index] = elem;
-                ++new_elements_size;
+                GetAggregatesVisitor::Data data = {};
+                GetAggregatesVisitor(data).visit(elem);
+                if (!data.aggregates.empty())
+                {
+                    new_elements[result_index] = elem;
+                    ++new_elements_size;
+                }
             }
         }
     }
diff --git a/src/Interpreters/TreeRewriter.h b/src/Interpreters/TreeRewriter.h
index 16ff7f8b6c3..7954547c070 100644
--- a/src/Interpreters/TreeRewriter.h
+++ b/src/Interpreters/TreeRewriter.h
@@ -99,7 +99,7 @@ using TreeRewriterResultPtr = std::shared_ptr<const TreeRewriterResult>;
 
 /// Tree Rewriter in terms of CMU slides @sa https://15721.courses.cs.cmu.edu/spring2020/slides/19-optimizer1.pdf
 ///
-/// Optimises AST tree and collect information for further expression analysis in ExpressionAnalyzer.
+/// Optimizes AST tree and collect information for further expression analysis in ExpressionAnalyzer.
 /// Result AST has the following invariants:
 ///  * all aliases are substituted
 ///  * qualified names are translated
diff --git a/src/Interpreters/WhereConstraintsOptimizer.cpp b/src/Interpreters/WhereConstraintsOptimizer.cpp
index 83bdcfeb2e1..234b99167bb 100644
--- a/src/Interpreters/WhereConstraintsOptimizer.cpp
+++ b/src/Interpreters/WhereConstraintsOptimizer.cpp
@@ -170,7 +170,7 @@ void WhereConstraintsOptimizer::perform()
                 return replaceTermsToConstants(atom, compare_graph);
             })
             .reduce()
-            .pushNotInFuntions();
+            .pushNotInFunctions();
 
         if (optimize_append_index)
             AddIndexConstraintsOptimizer(metadata_snapshot).perform(cnf);
diff --git a/src/Interpreters/executeQuery.cpp b/src/Interpreters/executeQuery.cpp
index cdddd28adeb..c501c1722ba 100644
--- a/src/Interpreters/executeQuery.cpp
+++ b/src/Interpreters/executeQuery.cpp
@@ -233,7 +233,7 @@ inline UInt64 time_in_seconds(std::chrono::time_point<std::chrono::system_clock>
     return std::chrono::duration_cast<std::chrono::seconds>(timepoint.time_since_epoch()).count();
 }
 
-static void onExceptionBeforeStart(const String & query_for_logging, ContextPtr context, UInt64 current_time_us, ASTPtr ast)
+static void onExceptionBeforeStart(const String & query_for_logging, ContextPtr context, UInt64 current_time_us, ASTPtr ast, const std::shared_ptr<OpenTelemetry::SpanHolder> & query_span)
 {
     /// Exception before the query execution.
     if (auto quota = context->getQuota())
@@ -291,29 +291,13 @@ static void onExceptionBeforeStart(const String & query_for_logging, ContextPtr
         if (auto query_log = context->getQueryLog())
             query_log->add(elem);
 
-    if (auto opentelemetry_span_log = context->getOpenTelemetrySpanLog();
-        context->query_trace_context.trace_id != UUID()
-            && opentelemetry_span_log)
+    if (query_span)
     {
-        OpenTelemetrySpanLogElement span;
-        span.trace_id = context->query_trace_context.trace_id;
-        span.span_id = context->query_trace_context.span_id;
-        span.parent_span_id = context->getClientInfo().client_trace_context.span_id;
-        span.operation_name = "query";
-        span.start_time_us = current_time_us;
-        span.finish_time_us = time_in_microseconds(std::chrono::system_clock::now());
-        span.attributes.reserve(6);
-        span.attributes.push_back(Tuple{"clickhouse.query_status", "ExceptionBeforeStart"});
-        span.attributes.push_back(Tuple{"db.statement", elem.query});
-        span.attributes.push_back(Tuple{"clickhouse.query_id", elem.client_info.current_query_id});
-        span.attributes.push_back(Tuple{"clickhouse.exception", elem.exception});
-        span.attributes.push_back(Tuple{"clickhouse.exception_code", toString(elem.exception_code)});
-        if (!context->query_trace_context.tracestate.empty())
-        {
-            span.attributes.push_back(Tuple{"clickhouse.tracestate", context->query_trace_context.tracestate});
-        }
-
-        opentelemetry_span_log->add(span);
+        query_span->addAttribute("clickhouse.exception_code", elem.exception_code);
+        query_span->addAttribute("clickhouse.exception", elem.exception);
+        query_span->addAttribute("db.statement", elem.query);
+        query_span->addAttribute("clickhouse.query_id", elem.client_info.current_query_id);
+        query_span->finish();
     }
 
     ProfileEvents::increment(ProfileEvents::FailedQuery);
@@ -364,6 +348,14 @@ static std::tuple<ASTPtr, BlockIO> executeQueryImpl(
     QueryProcessingStage::Enum stage,
     ReadBuffer * istr)
 {
+    /// query_span is a special span, when this function exits, it's lifetime is not ended, but ends when the query finishes.
+    /// Some internal queries might call this function recursively by setting 'internal' parameter to 'true',
+    /// to make sure SpanHolders in current stack ends in correct order, we disable this span for these internal queries
+    ///
+    /// This does not have impact on the final span logs, because these internal queries are issued by external queries,
+    /// we still have enough span logs for the execution of external queries.
+    std::shared_ptr<OpenTelemetry::SpanHolder> query_span = internal ? nullptr : std::make_shared<OpenTelemetry::SpanHolder>("query");
+
     const auto current_time = std::chrono::system_clock::now();
 
     auto & client_info = context->getClientInfo();
@@ -465,7 +457,7 @@ static std::tuple<ASTPtr, BlockIO> executeQueryImpl(
 
         if (!internal)
         {
-            onExceptionBeforeStart(query_for_logging, context, time_in_microseconds(current_time), ast);
+            onExceptionBeforeStart(query_for_logging, context, time_in_microseconds(current_time), ast, query_span);
         }
 
         throw;
@@ -521,13 +513,13 @@ static std::tuple<ASTPtr, BlockIO> executeQueryImpl(
         }
 
         {
-            SelectIntersectExceptQueryVisitor::Data data;
+            SelectIntersectExceptQueryVisitor::Data data{settings.intersect_default_mode, settings.except_default_mode};
             SelectIntersectExceptQueryVisitor{data}.visit(ast);
         }
 
         {
             /// Normalize SelectWithUnionQuery
-            NormalizeSelectWithUnionQueryVisitor::Data data{context->getSettingsRef().union_default_mode};
+            NormalizeSelectWithUnionQueryVisitor::Data data{settings.union_default_mode};
             NormalizeSelectWithUnionQueryVisitor{data}.visit(ast);
         }
 
@@ -684,12 +676,12 @@ static std::tuple<ASTPtr, BlockIO> executeQueryImpl(
             }
 
             {
-                std::unique_ptr<OpenTelemetrySpanHolder> span;
-                if (context->query_trace_context.trace_id != UUID())
+                std::unique_ptr<OpenTelemetry::SpanHolder> span;
+                if (OpenTelemetry::CurrentContext().isTraceEnabled())
                 {
                     auto * raw_interpreter_ptr = interpreter.get();
                     std::string class_name(demangle(typeid(*raw_interpreter_ptr).name()));
-                    span = std::make_unique<OpenTelemetrySpanHolder>(class_name + "::execute()");
+                    span = std::make_unique<OpenTelemetry::SpanHolder>(class_name + "::execute()");
                 }
                 res = interpreter->execute();
             }
@@ -841,7 +833,8 @@ static std::tuple<ASTPtr, BlockIO> executeQueryImpl(
                                     log_processors_profiles = settings.log_processors_profiles,
                                     status_info_to_query_log,
                                     implicit_txn_control,
-                                    pulling_pipeline = pipeline.pulling()](QueryPipeline & query_pipeline) mutable
+                                    pulling_pipeline = pipeline.pulling(),
+                                    query_span](QueryPipeline & query_pipeline) mutable
             {
                 QueryStatus * process_list_elem = context->getProcessListElement();
 
@@ -944,28 +937,18 @@ static std::tuple<ASTPtr, BlockIO> executeQueryImpl(
                     }
                 }
 
-                if (auto opentelemetry_span_log = context->getOpenTelemetrySpanLog();
-                    context->query_trace_context.trace_id != UUID()
-                        && opentelemetry_span_log)
+                if (query_span)
                 {
-                    OpenTelemetrySpanLogElement span;
-                    span.trace_id = context->query_trace_context.trace_id;
-                    span.span_id = context->query_trace_context.span_id;
-                    span.parent_span_id = context->getClientInfo().client_trace_context.span_id;
-                    span.operation_name = "query";
-                    span.start_time_us = elem.query_start_time_microseconds;
-                    span.finish_time_us = time_in_microseconds(finish_time);
-
-                    span.attributes.reserve(4);
-                    span.attributes.push_back(Tuple{"clickhouse.query_status", "QueryFinish"});
-                    span.attributes.push_back(Tuple{"db.statement", elem.query});
-                    span.attributes.push_back(Tuple{"clickhouse.query_id", elem.client_info.current_query_id});
-                    if (!context->query_trace_context.tracestate.empty())
-                    {
-                    span.attributes.push_back(Tuple{"clickhouse.tracestate", context->query_trace_context.tracestate});
-                    }
-
-                    opentelemetry_span_log->add(span);
+                    query_span->addAttribute("db.statement", elem.query);
+                    query_span->addAttribute("clickhouse.query_id", elem.client_info.current_query_id);
+                    query_span->addAttribute("clickhouse.query_status", "QueryFinish");
+                    query_span->addAttributeIfNotEmpty("clickhouse.tracestate", OpenTelemetry::CurrentContext().tracestate);
+                    query_span->addAttributeIfNotZero("clickhouse.read_rows", elem.read_rows);
+                    query_span->addAttributeIfNotZero("clickhouse.read_bytes", elem.read_bytes);
+                    query_span->addAttributeIfNotZero("clickhouse.written_rows", info.written_rows);
+                    query_span->addAttributeIfNotZero("clickhouse.written_bytes", elem.written_bytes);
+                    query_span->addAttributeIfNotZero("clickhouse.memory_usage", elem.memory_usage);
+                    query_span->finish();
                 }
 
                 if (implicit_txn_control)
@@ -993,7 +976,8 @@ static std::tuple<ASTPtr, BlockIO> executeQueryImpl(
                                        log_queries_min_query_duration_ms = settings.log_queries_min_query_duration_ms.totalMilliseconds(),
                                        quota(quota),
                                        status_info_to_query_log,
-                                       implicit_txn_control]() mutable
+                                       implicit_txn_control,
+                                       query_span]() mutable
             {
                 if (implicit_txn_control)
                 {
@@ -1050,6 +1034,15 @@ static std::tuple<ASTPtr, BlockIO> executeQueryImpl(
                 {
                     ProfileEvents::increment(ProfileEvents::FailedInsertQuery);
                 }
+
+                if (query_span)
+                {
+                    query_span->addAttribute("db.statement", elem.query);
+                    query_span->addAttribute("clickhouse.query_id", elem.client_info.current_query_id);
+                    query_span->addAttribute("clickhouse.exception", elem.exception);
+                    query_span->addAttribute("clickhouse.exception_code", elem.exception_code);
+                    query_span->finish();
+                }
             };
 
             res.finish_callback = std::move(finish_callback);
@@ -1073,7 +1066,7 @@ static std::tuple<ASTPtr, BlockIO> executeQueryImpl(
             if (query_for_logging.empty())
                 query_for_logging = prepareQueryForLogging(query, context);
 
-            onExceptionBeforeStart(query_for_logging, context, time_in_microseconds(current_time), ast);
+            onExceptionBeforeStart(query_for_logging, context, time_in_microseconds(current_time), ast, query_span);
         }
 
         throw;
diff --git a/src/Interpreters/inplaceBlockConversions.cpp b/src/Interpreters/inplaceBlockConversions.cpp
index 1bde6fe5a8c..a4791690f4e 100644
--- a/src/Interpreters/inplaceBlockConversions.cpp
+++ b/src/Interpreters/inplaceBlockConversions.cpp
@@ -12,6 +12,7 @@
 #include <Parsers/ASTFunction.h>
 #include <utility>
 #include <DataTypes/DataTypesNumber.h>
+#include <DataTypes/ObjectUtils.h>
 #include <Interpreters/RequiredSourceColumnsVisitor.h>
 #include <Common/checkStackSize.h>
 #include <Storages/ColumnsDescription.h>
@@ -187,29 +188,56 @@ ActionsDAGPtr evaluateMissingDefaults(
     return createExpressions(header, expr_list, save_unneeded_columns, context);
 }
 
-static bool arrayHasNoElementsRead(const IColumn & column)
+static std::unordered_map<String, ColumnPtr> collectOffsetsColumns(
+    const NamesAndTypesList & available_columns, const Columns & res_columns)
 {
-    const auto * column_array = typeid_cast<const ColumnArray *>(&column);
+    std::unordered_map<String, ColumnPtr> offsets_columns;
 
-    if (!column_array)
-        return false;
+    auto available_column = available_columns.begin();
+    for (size_t i = 0; i < available_columns.size(); ++i, ++available_column)
+    {
+        if (res_columns[i] == nullptr || isColumnConst(*res_columns[i]))
+            continue;
 
-    size_t size = column_array->size();
-    if (!size)
-        return false;
+        auto serialization = IDataType::getSerialization(*available_column);
+        serialization->enumerateStreams([&](const auto & subpath)
+        {
+            if (subpath.empty() || subpath.back().type != ISerialization::Substream::ArraySizes)
+                return;
 
-    size_t data_size = column_array->getData().size();
-    if (data_size)
-        return false;
+            auto stream_name = ISerialization::getFileNameForStream(*available_column, subpath);
+            const auto & current_offsets_column = subpath.back().data.column;
 
-    size_t last_offset = column_array->getOffsets()[size - 1];
-    return last_offset != 0;
+            /// If for some reason multiple offsets columns are present
+            /// for the same nested data structure, choose the one that is not empty.
+            if (current_offsets_column && !current_offsets_column->empty())
+            {
+                auto & offsets_column = offsets_columns[stream_name];
+                if (!offsets_column)
+                    offsets_column = current_offsets_column;
+
+            #ifndef NDEBUG
+                const auto & offsets_data = assert_cast<const ColumnUInt64 &>(*offsets_column).getData();
+                const auto & current_offsets_data = assert_cast<const ColumnUInt64 &>(*current_offsets_column).getData();
+
+                if (offsets_data != current_offsets_data)
+                    throw Exception(ErrorCodes::LOGICAL_ERROR,
+                        "Found non-equal columns with offsets (sizes: {} and {}) for stream {}",
+                        offsets_data.size(), current_offsets_data.size(), stream_name);
+            #endif
+            }
+        }, available_column->type, res_columns[i]);
+    }
+
+    return offsets_columns;
 }
 
 void fillMissingColumns(
     Columns & res_columns,
     size_t num_rows,
     const NamesAndTypesList & requested_columns,
+    const NamesAndTypesList & available_columns,
+    const NameSet & partially_read_columns,
     StorageMetadataPtr metadata_snapshot)
 {
     size_t num_columns = requested_columns.size();
@@ -218,65 +246,79 @@ void fillMissingColumns(
             "Invalid number of columns passed to fillMissingColumns. Expected {}, got {}",
             num_columns, res_columns.size());
 
-    /// For a missing column of a nested data structure we must create not a column of empty
-    /// arrays, but a column of arrays of correct length.
+    /// For a missing column of a nested data structure
+    /// we must create not a column of empty arrays,
+    /// but a column of arrays of correct length.
 
     /// First, collect offset columns for all arrays in the block.
+    auto offsets_columns = collectOffsetsColumns(available_columns, res_columns);
 
-    std::unordered_map<String, ColumnPtr> offset_columns;
+    /// Insert default values only for columns without default expressions.
     auto requested_column = requested_columns.begin();
     for (size_t i = 0; i < num_columns; ++i, ++requested_column)
-    {
-        if (res_columns[i] == nullptr)
-            continue;
-
-        if (const auto * array = typeid_cast<const ColumnArray *>(res_columns[i].get()))
-        {
-            String offsets_name = Nested::extractTableName(requested_column->name);
-            auto & offsets_column = offset_columns[offsets_name];
-
-            /// If for some reason multiple offsets columns are present for the same nested data structure,
-            /// choose the one that is not empty.
-            if (!offsets_column || offsets_column->empty())
-                offsets_column = array->getOffsetsPtr();
-        }
-    }
-
-    /// insert default values only for columns without default expressions
-    requested_column = requested_columns.begin();
-    for (size_t i = 0; i < num_columns; ++i, ++requested_column)
     {
         const auto & [name, type] = *requested_column;
 
-        if (res_columns[i] && arrayHasNoElementsRead(*res_columns[i]))
+        if (res_columns[i] && partially_read_columns.contains(name))
             res_columns[i] = nullptr;
 
-        if (res_columns[i] == nullptr)
+        if (res_columns[i])
+            continue;
+
+        if (metadata_snapshot && metadata_snapshot->getColumns().hasDefault(name))
+            continue;
+
+        std::vector<ColumnPtr> current_offsets;
+        size_t num_dimensions = 0;
+
+        const auto * array_type = typeid_cast<const DataTypeArray *>(type.get());
+        if (array_type && !offsets_columns.empty())
         {
-            if (metadata_snapshot && metadata_snapshot->getColumns().hasDefault(name))
-                continue;
+            num_dimensions = getNumberOfDimensions(*array_type);
+            current_offsets.resize(num_dimensions);
 
-            String offsets_name = Nested::extractTableName(name);
-            auto offset_it = offset_columns.find(offsets_name);
-            const auto * array_type = typeid_cast<const DataTypeArray *>(type.get());
-            if (offset_it != offset_columns.end() && array_type)
+            auto serialization = IDataType::getSerialization(*requested_column);
+            serialization->enumerateStreams([&](const auto & subpath)
             {
-                const auto & nested_type = array_type->getNestedType();
-                ColumnPtr offsets_column = offset_it->second;
-                size_t nested_rows = typeid_cast<const ColumnUInt64 &>(*offsets_column).getData().back();
+                if (subpath.empty() || subpath.back().type != ISerialization::Substream::ArraySizes)
+                    return;
 
-                ColumnPtr nested_column =
-                    nested_type->createColumnConstWithDefaultValue(nested_rows)->convertToFullColumnIfConst();
+                size_t level = ISerialization::getArrayLevel(subpath);
+                assert(level < num_dimensions);
 
-                res_columns[i] = ColumnArray::create(nested_column, offsets_column);
-            }
-            else
+                auto stream_name = ISerialization::getFileNameForStream(*requested_column, subpath);
+                auto it = offsets_columns.find(stream_name);
+                if (it != offsets_columns.end())
+                    current_offsets[level] = it->second;
+            });
+
+            for (size_t j = 0; j < num_dimensions; ++j)
             {
-                /// We must turn a constant column into a full column because the interpreter could infer
-                /// that it is constant everywhere but in some blocks (from other parts) it can be a full column.
-                res_columns[i] = type->createColumnConstWithDefaultValue(num_rows)->convertToFullColumnIfConst();
+                if (!current_offsets[j])
+                {
+                    current_offsets.resize(j);
+                    break;
+                }
             }
         }
+
+        if (!current_offsets.empty())
+        {
+            size_t num_empty_dimensions = num_dimensions - current_offsets.size();
+            auto scalar_type = createArrayOfType(getBaseTypeOfArray(type), num_empty_dimensions);
+
+            size_t data_size = assert_cast<const ColumnUInt64 &>(*current_offsets.back()).getData().back();
+            res_columns[i] = scalar_type->createColumnConstWithDefaultValue(data_size)->convertToFullColumnIfConst();
+
+            for (auto it = current_offsets.rbegin(); it != current_offsets.rend(); ++it)
+                res_columns[i] = ColumnArray::create(res_columns[i], *it);
+        }
+        else
+        {
+            /// We must turn a constant column into a full column because the interpreter could infer
+            /// that it is constant everywhere but in some blocks (from other parts) it can be a full column.
+            res_columns[i] = type->createColumnConstWithDefaultValue(num_rows)->convertToFullColumnIfConst();
+        }
     }
 }
 
diff --git a/src/Interpreters/inplaceBlockConversions.h b/src/Interpreters/inplaceBlockConversions.h
index b3113ddfa5c..bea44bf6db9 100644
--- a/src/Interpreters/inplaceBlockConversions.h
+++ b/src/Interpreters/inplaceBlockConversions.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <Core/Names.h>
 #include <Interpreters/Context_fwd.h>
 #include <Common/COW.h>
 
@@ -43,6 +44,8 @@ void fillMissingColumns(
     Columns & res_columns,
     size_t num_rows,
     const NamesAndTypesList & requested_columns,
+    const NamesAndTypesList & available_columns,
+    const NameSet & partially_read_columns,
     StorageMetadataPtr metadata_snapshot);
 
 }
diff --git a/src/Interpreters/tests/gtest_lru_file_cache.cpp b/src/Interpreters/tests/gtest_lru_file_cache.cpp
index a3fe3bbcba5..6460eeef8c5 100644
--- a/src/Interpreters/tests/gtest_lru_file_cache.cpp
+++ b/src/Interpreters/tests/gtest_lru_file_cache.cpp
@@ -89,7 +89,6 @@ TEST(FileCache, get)
 {
     if (fs::exists(cache_base_path))
         fs::remove_all(cache_base_path);
-    fs::create_directories(cache_base_path);
 
     DB::ThreadStatus thread_status;
 
@@ -103,373 +102,376 @@ TEST(FileCache, get)
     DB::FileCacheSettings settings;
     settings.max_size = 30;
     settings.max_elements = 5;
-    auto cache = DB::FileCache(cache_base_path, settings);
-    cache.initialize();
-    auto key = cache.hash("key1");
 
     {
-        auto holder = cache.getOrSet(key, 0, 10, false);  /// Add range [0, 9]
-        auto segments = fromHolder(holder);
-        /// Range was not present in cache. It should be added in cache as one while file segment.
-        ASSERT_EQ(segments.size(), 1);
+        auto cache = DB::FileCache(cache_base_path, settings);
+        cache.initialize();
+        auto key = cache.hash("key1");
 
-        assertRange(1, segments[0], DB::FileSegment::Range(0, 9), DB::FileSegment::State::EMPTY);
+        {
+            auto holder = cache.getOrSet(key, 0, 10, false);  /// Add range [0, 9]
+            auto segments = fromHolder(holder);
+            /// Range was not present in cache. It should be added in cache as one while file segment.
+            ASSERT_EQ(segments.size(), 1);
 
-        /// Exception because space not reserved.
-        /// EXPECT_THROW(download(segments[0]), DB::Exception);
-        /// Exception because space can be reserved only by downloader
-        /// EXPECT_THROW(segments[0]->reserve(segments[0]->range().size()), DB::Exception);
+            assertRange(1, segments[0], DB::FileSegment::Range(0, 9), DB::FileSegment::State::EMPTY);
 
-        ASSERT_TRUE(segments[0]->getOrSetDownloader() == DB::FileSegment::getCallerId());
-        ASSERT_TRUE(segments[0]->reserve(segments[0]->range().size()));
-        assertRange(2, segments[0], DB::FileSegment::Range(0, 9), DB::FileSegment::State::DOWNLOADING);
+            /// Exception because space not reserved.
+            /// EXPECT_THROW(download(segments[0]), DB::Exception);
+            /// Exception because space can be reserved only by downloader
+            /// EXPECT_THROW(segments[0]->reserve(segments[0]->range().size()), DB::Exception);
 
-        download(segments[0]);
-        segments[0]->completeWithState(DB::FileSegment::State::DOWNLOADED);
-        assertRange(3, segments[0], DB::FileSegment::Range(0, 9), DB::FileSegment::State::DOWNLOADED);
-    }
+            ASSERT_TRUE(segments[0]->getOrSetDownloader() == DB::FileSegment::getCallerId());
+            ASSERT_TRUE(segments[0]->reserve(segments[0]->range().size()));
+            assertRange(2, segments[0], DB::FileSegment::Range(0, 9), DB::FileSegment::State::DOWNLOADING);
 
-    /// Current cache:    [__________]
-    ///                   ^          ^
-    ///                   0          9
-    ASSERT_EQ(cache.getFileSegmentsNum(), 1);
-    ASSERT_EQ(cache.getUsedCacheSize(), 10);
+            download(segments[0]);
+            segments[0]->completeWithState(DB::FileSegment::State::DOWNLOADED);
+            assertRange(3, segments[0], DB::FileSegment::Range(0, 9), DB::FileSegment::State::DOWNLOADED);
+        }
 
-    {
-        /// Want range [5, 14], but [0, 9] already in cache, so only [10, 14] will be put in cache.
-        auto holder = cache.getOrSet(key, 5, 10, false);
-        auto segments = fromHolder(holder);
-        ASSERT_EQ(segments.size(), 2);
+        /// Current cache:    [__________]
+        ///                   ^          ^
+        ///                   0          9
+        ASSERT_EQ(cache.getFileSegmentsNum(), 1);
+        ASSERT_EQ(cache.getUsedCacheSize(), 10);
 
-        assertRange(4, segments[0], DB::FileSegment::Range(0, 9), DB::FileSegment::State::DOWNLOADED);
-        assertRange(5, segments[1], DB::FileSegment::Range(10, 14), DB::FileSegment::State::EMPTY);
+        {
+            /// Want range [5, 14], but [0, 9] already in cache, so only [10, 14] will be put in cache.
+            auto holder = cache.getOrSet(key, 5, 10, false);
+            auto segments = fromHolder(holder);
+            ASSERT_EQ(segments.size(), 2);
 
-        ASSERT_TRUE(segments[1]->getOrSetDownloader() == DB::FileSegment::getCallerId());
-        prepareAndDownload(segments[1]);
-        segments[1]->completeWithState(DB::FileSegment::State::DOWNLOADED);
-        assertRange(6, segments[1], DB::FileSegment::Range(10, 14), DB::FileSegment::State::DOWNLOADED);
-    }
+            assertRange(4, segments[0], DB::FileSegment::Range(0, 9), DB::FileSegment::State::DOWNLOADED);
+            assertRange(5, segments[1], DB::FileSegment::Range(10, 14), DB::FileSegment::State::EMPTY);
 
-    /// Current cache:    [__________][_____]
-    ///                   ^          ^^     ^
-    ///                   0          910    14
-    ASSERT_EQ(cache.getFileSegmentsNum(), 2);
-    ASSERT_EQ(cache.getUsedCacheSize(), 15);
+            ASSERT_TRUE(segments[1]->getOrSetDownloader() == DB::FileSegment::getCallerId());
+            prepareAndDownload(segments[1]);
+            segments[1]->completeWithState(DB::FileSegment::State::DOWNLOADED);
+            assertRange(6, segments[1], DB::FileSegment::Range(10, 14), DB::FileSegment::State::DOWNLOADED);
+        }
 
-    {
-        auto holder = cache.getOrSet(key, 9, 1, false);  /// Get [9, 9]
-        auto segments = fromHolder(holder);
-        ASSERT_EQ(segments.size(), 1);
-        assertRange(7, segments[0], DB::FileSegment::Range(0, 9), DB::FileSegment::State::DOWNLOADED);
-    }
+        /// Current cache:    [__________][_____]
+        ///                   ^          ^^     ^
+        ///                   0          910    14
+        ASSERT_EQ(cache.getFileSegmentsNum(), 2);
+        ASSERT_EQ(cache.getUsedCacheSize(), 15);
 
-    {
-        auto holder = cache.getOrSet(key, 9, 2, false);  /// Get [9, 10]
-        auto segments = fromHolder(holder);
-        ASSERT_EQ(segments.size(), 2);
-        assertRange(8, segments[0], DB::FileSegment::Range(0, 9), DB::FileSegment::State::DOWNLOADED);
-        assertRange(9, segments[1], DB::FileSegment::Range(10, 14), DB::FileSegment::State::DOWNLOADED);
-    }
+        {
+            auto holder = cache.getOrSet(key, 9, 1, false);  /// Get [9, 9]
+            auto segments = fromHolder(holder);
+            ASSERT_EQ(segments.size(), 1);
+            assertRange(7, segments[0], DB::FileSegment::Range(0, 9), DB::FileSegment::State::DOWNLOADED);
+        }
 
-    {
-        auto holder = cache.getOrSet(key, 10, 1, false);  /// Get [10, 10]
-        auto segments = fromHolder(holder);
-        ASSERT_EQ(segments.size(), 1);
-        assertRange(10, segments[0], DB::FileSegment::Range(10, 14), DB::FileSegment::State::DOWNLOADED);
-    }
+        {
+            auto holder = cache.getOrSet(key, 9, 2, false);  /// Get [9, 10]
+            auto segments = fromHolder(holder);
+            ASSERT_EQ(segments.size(), 2);
+            assertRange(8, segments[0], DB::FileSegment::Range(0, 9), DB::FileSegment::State::DOWNLOADED);
+            assertRange(9, segments[1], DB::FileSegment::Range(10, 14), DB::FileSegment::State::DOWNLOADED);
+        }
 
-    complete(cache.getOrSet(key, 17, 4, false)); /// Get [17, 20]
-    complete(cache.getOrSet(key, 24, 3, false)); /// Get [24, 26]
-    /// complete(cache.getOrSet(key, 27, 1, false)); /// Get [27, 27]
+        {
+            auto holder = cache.getOrSet(key, 10, 1, false);  /// Get [10, 10]
+            auto segments = fromHolder(holder);
+            ASSERT_EQ(segments.size(), 1);
+            assertRange(10, segments[0], DB::FileSegment::Range(10, 14), DB::FileSegment::State::DOWNLOADED);
+        }
 
-    /// Current cache:    [__________][_____]   [____]    [___][]
-    ///                   ^          ^^     ^   ^    ^    ^   ^^^
-    ///                   0          910    14  17   20   24  2627
-    ///
-    ASSERT_EQ(cache.getFileSegmentsNum(), 4);
-    ASSERT_EQ(cache.getUsedCacheSize(), 22);
+        complete(cache.getOrSet(key, 17, 4, false)); /// Get [17, 20]
+        complete(cache.getOrSet(key, 24, 3, false)); /// Get [24, 26]
+        /// complete(cache.getOrSet(key, 27, 1, false)); /// Get [27, 27]
 
-    {
-        auto holder = cache.getOrSet(key, 0, 26, false); /// Get [0, 25]
-        auto segments = fromHolder(holder);
-        ASSERT_EQ(segments.size(), 6);
-
-        assertRange(11, segments[0], DB::FileSegment::Range(0, 9), DB::FileSegment::State::DOWNLOADED);
-        assertRange(12, segments[1], DB::FileSegment::Range(10, 14), DB::FileSegment::State::DOWNLOADED);
-
-        /// Missing [15, 16] should be added in cache.
-        assertRange(13, segments[2], DB::FileSegment::Range(15, 16), DB::FileSegment::State::EMPTY);
-
-        ASSERT_TRUE(segments[2]->getOrSetDownloader() == DB::FileSegment::getCallerId());
-        prepareAndDownload(segments[2]);
-
-        segments[2]->completeWithState(DB::FileSegment::State::DOWNLOADED);
-
-        assertRange(14, segments[3], DB::FileSegment::Range(17, 20), DB::FileSegment::State::DOWNLOADED);
-
-        /// New [21, 23], but will not be added in cache because of elements limit (5)
-        assertRange(15, segments[4], DB::FileSegment::Range(21, 23), DB::FileSegment::State::EMPTY);
-        ASSERT_TRUE(segments[4]->getOrSetDownloader() == DB::FileSegment::getCallerId());
-        ASSERT_FALSE(segments[4]->reserve(1));
-
-        assertRange(16, segments[5], DB::FileSegment::Range(24, 26), DB::FileSegment::State::DOWNLOADED);
-
-        /// Current cache:    [__________][_____][   ][____]    [___]
-        ///                   ^                            ^    ^
-        ///                   0                            20   24
+        /// Current cache:    [__________][_____]   [____]    [___][]
+        ///                   ^          ^^     ^   ^    ^    ^   ^^^
+        ///                   0          910    14  17   20   24  2627
         ///
+        ASSERT_EQ(cache.getFileSegmentsNum(), 4);
+        ASSERT_EQ(cache.getUsedCacheSize(), 22);
 
-        /// Range [27, 27] must be evicted in previous getOrSet [0, 25].
-        /// Let's not invalidate pointers to returned segments from range [0, 25] and
-        /// as max elements size is reached, next attempt to put something in cache should fail.
-        /// This will also check that [27, 27] was indeed evicted.
+        {
+            auto holder = cache.getOrSet(key, 0, 26, false); /// Get [0, 25]
+            auto segments = fromHolder(holder);
+            ASSERT_EQ(segments.size(), 6);
 
-        auto holder1 = cache.getOrSet(key, 27, 1, false);
-        auto segments_1 = fromHolder(holder1); /// Get [27, 27]
-        ASSERT_EQ(segments_1.size(), 1);
-        assertRange(17, segments_1[0], DB::FileSegment::Range(27, 27), DB::FileSegment::State::EMPTY);
-    }
+            assertRange(11, segments[0], DB::FileSegment::Range(0, 9), DB::FileSegment::State::DOWNLOADED);
+            assertRange(12, segments[1], DB::FileSegment::Range(10, 14), DB::FileSegment::State::DOWNLOADED);
 
-    {
-        auto holder = cache.getOrSet(key, 12, 10, false); /// Get [12, 21]
-        auto segments = fromHolder(holder);
-        ASSERT_EQ(segments.size(), 4);
+            /// Missing [15, 16] should be added in cache.
+            assertRange(13, segments[2], DB::FileSegment::Range(15, 16), DB::FileSegment::State::EMPTY);
 
-        assertRange(18, segments[0], DB::FileSegment::Range(10, 14), DB::FileSegment::State::DOWNLOADED);
-        assertRange(19, segments[1], DB::FileSegment::Range(15, 16), DB::FileSegment::State::DOWNLOADED);
-        assertRange(20, segments[2], DB::FileSegment::Range(17, 20), DB::FileSegment::State::DOWNLOADED);
+            ASSERT_TRUE(segments[2]->getOrSetDownloader() == DB::FileSegment::getCallerId());
+            prepareAndDownload(segments[2]);
 
-        assertRange(21, segments[3], DB::FileSegment::Range(21, 21), DB::FileSegment::State::EMPTY);
+            segments[2]->completeWithState(DB::FileSegment::State::DOWNLOADED);
 
-        ASSERT_TRUE(segments[3]->getOrSetDownloader() == DB::FileSegment::getCallerId());
-        prepareAndDownload(segments[3]);
+            assertRange(14, segments[3], DB::FileSegment::Range(17, 20), DB::FileSegment::State::DOWNLOADED);
 
-        segments[3]->completeWithState(DB::FileSegment::State::DOWNLOADED);
-        ASSERT_TRUE(segments[3]->state() == DB::FileSegment::State::DOWNLOADED);
-    }
+            /// New [21, 23], but will not be added in cache because of elements limit (5)
+            assertRange(15, segments[4], DB::FileSegment::Range(21, 23), DB::FileSegment::State::EMPTY);
+            ASSERT_TRUE(segments[4]->getOrSetDownloader() == DB::FileSegment::getCallerId());
+            ASSERT_FALSE(segments[4]->reserve(1));
 
-    /// Current cache:    [_____][__][____][_]   [___]
-    ///                   ^          ^       ^   ^   ^
-    ///                   10         17      21  24  26
+            assertRange(16, segments[5], DB::FileSegment::Range(24, 26), DB::FileSegment::State::DOWNLOADED);
 
-    ASSERT_EQ(cache.getFileSegmentsNum(), 5);
+            /// Current cache:    [__________][_____][   ][____]    [___]
+            ///                   ^                            ^    ^
+            ///                   0                            20   24
+            ///
 
-    {
-        auto holder = cache.getOrSet(key, 23, 5, false); /// Get [23, 28]
-        auto segments = fromHolder(holder);
-        ASSERT_EQ(segments.size(), 3);
+            /// Range [27, 27] must be evicted in previous getOrSet [0, 25].
+            /// Let's not invalidate pointers to returned segments from range [0, 25] and
+            /// as max elements size is reached, next attempt to put something in cache should fail.
+            /// This will also check that [27, 27] was indeed evicted.
 
-        assertRange(22, segments[0], DB::FileSegment::Range(23, 23), DB::FileSegment::State::EMPTY);
-        assertRange(23, segments[1], DB::FileSegment::Range(24, 26), DB::FileSegment::State::DOWNLOADED);
-        assertRange(24, segments[2], DB::FileSegment::Range(27, 27), DB::FileSegment::State::EMPTY);
+            auto holder1 = cache.getOrSet(key, 27, 1, false);
+            auto segments_1 = fromHolder(holder1); /// Get [27, 27]
+            ASSERT_EQ(segments_1.size(), 1);
+            assertRange(17, segments_1[0], DB::FileSegment::Range(27, 27), DB::FileSegment::State::EMPTY);
+        }
 
-        ASSERT_TRUE(segments[0]->getOrSetDownloader() == DB::FileSegment::getCallerId());
-        ASSERT_TRUE(segments[2]->getOrSetDownloader() == DB::FileSegment::getCallerId());
-        prepareAndDownload(segments[0]);
-        prepareAndDownload(segments[2]);
-        segments[0]->completeWithState(DB::FileSegment::State::DOWNLOADED);
-        segments[2]->completeWithState(DB::FileSegment::State::DOWNLOADED);
-    }
+        {
+            auto holder = cache.getOrSet(key, 12, 10, false); /// Get [12, 21]
+            auto segments = fromHolder(holder);
+            ASSERT_EQ(segments.size(), 4);
 
-    /// Current cache:    [____][_]  [][___][__]
-    ///                   ^       ^  ^^^   ^^  ^
-    ///                   17      21 2324  26  28
+            assertRange(18, segments[0], DB::FileSegment::Range(10, 14), DB::FileSegment::State::DOWNLOADED);
+            assertRange(19, segments[1], DB::FileSegment::Range(15, 16), DB::FileSegment::State::DOWNLOADED);
+            assertRange(20, segments[2], DB::FileSegment::Range(17, 20), DB::FileSegment::State::DOWNLOADED);
 
-    {
-        auto holder5 = cache.getOrSet(key, 2, 3,false); /// Get [2, 4]
-        auto s5 = fromHolder(holder5);
-        ASSERT_EQ(s5.size(), 1);
-        assertRange(25, s5[0], DB::FileSegment::Range(2, 4), DB::FileSegment::State::EMPTY);
+            assertRange(21, segments[3], DB::FileSegment::Range(21, 21), DB::FileSegment::State::EMPTY);
 
-        auto holder1 = cache.getOrSet(key, 30, 2, false); /// Get [30, 31]
-        auto s1 = fromHolder(holder1);
-        ASSERT_EQ(s1.size(), 1);
-        assertRange(26, s1[0], DB::FileSegment::Range(30, 31), DB::FileSegment::State::EMPTY);
+            ASSERT_TRUE(segments[3]->getOrSetDownloader() == DB::FileSegment::getCallerId());
+            prepareAndDownload(segments[3]);
 
-        ASSERT_TRUE(s5[0]->getOrSetDownloader() == DB::FileSegment::getCallerId());
-        ASSERT_TRUE(s1[0]->getOrSetDownloader() == DB::FileSegment::getCallerId());
-        prepareAndDownload(s5[0]);
-        prepareAndDownload(s1[0]);
-        s5[0]->completeWithState(DB::FileSegment::State::DOWNLOADED);
-        s1[0]->completeWithState(DB::FileSegment::State::DOWNLOADED);
+            segments[3]->completeWithState(DB::FileSegment::State::DOWNLOADED);
+            ASSERT_TRUE(segments[3]->state() == DB::FileSegment::State::DOWNLOADED);
+        }
+
+        /// Current cache:    [_____][__][____][_]   [___]
+        ///                   ^          ^       ^   ^   ^
+        ///                   10         17      21  24  26
+
+        ASSERT_EQ(cache.getFileSegmentsNum(), 5);
+
+        {
+            auto holder = cache.getOrSet(key, 23, 5, false); /// Get [23, 28]
+            auto segments = fromHolder(holder);
+            ASSERT_EQ(segments.size(), 3);
+
+            assertRange(22, segments[0], DB::FileSegment::Range(23, 23), DB::FileSegment::State::EMPTY);
+            assertRange(23, segments[1], DB::FileSegment::Range(24, 26), DB::FileSegment::State::DOWNLOADED);
+            assertRange(24, segments[2], DB::FileSegment::Range(27, 27), DB::FileSegment::State::EMPTY);
+
+            ASSERT_TRUE(segments[0]->getOrSetDownloader() == DB::FileSegment::getCallerId());
+            ASSERT_TRUE(segments[2]->getOrSetDownloader() == DB::FileSegment::getCallerId());
+            prepareAndDownload(segments[0]);
+            prepareAndDownload(segments[2]);
+            segments[0]->completeWithState(DB::FileSegment::State::DOWNLOADED);
+            segments[2]->completeWithState(DB::FileSegment::State::DOWNLOADED);
+        }
+
+        /// Current cache:    [____][_]  [][___][__]
+        ///                   ^       ^  ^^^   ^^  ^
+        ///                   17      21 2324  26  28
+
+        {
+            auto holder5 = cache.getOrSet(key, 2, 3,false); /// Get [2, 4]
+            auto s5 = fromHolder(holder5);
+            ASSERT_EQ(s5.size(), 1);
+            assertRange(25, s5[0], DB::FileSegment::Range(2, 4), DB::FileSegment::State::EMPTY);
+
+            auto holder1 = cache.getOrSet(key, 30, 2, false); /// Get [30, 31]
+            auto s1 = fromHolder(holder1);
+            ASSERT_EQ(s1.size(), 1);
+            assertRange(26, s1[0], DB::FileSegment::Range(30, 31), DB::FileSegment::State::EMPTY);
+
+            ASSERT_TRUE(s5[0]->getOrSetDownloader() == DB::FileSegment::getCallerId());
+            ASSERT_TRUE(s1[0]->getOrSetDownloader() == DB::FileSegment::getCallerId());
+            prepareAndDownload(s5[0]);
+            prepareAndDownload(s1[0]);
+            s5[0]->completeWithState(DB::FileSegment::State::DOWNLOADED);
+            s1[0]->completeWithState(DB::FileSegment::State::DOWNLOADED);
+
+            /// Current cache:    [___]       [_][___][_]   [__]
+            ///                   ^   ^       ^  ^   ^  ^   ^  ^
+            ///                   2   4       23 24  26 27  30 31
+
+            auto holder2 = cache.getOrSet(key, 23, 1, false); /// Get [23, 23]
+            auto s2 = fromHolder(holder2);
+            ASSERT_EQ(s2.size(), 1);
+
+            auto holder3 = cache.getOrSet(key, 24, 3, false); /// Get [24, 26]
+            auto s3 = fromHolder(holder3);
+            ASSERT_EQ(s3.size(), 1);
+
+            auto holder4 = cache.getOrSet(key, 27, 1, false); /// Get [27, 27]
+            auto s4 = fromHolder(holder4);
+            ASSERT_EQ(s4.size(), 1);
+
+            /// All cache is now unreleasable because pointers are still hold
+            auto holder6 = cache.getOrSet(key, 0, 40, false);
+            auto f = fromHolder(holder6);
+            ASSERT_EQ(f.size(), 9);
+
+            assertRange(27, f[0], DB::FileSegment::Range(0, 1), DB::FileSegment::State::EMPTY);
+            assertRange(28, f[2], DB::FileSegment::Range(5, 22), DB::FileSegment::State::EMPTY);
+            assertRange(29, f[6], DB::FileSegment::Range(28, 29), DB::FileSegment::State::EMPTY);
+            assertRange(30, f[8], DB::FileSegment::Range(32, 39), DB::FileSegment::State::EMPTY);
+
+            ASSERT_TRUE(f[0]->getOrSetDownloader() == DB::FileSegment::getCallerId());
+            ASSERT_TRUE(f[2]->getOrSetDownloader() == DB::FileSegment::getCallerId());
+            ASSERT_TRUE(f[6]->getOrSetDownloader() == DB::FileSegment::getCallerId());
+            ASSERT_TRUE(f[8]->getOrSetDownloader() == DB::FileSegment::getCallerId());
+
+            ASSERT_FALSE(f[0]->reserve(1));
+            ASSERT_FALSE(f[2]->reserve(1));
+            ASSERT_FALSE(f[6]->reserve(1));
+            ASSERT_FALSE(f[8]->reserve(1));
+        }
+
+        {
+            auto holder = cache.getOrSet(key, 2, 3, false); /// Get [2, 4]
+            auto segments = fromHolder(holder);
+            ASSERT_EQ(segments.size(), 1);
+            assertRange(31, segments[0], DB::FileSegment::Range(2, 4), DB::FileSegment::State::DOWNLOADED);
+        }
 
         /// Current cache:    [___]       [_][___][_]   [__]
         ///                   ^   ^       ^  ^   ^  ^   ^  ^
         ///                   2   4       23 24  26 27  30 31
 
-        auto holder2 = cache.getOrSet(key, 23, 1, false); /// Get [23, 23]
-        auto s2 = fromHolder(holder2);
-        ASSERT_EQ(s2.size(), 1);
-
-        auto holder3 = cache.getOrSet(key, 24, 3, false); /// Get [24, 26]
-        auto s3 = fromHolder(holder3);
-        ASSERT_EQ(s3.size(), 1);
-
-        auto holder4 = cache.getOrSet(key, 27, 1, false); /// Get [27, 27]
-        auto s4 = fromHolder(holder4);
-        ASSERT_EQ(s4.size(), 1);
-
-        /// All cache is now unreleasable because pointers are still hold
-        auto holder6 = cache.getOrSet(key, 0, 40, false);
-        auto f = fromHolder(holder6);
-        ASSERT_EQ(f.size(), 9);
-
-        assertRange(27, f[0], DB::FileSegment::Range(0, 1), DB::FileSegment::State::EMPTY);
-        assertRange(28, f[2], DB::FileSegment::Range(5, 22), DB::FileSegment::State::EMPTY);
-        assertRange(29, f[6], DB::FileSegment::Range(28, 29), DB::FileSegment::State::EMPTY);
-        assertRange(30, f[8], DB::FileSegment::Range(32, 39), DB::FileSegment::State::EMPTY);
-
-        ASSERT_TRUE(f[0]->getOrSetDownloader() == DB::FileSegment::getCallerId());
-        ASSERT_TRUE(f[2]->getOrSetDownloader() == DB::FileSegment::getCallerId());
-        ASSERT_TRUE(f[6]->getOrSetDownloader() == DB::FileSegment::getCallerId());
-        ASSERT_TRUE(f[8]->getOrSetDownloader() == DB::FileSegment::getCallerId());
-
-        ASSERT_FALSE(f[0]->reserve(1));
-        ASSERT_FALSE(f[2]->reserve(1));
-        ASSERT_FALSE(f[6]->reserve(1));
-        ASSERT_FALSE(f[8]->reserve(1));
-    }
-
-    {
-        auto holder = cache.getOrSet(key, 2, 3, false); /// Get [2, 4]
-        auto segments = fromHolder(holder);
-        ASSERT_EQ(segments.size(), 1);
-        assertRange(31, segments[0], DB::FileSegment::Range(2, 4), DB::FileSegment::State::DOWNLOADED);
-    }
-
-    /// Current cache:    [___]       [_][___][_]   [__]
-    ///                   ^   ^       ^  ^   ^  ^   ^  ^
-    ///                   2   4       23 24  26 27  30 31
-
-    {
-        auto holder = cache.getOrSet(key, 25, 5, false); /// Get [25, 29]
-        auto segments = fromHolder(holder);
-        ASSERT_EQ(segments.size(), 3);
-
-        assertRange(32, segments[0], DB::FileSegment::Range(24, 26), DB::FileSegment::State::DOWNLOADED);
-        assertRange(33, segments[1], DB::FileSegment::Range(27, 27), DB::FileSegment::State::DOWNLOADED);
-
-        assertRange(34, segments[2], DB::FileSegment::Range(28, 29), DB::FileSegment::State::EMPTY);
-        ASSERT_TRUE(segments[2]->getOrSetDownloader() == DB::FileSegment::getCallerId());
-        ASSERT_TRUE(segments[2]->state() == DB::FileSegment::State::DOWNLOADING);
-
-        bool lets_start_download = false;
-        std::mutex mutex;
-        std::condition_variable cv;
-
-        std::thread other_1([&]
         {
-            DB::ThreadStatus thread_status_1;
-            auto query_context_1 = DB::Context::createCopy(getContext().context);
-            query_context_1->makeQueryContext();
-            query_context_1->setCurrentQueryId("query_id_1");
-            DB::CurrentThread::QueryScope query_scope_holder_1(query_context_1);
-            thread_status_1.attachQueryContext(query_context_1);
-
-            auto holder_2 = cache.getOrSet(key, 25, 5, false); /// Get [25, 29] once again.
-            auto segments_2 = fromHolder(holder_2);
+            auto holder = cache.getOrSet(key, 25, 5, false); /// Get [25, 29]
+            auto segments = fromHolder(holder);
             ASSERT_EQ(segments.size(), 3);
 
-            assertRange(35, segments_2[0], DB::FileSegment::Range(24, 26), DB::FileSegment::State::DOWNLOADED);
-            assertRange(36, segments_2[1], DB::FileSegment::Range(27, 27), DB::FileSegment::State::DOWNLOADED);
-            assertRange(37, segments_2[2], DB::FileSegment::Range(28, 29), DB::FileSegment::State::DOWNLOADING);
+            assertRange(32, segments[0], DB::FileSegment::Range(24, 26), DB::FileSegment::State::DOWNLOADED);
+            assertRange(33, segments[1], DB::FileSegment::Range(27, 27), DB::FileSegment::State::DOWNLOADED);
 
-            ASSERT_TRUE(segments[2]->getOrSetDownloader() != DB::FileSegment::getCallerId());
+            assertRange(34, segments[2], DB::FileSegment::Range(28, 29), DB::FileSegment::State::EMPTY);
+            ASSERT_TRUE(segments[2]->getOrSetDownloader() == DB::FileSegment::getCallerId());
             ASSERT_TRUE(segments[2]->state() == DB::FileSegment::State::DOWNLOADING);
 
+            bool lets_start_download = false;
+            std::mutex mutex;
+            std::condition_variable cv;
+
+            std::thread other_1([&]
             {
-                std::lock_guard lock(mutex);
-                lets_start_download = true;
-            }
-            cv.notify_one();
+                DB::ThreadStatus thread_status_1;
+                auto query_context_1 = DB::Context::createCopy(getContext().context);
+                query_context_1->makeQueryContext();
+                query_context_1->setCurrentQueryId("query_id_1");
+                DB::CurrentThread::QueryScope query_scope_holder_1(query_context_1);
+                thread_status_1.attachQueryContext(query_context_1);
 
-            segments_2[2]->wait();
-            ASSERT_TRUE(segments_2[2]->state() == DB::FileSegment::State::DOWNLOADED);
-        });
+                auto holder_2 = cache.getOrSet(key, 25, 5, false); /// Get [25, 29] once again.
+                auto segments_2 = fromHolder(holder_2);
+                ASSERT_EQ(segments.size(), 3);
 
-        {
-            std::unique_lock lock(mutex);
-            cv.wait(lock, [&]{ return lets_start_download; });
-        }
+                assertRange(35, segments_2[0], DB::FileSegment::Range(24, 26), DB::FileSegment::State::DOWNLOADED);
+                assertRange(36, segments_2[1], DB::FileSegment::Range(27, 27), DB::FileSegment::State::DOWNLOADED);
+                assertRange(37, segments_2[2], DB::FileSegment::Range(28, 29), DB::FileSegment::State::DOWNLOADING);
 
-        prepareAndDownload(segments[2]);
-        segments[2]->completeWithState(DB::FileSegment::State::DOWNLOADED);
-        ASSERT_TRUE(segments[2]->state() == DB::FileSegment::State::DOWNLOADED);
+                ASSERT_TRUE(segments[2]->getOrSetDownloader() != DB::FileSegment::getCallerId());
+                ASSERT_TRUE(segments[2]->state() == DB::FileSegment::State::DOWNLOADING);
 
-        other_1.join();
-    }
+                {
+                    std::lock_guard lock(mutex);
+                    lets_start_download = true;
+                }
+                cv.notify_one();
 
-    /// Current cache:    [___]       [___][_][__][__]
-    ///                   ^   ^       ^   ^  ^^  ^^  ^
-    ///                   2   4       24  26 27  2930 31
-
-    {
-        /// Now let's check the similar case but getting ERROR state after segment->wait(), when
-        /// state is changed not manually via segment->complete(state) but from destructor of holder
-        /// and notify_all() is also called from destructor of holder.
-
-        std::optional<DB::FileSegmentsHolder> holder;
-        holder.emplace(cache.getOrSet(key, 3, 23, false)); /// Get [3, 25]
-
-        auto segments = fromHolder(*holder);
-        ASSERT_EQ(segments.size(), 3);
-
-        assertRange(38, segments[0], DB::FileSegment::Range(2, 4), DB::FileSegment::State::DOWNLOADED);
-
-        assertRange(39, segments[1], DB::FileSegment::Range(5, 23), DB::FileSegment::State::EMPTY);
-        ASSERT_TRUE(segments[1]->getOrSetDownloader() == DB::FileSegment::getCallerId());
-        ASSERT_TRUE(segments[1]->state() == DB::FileSegment::State::DOWNLOADING);
-
-        assertRange(40, segments[2], DB::FileSegment::Range(24, 26), DB::FileSegment::State::DOWNLOADED);
-
-        bool lets_start_download = false;
-        std::mutex mutex;
-        std::condition_variable cv;
-
-        std::thread other_1([&]
-        {
-            DB::ThreadStatus thread_status_1;
-            auto query_context_1 = DB::Context::createCopy(getContext().context);
-            query_context_1->makeQueryContext();
-            query_context_1->setCurrentQueryId("query_id_1");
-            DB::CurrentThread::QueryScope query_scope_holder_1(query_context_1);
-            thread_status_1.attachQueryContext(query_context_1);
-
-            auto holder_2 = cache.getOrSet(key, 3, 23, false); /// Get [3, 25] once again
-            auto segments_2 = fromHolder(*holder);
-            ASSERT_EQ(segments_2.size(), 3);
-
-            assertRange(41, segments_2[0], DB::FileSegment::Range(2, 4), DB::FileSegment::State::DOWNLOADED);
-            assertRange(42, segments_2[1], DB::FileSegment::Range(5, 23), DB::FileSegment::State::DOWNLOADING);
-            assertRange(43, segments_2[2], DB::FileSegment::Range(24, 26), DB::FileSegment::State::DOWNLOADED);
-
-            ASSERT_TRUE(segments_2[1]->getDownloader() != DB::FileSegment::getCallerId());
-            ASSERT_TRUE(segments_2[1]->state() == DB::FileSegment::State::DOWNLOADING);
+                segments_2[2]->wait();
+                ASSERT_TRUE(segments_2[2]->state() == DB::FileSegment::State::DOWNLOADED);
+            });
 
             {
-                std::lock_guard lock(mutex);
-                lets_start_download = true;
+                std::unique_lock lock(mutex);
+                cv.wait(lock, [&]{ return lets_start_download; });
             }
-            cv.notify_one();
 
-            segments_2[1]->wait();
-            printRanges(segments_2);
-            ASSERT_TRUE(segments_2[1]->state() == DB::FileSegment::State::PARTIALLY_DOWNLOADED);
+            prepareAndDownload(segments[2]);
+            segments[2]->completeWithState(DB::FileSegment::State::DOWNLOADED);
+            ASSERT_TRUE(segments[2]->state() == DB::FileSegment::State::DOWNLOADED);
 
-            ASSERT_TRUE(segments_2[1]->getOrSetDownloader() == DB::FileSegment::getCallerId());
-            prepareAndDownload(segments_2[1]);
-            segments_2[1]->completeWithState(DB::FileSegment::State::DOWNLOADED);
-        });
-
-        {
-            std::unique_lock lock(mutex);
-            cv.wait(lock, [&]{ return lets_start_download; });
+            other_1.join();
         }
 
-        holder.reset();
-        other_1.join();
-        printRanges(segments);
-        ASSERT_TRUE(segments[1]->state() == DB::FileSegment::State::DOWNLOADED);
+        /// Current cache:    [___]       [___][_][__][__]
+        ///                   ^   ^       ^   ^  ^^  ^^  ^
+        ///                   2   4       24  26 27  2930 31
+
+        {
+            /// Now let's check the similar case but getting ERROR state after segment->wait(), when
+            /// state is changed not manually via segment->complete(state) but from destructor of holder
+            /// and notify_all() is also called from destructor of holder.
+
+            std::optional<DB::FileSegmentsHolder> holder;
+            holder.emplace(cache.getOrSet(key, 3, 23, false)); /// Get [3, 25]
+
+            auto segments = fromHolder(*holder);
+            ASSERT_EQ(segments.size(), 3);
+
+            assertRange(38, segments[0], DB::FileSegment::Range(2, 4), DB::FileSegment::State::DOWNLOADED);
+
+            assertRange(39, segments[1], DB::FileSegment::Range(5, 23), DB::FileSegment::State::EMPTY);
+            ASSERT_TRUE(segments[1]->getOrSetDownloader() == DB::FileSegment::getCallerId());
+            ASSERT_TRUE(segments[1]->state() == DB::FileSegment::State::DOWNLOADING);
+
+            assertRange(40, segments[2], DB::FileSegment::Range(24, 26), DB::FileSegment::State::DOWNLOADED);
+
+            bool lets_start_download = false;
+            std::mutex mutex;
+            std::condition_variable cv;
+
+            std::thread other_1([&]
+            {
+                DB::ThreadStatus thread_status_1;
+                auto query_context_1 = DB::Context::createCopy(getContext().context);
+                query_context_1->makeQueryContext();
+                query_context_1->setCurrentQueryId("query_id_1");
+                DB::CurrentThread::QueryScope query_scope_holder_1(query_context_1);
+                thread_status_1.attachQueryContext(query_context_1);
+
+                auto holder_2 = cache.getOrSet(key, 3, 23, false); /// Get [3, 25] once again
+                auto segments_2 = fromHolder(*holder);
+                ASSERT_EQ(segments_2.size(), 3);
+
+                assertRange(41, segments_2[0], DB::FileSegment::Range(2, 4), DB::FileSegment::State::DOWNLOADED);
+                assertRange(42, segments_2[1], DB::FileSegment::Range(5, 23), DB::FileSegment::State::DOWNLOADING);
+                assertRange(43, segments_2[2], DB::FileSegment::Range(24, 26), DB::FileSegment::State::DOWNLOADED);
+
+                ASSERT_TRUE(segments_2[1]->getDownloader() != DB::FileSegment::getCallerId());
+                ASSERT_TRUE(segments_2[1]->state() == DB::FileSegment::State::DOWNLOADING);
+
+                {
+                    std::lock_guard lock(mutex);
+                    lets_start_download = true;
+                }
+                cv.notify_one();
+
+                segments_2[1]->wait();
+                printRanges(segments_2);
+                ASSERT_TRUE(segments_2[1]->state() == DB::FileSegment::State::PARTIALLY_DOWNLOADED);
+
+                ASSERT_TRUE(segments_2[1]->getOrSetDownloader() == DB::FileSegment::getCallerId());
+                prepareAndDownload(segments_2[1]);
+                segments_2[1]->completeWithState(DB::FileSegment::State::DOWNLOADED);
+            });
+
+            {
+                std::unique_lock lock(mutex);
+                cv.wait(lock, [&]{ return lets_start_download; });
+            }
+
+            holder.reset();
+            other_1.join();
+            printRanges(segments);
+            ASSERT_TRUE(segments[1]->state() == DB::FileSegment::State::DOWNLOADED);
+        }
     }
 
     /// Current cache:    [___][        ][___][_][__]
@@ -481,6 +483,7 @@ TEST(FileCache, get)
 
         auto cache2 = DB::FileCache(cache_base_path, settings);
         cache2.initialize();
+        auto key = cache2.hash("key1");
 
         auto holder1 = cache2.getOrSet(key, 2, 28, false); /// Get [2, 29]
 
@@ -501,6 +504,7 @@ TEST(FileCache, get)
         settings2.max_file_segment_size = 10;
         auto cache2 = DB::FileCache(caches_dir / "cache2", settings2);
         cache2.initialize();
+        auto key = cache2.hash("key1");
 
         auto holder1 = cache2.getOrSet(key, 0, 25, false); /// Get [0, 24]
         auto segments1 = fromHolder(holder1);
diff --git a/src/Loggers/OwnSplitChannel.cpp b/src/Loggers/OwnSplitChannel.cpp
index 933fc09d3e4..b1502cc4558 100644
--- a/src/Loggers/OwnSplitChannel.cpp
+++ b/src/Loggers/OwnSplitChannel.cpp
@@ -46,6 +46,8 @@ void OwnSplitChannel::log(const Poco::Message & msg)
 
 void OwnSplitChannel::tryLogSplit(const Poco::Message & msg)
 {
+    LockMemoryExceptionInThread lock_memory_tracker(VariableContext::Global);
+
     try
     {
         logSplit(msg);
@@ -62,8 +64,6 @@ void OwnSplitChannel::tryLogSplit(const Poco::Message & msg)
     /// but let's log it into the stderr at least.
     catch (...)
     {
-        LockMemoryExceptionInThread lock_memory_tracker(VariableContext::Global);
-
         const std::string & exception_message = getCurrentExceptionMessage(true);
         const std::string & message = msg.getText();
 
diff --git a/src/Parsers/ASTSelectIntersectExceptQuery.cpp b/src/Parsers/ASTSelectIntersectExceptQuery.cpp
index 62eeefba385..75fbe2b5280 100644
--- a/src/Parsers/ASTSelectIntersectExceptQuery.cpp
+++ b/src/Parsers/ASTSelectIntersectExceptQuery.cpp
@@ -27,7 +27,7 @@ void ASTSelectIntersectExceptQuery::formatImpl(const FormatSettings & settings,
         if (it != children.begin())
         {
             settings.ostr << settings.nl_or_ws << indent_str << (settings.hilite ? hilite_keyword : "")
-                          << (final_operator == Operator::INTERSECT ? "INTERSECT" : "EXCEPT")
+                          << fromOperator(final_operator)
                           << (settings.hilite ? hilite_none : "")
                           << settings.nl_or_ws;
         }
@@ -53,4 +53,20 @@ ASTs ASTSelectIntersectExceptQuery::getListOfSelects() const
     return selects;
 }
 
+const char * ASTSelectIntersectExceptQuery::fromOperator(Operator op)
+{
+    switch (op)
+    {
+        case Operator::EXCEPT_ALL:
+            return "EXCEPT ALL";
+        case Operator::EXCEPT_DISTINCT:
+            return "EXCEPT DISTINCT";
+        case Operator::INTERSECT_ALL:
+            return "INTERSECT ALL";
+        case Operator::INTERSECT_DISTINCT:
+            return "INTERSECT DISTINCT";
+        default:
+            return "";
+    }
+}
 }
diff --git a/src/Parsers/ASTSelectIntersectExceptQuery.h b/src/Parsers/ASTSelectIntersectExceptQuery.h
index c95944a0c35..ad962fe25e2 100644
--- a/src/Parsers/ASTSelectIntersectExceptQuery.h
+++ b/src/Parsers/ASTSelectIntersectExceptQuery.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <Parsers/ASTSelectQuery.h>
+#include "Parsers/ExpressionListParsers.h"
 
 
 namespace DB
@@ -16,8 +17,10 @@ public:
     enum class Operator
     {
         UNKNOWN,
-        INTERSECT,
-        EXCEPT
+        EXCEPT_ALL,
+        EXCEPT_DISTINCT,
+        INTERSECT_ALL,
+        INTERSECT_DISTINCT,
     };
 
     void formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override;
@@ -26,6 +29,8 @@ public:
 
     ASTs getListOfSelects() const;
 
+    static const char * fromOperator(Operator op);
+
     /// Final operator after applying visitor.
     Operator final_operator = Operator::UNKNOWN;
 };
diff --git a/src/Parsers/ASTSelectWithUnionQuery.cpp b/src/Parsers/ASTSelectWithUnionQuery.cpp
index 7718b2dc2de..bc413fbe16d 100644
--- a/src/Parsers/ASTSelectWithUnionQuery.cpp
+++ b/src/Parsers/ASTSelectWithUnionQuery.cpp
@@ -1,6 +1,7 @@
 #include <Parsers/ASTSelectWithUnionQuery.h>
 #include <Parsers/ASTSubquery.h>
 #include <Common/typeid_cast.h>
+#include <Parsers/SelectUnionMode.h>
 #include <IO/Operators.h>
 
 #include <iostream>
@@ -32,14 +33,18 @@ void ASTSelectWithUnionQuery::formatQueryImpl(const FormatSettings & settings, F
 
     auto mode_to_str = [&](auto mode)
     {
-        if (mode == SelectUnionMode::ALL)
+        if (mode == SelectUnionMode::UNION_ALL)
             return "UNION ALL";
-        else if (mode == SelectUnionMode::DISTINCT)
+        else if (mode == SelectUnionMode::UNION_DISTINCT)
             return "UNION DISTINCT";
-        else if (mode == SelectUnionMode::INTERSECT)
-            return "INTERSECT";
-        else if (mode == SelectUnionMode::EXCEPT)
-            return "EXCEPT";
+        else if (mode == SelectUnionMode::EXCEPT_ALL)
+            return "EXCEPT ALL";
+        else if (mode == SelectUnionMode::EXCEPT_DISTINCT)
+            return "EXCEPT DISTINCT";
+        else if (mode == SelectUnionMode::INTERSECT_ALL)
+            return "INTERSECT ALL";
+        else if (mode == SelectUnionMode::INTERSECT_DISTINCT)
+            return "INTERSECT DISTINCT";
         return "";
     };
 
@@ -77,8 +82,8 @@ void ASTSelectWithUnionQuery::formatQueryImpl(const FormatSettings & settings, F
 
 bool ASTSelectWithUnionQuery::hasNonDefaultUnionMode() const
 {
-    return set_of_modes.contains(SelectUnionMode::DISTINCT) || set_of_modes.contains(SelectUnionMode::INTERSECT)
-        || set_of_modes.contains(SelectUnionMode::EXCEPT);
+    return set_of_modes.contains(SelectUnionMode::UNION_DISTINCT) || set_of_modes.contains(SelectUnionMode::INTERSECT_DISTINCT)
+        || set_of_modes.contains(SelectUnionMode::EXCEPT_DISTINCT);
 }
 
 }
diff --git a/src/Parsers/ExpressionListParsers.cpp b/src/Parsers/ExpressionListParsers.cpp
index ea51367ee5d..4e88e5c68e6 100644
--- a/src/Parsers/ExpressionListParsers.cpp
+++ b/src/Parsers/ExpressionListParsers.cpp
@@ -139,36 +139,38 @@ bool ParserUnionList::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
         return true;
     };
 
-    /// Parse UNION type
+    /// Parse UNION / INTERSECT / EXCEPT mode
+    /// The mode can be DEFAULT (unspecified) / DISTINCT / ALL
     auto parse_separator = [&]
     {
         if (s_union_parser.ignore(pos, expected))
         {
-            // SELECT ... UNION ALL SELECT ...
             if (s_all_parser.check(pos, expected))
-            {
-                union_modes.push_back(SelectUnionMode::ALL);
-            }
-            // SELECT ... UNION DISTINCT SELECT ...
+                union_modes.push_back(SelectUnionMode::UNION_ALL);
             else if (s_distinct_parser.check(pos, expected))
-            {
-                union_modes.push_back(SelectUnionMode::DISTINCT);
-            }
-            // SELECT ... UNION SELECT ...
+                union_modes.push_back(SelectUnionMode::UNION_DISTINCT);
             else
-            {
-                union_modes.push_back(SelectUnionMode::Unspecified);
-            }
+                union_modes.push_back(SelectUnionMode::UNION_DEFAULT);
             return true;
         }
         else if (s_except_parser.check(pos, expected))
         {
-            union_modes.push_back(SelectUnionMode::EXCEPT);
+            if (s_all_parser.check(pos, expected))
+                union_modes.push_back(SelectUnionMode::EXCEPT_ALL);
+            else if (s_distinct_parser.check(pos, expected))
+                union_modes.push_back(SelectUnionMode::EXCEPT_DISTINCT);
+            else
+                union_modes.push_back(SelectUnionMode::EXCEPT_DEFAULT);
             return true;
         }
         else if (s_intersect_parser.check(pos, expected))
         {
-            union_modes.push_back(SelectUnionMode::INTERSECT);
+            if (s_all_parser.check(pos, expected))
+                union_modes.push_back(SelectUnionMode::INTERSECT_ALL);
+            else if (s_distinct_parser.check(pos, expected))
+                union_modes.push_back(SelectUnionMode::INTERSECT_DISTINCT);
+            else
+                union_modes.push_back(SelectUnionMode::INTERSECT_DEFAULT);
             return true;
         }
         return false;
diff --git a/src/Parsers/InsertQuerySettingsPushDownVisitor.cpp b/src/Parsers/InsertQuerySettingsPushDownVisitor.cpp
index a3bca76816f..1cebdfde957 100644
--- a/src/Parsers/InsertQuerySettingsPushDownVisitor.cpp
+++ b/src/Parsers/InsertQuerySettingsPushDownVisitor.cpp
@@ -55,7 +55,7 @@ void InsertQuerySettingsPushDownMatcher::visit(ASTSelectQuery & select_query, AS
             insert_settings.push_back(setting);
         else
         {
-            /// Do not ovewrite setting that was passed for INSERT
+            /// Do not overwrite setting that was passed for INSERT
             /// by settings that was passed for SELECT
         }
     }
diff --git a/src/Parsers/ParserSelectQuery.cpp b/src/Parsers/ParserSelectQuery.cpp
index 66428b144bf..cf335270734 100644
--- a/src/Parsers/ParserSelectQuery.cpp
+++ b/src/Parsers/ParserSelectQuery.cpp
@@ -224,8 +224,6 @@ bool ParserSelectQuery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
             select_query->group_by_with_rollup = true;
         else if (s_cube.ignore(pos, expected))
             select_query->group_by_with_cube = true;
-        else if (s_grouping_sets.ignore(pos, expected))
-            select_query->group_by_with_grouping_sets = true;
         else if (s_totals.ignore(pos, expected))
             select_query->group_by_with_totals = true;
         else
diff --git a/src/Parsers/QueryWithOutputSettingsPushDownVisitor.h b/src/Parsers/QueryWithOutputSettingsPushDownVisitor.h
index 2a7ed0125fa..fde8a07b555 100644
--- a/src/Parsers/QueryWithOutputSettingsPushDownVisitor.h
+++ b/src/Parsers/QueryWithOutputSettingsPushDownVisitor.h
@@ -11,7 +11,7 @@ struct SettingChange;
 class SettingsChanges;
 
 /// Pushdown SETTINGS clause that goes after FORMAT to the SELECT query:
-/// (since settings after FORMAT parsed separatelly not in the ParserSelectQuery but in ParserQueryWithOutput)
+/// (since settings after FORMAT parsed separately not in the ParserSelectQuery but in ParserQueryWithOutput)
 ///
 ///     SELECT 1                             FORMAT Null SETTINGS max_block_size = 1 ->
 ///     SELECT 1 SETTINGS max_block_size = 1 FORMAT Null SETTINGS max_block_size = 1
diff --git a/src/Parsers/SelectUnionMode.h b/src/Parsers/SelectUnionMode.h
index f4ca858d043..ca3637612aa 100644
--- a/src/Parsers/SelectUnionMode.h
+++ b/src/Parsers/SelectUnionMode.h
@@ -7,11 +7,15 @@ namespace DB
 {
 enum class SelectUnionMode
 {
-    Unspecified,
-    ALL,
-    DISTINCT,
-    EXCEPT,
-    INTERSECT
+    UNION_DEFAULT,
+    UNION_ALL,
+    UNION_DISTINCT,
+    EXCEPT_DEFAULT,
+    EXCEPT_ALL,
+    EXCEPT_DISTINCT,
+    INTERSECT_DEFAULT,
+    INTERSECT_ALL,
+    INTERSECT_DISTINCT
 };
 
 using SelectUnionModes = std::vector<SelectUnionMode>;
diff --git a/src/Processors/Executors/ExecutionThreadContext.cpp b/src/Processors/Executors/ExecutionThreadContext.cpp
index 7631cb09f61..eddc1b76d8a 100644
--- a/src/Processors/Executors/ExecutionThreadContext.cpp
+++ b/src/Processors/Executors/ExecutionThreadContext.cpp
@@ -71,11 +71,11 @@ static void executeJob(ExecutingGraph::Node * node, ReadProgressCallback * read_
 
 bool ExecutionThreadContext::executeTask()
 {
-    std::unique_ptr<OpenTelemetrySpanHolder> span;
+    std::unique_ptr<OpenTelemetry::SpanHolder> span;
 
     if (trace_processors)
     {
-        span = std::make_unique<OpenTelemetrySpanHolder>("ExecutionThreadContext::executeTask() " + node->processor->getName());
+        span = std::make_unique<OpenTelemetry::SpanHolder>("ExecutionThreadContext::executeTask() " + node->processor->getName());
         span->addAttribute("thread_number", thread_number);
     }
     std::optional<Stopwatch> execution_time_watch;
diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp
index 12fa9710c42..427c159314b 100644
--- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp
+++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp
@@ -55,7 +55,16 @@ Chunk ParquetBlockInputFormat::generate()
         return res;
 
     std::shared_ptr<arrow::Table> table;
-    arrow::Status read_status = file_reader->ReadRowGroup(row_group_current, column_indices, &table);
+
+    std::unique_ptr<::arrow::RecordBatchReader> rbr;
+    std::vector<int> row_group_indices { row_group_current };
+    arrow::Status get_batch_reader_status = file_reader->GetRecordBatchReader(row_group_indices, column_indices, &rbr);
+
+    if (!get_batch_reader_status.ok())
+        throw ParsingException{"Error while reading Parquet data: " + get_batch_reader_status.ToString(), ErrorCodes::CANNOT_READ_ALL_DATA};
+
+    arrow::Status read_status = rbr->ReadAll(&table);
+
     if (!read_status.ok())
         throw ParsingException{"Error while reading Parquet data: " + read_status.ToString(), ErrorCodes::CANNOT_READ_ALL_DATA};
 
diff --git a/src/Processors/Merges/Algorithms/Graphite.cpp b/src/Processors/Merges/Algorithms/Graphite.cpp
index 2448a1e2a94..c5c611366ff 100644
--- a/src/Processors/Merges/Algorithms/Graphite.cpp
+++ b/src/Processors/Merges/Algorithms/Graphite.cpp
@@ -103,17 +103,17 @@ Graphite::RollupRule selectPatternForPath(
             if (first_match->type == first_match->TypeUndef && pattern.type == pattern.TypeAll)
             {
                 /// There is only default pattern for both retention and aggregation
-                return std::pair(&pattern, &pattern);
+                return {&pattern, &pattern};
             }
             if (pattern.type != first_match->type)
             {
                 if (first_match->type == first_match->TypeRetention)
                 {
-                    return std::pair(first_match, &pattern);
+                    return {first_match, &pattern};
                 }
                 if (first_match->type == first_match->TypeAggregation)
                 {
-                    return std::pair(&pattern, first_match);
+                    return {&pattern, first_match};
                 }
             }
         }
@@ -125,7 +125,7 @@ Graphite::RollupRule selectPatternForPath(
                 if (pattern.type == pattern.TypeAll)
                 {
                     /// Only for not default patterns with both function and retention parameters
-                    return std::pair(&pattern, &pattern);
+                    return {&pattern, &pattern};
                 }
                 if (first_match->type == first_match->TypeUndef)
                 {
@@ -136,11 +136,11 @@ Graphite::RollupRule selectPatternForPath(
                 {
                     if (first_match->type == first_match->TypeRetention)
                     {
-                        return std::pair(first_match, &pattern);
+                        return {first_match, &pattern};
                     }
                     if (first_match->type == first_match->TypeAggregation)
                     {
-                        return std::pair(&pattern, first_match);
+                        return {&pattern, first_match};
                     }
                 }
             }
diff --git a/src/Processors/PingPongProcessor.cpp b/src/Processors/PingPongProcessor.cpp
new file mode 100644
index 00000000000..e9d61386314
--- /dev/null
+++ b/src/Processors/PingPongProcessor.cpp
@@ -0,0 +1,198 @@
+#include <Processors/PingPongProcessor.h>
+
+namespace DB
+{
+
+/// Create list with `num_ports` of regular ports and 1 auxiliary port with empty header.
+template <typename T> requires std::is_same_v<T, InputPorts> || std::is_same_v<T, OutputPorts>
+static T createPortsWithSpecial(const Block & header, size_t num_ports)
+{
+    T res(num_ports, header);
+    res.emplace_back(Block());
+    return res;
+}
+
+PingPongProcessor::PingPongProcessor(const Block & header, size_t num_ports, Order order_)
+    : IProcessor(createPortsWithSpecial<InputPorts>(header, num_ports),
+                 createPortsWithSpecial<OutputPorts>(header, num_ports))
+    , aux_in_port(inputs.back())
+    , aux_out_port(outputs.back())
+    , order(order_)
+{
+    assert(order == First || order == Second);
+
+    port_pairs.resize(num_ports);
+
+    auto input_it = inputs.begin();
+    auto output_it = outputs.begin();
+    for (size_t i = 0; i < num_ports; ++i)
+    {
+        port_pairs[i].input_port = &*input_it;
+        ++input_it;
+
+        port_pairs[i].output_port = &*output_it;
+        ++output_it;
+    }
+}
+
+void PingPongProcessor::finishPair(PortsPair & pair)
+{
+    if (!pair.is_finished)
+    {
+        pair.output_port->finish();
+        pair.input_port->close();
+
+        pair.is_finished = true;
+        ++num_finished_pairs;
+    }
+}
+
+bool PingPongProcessor::processPair(PortsPair & pair)
+{
+    if (pair.output_port->isFinished())
+    {
+        finishPair(pair);
+        return false;
+    }
+
+    if (pair.input_port->isFinished())
+    {
+        finishPair(pair);
+        return false;
+    }
+
+    if (!pair.output_port->canPush())
+    {
+        pair.input_port->setNotNeeded();
+        return false;
+    }
+
+    pair.input_port->setNeeded();
+    if (pair.input_port->hasData())
+    {
+        Chunk chunk = pair.input_port->pull(true);
+        ready_to_send |= consume(chunk);
+        pair.output_port->push(std::move(chunk));
+    }
+
+    return true;
+}
+
+bool PingPongProcessor::isPairsFinished() const
+{
+    return num_finished_pairs == port_pairs.size();
+}
+
+IProcessor::Status PingPongProcessor::processRegularPorts()
+{
+    if (isPairsFinished())
+        return Status::Finished;
+
+    bool need_data = false;
+
+    for (auto & pair : port_pairs)
+        need_data = processPair(pair) || need_data;
+
+    if (isPairsFinished())
+        return Status::Finished;
+
+    if (need_data)
+        return Status::NeedData;
+
+    return Status::PortFull;
+}
+
+bool PingPongProcessor::sendPing()
+{
+    if (aux_out_port.canPush())
+    {
+        Chunk chunk(aux_out_port.getHeader().cloneEmpty().getColumns(), 0);
+        aux_out_port.push(std::move(chunk));
+        is_send = true;
+        aux_out_port.finish();
+        return true;
+    }
+    return false;
+}
+
+bool PingPongProcessor::recievePing()
+{
+    if (aux_in_port.hasData())
+    {
+        aux_in_port.pull();
+        is_received = true;
+        aux_in_port.close();
+        return true;
+    }
+    return false;
+}
+
+bool PingPongProcessor::canSend() const
+{
+    return !is_send && (ready_to_send || isPairsFinished());
+}
+
+IProcessor::Status PingPongProcessor::prepare()
+{
+    if (!set_needed_once && !is_received && !aux_in_port.isFinished())
+    {
+        set_needed_once = true;
+        aux_in_port.setNeeded();
+    }
+
+    if (order == First || is_send)
+    {
+        if (!is_received)
+        {
+            bool received = recievePing();
+            if (!received)
+            {
+                return Status::NeedData;
+            }
+        }
+    }
+
+    if (order == Second || is_received)
+    {
+        if (!is_send && canSend())
+        {
+            bool sent = sendPing();
+            if (!sent)
+                return Status::PortFull;
+        }
+    }
+
+    auto status = processRegularPorts();
+    if (status == Status::Finished)
+    {
+        if (order == First || is_send)
+        {
+            if (!is_received)
+            {
+                bool received = recievePing();
+                if (!received)
+                {
+                    return Status::NeedData;
+                }
+            }
+        }
+
+        if (order == Second || is_received)
+        {
+            if (!is_send && canSend())
+            {
+                bool sent = sendPing();
+                if (!sent)
+                    return Status::PortFull;
+            }
+        }
+    }
+    return status;
+}
+
+std::pair<InputPort *, OutputPort *> PingPongProcessor::getAuxPorts()
+{
+    return std::make_pair(&aux_in_port, &aux_out_port);
+}
+
+}
diff --git a/src/Processors/PingPongProcessor.h b/src/Processors/PingPongProcessor.h
new file mode 100644
index 00000000000..3dbe1178332
--- /dev/null
+++ b/src/Processors/PingPongProcessor.h
@@ -0,0 +1,105 @@
+#pragma once
+
+#include <Processors/IProcessor.h>
+#include <base/unit.h>
+#include <Processors/Chunk.h>
+#include <Common/logger_useful.h>
+
+namespace DB
+{
+
+/*
+ * Processor with N inputs and N outputs. Moves data from i-th input to i-th output as is.
+ * It has a pair of auxiliary ports to notify another instance by sending empty chunk after some condition holds.
+ * You should use this processor in pair of instances and connect auxiliary ports crosswise.
+ *
+ *     ╭─┴───┴───┴───┴───┴─╮       ╭─┴───┴───┴───┴───┴─╮
+ *     │                   ├─ aux ⟶│                   │
+ *     │ PingPongProcessor │       │ PingPongProcessor │
+ *     │                   │⟵ aux ─┤                   │
+ *     ╰─┬───┬───┬───┬───┬─╯       ╰─┬───┬───┬───┬───┬─╯
+ *
+ * One of the processors starts processing data, and another waits for notification.
+ * When `consume` returns true, the first stops processing, sends a ping to another and waits for notification.
+ * After that, the second one also processes data until `consume`, then send a notification back to the first one.
+ * After this roundtrip, processors bypass data from regular inputs to outputs.
+ */
+class PingPongProcessor : public IProcessor
+{
+public:
+    enum class Order : uint8_t
+    {
+        /// Processor that starts processing data.
+        First,
+        /// Processor that waits for notification.
+        Second,
+    };
+
+    using enum Order;
+
+    PingPongProcessor(const Block & header, size_t num_ports, Order order_);
+
+    Status prepare() override;
+
+    std::pair<InputPort *, OutputPort *> getAuxPorts();
+
+    /// Returns `true` when enough data consumed
+    virtual bool consume(const Chunk & chunk) = 0;
+
+protected:
+    struct PortsPair
+    {
+        InputPort * input_port = nullptr;
+        OutputPort * output_port = nullptr;
+        bool is_finished = false;
+    };
+
+    bool sendPing();
+    bool recievePing();
+    bool canSend() const;
+
+    bool isPairsFinished() const;
+    bool processPair(PortsPair & pair);
+    void finishPair(PortsPair & pair);
+    Status processRegularPorts();
+
+    std::vector<PortsPair> port_pairs;
+    size_t num_finished_pairs = 0;
+
+    InputPort & aux_in_port;
+    OutputPort & aux_out_port;
+
+    bool is_send = false;
+    bool is_received = false;
+
+    bool ready_to_send = false;
+
+    /// Used to set 'needed' flag once for auxiliary input at first `prepare` call.
+    bool set_needed_once = false;
+
+    Order order;
+};
+
+/// Reads first N rows from two streams evenly.
+class ReadHeadBalancedProcessor : public PingPongProcessor
+{
+public:
+    ReadHeadBalancedProcessor(const Block & header, size_t num_ports, size_t size_to_wait_, Order order_)
+        : PingPongProcessor(header, num_ports, order_) , data_consumed(0) , size_to_wait(size_to_wait_)
+    {
+    }
+
+    String getName() const override { return "ReadHeadBalancedProcessor"; }
+
+    bool consume(const Chunk & chunk) override
+    {
+        data_consumed += chunk.getNumRows();
+        return data_consumed > size_to_wait;
+    }
+
+private:
+    size_t data_consumed;
+    size_t size_to_wait;
+};
+
+}
diff --git a/src/Processors/Port.cpp b/src/Processors/Port.cpp
index 86431dbc6e6..79532dd4d6c 100644
--- a/src/Processors/Port.cpp
+++ b/src/Processors/Port.cpp
@@ -8,18 +8,18 @@ namespace ErrorCodes
     extern const int LOGICAL_ERROR;
 }
 
-void connect(OutputPort & output, InputPort & input)
+void connect(OutputPort & output, InputPort & input, bool reconnect)
 {
-    if (input.state)
+    if (!reconnect && input.state)
         throw Exception(ErrorCodes::LOGICAL_ERROR, "Port is already connected, (header: [{}])", input.header.dumpStructure());
 
-    if (output.state)
+    if (!reconnect && output.state)
         throw Exception(ErrorCodes::LOGICAL_ERROR, "Port is already connected, (header: [{}])", output.header.dumpStructure());
 
-    auto out_name = output.getProcessor().getName();
-    auto in_name = input.getProcessor().getName();
+    auto out_name = output.processor ? output.getProcessor().getName() : "null";
+    auto in_name = input.processor ? input.getProcessor().getName() : "null";
 
-    assertCompatibleHeader(output.getHeader(), input.getHeader(), fmt::format(" function connect between {} and {}", out_name, in_name));
+    assertCompatibleHeader(output.getHeader(), input.getHeader(), fmt::format("function connect between {} and {}", out_name, in_name));
 
     input.output_port = &output;
     output.input_port = &input;
diff --git a/src/Processors/Port.h b/src/Processors/Port.h
index e3fb0e3e342..9163402f600 100644
--- a/src/Processors/Port.h
+++ b/src/Processors/Port.h
@@ -25,7 +25,7 @@ namespace ErrorCodes
 
 class Port
 {
-    friend void connect(OutputPort &, InputPort &);
+    friend void connect(OutputPort &, InputPort &, bool);
     friend class IProcessor;
 
 public:
@@ -267,7 +267,7 @@ protected:
 ///   * You can pull only if port hasData().
 class InputPort : public Port
 {
-    friend void connect(OutputPort &, InputPort &);
+    friend void connect(OutputPort &, InputPort &, bool);
 
 private:
     OutputPort * output_port = nullptr;
@@ -390,7 +390,7 @@ public:
 ///   * You can push only if port doesn't hasData().
 class OutputPort : public Port
 {
-    friend void connect(OutputPort &, InputPort &);
+    friend void connect(OutputPort &, InputPort &, bool);
 
 private:
     InputPort * input_port = nullptr;
@@ -483,6 +483,6 @@ using InputPorts = std::list<InputPort>;
 using OutputPorts = std::list<OutputPort>;
 
 
-void connect(OutputPort & output, InputPort & input);
+void connect(OutputPort & output, InputPort & input, bool reconnect = false);
 
 }
diff --git a/src/Processors/QueryPlan/AggregatingStep.cpp b/src/Processors/QueryPlan/AggregatingStep.cpp
index f0374d2419b..03f346d8f72 100644
--- a/src/Processors/QueryPlan/AggregatingStep.cpp
+++ b/src/Processors/QueryPlan/AggregatingStep.cpp
@@ -182,6 +182,7 @@ void AggregatingStep::transformPipeline(QueryPipelineBuilder & pipeline, const B
                     transform_params->params.min_free_disk_space,
                     transform_params->params.compile_aggregate_expressions,
                     transform_params->params.min_count_to_compile_aggregate_expression,
+                    transform_params->params.max_block_size,
                     /* only_merge */ false,
                     transform_params->params.stats_collecting_params};
                 auto transform_params_for_set = std::make_shared<AggregatingTransformParams>(src_header, std::move(params_for_set), final);
@@ -250,14 +251,17 @@ void AggregatingStep::transformPipeline(QueryPipelineBuilder & pipeline, const B
                 outputs.push_back(grouping_node);
 
                 const auto & missing_columns = grouping_sets_params[set_counter].missing_keys;
+                const auto & used_keys = grouping_sets_params[set_counter].used_keys;
 
                 auto to_nullable_function = FunctionFactory::instance().get("toNullable", nullptr);
                 for (size_t i = 0; i < output_header.columns(); ++i)
                 {
                     auto & col = output_header.getByPosition(i);
-                    const auto it = std::find_if(
+                    const auto missing_it = std::find_if(
                         missing_columns.begin(), missing_columns.end(), [&](const auto & missing_col) { return missing_col == col.name; });
-                    if (it != missing_columns.end())
+                    const auto used_it = std::find_if(
+                        used_keys.begin(), used_keys.end(), [&](const auto & used_col) { return used_col == col.name; });
+                    if (missing_it != missing_columns.end())
                     {
                         auto column_with_default = col.column->cloneEmpty();
                         col.type->insertDefaultInto(*column_with_default);
@@ -269,7 +273,7 @@ void AggregatingStep::transformPipeline(QueryPipelineBuilder & pipeline, const B
                     else
                     {
                         const auto * column_node = dag->getOutputs()[header.getPositionByName(col.name)];
-                        if (group_by_use_nulls && column_node->result_type->canBeInsideNullable())
+                        if (used_it != used_keys.end() && group_by_use_nulls && column_node->result_type->canBeInsideNullable())
                             outputs.push_back(&dag->addFunction(to_nullable_function, { column_node }, col.name));
                         else
                             outputs.push_back(column_node);
@@ -376,16 +380,15 @@ void AggregatingStep::transformPipeline(QueryPipelineBuilder & pipeline, const B
         });
 
         /// We add the explicit resize here, but not in case of aggregating in order, since AIO don't use two-level hash tables and thus returns only buckets with bucket_number = -1.
-        pipeline.resize(should_produce_results_in_order_of_bucket_number ? 1 : pipeline.getNumStreams(), true /* force */);
+        pipeline.resize(should_produce_results_in_order_of_bucket_number ? 1 : params.max_threads, true /* force */);
 
         aggregating = collector.detachProcessors(0);
     }
     else
     {
-        pipeline.addSimpleTransform([&](const Block & header)
-        {
-            return std::make_shared<AggregatingTransform>(header, transform_params);
-        });
+        pipeline.addSimpleTransform([&](const Block & header) { return std::make_shared<AggregatingTransform>(header, transform_params); });
+
+        pipeline.resize(should_produce_results_in_order_of_bucket_number ? 1 : params.max_threads, false /* force */);
 
         aggregating = collector.detachProcessors(0);
     }
diff --git a/src/Processors/QueryPlan/CreateSetAndFilterOnTheFlyStep.cpp b/src/Processors/QueryPlan/CreateSetAndFilterOnTheFlyStep.cpp
new file mode 100644
index 00000000000..e42642ceff8
--- /dev/null
+++ b/src/Processors/QueryPlan/CreateSetAndFilterOnTheFlyStep.cpp
@@ -0,0 +1,205 @@
+#include <Processors/QueryPlan/CreateSetAndFilterOnTheFlyStep.h>
+#include <Processors/Transforms/CreateSetAndFilterOnTheFlyTransform.h>
+
+#include <QueryPipeline/QueryPipelineBuilder.h>
+#include <IO/Operators.h>
+#include <Common/JSONBuilder.h>
+#include <Core/ColumnWithTypeAndName.h>
+#include <Core/ColumnsWithTypeAndName.h>
+#include <Processors/IProcessor.h>
+#include <Processors/PingPongProcessor.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+}
+
+static void connectAllInputs(OutputPortRawPtrs ports, InputPorts & inputs, size_t num_ports)
+{
+    auto input_it = inputs.begin();
+    for (size_t i = 0; i < num_ports; ++i)
+    {
+        connect(*ports[i], *input_it);
+        input_it++;
+    }
+}
+
+static ColumnsWithTypeAndName getColumnSubset(const Block & block, const Names & column_names)
+{
+    ColumnsWithTypeAndName result;
+    for (const auto & name : column_names)
+        result.emplace_back(block.getByName(name));
+    return result;
+}
+
+static ITransformingStep::Traits getTraits()
+{
+    return ITransformingStep::Traits
+    {
+        {
+            .preserves_distinct_columns = true,
+            .returns_single_stream = false,
+            .preserves_number_of_streams = true,
+            .preserves_sorting = true,
+        },
+        {
+            .preserves_number_of_rows = false,
+        }
+    };
+}
+
+class CreateSetAndFilterOnTheFlyStep::CrosswiseConnection : public boost::noncopyable
+{
+public:
+    using PortPair = std::pair<InputPort *, OutputPort *>;
+
+    /// Remember ports passed on the first call and connect with ones from second call.
+    /// Thread-safe.
+    void connectPorts(PortPair rhs_ports, IProcessor * proc)
+    {
+        assert(!rhs_ports.first->isConnected() && !rhs_ports.second->isConnected());
+
+        std::lock_guard<std::mutex> lock(mux);
+        if (input_port || output_port)
+        {
+            assert(input_port && output_port);
+            assert(!input_port->isConnected());
+            connect(*rhs_ports.second, *input_port);
+            connect(*output_port, *rhs_ports.first, /* reconnect= */ true);
+        }
+        else
+        {
+            std::tie(input_port, output_port) = rhs_ports;
+            assert(input_port && output_port);
+            assert(!input_port->isConnected() && !output_port->isConnected());
+
+            dummy_input_port = std::make_unique<InputPort>(output_port->getHeader(), proc);
+            connect(*output_port, *dummy_input_port);
+        }
+    }
+
+private:
+    std::mutex mux;
+    InputPort * input_port = nullptr;
+    OutputPort * output_port = nullptr;
+
+    /// Output ports should always be connected, and we can't add a step to the pipeline without them.
+    /// So, connect the port from the first processor to this dummy port and then reconnect to the second processor.
+    std::unique_ptr<InputPort> dummy_input_port;
+};
+
+CreateSetAndFilterOnTheFlyStep::CrosswiseConnectionPtr CreateSetAndFilterOnTheFlyStep::createCrossConnection()
+{
+    return std::make_shared<CreateSetAndFilterOnTheFlyStep::CrosswiseConnection>();
+}
+
+CreateSetAndFilterOnTheFlyStep::CreateSetAndFilterOnTheFlyStep(
+    const DataStream & input_stream_,
+    const Names & column_names_,
+    size_t max_rows_in_set_,
+    CrosswiseConnectionPtr crosswise_connection_,
+    JoinTableSide position_)
+    : ITransformingStep(input_stream_, input_stream_.header, getTraits())
+    , column_names(column_names_)
+    , max_rows_in_set(max_rows_in_set_)
+    , own_set(std::make_shared<SetWithState>(SizeLimits(max_rows_in_set, 0, OverflowMode::BREAK), false, true))
+    , filtering_set(nullptr)
+    , crosswise_connection(crosswise_connection_)
+    , position(position_)
+{
+    if (crosswise_connection == nullptr)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Crosswise connection is not initialized");
+
+    if (input_streams.size() != 1)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Step requires exactly one input stream, got {}", input_streams.size());
+
+    own_set->setHeader(getColumnSubset(input_streams[0].header, column_names));
+}
+
+void CreateSetAndFilterOnTheFlyStep::transformPipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &)
+{
+    size_t num_streams = pipeline.getNumStreams();
+    pipeline.addSimpleTransform([this, num_streams](const Block & header, QueryPipelineBuilder::StreamType stream_type) -> ProcessorPtr
+    {
+        if (stream_type != QueryPipelineBuilder::StreamType::Main)
+            return nullptr;
+        auto res = std::make_shared<CreatingSetsOnTheFlyTransform>(header, column_names, num_streams, own_set);
+        res->setDescription(this->getStepDescription());
+        return res;
+    });
+
+    Block input_header = pipeline.getHeader();
+    auto pipeline_transform = [&input_header, this](OutputPortRawPtrs ports)
+    {
+        Processors result_transforms;
+
+        size_t num_ports = ports.size();
+
+        /// Add balancing transform
+        auto idx = position == JoinTableSide::Left ? PingPongProcessor::First : PingPongProcessor::Second;
+        auto stream_balancer = std::make_shared<ReadHeadBalancedProcessor>(input_header, num_ports, max_rows_in_set, idx);
+        stream_balancer->setDescription(getStepDescription());
+
+        /// Regular inputs just bypass data for respective ports
+        connectAllInputs(ports, stream_balancer->getInputs(), num_ports);
+
+        /// Connect auxiliary ports
+        crosswise_connection->connectPorts(stream_balancer->getAuxPorts(), stream_balancer.get());
+
+        if (!filtering_set)
+        {
+            LOG_DEBUG(log, "Skip filtering {} stream", position);
+            result_transforms.emplace_back(std::move(stream_balancer));
+            return result_transforms;
+        }
+
+        /// Add filtering transform, ports just connected respectively
+        auto & outputs = stream_balancer->getOutputs();
+        auto output_it = outputs.begin();
+        for (size_t i = 0; i < outputs.size() - 1; ++i)
+        {
+            auto & port = *output_it++;
+            auto transform = std::make_shared<FilterBySetOnTheFlyTransform>(port.getHeader(), column_names, filtering_set);
+            transform->setDescription(this->getStepDescription());
+            connect(port, transform->getInputPort());
+            result_transforms.emplace_back(std::move(transform));
+        }
+        assert(output_it == std::prev(outputs.end()));
+        result_transforms.emplace_back(std::move(stream_balancer));
+
+        return result_transforms;
+    };
+
+    /// Auxiliary port stream_balancer can be connected later (by crosswise_connection).
+    /// So, use unsafe `transform` with `check_ports = false` to avoid assertions
+    pipeline.transform(std::move(pipeline_transform), /* check_ports= */ false);
+}
+
+void CreateSetAndFilterOnTheFlyStep::describeActions(JSONBuilder::JSONMap & map) const
+{
+    map.add(getName(), true);
+}
+
+void CreateSetAndFilterOnTheFlyStep::describeActions(FormatSettings & settings) const
+{
+    String prefix(settings.offset, ' ');
+    settings.out << prefix << getName();
+
+    settings.out << '\n';
+}
+
+void CreateSetAndFilterOnTheFlyStep::updateOutputStream()
+{
+    if (input_streams.size() != 1)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "{} requires exactly one input stream, got {}", getName(), input_streams.size());
+
+    own_set->setHeader(getColumnSubset(input_streams[0].header, column_names));
+
+    output_stream = input_streams[0];
+}
+
+
+}
diff --git a/src/Processors/QueryPlan/CreateSetAndFilterOnTheFlyStep.h b/src/Processors/QueryPlan/CreateSetAndFilterOnTheFlyStep.h
new file mode 100644
index 00000000000..8c2eef00af0
--- /dev/null
+++ b/src/Processors/QueryPlan/CreateSetAndFilterOnTheFlyStep.h
@@ -0,0 +1,59 @@
+#pragma once
+#include <Processors/QueryPlan/ITransformingStep.h>
+#include <Processors/Transforms/CreateSetAndFilterOnTheFlyTransform.h>
+#include <Processors/DelayedPortsProcessor.h>
+
+
+namespace DB
+{
+
+/*
+ * Used to optimize JOIN when joining a small table over a large table.
+ * Currently applied only for the full sorting join.
+ * It tries to build a set for each stream.
+ * Once one stream is finished, it starts to filter another stream with this set.
+ */
+class CreateSetAndFilterOnTheFlyStep : public ITransformingStep
+{
+public:
+    /// Two instances of step need some shared state to connect processors crosswise
+    class CrosswiseConnection;
+    using CrosswiseConnectionPtr = std::shared_ptr<CrosswiseConnection>;
+    static CrosswiseConnectionPtr createCrossConnection();
+
+    CreateSetAndFilterOnTheFlyStep(
+        const DataStream & input_stream_,
+        const Names & column_names_,
+        size_t max_rows_in_set_,
+        CrosswiseConnectionPtr crosswise_connection_,
+        JoinTableSide position_);
+
+    String getName() const override { return "CreateSetAndFilterOnTheFlyStep"; }
+    void transformPipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings & settings) override;
+
+    void describeActions(JSONBuilder::JSONMap & map) const override;
+    void describeActions(FormatSettings & settings) const override;
+
+    SetWithStatePtr getSet() const { return own_set; }
+
+    /// Set for another stream.
+    void setFiltering(SetWithStatePtr filtering_set_) { filtering_set = filtering_set_; }
+
+private:
+    void updateOutputStream() override;
+
+    Names column_names;
+
+    size_t max_rows_in_set;
+
+    SetWithStatePtr own_set;
+    SetWithStatePtr filtering_set;
+
+    CrosswiseConnectionPtr crosswise_connection;
+
+    JoinTableSide position;
+
+    Poco::Logger * log = &Poco::Logger::get("CreateSetAndFilterOnTheFlyStep");
+};
+
+}
diff --git a/src/Processors/QueryPlan/DistinctStep.cpp b/src/Processors/QueryPlan/DistinctStep.cpp
index 7da2b5252f5..e3d29256c23 100644
--- a/src/Processors/QueryPlan/DistinctStep.cpp
+++ b/src/Processors/QueryPlan/DistinctStep.cpp
@@ -108,7 +108,7 @@ void DistinctStep::transformPipeline(QueryPipelineBuilder & pipeline, const Buil
                 return;
             }
             /// final distinct for sorted stream (sorting inside and among chunks)
-            if (input_stream.sort_mode == DataStream::SortMode::Stream)
+            if (input_stream.sort_scope == DataStream::SortScope::Global)
             {
                 assert(input_stream.has_single_port);
 
diff --git a/src/Processors/QueryPlan/IQueryPlanStep.h b/src/Processors/QueryPlan/IQueryPlanStep.h
index a66582ff06d..c5bd64d66be 100644
--- a/src/Processors/QueryPlan/IQueryPlanStep.h
+++ b/src/Processors/QueryPlan/IQueryPlanStep.h
@@ -31,18 +31,18 @@ public:
     /// QueryPipeline has single port. Totals or extremes ports are not counted.
     bool has_single_port = false;
 
-    /// How data is sorted.
-    enum class SortMode
+    /// Sorting scope
+    enum class SortScope
     {
         None,
         Chunk, /// Separate chunks are sorted
-        Port, /// Data from each port is sorted
-        Stream, /// Data is globally sorted
+        Stream, /// Each data steam is sorted
+        Global, /// Data is globally sorted
     };
 
     /// It is not guaranteed that header has columns from sort_description.
     SortDescription sort_description = {};
-    SortMode sort_mode = SortMode::None;
+    SortScope sort_scope = SortScope::None;
 
     /// Things which may be added:
     /// * limit
@@ -54,7 +54,7 @@ public:
         return distinct_columns == other.distinct_columns
             && has_single_port == other.has_single_port
             && sort_description == other.sort_description
-            && (sort_description.empty() || sort_mode == other.sort_mode);
+            && (sort_description.empty() || sort_scope == other.sort_scope);
     }
 
     bool hasEqualHeaderWith(const DataStream & other) const
diff --git a/src/Processors/QueryPlan/ITransformingStep.cpp b/src/Processors/QueryPlan/ITransformingStep.cpp
index 9b9797b6540..64ad2ec5626 100644
--- a/src/Processors/QueryPlan/ITransformingStep.cpp
+++ b/src/Processors/QueryPlan/ITransformingStep.cpp
@@ -29,7 +29,7 @@ DataStream ITransformingStep::createOutputStream(
     if (stream_traits.preserves_sorting)
     {
         output_stream.sort_description = input_stream.sort_description;
-        output_stream.sort_mode = input_stream.sort_mode;
+        output_stream.sort_scope = input_stream.sort_scope;
     }
 
     return output_stream;
diff --git a/src/Processors/QueryPlan/JoinStep.cpp b/src/Processors/QueryPlan/JoinStep.cpp
index 909933fbed2..6e212a53bc6 100644
--- a/src/Processors/QueryPlan/JoinStep.cpp
+++ b/src/Processors/QueryPlan/JoinStep.cpp
@@ -34,8 +34,12 @@ QueryPipelineBuilderPtr JoinStep::updatePipeline(QueryPipelineBuilders pipelines
         throw Exception(ErrorCodes::LOGICAL_ERROR, "JoinStep expect two input steps");
 
     if (join->pipelineType() == JoinPipelineType::YShaped)
-        return QueryPipelineBuilder::joinPipelinesYShaped(
+    {
+        auto joined_pipeline = QueryPipelineBuilder::joinPipelinesYShaped(
             std::move(pipelines[0]), std::move(pipelines[1]), join, output_stream->header, max_block_size, &processors);
+        joined_pipeline->resize(max_streams);
+        return joined_pipeline;
+    }
 
     return QueryPipelineBuilder::joinPipelinesRightLeft(
         std::move(pipelines[0]),
diff --git a/src/Processors/QueryPlan/Optimizations/Optimizations.h b/src/Processors/QueryPlan/Optimizations/Optimizations.h
index 904f30e84b0..f45200f3026 100644
--- a/src/Processors/QueryPlan/Optimizations/Optimizations.h
+++ b/src/Processors/QueryPlan/Optimizations/Optimizations.h
@@ -63,7 +63,7 @@ inline const auto & getOptimizations()
         {tryMergeExpressions, "mergeExpressions", &QueryPlanOptimizationSettings::optimize_plan},
         {tryPushDownFilter, "pushDownFilter", &QueryPlanOptimizationSettings::filter_push_down},
         {tryExecuteFunctionsAfterSorting, "liftUpFunctions", &QueryPlanOptimizationSettings::optimize_plan},
-        {tryReuseStorageOrderingForWindowFunctions, "reuseStorageOrderingForWindowFunctions", &QueryPlanOptimizationSettings::optimize_plan}
+        {tryReuseStorageOrderingForWindowFunctions, "reuseStorageOrderingForWindowFunctions", &QueryPlanOptimizationSettings::optimize_read_in_window_order}
     }};
 
     return optimizations;
diff --git a/src/Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.cpp b/src/Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.cpp
index 1472fb87a89..f9707b973e4 100644
--- a/src/Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.cpp
+++ b/src/Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.cpp
@@ -11,6 +11,7 @@ QueryPlanOptimizationSettings QueryPlanOptimizationSettings::fromSettings(const
     settings.optimize_plan = from.query_plan_enable_optimizations;
     settings.max_optimizations_to_apply = from.query_plan_max_optimizations_to_apply;
     settings.filter_push_down = from.query_plan_filter_push_down;
+    settings.optimize_read_in_window_order = from.optimize_read_in_window_order;
     return settings;
 }
 
diff --git a/src/Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.h b/src/Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.h
index b5a37bf69d6..99e52b60a73 100644
--- a/src/Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.h
+++ b/src/Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.h
@@ -21,6 +21,9 @@ struct QueryPlanOptimizationSettings
     /// If filter push down optimization is enabled.
     bool filter_push_down = true;
 
+    /// window functions read in order optimization
+    bool optimize_read_in_window_order = true;
+
     static QueryPlanOptimizationSettings fromSettings(const Settings & from);
     static QueryPlanOptimizationSettings fromContext(ContextPtr from);
 };
diff --git a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp
index 6c6c8954ea4..dbf389163be 100644
--- a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp
+++ b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp
@@ -8,6 +8,7 @@
 #include <Processors/QueryPlan/Optimizations/Optimizations.h>
 #include <Processors/QueryPlan/ITransformingStep.h>
 #include <Processors/QueryPlan/FilterStep.h>
+#include <Processors/QueryPlan/CreateSetAndFilterOnTheFlyStep.h>
 #include <Processors/QueryPlan/AggregatingStep.h>
 #include <Processors/QueryPlan/ExpressionStep.h>
 #include <Processors/QueryPlan/JoinStep.h>
@@ -22,6 +23,7 @@
 #include <Interpreters/ActionsDAG.h>
 #include <Interpreters/ArrayJoinAction.h>
 #include <Interpreters/TableJoin.h>
+#include <fmt/format.h>
 
 namespace DB::ErrorCodes
 {
@@ -134,10 +136,24 @@ tryAddNewFilterStep(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes, con
 
 static size_t
 tryAddNewFilterStep(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes, const Names & allowed_inputs,
-                    bool can_remove_filter = true)
+                    bool can_remove_filter = true, size_t child_idx = 0)
 {
-    if (auto split_filter = splitFilter(parent_node, allowed_inputs, 0))
-        return tryAddNewFilterStep(parent_node, nodes, split_filter, can_remove_filter, 0);
+    if (auto split_filter = splitFilter(parent_node, allowed_inputs, child_idx))
+        return tryAddNewFilterStep(parent_node, nodes, split_filter, can_remove_filter, child_idx);
+    return 0;
+}
+
+
+/// Push down filter through specified type of step
+template <typename Step>
+static size_t simplePushDownOverStep(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes, QueryPlanStepPtr & child)
+{
+    if (typeid_cast<Step *>(child.get()))
+    {
+        Names allowed_inputs = child->getOutputStream().header.getNames();
+        if (auto updated_steps = tryAddNewFilterStep(parent_node, nodes, allowed_inputs))
+            return updated_steps;
+    }
     return 0;
 }
 
@@ -234,12 +250,8 @@ size_t tryPushDownFilter(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes
             return updated_steps;
     }
 
-    if (auto * distinct = typeid_cast<DistinctStep *>(child.get()))
-    {
-        Names allowed_inputs = distinct->getOutputStream().header.getNames();
-        if (auto updated_steps = tryAddNewFilterStep(parent_node, nodes, allowed_inputs))
-            return updated_steps;
-    }
+    if (auto updated_steps = simplePushDownOverStep<DistinctStep>(parent_node, nodes, child))
+        return updated_steps;
 
     if (auto * join = typeid_cast<JoinStep *>(child.get()))
     {
@@ -290,7 +302,7 @@ size_t tryPushDownFilter(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes
             const size_t updated_steps = tryAddNewFilterStep(parent_node, nodes, split_filter, can_remove_filter, child_idx);
             if (updated_steps > 0)
             {
-                LOG_DEBUG(&Poco::Logger::get("QueryPlanOptimizations"), "Pushed down filter to {} side of join", kind);
+                LOG_DEBUG(&Poco::Logger::get("QueryPlanOptimizations"), "Pushed down filter {} to the {} side of join", split_filter_column_name, kind);
             }
             return updated_steps;
         };
@@ -321,12 +333,11 @@ size_t tryPushDownFilter(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes
     // {
     // }
 
-    if (typeid_cast<SortingStep *>(child.get()))
-    {
-        Names allowed_inputs = child->getOutputStream().header.getNames();
-        if (auto updated_steps = tryAddNewFilterStep(parent_node, nodes, allowed_inputs))
-            return updated_steps;
-    }
+    if (auto updated_steps = simplePushDownOverStep<SortingStep>(parent_node, nodes, child))
+        return updated_steps;
+
+    if (auto updated_steps = simplePushDownOverStep<CreateSetAndFilterOnTheFlyStep>(parent_node, nodes, child))
+        return updated_steps;
 
     if (auto * union_step = typeid_cast<UnionStep *>(child.get()))
     {
diff --git a/src/Processors/QueryPlan/Optimizations/reuseStorageOrderingForWindowFunctions.cpp b/src/Processors/QueryPlan/Optimizations/reuseStorageOrderingForWindowFunctions.cpp
index 2d7b0313955..8377b62c947 100644
--- a/src/Processors/QueryPlan/Optimizations/reuseStorageOrderingForWindowFunctions.cpp
+++ b/src/Processors/QueryPlan/Optimizations/reuseStorageOrderingForWindowFunctions.cpp
@@ -29,7 +29,7 @@ size_t tryReuseStorageOrderingForWindowFunctions(QueryPlan::Node * parent_node,
 {
     /// Find the following sequence of steps, add InputOrderInfo and apply prefix sort description to
     /// SortingStep:
-    /// WindowStep <- SortingStep <- [Expression] <- [SettingQuotaAndLimits] <- ReadFromMergeTree
+    /// WindowStep <- SortingStep <- [Expression] <- ReadFromMergeTree
 
     auto * window_node = parent_node;
     auto * window = typeid_cast<WindowStep *>(window_node->step.get());
@@ -61,12 +61,7 @@ size_t tryReuseStorageOrderingForWindowFunctions(QueryPlan::Node * parent_node,
         return 0;
     }
 
-    auto context = read_from_merge_tree->getContext();
-    if (!context->getSettings().optimize_read_in_window_order)
-    {
-        return 0;
-    }
-
+    const auto context = read_from_merge_tree->getContext();
     const auto & query_info = read_from_merge_tree->getQueryInfo();
     const auto * select_query = query_info.query->as<ASTSelectQuery>();
 
diff --git a/src/Processors/QueryPlan/QueryPlan.cpp b/src/Processors/QueryPlan/QueryPlan.cpp
index c27c0c0d318..9b4c0a6e920 100644
--- a/src/Processors/QueryPlan/QueryPlan.cpp
+++ b/src/Processors/QueryPlan/QueryPlan.cpp
@@ -333,8 +333,8 @@ static void explainStep(
     {
         if (step.hasOutputStream())
         {
-            settings.out << prefix << "Sorting (" << step.getOutputStream().sort_mode << ")";
-            if (step.getOutputStream().sort_mode != DataStream::SortMode::None)
+            settings.out << prefix << "Sorting (" << step.getOutputStream().sort_scope << ")";
+            if (step.getOutputStream().sort_scope != DataStream::SortScope::None)
             {
                 settings.out << ": ";
                 dumpSortDescription(step.getOutputStream().sort_description, settings.out);
diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp
index 1f6c6ee2a3f..60bf8d6a15c 100644
--- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp
+++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp
@@ -143,9 +143,9 @@ ReadFromMergeTree::ReadFromMergeTree(
         {
             auto const & settings = context->getSettingsRef();
             if ((settings.optimize_read_in_order || settings.optimize_aggregation_in_order) && query_info.getInputOrderInfo())
-                output_stream->sort_mode = DataStream::SortMode::Port;
+                output_stream->sort_scope = DataStream::SortScope::Stream;
             else
-                output_stream->sort_mode = DataStream::SortMode::Chunk;
+                output_stream->sort_scope = DataStream::SortScope::Chunk;
         }
 
         output_stream->sort_description = std::move(sort_description);
@@ -179,7 +179,6 @@ Pipe ReadFromMergeTree::readFromPool(
         sum_marks,
         min_marks_for_concurrent_read,
         std::move(parts_with_range),
-        data,
         storage_snapshot,
         prewhere_info,
         required_columns,
diff --git a/src/Processors/QueryPlan/SortingStep.cpp b/src/Processors/QueryPlan/SortingStep.cpp
index 38e02eebd44..9bad6a02d53 100644
--- a/src/Processors/QueryPlan/SortingStep.cpp
+++ b/src/Processors/QueryPlan/SortingStep.cpp
@@ -55,7 +55,7 @@ SortingStep::SortingStep(
 {
     /// TODO: check input_stream is partially sorted by the same description.
     output_stream->sort_description = result_description;
-    output_stream->sort_mode = DataStream::SortMode::Stream;
+    output_stream->sort_scope = DataStream::SortScope::Global;
 }
 
 SortingStep::SortingStep(
@@ -73,7 +73,7 @@ SortingStep::SortingStep(
 {
     /// TODO: check input_stream is sorted by prefix_description.
     output_stream->sort_description = result_description;
-    output_stream->sort_mode = DataStream::SortMode::Stream;
+    output_stream->sort_scope = DataStream::SortScope::Global;
 }
 
 SortingStep::SortingStep(
@@ -89,14 +89,14 @@ SortingStep::SortingStep(
 {
     /// TODO: check input_stream is partially sorted (each port) by the same description.
     output_stream->sort_description = result_description;
-    output_stream->sort_mode = DataStream::SortMode::Stream;
+    output_stream->sort_scope = DataStream::SortScope::Global;
 }
 
 void SortingStep::updateOutputStream()
 {
     output_stream = createOutputStream(input_streams.front(), input_streams.front().header, getDataStreamTraits());
     output_stream->sort_description = result_description;
-    output_stream->sort_mode = DataStream::SortMode::Stream;
+    output_stream->sort_scope = DataStream::SortScope::Global;
 }
 
 void SortingStep::updateLimit(size_t limit_)
@@ -256,23 +256,23 @@ void SortingStep::transformPipeline(QueryPipelineBuilder & pipeline, const Build
         return;
     }
 
-    const auto input_sort_mode = input_streams.front().sort_mode;
+    const auto input_sort_mode = input_streams.front().sort_scope;
     const SortDescription & input_sort_desc = input_streams.front().sort_description;
     if (optimize_sorting_by_input_stream_properties)
     {
         /// skip sorting if stream is already sorted
-        if (input_sort_mode == DataStream::SortMode::Stream && input_sort_desc.hasPrefix(result_description))
+        if (input_sort_mode == DataStream::SortScope::Global && input_sort_desc.hasPrefix(result_description))
             return;
 
         /// merge sorted
-        if (input_sort_mode == DataStream::SortMode::Port && input_sort_desc.hasPrefix(result_description))
+        if (input_sort_mode == DataStream::SortScope::Stream && input_sort_desc.hasPrefix(result_description))
         {
             mergingSorted(pipeline, result_description, limit);
             return;
         }
 
         /// if chunks already sorted according to result_sort_desc, then we can skip chunk sorting
-        if (input_sort_mode == DataStream::SortMode::Chunk && input_sort_desc.hasPrefix(result_description))
+        if (input_sort_mode == DataStream::SortScope::Chunk && input_sort_desc.hasPrefix(result_description))
         {
             const bool skip_partial_sort = true;
             fullSort(pipeline, result_description, limit, skip_partial_sort);
diff --git a/src/Processors/ResizeProcessor.h b/src/Processors/ResizeProcessor.h
index f9c188e041a..364d1b4c883 100644
--- a/src/Processors/ResizeProcessor.h
+++ b/src/Processors/ResizeProcessor.h
@@ -85,6 +85,13 @@ public:
     {
     }
 
+    StrictResizeProcessor(InputPorts inputs_, OutputPorts outputs_)
+        : IProcessor(inputs_, outputs_)
+        , current_input(inputs.begin())
+        , current_output(outputs.begin())
+    {
+    }
+
     String getName() const override { return "StrictResize"; }
 
     Status prepare(const PortNumbers &, const PortNumbers &) override;
diff --git a/src/Processors/Sources/SQLiteSource.cpp b/src/Processors/Sources/SQLiteSource.cpp
index 814480b63e3..60d39966659 100644
--- a/src/Processors/Sources/SQLiteSource.cpp
+++ b/src/Processors/Sources/SQLiteSource.cpp
@@ -39,7 +39,7 @@ SQLiteSource::SQLiteSource(
 
     if (status != SQLITE_OK)
         throw Exception(ErrorCodes::SQLITE_ENGINE_ERROR,
-                        "Cannot prepate sqlite statement. Status: {}. Message: {}",
+                        "Cannot prepare sqlite statement. Status: {}. Message: {}",
                         status, sqlite3_errstr(status));
 
     compiled_statement = std::unique_ptr<sqlite3_stmt, StatementDeleter>(compiled_stmt, StatementDeleter());
diff --git a/src/Processors/TTL/TTLAggregationAlgorithm.cpp b/src/Processors/TTL/TTLAggregationAlgorithm.cpp
index 0d160b8d32d..6a813a770cf 100644
--- a/src/Processors/TTL/TTLAggregationAlgorithm.cpp
+++ b/src/Processors/TTL/TTLAggregationAlgorithm.cpp
@@ -38,7 +38,8 @@ TTLAggregationAlgorithm::TTLAggregationAlgorithm(
         settings.max_threads,
         settings.min_free_disk_space_for_temporary_data,
         settings.compile_aggregate_expressions,
-        settings.min_count_to_compile_aggregate_expression);
+        settings.min_count_to_compile_aggregate_expression,
+        settings.max_block_size);
 
     aggregator = std::make_unique<Aggregator>(header, params);
 
diff --git a/src/Processors/Transforms/AggregatingInOrderTransform.cpp b/src/Processors/Transforms/AggregatingInOrderTransform.cpp
index 2c7a4e23119..c2de0c3a23a 100644
--- a/src/Processors/Transforms/AggregatingInOrderTransform.cpp
+++ b/src/Processors/Transforms/AggregatingInOrderTransform.cpp
@@ -182,7 +182,8 @@ void AggregatingInOrderTransform::consume(Chunk chunk)
             if (cur_block_size >= max_block_size || cur_block_bytes + current_memory_usage >= max_block_bytes)
             {
                 if (group_by_key)
-                    group_by_block = params->aggregator.prepareBlockAndFillSingleLevel(variants, /* final= */ false);
+                    group_by_block
+                        = params->aggregator.prepareBlockAndFillSingleLevel</* return_single_block */ true>(variants, /* final= */ false);
                 cur_block_bytes += current_memory_usage;
                 finalizeCurrentChunk(std::move(chunk), key_end);
                 return;
@@ -293,7 +294,8 @@ void AggregatingInOrderTransform::generate()
     if (cur_block_size && is_consume_finished)
     {
         if (group_by_key)
-            group_by_block = params->aggregator.prepareBlockAndFillSingleLevel(variants, /* final= */ false);
+            group_by_block
+                = params->aggregator.prepareBlockAndFillSingleLevel</* return_single_block */ true>(variants, /* final= */ false);
         else
             params->aggregator.addSingleKeyToAggregateColumns(variants, res_aggregate_columns);
         variants.invalidate();
diff --git a/src/Processors/Transforms/AggregatingTransform.cpp b/src/Processors/Transforms/AggregatingTransform.cpp
index 7f5896f5e97..4e55081ca48 100644
--- a/src/Processors/Transforms/AggregatingTransform.cpp
+++ b/src/Processors/Transforms/AggregatingTransform.cpp
@@ -203,7 +203,7 @@ public:
     {
         auto & output = outputs.front();
 
-        if (finished && !has_input)
+        if (finished && single_level_chunks.empty())
         {
             output.finish();
             return Status::Finished;
@@ -230,7 +230,7 @@ public:
         if (!processors.empty())
             return Status::ExpandPipeline;
 
-        if (has_input)
+        if (!single_level_chunks.empty())
             return preparePushToOutput();
 
         /// Single level case.
@@ -244,11 +244,14 @@ public:
 private:
     IProcessor::Status preparePushToOutput()
     {
-        auto & output = outputs.front();
-        output.push(std::move(current_chunk));
-        has_input = false;
+        if (single_level_chunks.empty())
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Some ready chunks expected");
 
-        if (finished)
+        auto & output = outputs.front();
+        output.push(std::move(single_level_chunks.back()));
+        single_level_chunks.pop_back();
+
+        if (finished && single_level_chunks.empty())
         {
             output.finish();
             return Status::Finished;
@@ -268,17 +271,17 @@ private:
             {
                 auto chunk = input.pull();
                 auto bucket = getInfoFromChunk(chunk)->bucket_num;
-                chunks[bucket] = std::move(chunk);
+                two_level_chunks[bucket] = std::move(chunk);
             }
         }
 
         if (!shared_data->is_bucket_processed[current_bucket_num])
             return Status::NeedData;
 
-        if (!chunks[current_bucket_num])
+        if (!two_level_chunks[current_bucket_num])
             return Status::NeedData;
 
-        output.push(std::move(chunks[current_bucket_num]));
+        output.push(std::move(two_level_chunks[current_bucket_num]));
 
         ++current_bucket_num;
         if (current_bucket_num == NUM_BUCKETS)
@@ -298,27 +301,16 @@ private:
     size_t num_threads;
 
     bool is_initialized = false;
-    bool has_input = false;
     bool finished = false;
 
-    Chunk current_chunk;
+    Chunks single_level_chunks;
 
     UInt32 current_bucket_num = 0;
     static constexpr Int32 NUM_BUCKETS = 256;
-    std::array<Chunk, NUM_BUCKETS> chunks;
+    std::array<Chunk, NUM_BUCKETS> two_level_chunks;
 
     Processors processors;
 
-    void setCurrentChunk(Chunk chunk)
-    {
-        if (has_input)
-            throw Exception("Current chunk was already set in "
-                            "ConvertingAggregatedToChunksTransform.", ErrorCodes::LOGICAL_ERROR);
-
-        has_input = true;
-        current_chunk = std::move(chunk);
-    }
-
     void initialize()
     {
         is_initialized = true;
@@ -339,7 +331,7 @@ private:
             auto block = params->aggregator.prepareBlockAndFillWithoutKey(
                 *first, params->final, first->type != AggregatedDataVariants::Type::without_key);
 
-            setCurrentChunk(convertToChunk(block));
+            single_level_chunks.emplace_back(convertToChunk(block));
         }
     }
 
@@ -364,9 +356,10 @@ private:
         else
             throw Exception("Unknown aggregated data variant.", ErrorCodes::UNKNOWN_AGGREGATED_DATA_VARIANT);
 
-        auto block = params->aggregator.prepareBlockAndFillSingleLevel(*first, params->final);
+        auto blocks = params->aggregator.prepareBlockAndFillSingleLevel</* return_single_block */ false>(*first, params->final);
+        for (auto & block : blocks)
+            single_level_chunks.emplace_back(convertToChunk(block));
 
-        setCurrentChunk(convertToChunk(block));
         finished = true;
     }
 
diff --git a/src/Processors/Transforms/CreateSetAndFilterOnTheFlyTransform.cpp b/src/Processors/Transforms/CreateSetAndFilterOnTheFlyTransform.cpp
new file mode 100644
index 00000000000..4278eb8e8b2
--- /dev/null
+++ b/src/Processors/Transforms/CreateSetAndFilterOnTheFlyTransform.cpp
@@ -0,0 +1,195 @@
+#include <Processors/Transforms/CreateSetAndFilterOnTheFlyTransform.h>
+
+#include <cstddef>
+#include <mutex>
+
+#include <Interpreters/Set.h>
+#include <Common/Stopwatch.h>
+#include <Common/formatReadable.h>
+#include <Common/logger_useful.h>
+#include <Columns/IColumn.h>
+#include <Core/ColumnWithTypeAndName.h>
+#include <base/types.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+}
+
+namespace
+{
+
+std::vector<size_t> getColumnIndices(const Block & block, const Names & column_names)
+{
+    std::vector<size_t> indices;
+    for (const auto & name : column_names)
+        indices.push_back(block.getPositionByName(name));
+    return indices;
+}
+
+Columns getColumnsByIndices(const Chunk & chunk, const std::vector<size_t> & indices)
+{
+    Columns columns;
+    const Columns & all_cols = chunk.getColumns();
+    for (const auto & index : indices)
+        columns.push_back(all_cols.at(index));
+    return columns;
+}
+
+ColumnsWithTypeAndName getColumnsByIndices(const Block & sample_block, const Chunk & chunk, const std::vector<size_t> & indices)
+{
+    Block block = sample_block.cloneEmpty();
+    block.setColumns(getColumnsByIndices(chunk, indices));
+    return block.getColumnsWithTypeAndName();
+}
+
+}
+
+CreatingSetsOnTheFlyTransform::CreatingSetsOnTheFlyTransform(
+    const Block & header_, const Names & column_names_, size_t num_streams_, SetWithStatePtr set_)
+    : ISimpleTransform(header_, header_, true)
+    , column_names(column_names_)
+    , key_column_indices(getColumnIndices(inputs.front().getHeader(), column_names))
+    , num_streams(num_streams_)
+    , set(set_)
+{
+}
+
+IProcessor::Status CreatingSetsOnTheFlyTransform::prepare()
+{
+    IProcessor::Status status = ISimpleTransform::prepare();
+
+    if (!set || status != Status::Finished)
+        /// Nothing to do with set
+        return status;
+
+    /// Finalize set
+    if (set->state == SetWithState::State::Creating)
+    {
+        if (input.isFinished())
+        {
+            set->finished_count++;
+            if (set->finished_count != num_streams)
+                /// Not all instances of processor are finished
+                return status;
+
+            set->finishInsert();
+            set->state = SetWithState::State::Finished;
+            LOG_DEBUG(log, "{}: finish building set for [{}] with {} rows, set size is {}",
+                getDescription(), fmt::join(column_names, ", "), set->getTotalRowCount(),
+                formatReadableSizeWithBinarySuffix(set->getTotalByteCount()));
+            set.reset();
+        }
+        else
+        {
+            /// Should not happen because processor inserted before join that reads all the data
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Processor finished, but not all input was read");
+        }
+    }
+
+    return status;
+}
+
+void CreatingSetsOnTheFlyTransform::transform(Chunk & chunk)
+{
+    if (!set || set->state != SetWithState::State::Creating)
+    {
+        /// If set building suspended by another processor, release pointer
+        if (set != nullptr)
+            set.reset();
+        return;
+    }
+
+    if (chunk.getNumRows())
+    {
+        Columns key_columns = getColumnsByIndices(chunk, key_column_indices);
+        bool limit_exceeded = !set->insertFromBlock(key_columns);
+        if (limit_exceeded)
+        {
+            auto prev_state = set->state.exchange(SetWithState::State::Suspended);
+            /// Print log only after first state switch
+            if (prev_state == SetWithState::State::Creating)
+            {
+                LOG_DEBUG(log, "{}: set limit exceeded, give up building set, after reading {} rows and using {}",
+                    getDescription(), set->getTotalRowCount(), formatReadableSizeWithBinarySuffix(set->getTotalByteCount()));
+            }
+            /// Probaply we need to clear set here, because it's unneeded anymore
+            /// But now `Set` doesn't have such method, so reset pointer in all processors and then it should be freed
+            set.reset();
+        }
+    }
+}
+
+FilterBySetOnTheFlyTransform::FilterBySetOnTheFlyTransform(const Block & header_, const Names & column_names_, SetWithStatePtr set_)
+    : ISimpleTransform(header_, header_, true)
+    , column_names(column_names_)
+    , key_column_indices(getColumnIndices(inputs.front().getHeader(), column_names))
+    , set(set_)
+{
+    const auto & header = inputs.front().getHeader();
+    for (size_t idx : key_column_indices)
+        key_sample_block.insert(header.getByPosition(idx));
+}
+
+IProcessor::Status FilterBySetOnTheFlyTransform::prepare()
+{
+    auto status = ISimpleTransform::prepare();
+
+    if (set && set->state == SetWithState::State::Suspended)
+        set.reset();
+
+    if (status == Status::Finished)
+    {
+        bool has_filter = set && set->state == SetWithState::State::Finished;
+        if (has_filter)
+        {
+            LOG_DEBUG(log, "Finished {} by [{}]: consumed {} rows in total, {} rows bypassed, result {} rows, {:.2f}% filtered",
+                Poco::toLower(getDescription()), fmt::join(column_names, ", "),
+                stat.consumed_rows, stat.consumed_rows_before_set, stat.result_rows,
+                100 - 100.0 * stat.result_rows / stat.consumed_rows);
+        }
+        else
+        {
+            LOG_DEBUG(log, "Finished {}: bypass {} rows", Poco::toLower(getDescription()), stat.consumed_rows);
+        }
+
+        /// Release set to free memory
+        set = nullptr;
+    }
+    return status;
+}
+
+void FilterBySetOnTheFlyTransform::transform(Chunk & chunk)
+{
+    stat.consumed_rows += chunk.getNumRows();
+    stat.result_rows += chunk.getNumRows();
+
+    bool can_filter = set && set->state == SetWithState::State::Finished;
+    if (!can_filter)
+        stat.consumed_rows_before_set += chunk.getNumRows();
+
+    if (can_filter && chunk.getNumRows())
+    {
+        auto key_columns = getColumnsByIndices(key_sample_block, chunk, key_column_indices);
+        ColumnPtr mask_col = set->execute(key_columns, false);
+        const auto & mask = assert_cast<const ColumnUInt8 *>(mask_col.get())->getData();
+
+        stat.result_rows -= chunk.getNumRows();
+
+        Columns columns = chunk.detachColumns();
+        size_t result_num_rows = 0;
+        for (auto & col : columns)
+        {
+            col = col->filter(mask, /* negative */ false);
+            result_num_rows = col->size();
+        }
+        stat.result_rows += result_num_rows;
+
+        chunk.setColumns(std::move(columns), result_num_rows);
+    }
+}
+
+}
diff --git a/src/Processors/Transforms/CreateSetAndFilterOnTheFlyTransform.h b/src/Processors/Transforms/CreateSetAndFilterOnTheFlyTransform.h
new file mode 100644
index 00000000000..d214a310a8c
--- /dev/null
+++ b/src/Processors/Transforms/CreateSetAndFilterOnTheFlyTransform.h
@@ -0,0 +1,114 @@
+#pragma once
+
+#include <atomic>
+#include <mutex>
+#include <vector>
+#include <Processors/ISimpleTransform.h>
+#include <Poco/Logger.h>
+#include <Interpreters/Set.h>
+
+namespace DB
+{
+
+struct SetWithState : public Set
+{
+    using Set::Set;
+
+    /// Flow: Creating -> Finished or Suspended
+    enum class State
+    {
+        /// Set is not yet created,
+        /// Creating processor continues to build set.
+        /// Filtering bypasses data.
+        Creating,
+
+        /// Set is finished.
+        /// Creating processor is finished.
+        /// Filtering filter stream with this set.
+        Finished,
+
+        /// Set building is canceled (due to limit exceeded).
+        /// Creating and filtering processors bypass data.
+        Suspended,
+    };
+
+    std::atomic<State> state = State::Creating;
+
+    /// Track number of processors that are currently working on this set.
+    /// Last one finalizes set.
+    std::atomic_size_t finished_count = 0;
+};
+
+using SetWithStatePtr = std::shared_ptr<SetWithState>;
+
+/*
+ * Create a set on the fly for incoming stream.
+ * The set is created from the key columns of the input block.
+ * Data is not changed and returned as is.
+ * Can be executed in parallel, but blocks on operations with set.
+ */
+class CreatingSetsOnTheFlyTransform : public ISimpleTransform
+{
+public:
+    CreatingSetsOnTheFlyTransform(const Block & header_, const Names & column_names_, size_t num_streams_, SetWithStatePtr set_);
+
+    String getName() const override { return "CreatingSetsOnTheFlyTransform"; }
+
+    Status prepare() override;
+
+    void transform(Chunk & chunk) override;
+
+private:
+    Names column_names;
+    std::vector<size_t> key_column_indices;
+
+    size_t num_streams;
+
+    /// Set to fill
+    SetWithStatePtr set;
+
+    Poco::Logger * log = &Poco::Logger::get("CreatingSetsOnTheFlyTransform");
+};
+
+/*
+ * Filter the input chunk by the set.
+ * When set building is not completed, just return the source data.
+ */
+class FilterBySetOnTheFlyTransform : public ISimpleTransform
+{
+public:
+    FilterBySetOnTheFlyTransform(const Block & header_, const Names & column_names_, SetWithStatePtr set_);
+
+    String getName() const override { return "FilterBySetOnTheFlyTransform"; }
+
+    Status prepare() override;
+
+    void transform(Chunk & chunk) override;
+
+private:
+    /// Set::execute requires ColumnsWithTypesAndNames, so we need to convert Chunk to that format
+    Block key_sample_block;
+
+    Names column_names;
+    std::vector<size_t> key_column_indices;
+
+    /// Filter by this set when it's created
+    SetWithStatePtr set;
+
+    /// Statistics to log
+    struct Stat
+    {
+        /// Total number of rows
+        size_t consumed_rows = 0;
+
+        /// Number of bypassed rows (processed before set is created)
+        size_t consumed_rows_before_set = 0;
+
+        /// Number of rows that passed the filter
+        size_t result_rows = 0;
+    } stat;
+
+    Poco::Logger * log = &Poco::Logger::get("FilterBySetOnTheFlyTransform");
+};
+
+}
diff --git a/src/Processors/Transforms/IntersectOrExceptTransform.cpp b/src/Processors/Transforms/IntersectOrExceptTransform.cpp
index 1ac82e99cf2..31a3e304505 100644
--- a/src/Processors/Transforms/IntersectOrExceptTransform.cpp
+++ b/src/Processors/Transforms/IntersectOrExceptTransform.cpp
@@ -111,7 +111,10 @@ size_t IntersectOrExceptTransform::buildFilter(
     for (size_t i = 0; i < rows; ++i)
     {
         auto find_result = state.findKey(method.data, i, variants.string_pool);
-        filter[i] = current_operator == ASTSelectIntersectExceptQuery::Operator::EXCEPT ? !find_result.isFound() : find_result.isFound();
+        filter[i] = (current_operator == ASTSelectIntersectExceptQuery::Operator::EXCEPT_ALL
+                     || current_operator == ASTSelectIntersectExceptQuery::Operator::EXCEPT_DISTINCT)
+            ? !find_result.isFound()
+            : find_result.isFound();
         if (filter[i])
             ++new_rows_num;
     }
diff --git a/src/Processors/Transforms/MergeJoinTransform.cpp b/src/Processors/Transforms/MergeJoinTransform.cpp
index 6f842bec939..315fc4810ba 100644
--- a/src/Processors/Transforms/MergeJoinTransform.cpp
+++ b/src/Processors/Transforms/MergeJoinTransform.cpp
@@ -513,7 +513,7 @@ MergeJoinAlgorithm::Status MergeJoinAlgorithm::allJoin(JoinKind kind)
     Columns lcols;
     if (!left_to_right_key_remap.empty())
     {
-        /// If we have remapped columns, then we need to get values from right columns insead of defaults
+        /// If we have remapped columns, then we need to get values from right columns instead of defaults
         const auto & indices = idx_map[0];
 
         const auto & left_src = cursors[0]->getCurrent().getColumns();
diff --git a/src/QueryPipeline/Pipe.cpp b/src/QueryPipeline/Pipe.cpp
index ae342abeea5..291739079a2 100644
--- a/src/QueryPipeline/Pipe.cpp
+++ b/src/QueryPipeline/Pipe.cpp
@@ -770,7 +770,7 @@ void Pipe::setSinks(const Pipe::ProcessorGetterWithStreamKind & getter)
     header.clear();
 }
 
-void Pipe::transform(const Transformer & transformer)
+void Pipe::transform(const Transformer & transformer, bool check_ports)
 {
     if (output_ports.empty())
         throw Exception("Cannot transform empty Pipe", ErrorCodes::LOGICAL_ERROR);
@@ -784,6 +784,9 @@ void Pipe::transform(const Transformer & transformer)
 
     for (const auto & port : output_ports)
     {
+        if (!check_ports)
+            break;
+
         if (!port->isConnected())
             throw Exception(
                 ErrorCodes::LOGICAL_ERROR,
@@ -799,6 +802,9 @@ void Pipe::transform(const Transformer & transformer)
     {
         for (const auto & port : processor->getInputs())
         {
+            if (!check_ports)
+                break;
+
             if (!port.isConnected())
                 throw Exception(
                     ErrorCodes::LOGICAL_ERROR,
@@ -806,7 +812,7 @@ void Pipe::transform(const Transformer & transformer)
                     processor->getName());
 
             const auto * connected_processor = &port.getOutputPort().getProcessor();
-            if (!set.contains(connected_processor))
+            if (check_ports && !set.contains(connected_processor))
                 throw Exception(
                     ErrorCodes::LOGICAL_ERROR,
                     "Transformation of Pipe is not valid because processor {} has input port which is connected with unknown processor {}",
@@ -823,7 +829,7 @@ void Pipe::transform(const Transformer & transformer)
             }
 
             const auto * connected_processor = &port.getInputPort().getProcessor();
-            if (!set.contains(connected_processor))
+            if (check_ports && !set.contains(connected_processor))
                 throw Exception(
                     ErrorCodes::LOGICAL_ERROR,
                     "Transformation of Pipe is not valid because processor {} has output port which is connected with unknown processor {}",
diff --git a/src/QueryPipeline/Pipe.h b/src/QueryPipeline/Pipe.h
index 52059f4ad19..79d19a18193 100644
--- a/src/QueryPipeline/Pipe.h
+++ b/src/QueryPipeline/Pipe.h
@@ -85,13 +85,13 @@ public:
     /// Add chain to every output port.
     void addChains(std::vector<Chain> chains);
 
-    /// Changes the number of output ports if needed. Adds ResizeTransform.
+    /// Changes the number of output ports if needed. Adds (Strict)ResizeProcessor.
     void resize(size_t num_streams, bool force = false, bool strict = false);
 
     using Transformer = std::function<Processors(OutputPortRawPtrs ports)>;
 
     /// Transform Pipe in general way.
-    void transform(const Transformer & transformer);
+    void transform(const Transformer & transformer, bool check_ports = true);
 
     /// Unite several pipes together. They should have same header.
     static Pipe unitePipes(Pipes pipes);
diff --git a/src/QueryPipeline/QueryPipelineBuilder.cpp b/src/QueryPipeline/QueryPipelineBuilder.cpp
index 82907d883bc..877c19a493e 100644
--- a/src/QueryPipeline/QueryPipelineBuilder.cpp
+++ b/src/QueryPipeline/QueryPipelineBuilder.cpp
@@ -159,10 +159,10 @@ void QueryPipelineBuilder::addChain(Chain chain)
     pipe.addChains(std::move(chains));
 }
 
-void QueryPipelineBuilder::transform(const Transformer & transformer)
+void QueryPipelineBuilder::transform(const Transformer & transformer, bool check_ports)
 {
     checkInitializedAndNotCompleted();
-    pipe.transform(transformer);
+    pipe.transform(transformer, check_ports);
 }
 
 void QueryPipelineBuilder::setSinks(const Pipe::ProcessorGetterWithStreamKind & getter)
@@ -348,8 +348,7 @@ std::unique_ptr<QueryPipelineBuilder> QueryPipelineBuilder::joinPipelinesYShaped
 
     left->pipe.dropExtremes();
     right->pipe.dropExtremes();
-
-    if (left->pipe.output_ports.size() != 1 || right->pipe.output_ports.size() != 1)
+    if (left->getNumStreams() != 1 || right->getNumStreams() != 1)
         throw Exception("Join is supported only for pipelines with one output port", ErrorCodes::LOGICAL_ERROR);
 
     if (left->hasTotals() || right->hasTotals())
@@ -359,8 +358,7 @@ std::unique_ptr<QueryPipelineBuilder> QueryPipelineBuilder::joinPipelinesYShaped
 
     auto joining = std::make_shared<MergeJoinTransform>(join, inputs, out_header, max_block_size);
 
-    auto result = mergePipelines(std::move(left), std::move(right), std::move(joining), collected_processors);
-    return result;
+    return mergePipelines(std::move(left), std::move(right), std::move(joining), collected_processors);
 }
 
 std::unique_ptr<QueryPipelineBuilder> QueryPipelineBuilder::joinPipelinesRightLeft(
diff --git a/src/QueryPipeline/QueryPipelineBuilder.h b/src/QueryPipeline/QueryPipelineBuilder.h
index 100a2e07341..4edae83fe86 100644
--- a/src/QueryPipeline/QueryPipelineBuilder.h
+++ b/src/QueryPipeline/QueryPipelineBuilder.h
@@ -69,7 +69,7 @@ public:
 
     using Transformer = std::function<Processors(OutputPortRawPtrs ports)>;
     /// Transform pipeline in general way.
-    void transform(const Transformer & transformer);
+    void transform(const Transformer & transformer, bool check_ports = true);
 
     /// Add TotalsHavingTransform. Resize pipeline to single input. Adds totals port.
     void addTotalsHavingTransform(ProcessorPtr transform);
diff --git a/src/QueryPipeline/RemoteInserter.cpp b/src/QueryPipeline/RemoteInserter.cpp
index cd0485ec8e3..b8a878b56c3 100644
--- a/src/QueryPipeline/RemoteInserter.cpp
+++ b/src/QueryPipeline/RemoteInserter.cpp
@@ -31,22 +31,6 @@ RemoteInserter::RemoteInserter(
 {
     ClientInfo modified_client_info = client_info_;
     modified_client_info.query_kind = ClientInfo::QueryKind::SECONDARY_QUERY;
-    if (CurrentThread::isInitialized())
-    {
-        auto& thread_trace_context = CurrentThread::get().thread_trace_context;
-
-        if (thread_trace_context.trace_id != UUID())
-        {
-            // overwrite the trace context only if current thread trace context is available
-            modified_client_info.client_trace_context = thread_trace_context;
-        }
-        else
-        {
-            // if the trace on the thread local is not enabled(for example running in a background thread)
-            // we should not clear the trace context on the client info because the client info may hold trace context
-            // and this trace context should be propagated to the remote server so that the tracing of distributed table insert is complete.
-        }
-    }
 
     Settings settings = settings_;
     /// With current protocol it is impossible to avoid deadlock in case of send_logs_level!=none.
diff --git a/src/QueryPipeline/RemoteQueryExecutor.cpp b/src/QueryPipeline/RemoteQueryExecutor.cpp
index 44e844fbe40..e42b0141a27 100644
--- a/src/QueryPipeline/RemoteQueryExecutor.cpp
+++ b/src/QueryPipeline/RemoteQueryExecutor.cpp
@@ -242,10 +242,6 @@ void RemoteQueryExecutor::sendQuery(ClientInfo::QueryKind query_kind)
     auto timeouts = ConnectionTimeouts::getTCPTimeoutsWithFailover(settings);
     ClientInfo modified_client_info = context->getClientInfo();
     modified_client_info.query_kind = query_kind;
-    if (CurrentThread::isInitialized())
-    {
-        modified_client_info.client_trace_context = CurrentThread::get().thread_trace_context;
-    }
 
     {
         std::lock_guard lock(duplicated_part_uuids_mutex);
diff --git a/src/Server/GRPCServer.cpp b/src/Server/GRPCServer.cpp
index 165dc7e1344..f9b3041ad40 100644
--- a/src/Server/GRPCServer.cpp
+++ b/src/Server/GRPCServer.cpp
@@ -662,6 +662,7 @@ namespace
         std::optional<Session> session;
         ContextMutablePtr query_context;
         std::optional<CurrentThread::QueryScope> query_scope;
+        OpenTelemetry::TracingContextHolderPtr thread_trace_context;
         String query_text;
         ASTPtr ast;
         ASTInsertQuery * insert_query = nullptr;
@@ -840,6 +841,12 @@ namespace
         query_context->setCurrentQueryId(query_info.query_id());
         query_scope.emplace(query_context);
 
+        /// Set up tracing context for this query on current thread
+        thread_trace_context = std::make_unique<OpenTelemetry::TracingContextHolder>("GRPCServer",
+            query_context->getClientInfo().client_trace_context,
+            query_context->getSettingsRef(),
+            query_context->getOpenTelemetrySpanLog());
+
         /// Prepare for sending exceptions and logs.
         const Settings & settings = query_context->getSettingsRef();
         send_exception_with_stacktrace = settings.calculate_text_stack_trace;
@@ -1359,6 +1366,7 @@ namespace
         io = {};
         query_scope.reset();
         query_context.reset();
+        thread_trace_context.reset();
         session.reset();
     }
 
diff --git a/src/Server/HTTPHandler.cpp b/src/Server/HTTPHandler.cpp
index 5b8e17eb279..442233ab408 100644
--- a/src/Server/HTTPHandler.cpp
+++ b/src/Server/HTTPHandler.cpp
@@ -534,19 +534,7 @@ void HTTPHandler::processQuery(
         session->makeSessionContext(session_id, session_timeout, session_check == "1");
     }
 
-    // Parse the OpenTelemetry traceparent header.
-    ClientInfo client_info = session->getClientInfo();
-    if (request.has("traceparent"))
-    {
-        std::string opentelemetry_traceparent = request.get("traceparent");
-        std::string error;
-        if (!client_info.client_trace_context.parseTraceparentHeader(opentelemetry_traceparent, error))
-        {
-            LOG_DEBUG(log, "Failed to parse OpenTelemetry traceparent header '{}': {}", opentelemetry_traceparent, error);
-        }
-        client_info.client_trace_context.tracestate = request.get("tracestate", "");
-    }
-
+    auto client_info = session->getClientInfo();
     auto context = session->makeQueryContext(std::move(client_info));
 
     /// The client can pass a HTTP header indicating supported compression method (gzip or deflate).
@@ -945,6 +933,13 @@ void HTTPHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse
     /// In case of exception, send stack trace to client.
     bool with_stacktrace = false;
 
+    OpenTelemetry::TracingContextHolderPtr thread_trace_context;
+    SCOPE_EXIT({
+        // make sure the response status is recorded
+        if (thread_trace_context)
+            thread_trace_context->root_span.addAttribute("clickhouse.http_status", response.getStatus());
+    });
+
     try
     {
         if (request.getMethod() == HTTPServerRequest::HTTP_OPTIONS)
@@ -952,6 +947,28 @@ void HTTPHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse
             processOptionsRequest(response, server.config());
             return;
         }
+
+        // Parse the OpenTelemetry traceparent header.
+        ClientInfo& client_info = session->getClientInfo();
+        if (request.has("traceparent"))
+        {
+            std::string opentelemetry_traceparent = request.get("traceparent");
+            std::string error;
+            if (!client_info.client_trace_context.parseTraceparentHeader(opentelemetry_traceparent, error))
+            {
+                LOG_DEBUG(log, "Failed to parse OpenTelemetry traceparent header '{}': {}", opentelemetry_traceparent, error);
+            }
+            client_info.client_trace_context.tracestate = request.get("tracestate", "");
+        }
+
+        // Setup tracing context for this thread
+        auto context = session->sessionOrGlobalContext();
+        thread_trace_context = std::make_unique<OpenTelemetry::TracingContextHolder>("HTTPHandler",
+            client_info.client_trace_context,
+            context->getSettingsRef(),
+            context->getOpenTelemetrySpanLog());
+        thread_trace_context->root_span.addAttribute("clickhouse.uri", request.getURI());
+
         response.setContentType("text/plain; charset=UTF-8");
         response.set("X-ClickHouse-Server-Display-Name", server_display_name);
         /// For keep-alive to work.
@@ -987,6 +1004,9 @@ void HTTPHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse
         /// cannot write in it anymore. So, just log this exception.
         if (used_output.isFinalized())
         {
+            if (thread_trace_context)
+                thread_trace_context->root_span.addAttribute("clickhouse.exception", "Cannot flush data to client");
+
             tryLogCurrentException(log, "Cannot flush data to client");
             return;
         }
@@ -1000,6 +1020,9 @@ void HTTPHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse
         int exception_code = getCurrentExceptionCode();
 
         trySendExceptionToClient(exception_message, exception_code, request, response, used_output);
+
+        if (thread_trace_context)
+            thread_trace_context->root_span.addAttribute("clickhouse.exception_code", exception_code);
     }
 
     used_output.finalize();
diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp
index 1fc88168b35..550ae1bff31 100644
--- a/src/Server/TCPHandler.cpp
+++ b/src/Server/TCPHandler.cpp
@@ -219,6 +219,7 @@ void TCPHandler::runImpl()
 
         /// Initialized later.
         std::optional<CurrentThread::QueryScope> query_scope;
+        OpenTelemetry::TracingContextHolderPtr thread_trace_context;
 
         /** An exception during the execution of request (it must be sent over the network to the client).
          *  The client will be able to accept it, if it did not happen while sending another packet and the client has not disconnected yet.
@@ -244,6 +245,12 @@ void TCPHandler::runImpl()
             if (state.empty() && state.part_uuids_to_ignore && !receivePacket())
                 continue;
 
+            /// Set up tracing context for this query on current thread
+            thread_trace_context = std::make_unique<OpenTelemetry::TracingContextHolder>("TCPHandler",
+                query_context->getClientInfo().client_trace_context,
+                query_context->getSettingsRef(),
+                query_context->getOpenTelemetrySpanLog());
+
             query_scope.emplace(query_context);
 
             /// If query received, then settings in query_context has been updated.
@@ -419,6 +426,7 @@ void TCPHandler::runImpl()
             /// (i.e. deallocations from the Aggregator with two-level aggregation)
             state.reset();
             query_scope.reset();
+            thread_trace_context.reset();
         }
         catch (const Exception & e)
         {
@@ -484,6 +492,9 @@ void TCPHandler::runImpl()
         {
             if (exception)
             {
+                if (thread_trace_context)
+                    thread_trace_context->root_span.addAttribute(*exception);
+
                 try
                 {
                     /// Try to send logs to client, but it could be risky too
@@ -532,6 +543,7 @@ void TCPHandler::runImpl()
             /// (i.e. deallocations from the Aggregator with two-level aggregation)
             state.reset();
             query_scope.reset();
+            thread_trace_context.reset();
         }
         catch (...)
         {
diff --git a/src/Storages/AlterCommands.cpp b/src/Storages/AlterCommands.cpp
index 7f99abf31fc..d370a67bfcc 100644
--- a/src/Storages/AlterCommands.cpp
+++ b/src/Storages/AlterCommands.cpp
@@ -1274,7 +1274,7 @@ void AlterCommands::validate(const StoragePtr & table, ContextPtr context) const
             throw Exception{"Table doesn't have SAMPLE BY, cannot remove", ErrorCodes::BAD_ARGUMENTS};
         }
 
-        /// Collect default expressions for MODIFY and ADD comands
+        /// Collect default expressions for MODIFY and ADD commands
         if (command.type == AlterCommand::MODIFY_COLUMN || command.type == AlterCommand::ADD_COLUMN)
         {
             if (command.default_expression)
diff --git a/src/Storages/ColumnsDescription.cpp b/src/Storages/ColumnsDescription.cpp
index 1a90312e076..d2490858a72 100644
--- a/src/Storages/ColumnsDescription.cpp
+++ b/src/Storages/ColumnsDescription.cpp
@@ -780,7 +780,7 @@ void ColumnsDescription::addSubcolumns(const String & name_in_storage, const Dat
                 "Cannot add subcolumn {}: column with this name already exists", subcolumn.name);
 
         subcolumns.get<0>().insert(std::move(subcolumn));
-    }, {type_in_storage->getDefaultSerialization(), type_in_storage, nullptr, nullptr});
+    }, ISerialization::SubstreamData(type_in_storage->getDefaultSerialization()).withType(type_in_storage));
 }
 
 void ColumnsDescription::removeSubcolumns(const String & name_in_storage)
diff --git a/src/Storages/Distributed/DirectoryMonitor.cpp b/src/Storages/Distributed/DirectoryMonitor.cpp
index ff5a38fcc52..16981d26146 100644
--- a/src/Storages/Distributed/DirectoryMonitor.cpp
+++ b/src/Storages/Distributed/DirectoryMonitor.cpp
@@ -609,6 +609,8 @@ bool StorageDistributedDirectoryMonitor::processFiles(const std::map<UInt64, std
 
 void StorageDistributedDirectoryMonitor::processFile(const std::string & file_path)
 {
+    OpenTelemetry::TracingContextHolderPtr thread_trace_context;
+
     Stopwatch watch;
     auto timeouts = ConnectionTimeouts::getTCPTimeoutsWithFailover(storage.getContext()->getSettingsRef());
 
@@ -627,6 +629,10 @@ void StorageDistributedDirectoryMonitor::processFile(const std::string & file_pa
             formatReadableQuantity(distributed_header.rows),
             formatReadableSizeWithBinarySuffix(distributed_header.bytes));
 
+        thread_trace_context = std::make_unique<OpenTelemetry::TracingContextHolder>(__PRETTY_FUNCTION__,
+            distributed_header.client_info.client_trace_context,
+            this->storage.getContext()->getOpenTelemetrySpanLog());
+
         RemoteInserter remote{*connection, timeouts,
             distributed_header.insert_query,
             distributed_header.insert_settings,
@@ -637,10 +643,20 @@ void StorageDistributedDirectoryMonitor::processFile(const std::string & file_pa
     }
     catch (Exception & e)
     {
+        if (thread_trace_context)
+            thread_trace_context->root_span.addAttribute(std::current_exception());
+
         e.addMessage(fmt::format("While sending {}", file_path));
         maybeMarkAsBroken(file_path, e);
         throw;
     }
+    catch (...)
+    {
+        if (thread_trace_context)
+            thread_trace_context->root_span.addAttribute(std::current_exception());
+
+        throw;
+    }
 
     auto dir_sync_guard = getDirectorySyncGuard(dir_fsync, disk, relative_path);
     markAsSend(file_path);
@@ -854,6 +870,10 @@ private:
             ReadBufferFromFile in(file_path->second);
             const auto & distributed_header = readDistributedHeader(in, parent.log);
 
+            OpenTelemetry::TracingContextHolder thread_trace_context(__PRETTY_FUNCTION__,
+                distributed_header.client_info.client_trace_context,
+                parent.storage.getContext()->getOpenTelemetrySpanLog());
+
             if (!remote)
             {
                 remote = std::make_unique<RemoteInserter>(connection, timeouts,
@@ -888,6 +908,11 @@ private:
                 ReadBufferFromFile in(file_path->second);
                 const auto & distributed_header = readDistributedHeader(in, parent.log);
 
+                // this function is called in a separated thread, so we set up the trace context from the file
+                OpenTelemetry::TracingContextHolder thread_trace_context(__PRETTY_FUNCTION__,
+                    distributed_header.client_info.client_trace_context,
+                    parent.storage.getContext()->getOpenTelemetrySpanLog());
+
                 RemoteInserter remote(connection, timeouts,
                     distributed_header.insert_query,
                     distributed_header.insert_settings,
diff --git a/src/Storages/Distributed/DistributedSink.cpp b/src/Storages/Distributed/DistributedSink.cpp
index 8ea2954368e..ae72fdd84e2 100644
--- a/src/Storages/Distributed/DistributedSink.cpp
+++ b/src/Storages/Distributed/DistributedSink.cpp
@@ -336,7 +336,7 @@ DistributedSink::runWritingJob(JobReplica & job, const Block & current_block, si
         if (rows == 0)
             return;
 
-        OpenTelemetrySpanHolder span(__PRETTY_FUNCTION__);
+        OpenTelemetry::SpanHolder span(__PRETTY_FUNCTION__);
         span.addAttribute("clickhouse.shard_num", shard_info.shard_num);
         span.addAttribute("clickhouse.written_rows", rows);
 
@@ -419,7 +419,7 @@ DistributedSink::runWritingJob(JobReplica & job, const Block & current_block, si
 
 void DistributedSink::writeSync(const Block & block)
 {
-    OpenTelemetrySpanHolder span(__PRETTY_FUNCTION__);
+    OpenTelemetry::SpanHolder span(__PRETTY_FUNCTION__);
 
     const Settings & settings = context->getSettingsRef();
     const auto & shards_info = cluster->getShardsInfo();
@@ -610,7 +610,7 @@ void DistributedSink::writeSplitAsync(const Block & block)
 
 void DistributedSink::writeAsyncImpl(const Block & block, size_t shard_id)
 {
-    OpenTelemetrySpanHolder span("DistributedSink::writeAsyncImpl()");
+    OpenTelemetry::SpanHolder span("DistributedSink::writeAsyncImpl()");
 
     const auto & shard_info = cluster->getShardsInfo()[shard_id];
     const auto & settings = context->getSettingsRef();
@@ -652,7 +652,7 @@ void DistributedSink::writeAsyncImpl(const Block & block, size_t shard_id)
 
 void DistributedSink::writeToLocal(const Block & block, size_t repeats)
 {
-    OpenTelemetrySpanHolder span(__PRETTY_FUNCTION__);
+    OpenTelemetry::SpanHolder span(__PRETTY_FUNCTION__);
     span.addAttribute("db.statement", this->query_string);
 
     InterpreterInsertQuery interp(query_ast, context, allow_materialized);
@@ -668,7 +668,7 @@ void DistributedSink::writeToLocal(const Block & block, size_t repeats)
 
 void DistributedSink::writeToShard(const Block & block, const std::vector<std::string> & dir_names)
 {
-    OpenTelemetrySpanHolder span(__PRETTY_FUNCTION__);
+    OpenTelemetry::SpanHolder span(__PRETTY_FUNCTION__);
 
     const auto & settings = context->getSettingsRef();
     const auto & distributed_settings = storage.getDistributedSettingsRef();
@@ -737,11 +737,11 @@ void DistributedSink::writeToShard(const Block & block, const std::vector<std::s
             writeStringBinary(query_string, header_buf);
             context->getSettingsRef().write(header_buf);
 
-            if (context->getClientInfo().client_trace_context.trace_id != UUID() && CurrentThread::isInitialized())
+            if (OpenTelemetry::CurrentContext().isTraceEnabled())
             {
                 // if the distributed tracing is enabled, use the trace context in current thread as parent of next span
                 auto client_info = context->getClientInfo();
-                client_info.client_trace_context = CurrentThread::get().thread_trace_context;
+                client_info.client_trace_context = OpenTelemetry::CurrentContext();
                 client_info.write(header_buf, DBMS_TCP_PROTOCOL_VERSION);
             }
             else
diff --git a/src/Storages/HDFS/ReadBufferFromHDFS.cpp b/src/Storages/HDFS/ReadBufferFromHDFS.cpp
index fab810a1e49..4aebcd6f6ab 100644
--- a/src/Storages/HDFS/ReadBufferFromHDFS.cpp
+++ b/src/Storages/HDFS/ReadBufferFromHDFS.cpp
@@ -41,8 +41,9 @@ struct ReadBufferFromHDFS::ReadBufferFromHDFSImpl : public BufferWithOwnMemory<S
         const std::string & hdfs_file_path_,
         const Poco::Util::AbstractConfiguration & config_,
         const ReadSettings & read_settings_,
-        size_t read_until_position_)
-        : BufferWithOwnMemory<SeekableReadBuffer>(read_settings_.remote_fs_buffer_size)
+        size_t read_until_position_,
+        bool use_external_buffer_)
+        : BufferWithOwnMemory<SeekableReadBuffer>(use_external_buffer_ ? 0 : read_settings_.remote_fs_buffer_size)
         , hdfs_uri(hdfs_uri_)
         , hdfs_file_path(hdfs_file_path_)
         , builder(createHDFSBuilder(hdfs_uri_, config_))
@@ -132,10 +133,12 @@ ReadBufferFromHDFS::ReadBufferFromHDFS(
         const String & hdfs_file_path_,
         const Poco::Util::AbstractConfiguration & config_,
         const ReadSettings & read_settings_,
-        size_t read_until_position_)
+        size_t read_until_position_,
+        bool use_external_buffer_)
     : ReadBufferFromFileBase(read_settings_.remote_fs_buffer_size, nullptr, 0)
     , impl(std::make_unique<ReadBufferFromHDFSImpl>(
-               hdfs_uri_, hdfs_file_path_, config_, read_settings_, read_until_position_))
+               hdfs_uri_, hdfs_file_path_, config_, read_settings_, read_until_position_, use_external_buffer_))
+    , use_external_buffer(use_external_buffer_)
 {
 }
 
@@ -146,7 +149,18 @@ size_t ReadBufferFromHDFS::getFileSize()
 
 bool ReadBufferFromHDFS::nextImpl()
 {
-    impl->position() = impl->buffer().begin() + offset();
+    if (use_external_buffer)
+    {
+        impl->set(internal_buffer.begin(), internal_buffer.size());
+        assert(working_buffer.begin() != nullptr);
+        assert(!internal_buffer.empty());
+    }
+    else
+    {
+        impl->position() = impl->buffer().begin() + offset();
+        assert(!impl->hasPendingData());
+    }
+
     auto result = impl->next();
 
     if (result)
diff --git a/src/Storages/HDFS/ReadBufferFromHDFS.h b/src/Storages/HDFS/ReadBufferFromHDFS.h
index 41493c31882..c3b859f0566 100644
--- a/src/Storages/HDFS/ReadBufferFromHDFS.h
+++ b/src/Storages/HDFS/ReadBufferFromHDFS.h
@@ -29,7 +29,8 @@ public:
         const String & hdfs_file_path_,
         const Poco::Util::AbstractConfiguration & config_,
         const ReadSettings & read_settings_,
-        size_t read_until_position_ = 0);
+        size_t read_until_position_ = 0,
+        bool use_external_buffer = false);
 
     ~ReadBufferFromHDFS() override;
 
@@ -49,6 +50,7 @@ public:
 
 private:
     std::unique_ptr<ReadBufferFromHDFSImpl> impl;
+    bool use_external_buffer;
 };
 }
 
diff --git a/src/Storages/Kafka/StorageKafka.cpp b/src/Storages/Kafka/StorageKafka.cpp
index b51457ba5d5..06ce4fb308d 100644
--- a/src/Storages/Kafka/StorageKafka.cpp
+++ b/src/Storages/Kafka/StorageKafka.cpp
@@ -810,7 +810,7 @@ void registerStorageKafka(StorageFactory & factory)
         /** Arguments of engine is following:
           * - Kafka broker list
           * - List of topics
-          * - Group ID (may be a constaint expression with a string result)
+          * - Group ID (may be a constant expression with a string result)
           * - Message format (string)
           * - Row delimiter
           * - Schema (optional, if the format supports it)
diff --git a/src/Storages/LiveView/StorageLiveView.cpp b/src/Storages/LiveView/StorageLiveView.cpp
index 6a079aa832f..b37dec6cbf1 100644
--- a/src/Storages/LiveView/StorageLiveView.cpp
+++ b/src/Storages/LiveView/StorageLiveView.cpp
@@ -382,7 +382,7 @@ bool StorageLiveView::getNewBlocks()
     BlocksMetadataPtr new_blocks_metadata = std::make_shared<BlocksMetadata>();
 
     /// can't set mergeable_blocks here or anywhere else outside the writeIntoLiveView function
-    /// as there could be a race codition when the new block has been inserted into
+    /// as there could be a race condition when the new block has been inserted into
     /// the source table by the PushingToViews chain and this method
     /// called before writeIntoLiveView function is called which can lead to
     /// the same block added twice to the mergeable_blocks leading to
diff --git a/src/Storages/MergeTree/AlterConversions.h b/src/Storages/MergeTree/AlterConversions.h
new file mode 100644
index 00000000000..0d58499d424
--- /dev/null
+++ b/src/Storages/MergeTree/AlterConversions.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <string>
+#include <unordered_map>
+
+
+namespace DB
+{
+
+/// Alter conversions which should be applied on-fly for part. Build from of
+/// the most recent mutation commands for part. Now we have only rename_map
+/// here (from ALTER_RENAME) command, because for all other type of alters
+/// we can deduce conversions for part from difference between
+/// part->getColumns() and storage->getColumns().
+struct AlterConversions
+{
+    /// Rename map new_name -> old_name
+    std::unordered_map<std::string, std::string> rename_map;
+
+    bool isColumnRenamed(const std::string & new_name) const { return rename_map.count(new_name) > 0; }
+    std::string getColumnOldName(const std::string & new_name) const { return rename_map.at(new_name); }
+};
+
+}
diff --git a/src/Storages/MergeTree/DataPartStorageOnDisk.cpp b/src/Storages/MergeTree/DataPartStorageOnDisk.cpp
index 0154fd6e281..894eec12f0c 100644
--- a/src/Storages/MergeTree/DataPartStorageOnDisk.cpp
+++ b/src/Storages/MergeTree/DataPartStorageOnDisk.cpp
@@ -650,23 +650,31 @@ bool DataPartStorageOnDisk::shallParticipateInMerges(const IStoragePolicy & stor
 }
 
 void DataPartStorageOnDisk::backup(
-    TemporaryFilesOnDisks & temp_dirs,
     const MergeTreeDataPartChecksums & checksums,
     const NameSet & files_without_checksums,
     const String & path_in_backup,
-    BackupEntries & backup_entries) const
+    BackupEntries & backup_entries,
+    bool make_temporary_hard_links,
+    TemporaryFilesOnDisks * temp_dirs) const
 {
     fs::path part_path_on_disk = fs::path{root_path} / part_dir;
     fs::path part_path_in_backup = fs::path{path_in_backup} / part_dir;
 
     auto disk = volume->getDisk();
-    auto temp_dir_it = temp_dirs.find(disk);
-    if (temp_dir_it == temp_dirs.end())
-        temp_dir_it = temp_dirs.emplace(disk, std::make_shared<TemporaryFileOnDisk>(disk, "tmp/")).first;
-    auto temp_dir_owner = temp_dir_it->second;
-    fs::path temp_dir = temp_dir_owner->getPath();
-    fs::path temp_part_dir = temp_dir / part_path_in_backup.relative_path();
-    disk->createDirectories(temp_part_dir);
+
+    fs::path temp_part_dir;
+    std::shared_ptr<TemporaryFileOnDisk> temp_dir_owner;
+    if (make_temporary_hard_links)
+    {
+        assert(temp_dirs);
+        auto temp_dir_it = temp_dirs->find(disk);
+        if (temp_dir_it == temp_dirs->end())
+            temp_dir_it = temp_dirs->emplace(disk, std::make_shared<TemporaryFileOnDisk>(disk, "tmp/")).first;
+        temp_dir_owner = temp_dir_it->second;
+        fs::path temp_dir = temp_dir_owner->getPath();
+        temp_part_dir = temp_dir / part_path_in_backup.relative_path();
+        disk->createDirectories(temp_part_dir);
+    }
 
     /// For example,
     /// part_path_in_backup = /data/test/table/0_1_1_0
@@ -683,13 +691,18 @@ void DataPartStorageOnDisk::backup(
             continue; /// Skip *.proj files - they're actually directories and will be handled.
         String filepath_on_disk = part_path_on_disk / filepath;
         String filepath_in_backup = part_path_in_backup / filepath;
-        String hardlink_filepath = temp_part_dir / filepath;
 
-        disk->createHardLink(filepath_on_disk, hardlink_filepath);
+        if (make_temporary_hard_links)
+        {
+            String hardlink_filepath = temp_part_dir / filepath;
+            disk->createHardLink(filepath_on_disk, hardlink_filepath);
+            filepath_on_disk = hardlink_filepath;
+        }
+
         UInt128 file_hash{checksum.file_hash.first, checksum.file_hash.second};
         backup_entries.emplace_back(
             filepath_in_backup,
-            std::make_unique<BackupEntryFromImmutableFile>(disk, hardlink_filepath, checksum.file_size, file_hash, temp_dir_owner));
+            std::make_unique<BackupEntryFromImmutableFile>(disk, filepath_on_disk, checksum.file_size, file_hash, temp_dir_owner));
     }
 
     for (const auto & filepath : files_without_checksums)
diff --git a/src/Storages/MergeTree/DataPartStorageOnDisk.h b/src/Storages/MergeTree/DataPartStorageOnDisk.h
index 5b5cff8e636..f02ef26f811 100644
--- a/src/Storages/MergeTree/DataPartStorageOnDisk.h
+++ b/src/Storages/MergeTree/DataPartStorageOnDisk.h
@@ -89,11 +89,12 @@ public:
     bool shallParticipateInMerges(const IStoragePolicy &) const override;
 
     void backup(
-        TemporaryFilesOnDisks & temp_dirs,
         const MergeTreeDataPartChecksums & checksums,
         const NameSet & files_without_checksums,
         const String & path_in_backup,
-        BackupEntries & backup_entries) const override;
+        BackupEntries & backup_entries,
+        bool make_temporary_hard_links,
+        TemporaryFilesOnDisks * temp_dirs) const override;
 
     DataPartStoragePtr freeze(
         const std::string & to,
diff --git a/src/Storages/MergeTree/DataPartsExchange.cpp b/src/Storages/MergeTree/DataPartsExchange.cpp
index 02821de8629..e10881c1eb3 100644
--- a/src/Storages/MergeTree/DataPartsExchange.cpp
+++ b/src/Storages/MergeTree/DataPartsExchange.cpp
@@ -399,7 +399,7 @@ MergeTreeData::DataPartPtr Service::findPart(const String & name)
     throw Exception(ErrorCodes::NO_SUCH_DATA_PART, "No part {} in table", name);
 }
 
-MergeTreeData::MutableDataPartPtr Fetcher::fetchPart(
+MergeTreeData::MutableDataPartPtr Fetcher::fetchSelectedPart(
     const StorageMetadataPtr & metadata_snapshot,
     ContextPtr context,
     const String & part_name,
@@ -420,6 +420,11 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchPart(
     if (blocker.isCancelled())
         throw Exception("Fetching of part was cancelled", ErrorCodes::ABORTED);
 
+    const auto data_settings = data.getSettings();
+
+    if (data.canUseZeroCopyReplication() && !try_zero_copy)
+        LOG_INFO(log, "Zero copy replication enabled, but trying to fetch part {} without zero copy", part_name);
+
     /// It should be "tmp-fetch_" and not "tmp_fetch_", because we can fetch part to detached/,
     /// but detached part name prefix should not contain underscore.
     static const String TMP_PREFIX = "tmp-fetch_";
@@ -429,7 +434,6 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchPart(
 
     /// Validation of the input that may come from malicious replica.
     auto part_info = MergeTreePartInfo::fromPartName(part_name, data.format_version);
-    const auto data_settings = data.getSettings();
 
     Poco::URI uri;
     uri.setScheme(interserver_scheme);
@@ -465,6 +469,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchPart(
             capability.push_back(toString(disk->getDataSourceDescription().type));
         }
     }
+
     if (!capability.empty())
     {
         ::sort(capability.begin(), capability.end());
@@ -474,6 +479,9 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchPart(
     }
     else
     {
+        if (data.canUseZeroCopyReplication())
+            LOG_INFO(log, "Cannot select any zero-copy disk for {}", part_name);
+
         try_zero_copy = false;
     }
 
@@ -585,7 +593,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchPart(
             temporary_directory_lock = {};
 
             /// Try again but without zero-copy
-            return fetchPart(metadata_snapshot, context, part_name, replica_path, host, port, timeouts,
+            return fetchSelectedPart(metadata_snapshot, context, part_name, replica_path, host, port, timeouts,
                 user, password, interserver_scheme, throttler, to_detached, tmp_prefix, nullptr, false, disk);
         }
     }
@@ -773,6 +781,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToDisk(
     ThrottlerPtr throttler)
 {
     assert(!tmp_prefix.empty());
+    const auto data_settings = data.getSettings();
 
     /// We will remove directory if it's already exists. Make precautions.
     if (tmp_prefix.empty() //-V560
@@ -800,7 +809,14 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToDisk(
     {
         LOG_WARNING(log, "Directory {} already exists, probably result of a failed fetch. Will remove it before fetching part.",
             data_part_storage_builder->getFullPath());
-        data_part_storage_builder->removeRecursive();
+
+        /// Even if it's a temporary part it could be downloaded with zero copy replication and this function
+        /// is executed as a callback.
+        ///
+        /// We don't control the amount of refs for temporary parts so we cannot decide can we remove blobs
+        /// or not. So we are not doing it
+        bool keep_shared = disk->supportZeroCopyReplication() && data_settings->allow_remote_fs_zero_copy_replication;
+        data_part_storage_builder->removeSharedRecursive(keep_shared);
     }
 
     data_part_storage_builder->createDirectories();
diff --git a/src/Storages/MergeTree/DataPartsExchange.h b/src/Storages/MergeTree/DataPartsExchange.h
index 0e19bf4cdcd..e2582c42dfb 100644
--- a/src/Storages/MergeTree/DataPartsExchange.h
+++ b/src/Storages/MergeTree/DataPartsExchange.h
@@ -66,7 +66,7 @@ public:
     explicit Fetcher(StorageReplicatedMergeTree & data_) : data(data_), log(&Poco::Logger::get("Fetcher")) {}
 
     /// Downloads a part to tmp_directory. If to_detached - downloads to the `detached` directory.
-    MergeTreeData::MutableDataPartPtr fetchPart(
+    MergeTreeData::MutableDataPartPtr fetchSelectedPart(
         const StorageMetadataPtr & metadata_snapshot,
         ContextPtr context,
         const String & part_name,
diff --git a/src/Storages/MergeTree/IDataPartStorage.h b/src/Storages/MergeTree/IDataPartStorage.h
index 946c1c5fd47..9da8a5eae03 100644
--- a/src/Storages/MergeTree/IDataPartStorage.h
+++ b/src/Storages/MergeTree/IDataPartStorage.h
@@ -177,11 +177,12 @@ public:
     /// Also creates a new tmp_dir for internal disk (if disk is mentioned the first time).
     using TemporaryFilesOnDisks = std::map<DiskPtr, std::shared_ptr<TemporaryFileOnDisk>>;
     virtual void backup(
-        TemporaryFilesOnDisks & temp_dirs,
         const MergeTreeDataPartChecksums & checksums,
         const NameSet & files_without_checksums,
         const String & path_in_backup,
-        BackupEntries & backup_entries) const = 0;
+        BackupEntries & backup_entries,
+        bool make_temporary_hard_links,
+        TemporaryFilesOnDisks * temp_dirs) const = 0;
 
     /// Creates hardlinks into 'to/dir_path' for every file in data part.
     /// Callback is called after hardlinks are created, but before 'delete-on-destroy.txt' marker is removed.
diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp
index 32c2c09a392..e9d900c6d54 100644
--- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp
+++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp
@@ -445,11 +445,11 @@ void IMergeTreeDataPart::setColumns(const NamesAndTypesList & new_columns, const
     column_name_to_position.clear();
     column_name_to_position.reserve(new_columns.size());
     size_t pos = 0;
-    for (const auto & column : columns)
-        column_name_to_position.emplace(column.name, pos++);
 
     for (const auto & column : columns)
     {
+        column_name_to_position.emplace(column.name, pos++);
+
         auto it = serialization_infos.find(column.name);
         auto serialization = it == serialization_infos.end()
             ? IDataType::getSerialization(column)
@@ -461,7 +461,7 @@ void IMergeTreeDataPart::setColumns(const NamesAndTypesList & new_columns, const
         {
             auto full_name = Nested::concatenateName(column.name, subname);
             serializations.emplace(full_name, subdata.serialization);
-        }, {serialization, nullptr, nullptr, nullptr});
+        }, ISerialization::SubstreamData(serialization));
     }
 
     columns_description = ColumnsDescription(columns);
@@ -534,14 +534,11 @@ void IMergeTreeDataPart::removeIfNeeded()
     }
     catch (...)
     {
+        tryLogCurrentException(__PRETTY_FUNCTION__, fmt::format("while removing part {} with path {}", name, path));
+
         /// FIXME If part it temporary, then directory will not be removed for 1 day (temporary_directories_lifetime).
         /// If it's tmp_merge_<part_name> or tmp_fetch_<part_name>,
         /// then all future attempts to execute part producing operation will fail with "directory already exists".
-        /// Seems like it's especially important for remote disks, because removal may fail due to network issues.
-        tryLogCurrentException(__PRETTY_FUNCTION__, "while removiong path: " + path);
-        assert(!is_temp);
-        assert(state != MergeTreeDataPartState::DeleteOnDestroy);
-        assert(state != MergeTreeDataPartState::Temporary);
     }
 }
 
@@ -1355,7 +1352,6 @@ bool IMergeTreeDataPart::assertHasValidVersionMetadata() const
     }
 }
 
-
 void IMergeTreeDataPart::appendFilesOfColumns(Strings & files)
 {
     files.push_back("columns.txt");
@@ -1412,7 +1408,10 @@ std::pair<bool, NameSet> IMergeTreeDataPart::canRemovePart() const
 {
     /// NOTE: It's needed for zero-copy replication
     if (force_keep_shared_data)
+    {
+        LOG_DEBUG(storage.log, "Blobs for part {} cannot be removed because it's forced to be keeped", name);
         return std::make_pair(false, NameSet{});
+    }
 
     return storage.unlockSharedData(*this);
 }
@@ -1436,6 +1435,12 @@ void IMergeTreeDataPart::remove() const
 
     auto [can_remove, files_not_to_remove] = canRemovePart();
 
+    if (!can_remove)
+        LOG_TRACE(storage.log, "Blobs of part {} cannot be removed", name);
+
+    if (!files_not_to_remove.empty())
+        LOG_TRACE(storage.log, "Some blobs ({}) of part {} cannot be removed", fmt::join(files_not_to_remove, ", "), name);
+
     if (!isStoredOnDisk())
         return;
 
diff --git a/src/Storages/MergeTree/IMergeTreeDataPartInfoForReader.h b/src/Storages/MergeTree/IMergeTreeDataPartInfoForReader.h
new file mode 100644
index 00000000000..28f834d661d
--- /dev/null
+++ b/src/Storages/MergeTree/IMergeTreeDataPartInfoForReader.h
@@ -0,0 +1,68 @@
+#pragma once
+#include <Interpreters/Context.h>
+#include <Storages/MergeTree/AlterConversions.h>
+#include <Core/NamesAndTypes.h>
+
+namespace DB
+{
+
+class IDataPartStorage;
+using DataPartStoragePtr = std::shared_ptr<IDataPartStorage>;
+class MergeTreeIndexGranularity;
+struct MergeTreeDataPartChecksums;
+struct MergeTreeIndexGranularityInfo;
+class ISerialization;
+using SerializationPtr = std::shared_ptr<const ISerialization>;
+
+/**
+ * A class which contains all information about a data part that is required
+ * in order to use MergeTreeDataPartReader's.
+ * It is a separate interface and not a simple struct because
+ * otherwise it will need to copy all the information which might not
+ * be even used (for example, an IndexGranulary class object is quite heavy).
+ */
+class IMergeTreeDataPartInfoForReader : public WithContext
+{
+public:
+    explicit IMergeTreeDataPartInfoForReader(ContextPtr context_) : WithContext(context_) {}
+
+    virtual ~IMergeTreeDataPartInfoForReader() = default;
+
+    virtual bool isCompactPart() const = 0;
+
+    virtual bool isWidePart() const = 0;
+
+    virtual bool isInMemoryPart() const = 0;
+
+    virtual bool isProjectionPart() const = 0;
+
+    virtual const DataPartStoragePtr & getDataPartStorage() const = 0;
+
+    virtual const NamesAndTypesList & getColumns() const = 0;
+
+    virtual std::optional<size_t> getColumnPosition(const String & column_name) const = 0;
+
+    virtual String getColumnNameWithMinimumCompressedSize(bool with_subcolumns) const = 0;
+
+    virtual const MergeTreeDataPartChecksums & getChecksums() const = 0;
+
+    virtual AlterConversions getAlterConversions() const = 0;
+
+    virtual size_t getMarksCount() const = 0;
+
+    virtual size_t getFileSizeOrZero(const std::string & file_name) const = 0;
+
+    virtual const MergeTreeIndexGranularityInfo & getIndexGranularityInfo() const = 0;
+
+    virtual const MergeTreeIndexGranularity & getIndexGranularity() const = 0;
+
+    virtual SerializationPtr getSerialization(const NameAndTypePair & column) const = 0;
+
+    virtual const SerializationInfoByName & getSerializationInfos() const = 0;
+
+    virtual void reportBroken() = 0;
+};
+
+using MergeTreeDataPartInfoForReaderPtr = std::shared_ptr<IMergeTreeDataPartInfoForReader>;
+
+}
diff --git a/src/Storages/MergeTree/IMergeTreeReader.cpp b/src/Storages/MergeTree/IMergeTreeReader.cpp
index 8c861248580..8711664d531 100644
--- a/src/Storages/MergeTree/IMergeTreeReader.cpp
+++ b/src/Storages/MergeTree/IMergeTreeReader.cpp
@@ -23,7 +23,7 @@ namespace ErrorCodes
 
 
 IMergeTreeReader::IMergeTreeReader(
-    const MergeTreeData::DataPartPtr & data_part_,
+    MergeTreeDataPartInfoForReaderPtr data_part_info_for_read_,
     const NamesAndTypesList & columns_,
     const StorageMetadataPtr & metadata_snapshot_,
     UncompressedCache * uncompressed_cache_,
@@ -31,19 +31,18 @@ IMergeTreeReader::IMergeTreeReader(
     const MarkRanges & all_mark_ranges_,
     const MergeTreeReaderSettings & settings_,
     const ValueSizeMap & avg_value_size_hints_)
-    : data_part(data_part_)
+    : data_part_info_for_read(data_part_info_for_read_)
     , avg_value_size_hints(avg_value_size_hints_)
     , uncompressed_cache(uncompressed_cache_)
     , mark_cache(mark_cache_)
     , settings(settings_)
-    , storage(data_part_->storage)
     , metadata_snapshot(metadata_snapshot_)
     , all_mark_ranges(all_mark_ranges_)
-    , alter_conversions(storage.getAlterConversionsForPart(data_part))
+    , alter_conversions(data_part_info_for_read->getAlterConversions())
     /// For wide parts convert plain arrays of Nested to subcolumns
     /// to allow to use shared offset column from cache.
-    , requested_columns(isWidePart(data_part) ? Nested::convertToSubcolumns(columns_) : columns_)
-    , part_columns(isWidePart(data_part) ? Nested::collect(data_part->getColumns()) : data_part->getColumns())
+    , requested_columns(data_part_info_for_read->isWidePart() ? Nested::convertToSubcolumns(columns_) : columns_)
+    , part_columns(data_part_info_for_read->isWidePart() ? Nested::collect(data_part_info_for_read->getColumns()) : data_part_info_for_read->getColumns())
 {
     columns_to_read.reserve(requested_columns.size());
     serializations.reserve(requested_columns.size());
@@ -64,14 +63,20 @@ void IMergeTreeReader::fillMissingColumns(Columns & res_columns, bool & should_e
 {
     try
     {
-        DB::fillMissingColumns(res_columns, num_rows, requested_columns, metadata_snapshot);
+        NamesAndTypesList available_columns(columns_to_read.begin(), columns_to_read.end());
+        DB::fillMissingColumns(
+            res_columns, num_rows,
+            Nested::convertToSubcolumns(requested_columns),
+            Nested::convertToSubcolumns(available_columns),
+            partially_read_columns, metadata_snapshot);
+
         should_evaluate_missing_defaults = std::any_of(
             res_columns.begin(), res_columns.end(), [](const auto & column) { return column == nullptr; });
     }
     catch (Exception & e)
     {
         /// Better diagnostics.
-        e.addMessage("(while reading from part " + data_part->data_part_storage->getFullPath() + ")");
+        e.addMessage("(while reading from part " + data_part_info_for_read->getDataPartStorage()->getFullPath() + ")");
         throw;
     }
 }
@@ -99,13 +104,13 @@ void IMergeTreeReader::evaluateMissingDefaults(Block additional_columns, Columns
         }
 
         auto dag = DB::evaluateMissingDefaults(
-                additional_columns, requested_columns, metadata_snapshot->getColumns(), storage.getContext());
+                additional_columns, requested_columns, metadata_snapshot->getColumns(), data_part_info_for_read->getContext());
         if (dag)
         {
             dag->addMaterializingOutputActions();
             auto actions = std::make_shared<
                 ExpressionActions>(std::move(dag),
-                ExpressionActionsSettings::fromSettings(storage.getContext()->getSettingsRef()));
+                ExpressionActionsSettings::fromSettings(data_part_info_for_read->getContext()->getSettingsRef()));
             actions->execute(additional_columns);
         }
 
@@ -117,7 +122,7 @@ void IMergeTreeReader::evaluateMissingDefaults(Block additional_columns, Columns
     catch (Exception & e)
     {
         /// Better diagnostics.
-        e.addMessage("(while reading from part " + data_part->data_part_storage->getFullPath() + ")");
+        e.addMessage("(while reading from part " + data_part_info_for_read->getDataPartStorage()->getFullPath() + ")");
         throw;
     }
 }
@@ -151,7 +156,7 @@ SerializationPtr IMergeTreeReader::getSerializationInPart(const NameAndTypePair
     if (!column_in_part)
         return IDataType::getSerialization(required_column);
 
-    const auto & infos = data_part->getSerializationInfos();
+    const auto & infos = data_part_info_for_read->getSerializationInfos();
     if (auto it = infos.find(column_in_part->getNameInStorage()); it != infos.end())
         return IDataType::getSerialization(*column_in_part, *it->second);
 
@@ -187,7 +192,7 @@ void IMergeTreeReader::performRequiredConversions(Columns & res_columns) const
             copy_block.insert({res_columns[pos], getColumnInPart(*name_and_type).type, name_and_type->name});
         }
 
-        DB::performRequiredConversions(copy_block, requested_columns, storage.getContext());
+        DB::performRequiredConversions(copy_block, requested_columns, data_part_info_for_read->getContext());
 
         /// Move columns from block.
         name_and_type = requested_columns.begin();
@@ -197,25 +202,61 @@ void IMergeTreeReader::performRequiredConversions(Columns & res_columns) const
     catch (Exception & e)
     {
         /// Better diagnostics.
-        e.addMessage("(while reading from part " + data_part->data_part_storage->getFullPath() + ")");
+        e.addMessage("(while reading from part " + data_part_info_for_read->getDataPartStorage()->getFullPath() + ")");
         throw;
     }
 }
 
-IMergeTreeReader::ColumnPosition IMergeTreeReader::findColumnForOffsets(const String & column_name) const
+IMergeTreeReader::ColumnPosition IMergeTreeReader::findColumnForOffsets(const NameAndTypePair & required_column) const
 {
-    String table_name = Nested::extractTableName(column_name);
-    for (const auto & part_column : data_part->getColumns())
+    auto get_offsets_streams = [](const auto & serialization, const auto & name_in_storage)
     {
-        if (typeid_cast<const DataTypeArray *>(part_column.type.get()))
+        Names offsets_streams;
+        serialization->enumerateStreams([&](const auto & subpath)
         {
-            auto position = data_part->getColumnPosition(part_column.getNameInStorage());
-            if (position && Nested::extractTableName(part_column.name) == table_name)
-                return position;
+            if (subpath.empty() || subpath.back().type != ISerialization::Substream::ArraySizes)
+                return;
+
+            auto subname = ISerialization::getSubcolumnNameForStream(subpath);
+            auto full_name = Nested::concatenateName(name_in_storage, subname);
+            offsets_streams.push_back(full_name);
+        });
+
+        return offsets_streams;
+    };
+
+    auto required_name_in_storage = Nested::extractTableName(required_column.getNameInStorage());
+    auto required_offsets_streams = get_offsets_streams(getSerializationInPart(required_column), required_name_in_storage);
+
+    size_t max_matched_streams = 0;
+    ColumnPosition position;
+
+    /// Find column that has maximal number of matching
+    /// offsets columns with required_column.
+    for (const auto & part_column : data_part_info_for_read->getColumns())
+    {
+        auto name_in_storage = Nested::extractTableName(part_column.name);
+        if (name_in_storage != required_name_in_storage)
+            continue;
+
+        auto offsets_streams = get_offsets_streams(data_part_info_for_read->getSerialization(part_column), name_in_storage);
+        NameSet offsets_streams_set(offsets_streams.begin(), offsets_streams.end());
+
+        size_t i = 0;
+        for (; i < required_offsets_streams.size(); ++i)
+        {
+            if (!offsets_streams_set.contains(required_offsets_streams[i]))
+                break;
+        }
+
+        if (i && (!position || i > max_matched_streams))
+        {
+            max_matched_streams = i;
+            position = data_part_info_for_read->getColumnPosition(part_column.name);
         }
     }
 
-    return {};
+    return position;
 }
 
 void IMergeTreeReader::checkNumberOfColumns(size_t num_columns_to_read) const
diff --git a/src/Storages/MergeTree/IMergeTreeReader.h b/src/Storages/MergeTree/IMergeTreeReader.h
index 453563522a5..16db13692aa 100644
--- a/src/Storages/MergeTree/IMergeTreeReader.h
+++ b/src/Storages/MergeTree/IMergeTreeReader.h
@@ -4,6 +4,8 @@
 #include <Common/HashTable/HashMap.h>
 #include <Storages/MergeTree/MergeTreeReaderStream.h>
 #include <Storages/MergeTree/MergeTreeBlockReadUtils.h>
+#include <Storages/MergeTree/IMergeTreeDataPart.h>
+#include <Storages/MergeTree/IMergeTreeDataPartInfoForReader.h>
 
 namespace DB
 {
@@ -20,7 +22,7 @@ public:
     using DeserializeBinaryBulkStateMap = std::map<std::string, ISerialization::DeserializeBinaryBulkStatePtr>;
 
     IMergeTreeReader(
-        const MergeTreeData::DataPartPtr & data_part_,
+        MergeTreeDataPartInfoForReaderPtr data_part_info_for_read_,
         const NamesAndTypesList & columns_,
         const StorageMetadataPtr & metadata_snapshot_,
         UncompressedCache * uncompressed_cache_,
@@ -57,7 +59,7 @@ public:
 
     size_t getFirstMarkToRead() const { return all_mark_ranges.front().begin; }
 
-    MergeTreeData::DataPartPtr data_part;
+    MergeTreeDataPartInfoForReaderPtr data_part_info_for_read;
 
 protected:
     /// Returns actual column name in part, which can differ from table metadata.
@@ -86,16 +88,17 @@ protected:
 
     MergeTreeReaderSettings settings;
 
-    const MergeTreeData & storage;
     StorageMetadataPtr metadata_snapshot;
     MarkRanges all_mark_ranges;
 
     using ColumnPosition = std::optional<size_t>;
-    ColumnPosition findColumnForOffsets(const String & column_name) const;
+    ColumnPosition findColumnForOffsets(const NameAndTypePair & column) const;
+
+    NameSet partially_read_columns;
 
 private:
     /// Alter conversions, which must be applied on fly if required
-    MergeTreeData::AlterConversions alter_conversions;
+    AlterConversions alter_conversions;
 
     /// Columns that are requested to read.
     NamesAndTypesList requested_columns;
diff --git a/src/Storages/MergeTree/KeyCondition.cpp b/src/Storages/MergeTree/KeyCondition.cpp
index 04007ce9356..f5eeb4ed35c 100644
--- a/src/Storages/MergeTree/KeyCondition.cpp
+++ b/src/Storages/MergeTree/KeyCondition.cpp
@@ -297,8 +297,10 @@ public:
                 assert(indexes_mapping.size() == data_types.size());
 
                 for (size_t i = 0; i < indexes_mapping.size(); ++i)
+                {
                     if (!candidate_set->areTypesEqual(indexes_mapping[i].tuple_index, data_types[i]))
                         return false;
+                }
 
                 return true;
             };
diff --git a/src/Storages/MergeTree/LoadedMergeTreeDataPartInfoForReader.h b/src/Storages/MergeTree/LoadedMergeTreeDataPartInfoForReader.h
new file mode 100644
index 00000000000..a16aaa728ae
--- /dev/null
+++ b/src/Storages/MergeTree/LoadedMergeTreeDataPartInfoForReader.h
@@ -0,0 +1,55 @@
+#pragma once
+#include <Storages/MergeTree/IMergeTreeDataPartInfoForReader.h>
+#include <Storages/MergeTree/MergeTreeData.h>
+
+
+namespace DB
+{
+
+class LoadedMergeTreeDataPartInfoForReader final : public IMergeTreeDataPartInfoForReader
+{
+public:
+    explicit LoadedMergeTreeDataPartInfoForReader(MergeTreeData::DataPartPtr data_part_)
+        : IMergeTreeDataPartInfoForReader(data_part_->storage.getContext())
+        , data_part(data_part_)
+    {}
+
+    bool isCompactPart() const override { return DB::isCompactPart(data_part); }
+
+    bool isWidePart() const override { return DB::isWidePart(data_part); }
+
+    bool isInMemoryPart() const override { return DB::isInMemoryPart(data_part); }
+
+    bool isProjectionPart() const override { return data_part->isProjectionPart(); }
+
+    const DataPartStoragePtr & getDataPartStorage() const override { return data_part->data_part_storage; }
+
+    const NamesAndTypesList & getColumns() const override { return data_part->getColumns(); }
+
+    std::optional<size_t> getColumnPosition(const String & column_name) const override { return data_part->getColumnPosition(column_name); }
+
+    AlterConversions getAlterConversions() const override { return data_part->storage.getAlterConversionsForPart(data_part); }
+
+    String getColumnNameWithMinimumCompressedSize(bool with_subcolumns) const override { return data_part->getColumnNameWithMinimumCompressedSize(with_subcolumns); }
+
+    const MergeTreeDataPartChecksums & getChecksums() const override { return data_part->checksums; }
+
+    void reportBroken() override { data_part->storage.reportBrokenPart(data_part); }
+
+    size_t getMarksCount() const override { return data_part->getMarksCount(); }
+
+    size_t getFileSizeOrZero(const std::string & file_name) const override { return data_part->getFileSizeOrZero(file_name); }
+
+    const MergeTreeIndexGranularityInfo & getIndexGranularityInfo() const override { return data_part->index_granularity_info; }
+
+    const MergeTreeIndexGranularity & getIndexGranularity() const override { return data_part->index_granularity; }
+
+    const SerializationInfoByName & getSerializationInfos() const override { return data_part->getSerializationInfos(); }
+
+    SerializationPtr getSerialization(const NameAndTypePair & column) const override { return data_part->getSerialization(column.name); }
+
+private:
+    MergeTreeData::DataPartPtr data_part;
+};
+
+}
diff --git a/src/Storages/MergeTree/MarkRange.cpp b/src/Storages/MergeTree/MarkRange.cpp
index 343c4ecaf22..903940efa94 100644
--- a/src/Storages/MergeTree/MarkRange.cpp
+++ b/src/Storages/MergeTree/MarkRange.cpp
@@ -36,4 +36,16 @@ size_t getLastMark(const MarkRanges & ranges)
     return current_task_last_mark;
 }
 
+std::string toString(const MarkRanges & ranges)
+{
+    std::string result;
+    for (const auto & mark_range : ranges)
+    {
+        if (!result.empty())
+            result += ", ";
+        result += "(" + std::to_string(mark_range.begin) + ", " + std::to_string(mark_range.end) + ")";
+    }
+    return result;
+}
+
 }
diff --git a/src/Storages/MergeTree/MarkRange.h b/src/Storages/MergeTree/MarkRange.h
index 4f32be6ab14..fe02eb056b7 100644
--- a/src/Storages/MergeTree/MarkRange.h
+++ b/src/Storages/MergeTree/MarkRange.h
@@ -32,4 +32,6 @@ using MarkRanges = std::deque<MarkRange>;
  */
 size_t getLastMark(const MarkRanges & ranges);
 
+std::string toString(const MarkRanges & ranges);
+
 }
diff --git a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h
index a2f97d6bbc6..0fc888dd6ad 100644
--- a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h
+++ b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h
@@ -155,7 +155,7 @@ private:
  *  We use boost::circular_buffer as a container for queues not to do any allocations.
  *
  *  Another nuisance that we faces with is than background operations always interact with an associated Storage.
- *  So, when a Storage want to shutdown, it must wait until all its background operaions are finished.
+ *  So, when a Storage want to shutdown, it must wait until all its background operations are finished.
  */
 template <class Queue>
 class MergeTreeBackgroundExecutor final : boost::noncopyable
diff --git a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp
index e2cd797ab92..475407a402b 100644
--- a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp
+++ b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp
@@ -43,6 +43,7 @@ MergeTreeBaseSelectProcessor::MergeTreeBaseSelectProcessor(
     , storage(storage_)
     , storage_snapshot(storage_snapshot_)
     , prewhere_info(prewhere_info_)
+    , prewhere_actions(getPrewhereActions(prewhere_info, actions_settings))
     , max_block_size_rows(max_block_size_rows_)
     , preferred_block_size_bytes(preferred_block_size_bytes_)
     , preferred_max_column_in_block_size_bytes(preferred_max_column_in_block_size_bytes_)
@@ -72,7 +73,12 @@ MergeTreeBaseSelectProcessor::MergeTreeBaseSelectProcessor(
                 header_without_virtual_columns.erase(*it);
         }
     }
+}
 
+
+std::unique_ptr<PrewhereExprInfo> MergeTreeBaseSelectProcessor::getPrewhereActions(PrewhereInfoPtr prewhere_info, const ExpressionActionsSettings & actions_settings)
+{
+    std::unique_ptr<PrewhereExprInfo> prewhere_actions;
     if (prewhere_info)
     {
         prewhere_actions = std::make_unique<PrewhereExprInfo>();
@@ -100,6 +106,8 @@ MergeTreeBaseSelectProcessor::MergeTreeBaseSelectProcessor(
 
         prewhere_actions->steps.emplace_back(std::move(prewhere_step));
     }
+
+    return prewhere_actions;
 }
 
 
@@ -262,45 +270,62 @@ void MergeTreeBaseSelectProcessor::initializeMergeTreeReadersForPart(
 
 void MergeTreeBaseSelectProcessor::initializeRangeReaders(MergeTreeReadTask & current_task)
 {
-    MergeTreeRangeReader* prev_reader = nullptr;
+    return initializeRangeReadersImpl(
+        current_task.range_reader, current_task.pre_range_readers, prewhere_info, prewhere_actions.get(),
+        reader.get(), current_task.data_part->hasLightweightDelete(), reader_settings,
+        pre_reader_for_step, lightweight_delete_filter_step, non_const_virtual_column_names);
+}
+
+void MergeTreeBaseSelectProcessor::initializeRangeReadersImpl(
+    MergeTreeRangeReader & range_reader, std::deque<MergeTreeRangeReader> & pre_range_readers,
+    PrewhereInfoPtr prewhere_info, const PrewhereExprInfo * prewhere_actions,
+    IMergeTreeReader * reader, bool has_lightweight_delete, const MergeTreeReaderSettings & reader_settings,
+    const std::vector<std::unique_ptr<IMergeTreeReader>> & pre_reader_for_step,
+    const PrewhereExprStep & lightweight_delete_filter_step, const Names & non_const_virtual_column_names)
+{
+    MergeTreeRangeReader * prev_reader = nullptr;
     bool last_reader = false;
     size_t pre_readers_shift = 0;
 
     /// Add filtering step with lightweight delete mask
-    if (reader_settings.apply_deleted_mask && current_task.data_part->hasLightweightDelete())
+    if (reader_settings.apply_deleted_mask && has_lightweight_delete)
     {
-        current_task.pre_range_readers.push_back(
-            MergeTreeRangeReader(pre_reader_for_step[0].get(), prev_reader, &lightweight_delete_filter_step, last_reader, non_const_virtual_column_names));
-        prev_reader = &current_task.pre_range_readers.back();
+        MergeTreeRangeReader pre_range_reader(pre_reader_for_step[0].get(), prev_reader, &lightweight_delete_filter_step, last_reader, non_const_virtual_column_names);
+        pre_range_readers.push_back(std::move(pre_range_reader));
+        prev_reader = &pre_range_readers.back();
         pre_readers_shift++;
     }
 
     if (prewhere_info)
     {
         if (prewhere_actions->steps.size() + pre_readers_shift != pre_reader_for_step.size())
-            throw Exception(ErrorCodes::LOGICAL_ERROR,
-                            "PREWHERE steps count mismatch, actions: {}, readers: {}",
-                            prewhere_actions->steps.size(), pre_reader_for_step.size());
+        {
+            throw Exception(
+                ErrorCodes::LOGICAL_ERROR,
+                "PREWHERE steps count mismatch, actions: {}, readers: {}",
+                prewhere_actions->steps.size(), pre_reader_for_step.size());
+        }
 
         for (size_t i = 0; i < prewhere_actions->steps.size(); ++i)
         {
             last_reader = reader->getColumns().empty() && (i + 1 == prewhere_actions->steps.size());
-            current_task.pre_range_readers.push_back(
-                MergeTreeRangeReader(pre_reader_for_step[i + pre_readers_shift].get(), prev_reader, &prewhere_actions->steps[i], last_reader, non_const_virtual_column_names));
 
-            prev_reader = &current_task.pre_range_readers.back();
+            MergeTreeRangeReader current_reader(pre_reader_for_step[i + pre_readers_shift].get(), prev_reader, &prewhere_actions->steps[i], last_reader, non_const_virtual_column_names);
+
+            pre_range_readers.push_back(std::move(current_reader));
+            prev_reader = &pre_range_readers.back();
         }
     }
 
     if (!last_reader)
     {
-        current_task.range_reader = MergeTreeRangeReader(reader.get(), prev_reader, nullptr, true, non_const_virtual_column_names);
+        range_reader = MergeTreeRangeReader(reader, prev_reader, nullptr, true, non_const_virtual_column_names);
     }
     else
     {
         /// If all columns are read by pre_range_readers than move last pre_range_reader into range_reader
-        current_task.range_reader = std::move(current_task.pre_range_readers.back());
-        current_task.pre_range_readers.pop_back();
+        range_reader = std::move(pre_range_readers.back());
+        pre_range_readers.pop_back();
     }
 }
 
diff --git a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h
index aa1b9d3541e..051854d8bc1 100644
--- a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h
+++ b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h
@@ -89,6 +89,20 @@ protected:
     static void
     injectVirtualColumns(Block & block, size_t row_count, MergeTreeReadTask * task, const DataTypePtr & partition_value_type, const Names & virtual_columns);
 
+    static std::unique_ptr<PrewhereExprInfo> getPrewhereActions(PrewhereInfoPtr prewhere_info, const ExpressionActionsSettings & actions_settings);
+
+    static void initializeRangeReadersImpl(
+         MergeTreeRangeReader & range_reader,
+         std::deque<MergeTreeRangeReader> & pre_range_readers,
+         PrewhereInfoPtr prewhere_info,
+         const PrewhereExprInfo * prewhere_actions,
+         IMergeTreeReader * reader,
+         bool has_lightweight_delete,
+         const MergeTreeReaderSettings & reader_settings,
+         const std::vector<std::unique_ptr<IMergeTreeReader>> & pre_reader_for_step,
+         const PrewhereExprStep & lightweight_delete_filter_step,
+         const Names & non_const_virtual_column_names);
+
     /// Sets up data readers for each step of prewhere and where
     void initializeMergeTreeReadersForPart(
         MergeTreeData::DataPartPtr & data_part,
diff --git a/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp b/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp
index 62fac84fc36..c3f069498be 100644
--- a/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp
+++ b/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp
@@ -1,5 +1,6 @@
 #include <Storages/MergeTree/MergeTreeBlockReadUtils.h>
 #include <Storages/MergeTree/MergeTreeData.h>
+#include <Storages/MergeTree/IMergeTreeDataPartInfoForReader.h>
 #include <DataTypes/NestedUtils.h>
 #include <Core/NamesAndTypes.h>
 #include <Common/checkStackSize.h>
@@ -28,8 +29,8 @@ namespace
 bool injectRequiredColumnsRecursively(
     const String & column_name,
     const StorageSnapshotPtr & storage_snapshot,
-    const MergeTreeData::AlterConversions & alter_conversions,
-    const MergeTreeData::DataPartPtr & part,
+    const AlterConversions & alter_conversions,
+    const IMergeTreeDataPartInfoForReader & data_part_info_for_reader,
     const GetColumnsOptions & options,
     Names & columns,
     NameSet & required_columns,
@@ -47,7 +48,7 @@ bool injectRequiredColumnsRecursively(
         if (alter_conversions.isColumnRenamed(column_name_in_part))
             column_name_in_part = alter_conversions.getColumnOldName(column_name_in_part);
 
-        auto column_in_part = part->getColumns().tryGetByName(column_name_in_part);
+        auto column_in_part = data_part_info_for_reader.getColumns().tryGetByName(column_name_in_part);
 
         if (column_in_part
             && (!column_in_storage->isSubcolumn()
@@ -78,7 +79,7 @@ bool injectRequiredColumnsRecursively(
     bool result = false;
     for (const auto & identifier : identifiers)
         result |= injectRequiredColumnsRecursively(
-            identifier, storage_snapshot, alter_conversions, part,
+            identifier, storage_snapshot, alter_conversions, data_part_info_for_reader,
             options, columns, required_columns, injected_columns);
 
     return result;
@@ -87,9 +88,8 @@ bool injectRequiredColumnsRecursively(
 }
 
 NameSet injectRequiredColumns(
-    const MergeTreeData & storage,
+    const IMergeTreeDataPartInfoForReader & data_part_info_for_reader,
     const StorageSnapshotPtr & storage_snapshot,
-    const MergeTreeData::DataPartPtr & part,
     bool with_subcolumns,
     Names & columns)
 {
@@ -97,9 +97,9 @@ NameSet injectRequiredColumns(
     NameSet injected_columns;
 
     bool have_at_least_one_physical_column = false;
-    MergeTreeData::AlterConversions alter_conversions;
-    if (!part->isProjectionPart())
-        alter_conversions = storage.getAlterConversionsForPart(part);
+    AlterConversions alter_conversions;
+    if (!data_part_info_for_reader.isProjectionPart())
+        alter_conversions = data_part_info_for_reader.getAlterConversions();
 
     auto options = GetColumnsOptions(GetColumnsOptions::AllPhysical)
         .withExtendedObjects()
@@ -115,7 +115,7 @@ NameSet injectRequiredColumns(
 
         have_at_least_one_physical_column |= injectRequiredColumnsRecursively(
             columns[i], storage_snapshot, alter_conversions,
-            part, options, columns, required_columns, injected_columns);
+            data_part_info_for_reader, options, columns, required_columns, injected_columns);
     }
 
     /** Add a column of the minimum size.
@@ -124,7 +124,7 @@ NameSet injectRequiredColumns(
         */
     if (!have_at_least_one_physical_column)
     {
-        const auto minimum_size_column_name = part->getColumnNameWithMinimumCompressedSize(with_subcolumns);
+        const auto minimum_size_column_name = data_part_info_for_reader.getColumnNameWithMinimumCompressedSize(with_subcolumns);
         columns.push_back(minimum_size_column_name);
         /// correctly report added column
         injected_columns.insert(columns.back());
@@ -135,13 +135,22 @@ NameSet injectRequiredColumns(
 
 
 MergeTreeReadTask::MergeTreeReadTask(
-    const MergeTreeData::DataPartPtr & data_part_, const MarkRanges & mark_ranges_, size_t part_index_in_query_,
-    const Names & ordered_names_, const NameSet & column_name_set_, const MergeTreeReadTaskColumns & task_columns_,
+    const MergeTreeData::DataPartPtr & data_part_,
+    const MarkRanges & mark_ranges_,
+    size_t part_index_in_query_,
+    const Names & ordered_names_,
+    const NameSet & column_name_set_,
+    const MergeTreeReadTaskColumns & task_columns_,
     bool remove_prewhere_column_,
     MergeTreeBlockSizePredictorPtr && size_predictor_)
-    : data_part{data_part_}, mark_ranges{mark_ranges_}, part_index_in_query{part_index_in_query_},
-    ordered_names{ordered_names_}, column_name_set{column_name_set_}, task_columns{task_columns_},
-    remove_prewhere_column{remove_prewhere_column_}, size_predictor{std::move(size_predictor_)}
+    : data_part{data_part_}
+    , mark_ranges{mark_ranges_}
+    , part_index_in_query{part_index_in_query_}
+    , ordered_names{ordered_names_}
+    , column_name_set{column_name_set_}
+    , task_columns{task_columns_}
+    , remove_prewhere_column{remove_prewhere_column_}
+    , size_predictor{std::move(size_predictor_)}
 {
 }
 
@@ -270,9 +279,8 @@ void MergeTreeBlockSizePredictor::update(const Block & sample_block, const Colum
 
 
 MergeTreeReadTaskColumns getReadTaskColumns(
-    const MergeTreeData & storage,
+    const IMergeTreeDataPartInfoForReader & data_part_info_for_reader,
     const StorageSnapshotPtr & storage_snapshot,
-    const MergeTreeData::DataPartPtr & data_part,
     const Names & required_columns,
     const Names & system_columns,
     const PrewhereInfoPtr & prewhere_info,
@@ -284,13 +292,13 @@ MergeTreeReadTaskColumns getReadTaskColumns(
     /// Read system columns such as lightweight delete mask "_row_exists" if it is persisted in the part
     for (const auto & name : system_columns)
     {
-        if (data_part->getColumns().contains(name))
+        if (data_part_info_for_reader.getColumns().contains(name))
             column_names.push_back(name);
     }
 
     /// inject columns required for defaults evaluation
     injectRequiredColumns(
-        storage, storage_snapshot, data_part, with_subcolumns, column_names);
+        data_part_info_for_reader, storage_snapshot, with_subcolumns, column_names);
 
     MergeTreeReadTaskColumns result;
     auto options = GetColumnsOptions(GetColumnsOptions::All)
@@ -316,7 +324,7 @@ MergeTreeReadTaskColumns getReadTaskColumns(
         Names all_pre_column_names = prewhere_info->prewhere_actions->getRequiredColumnsNames();
 
         const auto injected_pre_columns = injectRequiredColumns(
-            storage, storage_snapshot, data_part, with_subcolumns, all_pre_column_names);
+             data_part_info_for_reader, storage_snapshot, with_subcolumns, all_pre_column_names);
 
         for (const auto & name : all_pre_column_names)
         {
diff --git a/src/Storages/MergeTree/MergeTreeBlockReadUtils.h b/src/Storages/MergeTree/MergeTreeBlockReadUtils.h
index 5a36955b4d3..e1c06869bb7 100644
--- a/src/Storages/MergeTree/MergeTreeBlockReadUtils.h
+++ b/src/Storages/MergeTree/MergeTreeBlockReadUtils.h
@@ -12,6 +12,7 @@ namespace DB
 class MergeTreeData;
 struct MergeTreeReadTask;
 struct MergeTreeBlockSizePredictor;
+class IMergeTreeDataPartInfoForReader;
 
 using MergeTreeReadTaskPtr = std::unique_ptr<MergeTreeReadTask>;
 using MergeTreeBlockSizePredictorPtr = std::shared_ptr<MergeTreeBlockSizePredictor>;
@@ -23,9 +24,8 @@ using MergeTreeBlockSizePredictorPtr = std::shared_ptr<MergeTreeBlockSizePredict
   * Adds them to the `columns`.
   */
 NameSet injectRequiredColumns(
-    const MergeTreeData & storage,
+    const IMergeTreeDataPartInfoForReader & data_part_info_for_reader,
     const StorageSnapshotPtr & storage_snapshot,
-    const MergeTreeData::DataPartPtr & part,
     bool with_subcolumns,
     Names & columns);
 
@@ -68,16 +68,19 @@ struct MergeTreeReadTask
     bool isFinished() const { return mark_ranges.empty() && range_reader.isCurrentRangeFinished(); }
 
     MergeTreeReadTask(
-        const MergeTreeData::DataPartPtr & data_part_, const MarkRanges & mark_ranges_, size_t part_index_in_query_,
-        const Names & ordered_names_, const NameSet & column_name_set_, const MergeTreeReadTaskColumns & task_columns_,
+        const MergeTreeData::DataPartPtr & data_part_,
+        const MarkRanges & mark_ranges_,
+        size_t part_index_in_query_,
+        const Names & ordered_names_,
+        const NameSet & column_name_set_,
+        const MergeTreeReadTaskColumns & task_columns_,
         bool remove_prewhere_column_,
         MergeTreeBlockSizePredictorPtr && size_predictor_);
 };
 
 MergeTreeReadTaskColumns getReadTaskColumns(
-    const MergeTreeData & storage,
+    const IMergeTreeDataPartInfoForReader & data_part_info_for_reader,
     const StorageSnapshotPtr & storage_snapshot,
-    const MergeTreeData::DataPartPtr & data_part,
     const Names & required_columns,
     const Names & system_columns,
     const PrewhereInfoPtr & prewhere_info,
diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index ee2cbf4fe8a..78821ffcb74 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -3,6 +3,7 @@
 #include <Backups/BackupEntriesCollector.h>
 #include <Backups/BackupEntryFromImmutableFile.h>
 #include <Backups/BackupEntryFromSmallFile.h>
+#include <Backups/BackupEntryWrappedWith.h>
 #include <Backups/IBackup.h>
 #include <Backups/RestorerFromBackup.h>
 #include <Compression/CompressedReadBuffer.h>
@@ -1046,29 +1047,43 @@ void MergeTreeData::loadDataPartsFromDisk(
                 throw;
 
             broken = true;
-            tryLogCurrentException(__PRETTY_FUNCTION__);
+            tryLogCurrentException(log, fmt::format("while loading part {} on path {}", part->name, part_path));
         }
         catch (...)
         {
             broken = true;
-            tryLogCurrentException(__PRETTY_FUNCTION__);
+            tryLogCurrentException(log, fmt::format("while loading part {} on path {}", part->name, part_path));
         }
 
         /// Ignore broken parts that can appear as a result of hard server restart.
         if (broken)
         {
-            /// NOTE: getBytesOnDisk() cannot be used here, since it maybe zero of checksums.txt will not exist
-            size_t size_of_part = data_part_storage->calculateTotalSizeOnDisk();
+            std::optional<size_t> size_of_part;
+            try
+            {
+                /// NOTE: getBytesOnDisk() cannot be used here, since it maybe zero of checksums.txt will not exist
+                size_of_part = data_part_storage->calculateTotalSizeOnDisk();
+            }
+            catch (...)
+            {
+                tryLogCurrentException(log, fmt::format("while calculating part size {} on path {}", part->name, part_path));
+            }
+
+            std::string part_size_str = "failed to calculate size";
+            if (size_of_part.has_value())
+                part_size_str = formatReadableSizeWithBinarySuffix(*size_of_part);
+
 
             LOG_ERROR(log,
                 "Detaching broken part {}{} (size: {}). "
-                "If it happened after update, it is likely because of backward incompability. "
+                "If it happened after update, it is likely because of backward incompatibility. "
                 "You need to resolve this manually",
-                getFullPathOnDisk(part_disk_ptr), part_name, formatReadableSizeWithBinarySuffix(size_of_part));
+                getFullPathOnDisk(part_disk_ptr), part_name, part_size_str);
             std::lock_guard loading_lock(mutex);
             broken_parts_to_detach.push_back(part);
             ++suspicious_broken_parts;
-            suspicious_broken_parts_bytes += size_of_part;
+            if (size_of_part.has_value())
+                suspicious_broken_parts_bytes += *size_of_part;
             return;
         }
         if (!part->index_granularity_info.is_adaptive)
@@ -1177,14 +1192,10 @@ void MergeTreeData::loadDataPartsFromDisk(
 void MergeTreeData::loadDataPartsFromWAL(
     DataPartsVector & /* broken_parts_to_detach */,
     DataPartsVector & duplicate_parts_to_remove,
-    MutableDataPartsVector & parts_from_wal,
-    DataPartsLock & part_lock)
+    MutableDataPartsVector & parts_from_wal)
 {
     for (auto & part : parts_from_wal)
     {
-        if (getActiveContainingPart(part->info, DataPartState::Active, part_lock))
-            continue;
-
         part->modification_time = time(nullptr);
         /// Assume that all parts are Active, covered parts will be detected and marked as Outdated later
         part->setState(DataPartState::Active);
@@ -1212,7 +1223,6 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks)
 
     auto metadata_snapshot = getInMemoryMetadataPtr();
     const auto settings = getSettings();
-    MutableDataPartsVector parts_from_wal;
     Strings part_file_names;
 
     auto disks = getStoragePolicy()->getDisks();
@@ -1269,16 +1279,14 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks)
 
     /// Collect part names by disk.
     std::map<String, std::vector<std::pair<String, DiskPtr>>> disk_part_map;
-    std::map<String, MutableDataPartsVector> disk_wal_part_map;
     ThreadPool pool(disks.size());
-    std::mutex wal_init_lock;
+
     for (const auto & disk_ptr : disks)
     {
         if (disk_ptr->isBroken())
             continue;
 
         auto & disk_parts = disk_part_map[disk_ptr->getName()];
-        auto & disk_wal_parts = disk_wal_part_map[disk_ptr->getName()];
 
         pool.scheduleOrThrowOnError([&, disk_ptr]()
         {
@@ -1291,34 +1299,11 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks)
 
                 if (!startsWith(it->name(), MergeTreeWriteAheadLog::WAL_FILE_NAME))
                     disk_parts.emplace_back(std::make_pair(it->name(), disk_ptr));
-                else if (it->name() == MergeTreeWriteAheadLog::DEFAULT_WAL_FILE_NAME && settings->in_memory_parts_enable_wal)
-                {
-                    std::lock_guard lock(wal_init_lock);
-                    if (write_ahead_log != nullptr)
-                        throw Exception(
-                            "There are multiple WAL files appeared in current storage policy. You need to resolve this manually",
-                            ErrorCodes::CORRUPTED_DATA);
-
-                    write_ahead_log = std::make_shared<MergeTreeWriteAheadLog>(*this, disk_ptr, it->name());
-                    for (auto && part : write_ahead_log->restore(metadata_snapshot, getContext()))
-                        disk_wal_parts.push_back(std::move(part));
-                }
-                else if (settings->in_memory_parts_enable_wal)
-                {
-                    MergeTreeWriteAheadLog wal(*this, disk_ptr, it->name());
-                    for (auto && part : wal.restore(metadata_snapshot, getContext()))
-                        disk_wal_parts.push_back(std::move(part));
-                }
             }
         });
     }
-
     pool.wait();
 
-    for (auto & [_, disk_wal_parts] : disk_wal_part_map)
-        parts_from_wal.insert(
-            parts_from_wal.end(), std::make_move_iterator(disk_wal_parts.begin()), std::make_move_iterator(disk_wal_parts.end()));
-
     size_t num_parts = 0;
     std::queue<std::vector<std::pair<String, DiskPtr>>> parts_queue;
     for (auto & [_, disk_parts] : disk_part_map)
@@ -1332,13 +1317,6 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks)
     auto part_lock = lockParts();
     data_parts_indexes.clear();
 
-    if (num_parts == 0 && parts_from_wal.empty())
-    {
-        resetObjectColumnsFromActiveParts(part_lock);
-        LOG_DEBUG(log, "There are no data parts");
-        return;
-    }
-
     DataPartsVector broken_parts_to_detach;
     DataPartsVector duplicate_parts_to_remove;
 
@@ -1346,8 +1324,65 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks)
         loadDataPartsFromDisk(
             broken_parts_to_detach, duplicate_parts_to_remove, pool, num_parts, parts_queue, skip_sanity_checks, settings);
 
-    if (!parts_from_wal.empty())
-        loadDataPartsFromWAL(broken_parts_to_detach, duplicate_parts_to_remove, parts_from_wal, part_lock);
+    if (settings->in_memory_parts_enable_wal)
+    {
+        std::map<String, MutableDataPartsVector> disk_wal_part_map;
+
+        std::mutex wal_init_lock;
+        for (const auto & disk_ptr : disks)
+        {
+            if (disk_ptr->isBroken())
+                continue;
+
+            auto & disk_wal_parts = disk_wal_part_map[disk_ptr->getName()];
+
+            pool.scheduleOrThrowOnError([&, disk_ptr]()
+            {
+                for (auto it = disk_ptr->iterateDirectory(relative_data_path); it->isValid(); it->next())
+                {
+                    if (!startsWith(it->name(), MergeTreeWriteAheadLog::WAL_FILE_NAME))
+                        continue;
+
+                    if (it->name() == MergeTreeWriteAheadLog::DEFAULT_WAL_FILE_NAME)
+                    {
+                        std::lock_guard lock(wal_init_lock);
+                        if (write_ahead_log != nullptr)
+                            throw Exception(
+                                "There are multiple WAL files appeared in current storage policy. You need to resolve this manually",
+                                ErrorCodes::CORRUPTED_DATA);
+
+                        write_ahead_log = std::make_shared<MergeTreeWriteAheadLog>(*this, disk_ptr, it->name());
+                        for (auto && part : write_ahead_log->restore(metadata_snapshot, getContext(), part_lock))
+                            disk_wal_parts.push_back(std::move(part));
+                    }
+                    else
+                    {
+                        MergeTreeWriteAheadLog wal(*this, disk_ptr, it->name());
+                        for (auto && part : wal.restore(metadata_snapshot, getContext(), part_lock))
+                            disk_wal_parts.push_back(std::move(part));
+                    }
+                }
+            });
+        }
+
+        pool.wait();
+
+        MutableDataPartsVector parts_from_wal;
+        for (auto & [_, disk_wal_parts] : disk_wal_part_map)
+            parts_from_wal.insert(
+                parts_from_wal.end(), std::make_move_iterator(disk_wal_parts.begin()), std::make_move_iterator(disk_wal_parts.end()));
+
+        loadDataPartsFromWAL(broken_parts_to_detach, duplicate_parts_to_remove, parts_from_wal);
+
+        num_parts += parts_from_wal.size();
+    }
+
+    if (num_parts == 0)
+    {
+        resetObjectColumnsFromActiveParts(part_lock);
+        LOG_DEBUG(log, "There are no data parts");
+        return;
+    }
 
     for (auto & part : broken_parts_to_detach)
     {
@@ -1410,7 +1445,7 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks)
             continue;
         }
 
-        /// Check if CSNs were witten after committing transaction, update and write if needed.
+        /// Check if CSNs were written after committing transaction, update and write if needed.
         bool version_updated = false;
         chassert(!version.creation_tid.isEmpty());
         if (!part->version.creation_csn)
@@ -1592,7 +1627,21 @@ size_t MergeTreeData::clearOldTemporaryDirectories(size_t custom_directories_lif
                     else
                     {
                         LOG_WARNING(log, "Removing temporary directory {}", full_path);
-                        disk->removeRecursive(it->path());
+
+                        /// Even if it's a temporary part it could be downloaded with zero copy replication and this function
+                        /// is executed as a callback.
+                        ///
+                        /// We don't control the amount of refs for temporary parts so we cannot decide can we remove blobs
+                        /// or not. So we are not doing it
+                        bool keep_shared = false;
+                        if (it->path().find("fetch") != std::string::npos)
+                        {
+                            keep_shared = disk->supportZeroCopyReplication() && settings->allow_remote_fs_zero_copy_replication;
+                            if (keep_shared)
+                                LOG_WARNING(log, "Since zero-copy replication is enabled we are not going to remove blobs from shared storage for {}", full_path);
+                        }
+
+                        disk->removeSharedRecursive(it->path(), keep_shared, {});
                         ++cleared_count;
                     }
                 }
@@ -1819,18 +1868,18 @@ size_t MergeTreeData::clearOldPartsFromFilesystem(bool force)
 
 void MergeTreeData::clearPartsFromFilesystem(const DataPartsVector & parts, bool throw_on_error, NameSet * parts_failed_to_delete)
 {
-    NameSet part_names_successeded;
+    NameSet part_names_succeed;
 
-    auto get_failed_parts = [&part_names_successeded, &parts_failed_to_delete, &parts] ()
+    auto get_failed_parts = [&part_names_succeed, &parts_failed_to_delete, &parts] ()
     {
-        if (part_names_successeded.size() == parts.size())
+        if (part_names_succeed.size() == parts.size())
             return;
 
         if (parts_failed_to_delete)
         {
             for (const auto & part : parts)
             {
-                if (!part_names_successeded.contains(part->name))
+                if (!part_names_succeed.contains(part->name))
                     parts_failed_to_delete->insert(part->name);
             }
         }
@@ -1838,7 +1887,7 @@ void MergeTreeData::clearPartsFromFilesystem(const DataPartsVector & parts, bool
 
     try
     {
-        clearPartsFromFilesystemImpl(parts, &part_names_successeded);
+        clearPartsFromFilesystemImpl(parts, &part_names_succeed);
         get_failed_parts();
     }
     catch (...)
@@ -1850,10 +1899,12 @@ void MergeTreeData::clearPartsFromFilesystem(const DataPartsVector & parts, bool
     }
 }
 
-void MergeTreeData::clearPartsFromFilesystemImpl(const DataPartsVector & parts_to_remove, NameSet * part_names_successed)
+void MergeTreeData::clearPartsFromFilesystemImpl(const DataPartsVector & parts_to_remove, NameSet * part_names_succeed)
 {
     const auto settings = getSettings();
-    if (parts_to_remove.size() > 1 && settings->max_part_removal_threads > 1 && parts_to_remove.size() > settings->concurrent_part_removal_threshold)
+    if (parts_to_remove.size() > 1
+        && settings->max_part_removal_threads > 1
+        && parts_to_remove.size() > settings->concurrent_part_removal_threshold)
     {
         /// Parallel parts removal.
         size_t num_threads = std::min<size_t>(settings->max_part_removal_threads, parts_to_remove.size());
@@ -1868,12 +1919,12 @@ void MergeTreeData::clearPartsFromFilesystemImpl(const DataPartsVector & parts_t
                 if (thread_group)
                     CurrentThread::attachToIfDetached(thread_group);
 
-                LOG_DEBUG(log, "Removing part from filesystem {}", part->name);
+                LOG_DEBUG(log, "Removing part from filesystem {} (concurrently)", part->name);
                 part->remove();
-                if (part_names_successed)
+                if (part_names_succeed)
                 {
                     std::lock_guard lock(part_names_mutex);
-                    part_names_successed->insert(part->name);
+                    part_names_succeed->insert(part->name);
                 }
             });
         }
@@ -1886,13 +1937,13 @@ void MergeTreeData::clearPartsFromFilesystemImpl(const DataPartsVector & parts_t
         {
             LOG_DEBUG(log, "Removing part from filesystem {}", part->name);
             part->remove();
-            if (part_names_successed)
-                part_names_successed->insert(part->name);
+            if (part_names_succeed)
+                part_names_succeed->insert(part->name);
         }
     }
 }
 
-size_t MergeTreeData::clearOldBrokenPartsFromDetachedDirecory()
+size_t MergeTreeData::clearOldBrokenPartsFromDetachedDirectory()
 {
     /**
      * Remove old (configured by setting) broken detached parts.
@@ -2045,7 +2096,7 @@ void MergeTreeData::rename(const String & new_table_path, const StorageID & new_
 
     {
         /// Relies on storage path, so we drop it during rename
-        /// it will be recreated automatiaclly.
+        /// it will be recreated automatically.
         std::lock_guard wal_lock(write_ahead_log_mutex);
         if (write_ahead_log)
         {
@@ -3880,7 +3931,7 @@ void MergeTreeData::movePartitionToVolume(const ASTPtr & partition, const String
         throw Exception("Volume " + name + " does not exists on policy " + getStoragePolicy()->getName(), ErrorCodes::UNKNOWN_DISK);
 
     if (parts.empty())
-        throw Exception("Nothing to move (сheck that the partition exists).", ErrorCodes::NO_SUCH_DATA_PART);
+        throw Exception("Nothing to move (check that the partition exists).", ErrorCodes::NO_SUCH_DATA_PART);
 
     std::erase_if(parts, [&](auto part_ptr)
         {
@@ -4059,29 +4110,74 @@ void MergeTreeData::backupData(BackupEntriesCollector & backup_entries_collector
     else
         data_parts = getVisibleDataPartsVector(local_context);
 
-    backup_entries_collector.addBackupEntries(backupParts(data_parts, data_path_in_backup));
+    backup_entries_collector.addBackupEntries(backupParts(data_parts, data_path_in_backup, local_context));
 }
 
-BackupEntries MergeTreeData::backupParts(const DataPartsVector & data_parts, const String & data_path_in_backup)
+BackupEntries MergeTreeData::backupParts(const DataPartsVector & data_parts, const String & data_path_in_backup, const ContextPtr & local_context)
 {
     BackupEntries backup_entries;
     std::map<DiskPtr, std::shared_ptr<TemporaryFileOnDisk>> temp_dirs;
+    TableLockHolder table_lock;
 
     for (const auto & part : data_parts)
     {
+        /// Hard links is the default way to ensure that we'll be keeping access to the files of parts.
+        bool make_temporary_hard_links = true;
+        bool hold_storage_and_part_ptrs = false;
+        bool hold_table_lock = false;
+
+        if (getStorageID().hasUUID())
+        {
+            /// Tables in atomic databases have UUIDs. When using atomic database we don't have to create hard links to make a backup,
+            /// we can just hold smart pointers to a storage and to data parts instead. That's enough to protect those files from deleting
+            /// until the backup is done (see the calls `part.unique()` in grabOldParts() and table.unique() in DatabaseCatalog).
+            make_temporary_hard_links = false;
+            hold_storage_and_part_ptrs = true;
+        }
+        else if (supportsReplication() && part->data_part_storage->supportZeroCopyReplication() && getSettings()->allow_remote_fs_zero_copy_replication)
+        {
+            /// Hard links don't work correctly with zero copy replication.
+            make_temporary_hard_links = false;
+            hold_storage_and_part_ptrs = true;
+            hold_table_lock = true;
+        }
+
+        if (hold_table_lock && !table_lock)
+            table_lock = lockForShare(local_context->getCurrentQueryId(), local_context->getSettingsRef().lock_acquire_timeout);
+
+        BackupEntries backup_entries_from_part;
         part->data_part_storage->backup(
-            temp_dirs, part->checksums, part->getFileNamesWithoutChecksums(), data_path_in_backup, backup_entries);
+            part->checksums,
+            part->getFileNamesWithoutChecksums(),
+            data_path_in_backup,
+            backup_entries_from_part,
+            make_temporary_hard_links,
+            &temp_dirs);
 
         auto projection_parts = part->getProjectionParts();
         for (const auto & [projection_name, projection_part] : projection_parts)
         {
             projection_part->data_part_storage->backup(
-                temp_dirs,
                 projection_part->checksums,
                 projection_part->getFileNamesWithoutChecksums(),
                 fs::path{data_path_in_backup} / part->name,
-                backup_entries);
+                backup_entries_from_part,
+                make_temporary_hard_links,
+                &temp_dirs);
         }
+
+        if (hold_storage_and_part_ptrs)
+        {
+            /// Wrap backup entries with smart pointers to data parts and to the storage itself
+            /// (we'll be holding those smart pointers for as long as we'll be using the backup entries).
+            auto storage_and_part = std::make_pair(shared_from_this(), part);
+            if (hold_table_lock)
+                wrapBackupEntriesWith(backup_entries_from_part, std::make_pair(storage_and_part, table_lock));
+            else
+                wrapBackupEntriesWith(backup_entries_from_part, storage_and_part);
+        }
+
+        insertAtEnd(backup_entries, std::move(backup_entries_from_part));
     }
 
     return backup_entries;
@@ -4975,6 +5071,8 @@ void MergeTreeData::Transaction::rollbackPartsToTemporaryState()
 void MergeTreeData::Transaction::addPart(MutableDataPartPtr & part, DataPartStorageBuilderPtr builder)
 {
     precommitted_parts.insert(part);
+    if (asInMemoryPart(part))
+        has_in_memory_parts = true;
     part_builders.push_back(builder);
 }
 
@@ -4997,6 +5095,12 @@ void MergeTreeData::Transaction::rollback()
     clear();
 }
 
+void MergeTreeData::Transaction::clear()
+{
+    precommitted_parts.clear();
+    has_in_memory_parts = false;
+}
+
 MergeTreeData::DataPartsVector MergeTreeData::Transaction::commit(MergeTreeData::DataPartsLock * acquired_parts_lock)
 {
     DataPartsVector total_covered_parts;
@@ -5004,20 +5108,30 @@ MergeTreeData::DataPartsVector MergeTreeData::Transaction::commit(MergeTreeData:
     if (!isEmpty())
     {
         auto settings = data.getSettings();
-        MergeTreeData::WriteAheadLogPtr wal;
         auto parts_lock = acquired_parts_lock ? MergeTreeData::DataPartsLock() : data.lockParts();
         auto * owing_parts_lock = acquired_parts_lock ? acquired_parts_lock : &parts_lock;
 
         for (auto & builder : part_builders)
             builder->commit();
 
-        if (txn)
+        bool commit_to_wal = has_in_memory_parts && settings->in_memory_parts_enable_wal;
+        if (txn || commit_to_wal)
         {
+            MergeTreeData::WriteAheadLogPtr wal;
+            if (commit_to_wal)
+                wal = data.getWriteAheadLog();
+
             for (const DataPartPtr & part : precommitted_parts)
             {
-                DataPartPtr covering_part;
-                DataPartsVector covered_parts = data.getActivePartsToReplace(part->info, part->name, covering_part, *owing_parts_lock);
-                MergeTreeTransaction::addNewPartAndRemoveCovered(data.shared_from_this(), part, covered_parts, txn);
+                if (txn)
+                {
+                    DataPartPtr covering_part;
+                    DataPartsVector covered_parts = data.getActivePartsToReplace(part->info, part->name, covering_part, *owing_parts_lock);
+                    MergeTreeTransaction::addNewPartAndRemoveCovered(data.shared_from_this(), part, covered_parts, txn);
+                }
+
+                if (auto part_in_memory = asInMemoryPart(part))
+                    wal->addPart(part_in_memory);
             }
         }
 
@@ -5034,15 +5148,6 @@ MergeTreeData::DataPartsVector MergeTreeData::Transaction::commit(MergeTreeData:
 
             for (const DataPartPtr & part : precommitted_parts)
             {
-                auto part_in_memory = asInMemoryPart(part);
-                if (part_in_memory && settings->in_memory_parts_enable_wal)
-                {
-                    if (!wal)
-                        wal = data.getWriteAheadLog();
-
-                    wal->addPart(part_in_memory);
-                }
-
                 DataPartPtr covering_part;
                 DataPartsVector covered_parts = data.getActivePartsToReplace(part->info, part->name, covering_part, *owing_parts_lock);
                 if (covering_part)
@@ -5541,6 +5646,10 @@ std::optional<ProjectionCandidate> MergeTreeData::getQueryProcessingStageWithAgg
     if (select_query->interpolate() && !select_query->interpolate()->children.empty())
         return std::nullopt;
 
+    // Currently projections don't support GROUPING SET yet.
+    if (select_query->group_by_with_grouping_sets)
+        return std::nullopt;
+
     auto query_options = SelectQueryOptions(
         QueryProcessingStage::WithMergeableState,
         /* depth */ 1,
@@ -6234,7 +6343,7 @@ PartitionCommandsResultInfo MergeTreeData::freezePartitionsByMatcher(
         {
 
             // Store metadata for replicated table.
-            // Do nothing for non-replocated.
+            // Do nothing for non-replicated.
             createAndStoreFreezeMetadata(disk, part, fs::path(backup_part_path) / part->data_part_storage->getPartDirectory());
         };
 
@@ -6547,7 +6656,7 @@ bool MergeTreeData::moveParts(const CurrentlyMovingPartsTaggerPtr & moving_tagge
             auto disk = moving_part.reserved_space->getDisk();
             if (supportsReplication() && disk->supportZeroCopyReplication() && settings->allow_remote_fs_zero_copy_replication)
             {
-                /// If we acuqired lock than let's try to move. After one
+                /// If we acquired lock than let's try to move. After one
                 /// replica will actually move the part from disk to some
                 /// zero-copy storage other replicas will just fetch
                 /// metainformation.
@@ -6619,7 +6728,7 @@ bool MergeTreeData::canUsePolymorphicParts(const MergeTreeSettings & settings, S
     return true;
 }
 
-MergeTreeData::AlterConversions MergeTreeData::getAlterConversionsForPart(const MergeTreeDataPartPtr part) const
+AlterConversions MergeTreeData::getAlterConversionsForPart(const MergeTreeDataPartPtr part) const
 {
     MutationCommands commands = getFirstAlterMutationCommandsForPart(part);
 
diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h
index 68ec191412b..3a35daf4c90 100644
--- a/src/Storages/MergeTree/MergeTreeData.h
+++ b/src/Storages/MergeTree/MergeTreeData.h
@@ -24,6 +24,7 @@
 #include <Storages/MergeTree/ZeroCopyLock.h>
 #include <Storages/MergeTree/TemporaryParts.h>
 #include <Storages/IndicesDescription.h>
+#include <Storages/MergeTree/AlterConversions.h>
 #include <Storages/DataDestinationType.h>
 #include <Storages/extractKeyExpressionList.h>
 #include <Storages/PartitionCommands.h>
@@ -167,20 +168,6 @@ public:
 
     STRONG_TYPEDEF(String, PartitionID)
 
-    /// Alter conversions which should be applied on-fly for part. Build from of
-    /// the most recent mutation commands for part. Now we have only rename_map
-    /// here (from ALTER_RENAME) command, because for all other type of alters
-    /// we can deduce conversions for part from difference between
-    /// part->getColumns() and storage->getColumns().
-    struct AlterConversions
-    {
-        /// Rename map new_name -> old_name
-        std::unordered_map<String, String> rename_map;
-
-        bool isColumnRenamed(const String & new_name) const { return rename_map.count(new_name) > 0; }
-        String getColumnOldName(const String & new_name) const { return rename_map.at(new_name); }
-    };
-
     struct LessDataPart
     {
         using is_transparent = void;
@@ -290,8 +277,9 @@ public:
         DataParts precommitted_parts;
         std::vector<DataPartStorageBuilderPtr> part_builders;
         DataParts locked_parts;
+        bool has_in_memory_parts = false;
 
-        void clear() { precommitted_parts.clear(); }
+        void clear();
     };
 
     using TransactionUniquePtr = std::unique_ptr<Transaction>;
@@ -634,7 +622,7 @@ public:
     /// Delete WAL files containing parts, that all already stored on disk.
     size_t clearOldWriteAheadLogs();
 
-    size_t clearOldBrokenPartsFromDetachedDirecory();
+    size_t clearOldBrokenPartsFromDetachedDirectory();
 
     /// Delete all directories which names begin with "tmp"
     /// Must be called with locked lockForShare() because it's using relative_data_path.
@@ -761,7 +749,7 @@ public:
 
     const ColumnsDescription & getObjectColumns() const { return object_columns; }
 
-    /// Creates desciprion of columns of data type Object from the range of data parts.
+    /// Creates description of columns of data type Object from the range of data parts.
     static ColumnsDescription getObjectColumns(
         const DataPartsVector & parts, const ColumnsDescription & storage_columns);
 
@@ -1083,7 +1071,7 @@ protected:
     DataPartsIndexes::index<TagByInfo>::type & data_parts_by_info;
     DataPartsIndexes::index<TagByStateAndInfo>::type & data_parts_by_state_and_info;
 
-    /// Current descriprion of columns of data type Object.
+    /// Current description of columns of data type Object.
     /// It changes only when set of parts is changed and is
     /// protected by @data_parts_mutex.
     ColumnsDescription object_columns;
@@ -1125,7 +1113,7 @@ protected:
         return {begin, end};
     }
 
-    /// Creates desciprion of columns of data type Object from the range of data parts.
+    /// Creates description of columns of data type Object from the range of data parts.
     static ColumnsDescription getObjectColumns(
         boost::iterator_range<DataPartIteratorByStateAndInfo> range, const ColumnsDescription & storage_columns);
 
@@ -1243,7 +1231,7 @@ protected:
     bool movePartsToSpace(const DataPartsVector & parts, SpacePtr space);
 
     /// Makes backup entries to backup the parts of this table.
-    static BackupEntries backupParts(const DataPartsVector & data_parts, const String & data_path_in_backup);
+    BackupEntries backupParts(const DataPartsVector & data_parts, const String & data_path_in_backup, const ContextPtr & local_context);
 
     class RestoredPartsHolder;
 
@@ -1263,7 +1251,7 @@ private:
     void checkPartCanBeAddedToTable(MutableDataPartPtr & part, DataPartsLock & lock) const;
 
     /// Preparing itself to be committed in memory: fill some fields inside part, add it to data_parts_indexes
-    /// in precommitted state and to transasction
+    /// in precommitted state and to transaction
     void preparePartForCommit(MutableDataPartPtr & part, Transaction & out_transaction, DataPartStorageBuilderPtr builder);
 
     /// Low-level method for preparing parts for commit (in-memory).
@@ -1339,8 +1327,7 @@ private:
     void loadDataPartsFromWAL(
         DataPartsVector & broken_parts_to_detach,
         DataPartsVector & duplicate_parts_to_remove,
-        MutableDataPartsVector & parts_from_wal,
-        DataPartsLock & part_lock);
+        MutableDataPartsVector & parts_from_wal);
 
     void resetObjectColumnsFromActiveParts(const DataPartsLock & lock);
     void updateObjectColumns(const DataPartPtr & part, const DataPartsLock & lock);
@@ -1352,7 +1339,7 @@ private:
     /// Remove parts from disk calling part->remove(). Can do it in parallel in case of big set of parts and enabled settings.
     /// If we fail to remove some part and throw_on_error equal to `true` will throw an exception on the first failed part.
     /// Otherwise, in non-parallel case will break and return.
-    void clearPartsFromFilesystemImpl(const DataPartsVector & parts, NameSet * part_names_successed);
+    void clearPartsFromFilesystemImpl(const DataPartsVector & parts, NameSet * part_names_succeed);
 
     TemporaryParts temporary_parts;
 };
diff --git a/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp b/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp
index 851153cd619..50ebd80c1b2 100644
--- a/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp
@@ -3,6 +3,7 @@
 #include <Storages/MergeTree/MergeTreeReaderCompact.h>
 #include <Storages/MergeTree/MergeTreeDataPartWriterCompact.h>
 #include <Interpreters/Context.h>
+#include <Storages/MergeTree/LoadedMergeTreeDataPartInfoForReader.h>
 
 
 namespace DB
@@ -46,12 +47,11 @@ IMergeTreeDataPart::MergeTreeReaderPtr MergeTreeDataPartCompact::getReader(
     const ValueSizeMap & avg_value_size_hints,
     const ReadBufferFromFileBase::ProfileCallback & profile_callback) const
 {
-    auto ptr = std::static_pointer_cast<const MergeTreeDataPartCompact>(shared_from_this());
-    auto context = storage.getContext();
-    auto * load_marks_threadpool = reader_settings.read_settings.load_marks_asynchronously ? &context->getLoadMarksThreadpool() : nullptr;
+    auto read_info = std::make_shared<LoadedMergeTreeDataPartInfoForReader>(shared_from_this());
+    auto * load_marks_threadpool = reader_settings.read_settings.load_marks_asynchronously ? &read_info->getContext()->getLoadMarksThreadpool() : nullptr;
 
     return std::make_unique<MergeTreeReaderCompact>(
-        ptr, columns_to_read, metadata_snapshot, uncompressed_cache,
+        read_info, columns_to_read, metadata_snapshot, uncompressed_cache,
         mark_cache, mark_ranges, reader_settings, load_marks_threadpool,
         avg_value_size_hints, profile_callback);
 }
@@ -94,39 +94,44 @@ void MergeTreeDataPartCompact::calculateEachColumnSizes(ColumnSizeByName & /*eac
         total_size.marks += mrk_checksum->second.file_size;
 }
 
-void MergeTreeDataPartCompact::loadIndexGranularity()
+void MergeTreeDataPartCompact::loadIndexGranularityImpl(
+    MergeTreeIndexGranularity & index_granularity_, const MergeTreeIndexGranularityInfo & index_granularity_info_,
+    const NamesAndTypesList & columns_, const DataPartStoragePtr & data_part_storage_)
 {
-    //String full_path = getRelativePath();
-
-    if (columns.empty())
-        throw Exception("No columns in part " + name, ErrorCodes::NO_FILE_IN_DATA_PART);
-
-    if (!index_granularity_info.is_adaptive)
+    if (!index_granularity_info_.is_adaptive)
         throw Exception("MergeTreeDataPartCompact cannot be created with non-adaptive granulary.", ErrorCodes::NOT_IMPLEMENTED);
 
-    auto marks_file_path = index_granularity_info.getMarksFilePath("data");
-    if (!data_part_storage->exists(marks_file_path))
+    auto marks_file_path = index_granularity_info_.getMarksFilePath("data");
+    if (!data_part_storage_->exists(marks_file_path))
         throw Exception(
             ErrorCodes::NO_FILE_IN_DATA_PART,
             "Marks file '{}' doesn't exist",
-            std::string(fs::path(data_part_storage->getFullPath()) / marks_file_path));
+            std::string(fs::path(data_part_storage_->getFullPath()) / marks_file_path));
 
-    size_t marks_file_size = data_part_storage->getFileSize(marks_file_path);
+    size_t marks_file_size = data_part_storage_->getFileSize(marks_file_path);
 
-    auto buffer = data_part_storage->readFile(marks_file_path, ReadSettings().adjustBufferSize(marks_file_size), marks_file_size, std::nullopt);
+    auto buffer = data_part_storage_->readFile(marks_file_path, ReadSettings().adjustBufferSize(marks_file_size), marks_file_size, std::nullopt);
     while (!buffer->eof())
     {
         /// Skip offsets for columns
-        buffer->seek(columns.size() * sizeof(MarkInCompressedFile), SEEK_CUR);
+        buffer->seek(columns_.size() * sizeof(MarkInCompressedFile), SEEK_CUR);
         size_t granularity;
         readIntBinary(granularity, *buffer);
-        index_granularity.appendMark(granularity);
+        index_granularity_.appendMark(granularity);
     }
 
-    if (index_granularity.getMarksCount() * index_granularity_info.getMarkSizeInBytes(columns.size()) != marks_file_size)
+    if (index_granularity_.getMarksCount() * index_granularity_info_.getMarkSizeInBytes(columns_.size()) != marks_file_size)
         throw Exception("Cannot read all marks from file " + marks_file_path, ErrorCodes::CANNOT_READ_ALL_DATA);
 
-    index_granularity.setInitialized();
+    index_granularity_.setInitialized();
+}
+
+void MergeTreeDataPartCompact::loadIndexGranularity()
+{
+    if (columns.empty())
+        throw Exception("No columns in part " + name, ErrorCodes::NO_FILE_IN_DATA_PART);
+
+    loadIndexGranularityImpl(index_granularity, index_granularity_info, columns, data_part_storage);
 }
 
 bool MergeTreeDataPartCompact::hasColumnFiles(const NameAndTypePair & column) const
diff --git a/src/Storages/MergeTree/MergeTreeDataPartCompact.h b/src/Storages/MergeTree/MergeTreeDataPartCompact.h
index b1c0851afde..26c335f4324 100644
--- a/src/Storages/MergeTree/MergeTreeDataPartCompact.h
+++ b/src/Storages/MergeTree/MergeTreeDataPartCompact.h
@@ -65,6 +65,11 @@ public:
 
     ~MergeTreeDataPartCompact() override;
 
+protected:
+     static void loadIndexGranularityImpl(
+         MergeTreeIndexGranularity & index_granularity_, const MergeTreeIndexGranularityInfo & index_granularity_info_,
+         const NamesAndTypesList & columns_, const DataPartStoragePtr & data_part_storage_);
+
 private:
     void checkConsistency(bool require_part_metadata) const override;
 
diff --git a/src/Storages/MergeTree/MergeTreeDataPartInMemory.cpp b/src/Storages/MergeTree/MergeTreeDataPartInMemory.cpp
index 1c5006f4211..c7c831c23ec 100644
--- a/src/Storages/MergeTree/MergeTreeDataPartInMemory.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataPartInMemory.cpp
@@ -3,6 +3,7 @@
 #include <Storages/MergeTree/MergedBlockOutputStream.h>
 #include <Storages/MergeTree/MergeTreeDataPartWriterInMemory.h>
 #include <Storages/MergeTree/IMergeTreeReader.h>
+#include <Storages/MergeTree/LoadedMergeTreeDataPartInfoForReader.h>
 #include <DataTypes/NestedUtils.h>
 #include <Interpreters/Context.h>
 #include <Poco/Logger.h>
@@ -48,9 +49,10 @@ IMergeTreeDataPart::MergeTreeReaderPtr MergeTreeDataPartInMemory::getReader(
     const ValueSizeMap & /* avg_value_size_hints */,
     const ReadBufferFromFileBase::ProfileCallback & /* profile_callback */) const
 {
+    auto read_info = std::make_shared<LoadedMergeTreeDataPartInfoForReader>(shared_from_this());
     auto ptr = std::static_pointer_cast<const MergeTreeDataPartInMemory>(shared_from_this());
     return std::make_unique<MergeTreeReaderInMemory>(
-        ptr, columns_to_read, metadata_snapshot, mark_ranges, reader_settings);
+        read_info, ptr, columns_to_read, metadata_snapshot, mark_ranges, reader_settings);
 }
 
 IMergeTreeDataPart::MergeTreeWriterPtr MergeTreeDataPartInMemory::getWriter(
diff --git a/src/Storages/MergeTree/MergeTreeDataPartTTLInfo.h b/src/Storages/MergeTree/MergeTreeDataPartTTLInfo.h
index 71ef6edd7fb..3080e285ac8 100644
--- a/src/Storages/MergeTree/MergeTreeDataPartTTLInfo.h
+++ b/src/Storages/MergeTree/MergeTreeDataPartTTLInfo.h
@@ -16,7 +16,7 @@ struct MergeTreeDataPartTTLInfo
     time_t max = 0;
 
     /// This TTL was computed on completely expired part. It doesn't make sense
-    /// to select such parts for TTL again. But make sense to recalcuate TTL
+    /// to select such parts for TTL again. But make sense to recalculate TTL
     /// again for merge with multiple parts.
     std::optional<bool> ttl_finished;
     bool finished() const { return ttl_finished.value_or(false); }
diff --git a/src/Storages/MergeTree/MergeTreeDataPartWide.cpp b/src/Storages/MergeTree/MergeTreeDataPartWide.cpp
index c7b6ff0c4dd..58a0e48caab 100644
--- a/src/Storages/MergeTree/MergeTreeDataPartWide.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataPartWide.cpp
@@ -2,6 +2,7 @@
 #include <Storages/MergeTree/MergeTreeReaderWide.h>
 #include <Storages/MergeTree/MergeTreeDataPartWriterWide.h>
 #include <Storages/MergeTree/IMergeTreeDataPartWriter.h>
+#include <Storages/MergeTree/LoadedMergeTreeDataPartInfoForReader.h>
 #include <DataTypes/NestedUtils.h>
 #include <Core/NamesAndTypes.h>
 
@@ -47,9 +48,9 @@ IMergeTreeDataPart::MergeTreeReaderPtr MergeTreeDataPartWide::getReader(
     const ValueSizeMap & avg_value_size_hints,
     const ReadBufferFromFileBase::ProfileCallback & profile_callback) const
 {
-    auto ptr = std::static_pointer_cast<const MergeTreeDataPartWide>(shared_from_this());
+    auto read_info = std::make_shared<LoadedMergeTreeDataPartInfoForReader>(shared_from_this());
     return std::make_unique<MergeTreeReaderWide>(
-        ptr, columns_to_read,
+        read_info, columns_to_read,
         metadata_snapshot, uncompressed_cache,
         mark_cache, mark_ranges, reader_settings,
         avg_value_size_hints, profile_callback);
@@ -103,46 +104,52 @@ ColumnSize MergeTreeDataPartWide::getColumnSizeImpl(
     return size;
 }
 
-void MergeTreeDataPartWide::loadIndexGranularity()
+void MergeTreeDataPartWide::loadIndexGranularityImpl(
+    MergeTreeIndexGranularity & index_granularity_, MergeTreeIndexGranularityInfo & index_granularity_info_,
+    const DataPartStoragePtr & data_part_storage_, const std::string & any_column_file_name)
 {
-    index_granularity_info.changeGranularityIfRequired(data_part_storage);
-
-
-    if (columns.empty())
-        throw Exception("No columns in part " + name, ErrorCodes::NO_FILE_IN_DATA_PART);
+    index_granularity_info_.changeGranularityIfRequired(data_part_storage_);
 
     /// We can use any column, it doesn't matter
-    std::string marks_file_path = index_granularity_info.getMarksFilePath(getFileNameForColumn(columns.front()));
-    if (!data_part_storage->exists(marks_file_path))
+    std::string marks_file_path = index_granularity_info_.getMarksFilePath(any_column_file_name);
+    if (!data_part_storage_->exists(marks_file_path))
         throw Exception(
             ErrorCodes::NO_FILE_IN_DATA_PART, "Marks file '{}' doesn't exist",
-            std::string(fs::path(data_part_storage->getFullPath()) / marks_file_path));
+            std::string(fs::path(data_part_storage_->getFullPath()) / marks_file_path));
 
-    size_t marks_file_size = data_part_storage->getFileSize(marks_file_path);
+    size_t marks_file_size = data_part_storage_->getFileSize(marks_file_path);
 
-    if (!index_granularity_info.is_adaptive)
+    if (!index_granularity_info_.is_adaptive)
     {
-        size_t marks_count = marks_file_size / index_granularity_info.getMarkSizeInBytes();
-        index_granularity.resizeWithFixedGranularity(marks_count, index_granularity_info.fixed_index_granularity); /// all the same
+        size_t marks_count = marks_file_size / index_granularity_info_.getMarkSizeInBytes();
+        index_granularity_.resizeWithFixedGranularity(marks_count, index_granularity_info_.fixed_index_granularity); /// all the same
     }
     else
     {
-        auto buffer = data_part_storage->readFile(marks_file_path, ReadSettings().adjustBufferSize(marks_file_size), marks_file_size, std::nullopt);
+        auto buffer = data_part_storage_->readFile(marks_file_path, ReadSettings().adjustBufferSize(marks_file_size), marks_file_size, std::nullopt);
         while (!buffer->eof())
         {
             buffer->seek(sizeof(size_t) * 2, SEEK_CUR); /// skip offset_in_compressed file and offset_in_decompressed_block
             size_t granularity;
             readIntBinary(granularity, *buffer);
-            index_granularity.appendMark(granularity);
+            index_granularity_.appendMark(granularity);
         }
 
-        if (index_granularity.getMarksCount() * index_granularity_info.getMarkSizeInBytes() != marks_file_size)
+        if (index_granularity_.getMarksCount() * index_granularity_info_.getMarkSizeInBytes() != marks_file_size)
             throw Exception(
                 ErrorCodes::CANNOT_READ_ALL_DATA, "Cannot read all marks from file {}",
-                std::string(fs::path(data_part_storage->getFullPath()) / marks_file_path));
+                std::string(fs::path(data_part_storage_->getFullPath()) / marks_file_path));
     }
 
-    index_granularity.setInitialized();
+    index_granularity_.setInitialized();
+}
+
+void MergeTreeDataPartWide::loadIndexGranularity()
+{
+    if (columns.empty())
+        throw Exception("No columns in part " + name, ErrorCodes::NO_FILE_IN_DATA_PART);
+
+    loadIndexGranularityImpl(index_granularity, index_granularity_info, data_part_storage, getFileNameForColumn(columns.front()));
 }
 
 bool MergeTreeDataPartWide::isStoredOnRemoteDisk() const
diff --git a/src/Storages/MergeTree/MergeTreeDataPartWide.h b/src/Storages/MergeTree/MergeTreeDataPartWide.h
index 325193557b3..52afa9e82d4 100644
--- a/src/Storages/MergeTree/MergeTreeDataPartWide.h
+++ b/src/Storages/MergeTree/MergeTreeDataPartWide.h
@@ -61,6 +61,11 @@ public:
 
     bool hasColumnFiles(const NameAndTypePair & column) const override;
 
+protected:
+    static void loadIndexGranularityImpl(
+        MergeTreeIndexGranularity & index_granularity_, MergeTreeIndexGranularityInfo & index_granularity_info_,
+        const DataPartStoragePtr & data_part_storage_, const std::string & any_column_file_name);
+
 private:
     void checkConsistency(bool require_part_metadata) const override;
 
diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp
index 771248b99c6..44fe50815da 100644
--- a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp
@@ -66,8 +66,7 @@ void MergeTreeDataPartWriterCompact::addStreams(const NameAndTypePair & column,
         compressed_streams.emplace(stream_name, stream);
     };
 
-    ISerialization::SubstreamPath path;
-    data_part->getSerialization(column.name)->enumerateStreams(path, callback, column.type);
+    data_part->getSerialization(column.name)->enumerateStreams(callback, column.type);
 }
 
 namespace
diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp
index 3d4aa0a7707..99bf188f03c 100644
--- a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp
@@ -121,7 +121,7 @@ void MergeTreeDataPartWriterWide::addStreams(
     };
 
     ISerialization::SubstreamPath path;
-    data_part->getSerialization(column.name)->enumerateStreams(path, callback, column.type);
+    data_part->getSerialization(column.name)->enumerateStreams(callback, column.type);
 }
 
 
@@ -255,10 +255,9 @@ void MergeTreeDataPartWriterWide::write(const Block & block, const IColumn::Perm
 void MergeTreeDataPartWriterWide::writeSingleMark(
     const NameAndTypePair & column,
     WrittenOffsetColumns & offset_columns,
-    size_t number_of_rows,
-    ISerialization::SubstreamPath & path)
+    size_t number_of_rows)
 {
-    StreamsWithMarks marks = getCurrentMarksForColumn(column, offset_columns, path);
+    StreamsWithMarks marks = getCurrentMarksForColumn(column, offset_columns);
     for (const auto & mark : marks)
         flushMarkToFile(mark, number_of_rows);
 }
@@ -274,8 +273,7 @@ void MergeTreeDataPartWriterWide::flushMarkToFile(const StreamNameAndMark & stre
 
 StreamsWithMarks MergeTreeDataPartWriterWide::getCurrentMarksForColumn(
     const NameAndTypePair & column,
-    WrittenOffsetColumns & offset_columns,
-    ISerialization::SubstreamPath & path)
+    WrittenOffsetColumns & offset_columns)
 {
     StreamsWithMarks result;
     data_part->getSerialization(column.name)->enumerateStreams([&] (const ISerialization::SubstreamPath & substream_path)
@@ -300,7 +298,7 @@ StreamsWithMarks MergeTreeDataPartWriterWide::getCurrentMarksForColumn(
         stream_with_mark.mark.offset_in_decompressed_block = stream.compressed.offset();
 
         result.push_back(stream_with_mark);
-    }, path);
+    });
 
     return result;
 }
@@ -328,7 +326,7 @@ void MergeTreeDataPartWriterWide::writeSingleGranule(
             return;
 
         column_streams[stream_name]->compressed.nextIfAtEnd();
-    }, serialize_settings.path);
+    });
 }
 
 /// Column must not be empty. (column.size() !== 0)
@@ -366,7 +364,7 @@ void MergeTreeDataPartWriterWide::writeColumn(
         {
             if (last_non_written_marks.contains(name))
                 throw Exception(ErrorCodes::LOGICAL_ERROR, "We have to add new mark for column, but already have non written mark. Current mark {}, total marks {}, offset {}", getCurrentMark(), index_granularity.getMarksCount(), rows_written_in_last_mark);
-            last_non_written_marks[name] = getCurrentMarksForColumn(name_and_type, offset_columns, serialize_settings.path);
+            last_non_written_marks[name] = getCurrentMarksForColumn(name_and_type, offset_columns);
         }
 
         writeSingleGranule(
@@ -390,7 +388,7 @@ void MergeTreeDataPartWriterWide::writeColumn(
         }
     }
 
-    serialization->enumerateStreams([&] (const ISerialization::SubstreamPath & substream_path)
+    serialization->enumerateStreams([&](const ISerialization::SubstreamPath & substream_path)
     {
         bool is_offsets = !substream_path.empty() && substream_path.back().type == ISerialization::Substream::ArraySizes;
         if (is_offsets)
@@ -398,7 +396,7 @@ void MergeTreeDataPartWriterWide::writeColumn(
             String stream_name = ISerialization::getFileNameForStream(name_and_type, substream_path);
             offset_columns.insert(stream_name);
         }
-    }, serialize_settings.path);
+    });
 }
 
 
@@ -553,7 +551,7 @@ void MergeTreeDataPartWriterWide::fillDataChecksums(IMergeTreeDataPart::Checksum
             }
 
             if (write_final_mark)
-                writeFinalMark(*it, offset_columns, serialize_settings.path);
+                writeFinalMark(*it, offset_columns);
         }
     }
 
@@ -618,10 +616,9 @@ void MergeTreeDataPartWriterWide::finish(bool sync)
 
 void MergeTreeDataPartWriterWide::writeFinalMark(
     const NameAndTypePair & column,
-    WrittenOffsetColumns & offset_columns,
-    ISerialization::SubstreamPath & path)
+    WrittenOffsetColumns & offset_columns)
 {
-    writeSingleMark(column, offset_columns, 0, path);
+    writeSingleMark(column, offset_columns, 0);
     /// Memoize information about offsets
     data_part->getSerialization(column.name)->enumerateStreams([&] (const ISerialization::SubstreamPath & substream_path)
     {
@@ -631,7 +628,7 @@ void MergeTreeDataPartWriterWide::writeFinalMark(
             String stream_name = ISerialization::getFileNameForStream(column, substream_path);
             offset_columns.insert(stream_name);
         }
-    }, path);
+    });
 }
 
 static void fillIndexGranularityImpl(
diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h
index a3517f3aa88..08815d9930a 100644
--- a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h
+++ b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h
@@ -61,8 +61,7 @@ private:
     /// Take offsets from column and return as MarkInCompressed file with stream name
     StreamsWithMarks getCurrentMarksForColumn(
         const NameAndTypePair & column,
-        WrittenOffsetColumns & offset_columns,
-        ISerialization::SubstreamPath & path);
+        WrittenOffsetColumns & offset_columns);
 
     /// Write mark to disk using stream and rows count
     void flushMarkToFile(
@@ -73,13 +72,11 @@ private:
     void writeSingleMark(
         const NameAndTypePair & column,
         WrittenOffsetColumns & offset_columns,
-        size_t number_of_rows,
-        ISerialization::SubstreamPath & path);
+        size_t number_of_rows);
 
     void writeFinalMark(
         const NameAndTypePair & column,
-        WrittenOffsetColumns & offset_columns,
-        ISerialization::SubstreamPath & path);
+        WrittenOffsetColumns & offset_columns);
 
     void addStreams(
         const NameAndTypePair & column,
diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
index c024e5da7b5..709a8babcdd 100644
--- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
@@ -313,6 +313,7 @@ QueryPlanPtr MergeTreeDataSelectExecutor::read(
                 settings.min_free_disk_space_for_temporary_data,
                 settings.compile_aggregate_expressions,
                 settings.min_count_to_compile_aggregate_expression,
+                settings.max_block_size,
                 only_merge);
 
             return std::make_pair(params, only_merge);
diff --git a/src/Storages/MergeTree/MergeTreeIndexGranularityInfo.h b/src/Storages/MergeTree/MergeTreeIndexGranularityInfo.h
index dbb027c244e..a5adc919f4f 100644
--- a/src/Storages/MergeTree/MergeTreeIndexGranularityInfo.h
+++ b/src/Storages/MergeTree/MergeTreeIndexGranularityInfo.h
@@ -29,6 +29,8 @@ public:
 
     MergeTreeIndexGranularityInfo(const MergeTreeData & storage, MergeTreeDataPartType type_);
 
+    MergeTreeIndexGranularityInfo(MergeTreeDataPartType type_, bool is_adaptive_, size_t index_granularity_, size_t index_granularity_bytes_);
+
     void changeGranularityIfRequired(const DataPartStoragePtr & data_part_storage);
 
     String getMarksFilePath(const String & path_prefix) const
diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.cpp b/src/Storages/MergeTree/MergeTreeRangeReader.cpp
index 79854785016..1a5a4d91806 100644
--- a/src/Storages/MergeTree/MergeTreeRangeReader.cpp
+++ b/src/Storages/MergeTree/MergeTreeRangeReader.cpp
@@ -83,7 +83,7 @@ MergeTreeRangeReader::DelayedStream::DelayedStream(
         : current_mark(from_mark), current_offset(0), num_delayed_rows(0)
         , current_task_last_mark(current_task_last_mark_)
         , merge_tree_reader(merge_tree_reader_)
-        , index_granularity(&(merge_tree_reader->data_part->index_granularity))
+        , index_granularity(&(merge_tree_reader->data_part_info_for_read->getIndexGranularity()))
         , continue_reading(false), is_finished(false)
 {
 }
@@ -181,7 +181,7 @@ MergeTreeRangeReader::Stream::Stream(
         : current_mark(from_mark), offset_after_current_mark(0)
         , last_mark(to_mark)
         , merge_tree_reader(merge_tree_reader_)
-        , index_granularity(&(merge_tree_reader->data_part->index_granularity))
+        , index_granularity(&(merge_tree_reader->data_part_info_for_read->getIndexGranularity()))
         , current_mark_index_granularity(index_granularity->getMarkRows(from_mark))
         , stream(from_mark, current_task_last_mark, merge_tree_reader)
 {
@@ -652,7 +652,7 @@ MergeTreeRangeReader::MergeTreeRangeReader(
     bool last_reader_in_chain_,
     const Names & non_const_virtual_column_names_)
     : merge_tree_reader(merge_tree_reader_)
-    , index_granularity(&(merge_tree_reader->data_part->index_granularity))
+    , index_granularity(&(merge_tree_reader->data_part_info_for_read->getIndexGranularity()))
     , prev_reader(prev_reader_)
     , prewhere_info(prewhere_info_)
     , last_reader_in_chain(last_reader_in_chain_)
@@ -946,7 +946,8 @@ MergeTreeRangeReader::ReadResult MergeTreeRangeReader::startReadingChain(size_t
     result.addRows(stream.finalize(result.columns));
 
     /// Last granule may be incomplete.
-    result.adjustLastGranule();
+    if (!result.rowsPerGranule().empty())
+        result.adjustLastGranule();
 
     for (const auto & column_name : non_const_virtual_column_names)
     {
diff --git a/src/Storages/MergeTree/MergeTreeReadPool.cpp b/src/Storages/MergeTree/MergeTreeReadPool.cpp
index cc2c20eda5a..3f51673a6b1 100644
--- a/src/Storages/MergeTree/MergeTreeReadPool.cpp
+++ b/src/Storages/MergeTree/MergeTreeReadPool.cpp
@@ -1,5 +1,6 @@
 #include <Storages/MergeTree/MergeTreeReadPool.h>
 #include <Storages/MergeTree/MergeTreeBaseSelectProcessor.h>
+#include <Storages/MergeTree/LoadedMergeTreeDataPartInfoForReader.h>
 #include <Common/formatReadable.h>
 #include <base/range.h>
 
@@ -22,7 +23,6 @@ MergeTreeReadPool::MergeTreeReadPool(
     size_t sum_marks_,
     size_t min_marks_for_concurrent_read_,
     RangesInDataParts && parts_,
-    const MergeTreeData & data_,
     const StorageSnapshotPtr & storage_snapshot_,
     const PrewhereInfoPtr & prewhere_info_,
     const Names & column_names_,
@@ -32,7 +32,6 @@ MergeTreeReadPool::MergeTreeReadPool(
     bool do_not_steal_tasks_)
     : backoff_settings{backoff_settings_}
     , backoff_state{threads_}
-    , data{data_}
     , storage_snapshot{storage_snapshot_}
     , column_names{column_names_}
     , virtual_column_names{virtual_column_names_}
@@ -214,7 +213,7 @@ std::vector<size_t> MergeTreeReadPool::fillPerPartInfo(const RangesInDataParts &
         per_part_sum_marks.push_back(sum_marks);
 
         auto task_columns = getReadTaskColumns(
-            data, storage_snapshot, part.data_part,
+            LoadedMergeTreeDataPartInfoForReader(part.data_part), storage_snapshot,
             column_names, virtual_column_names, prewhere_info, /*with_subcolumns=*/ true);
 
         auto size_predictor = !predict_block_size_bytes ? nullptr
diff --git a/src/Storages/MergeTree/MergeTreeReadPool.h b/src/Storages/MergeTree/MergeTreeReadPool.h
index 01a1280b6cb..c9fe70d9a78 100644
--- a/src/Storages/MergeTree/MergeTreeReadPool.h
+++ b/src/Storages/MergeTree/MergeTreeReadPool.h
@@ -70,11 +70,16 @@ private:
 
 public:
     MergeTreeReadPool(
-        size_t threads_, size_t sum_marks_, size_t min_marks_for_concurrent_read_,
-        RangesInDataParts && parts_, const MergeTreeData & data_, const StorageSnapshotPtr & storage_snapshot_,
+        size_t threads_,
+        size_t sum_marks_,
+        size_t min_marks_for_concurrent_read_,
+        RangesInDataParts && parts_,
+        const StorageSnapshotPtr & storage_snapshot_,
         const PrewhereInfoPtr & prewhere_info_,
-        const Names & column_names_, const Names & virtual_column_names_,
-        const BackoffSettings & backoff_settings_, size_t preferred_block_size_bytes_,
+        const Names & column_names_,
+        const Names & virtual_column_names_,
+        const BackoffSettings & backoff_settings_,
+        size_t preferred_block_size_bytes_,
         bool do_not_steal_tasks_ = false);
 
     MergeTreeReadTaskPtr getTask(size_t min_marks_to_read, size_t thread, const Names & ordered_names);
@@ -94,7 +99,6 @@ private:
         size_t threads, size_t sum_marks, std::vector<size_t> per_part_sum_marks,
         const RangesInDataParts & parts, size_t min_marks_for_concurrent_read);
 
-    const MergeTreeData & data;
     StorageSnapshotPtr storage_snapshot;
     const Names column_names;
     const Names virtual_column_names;
diff --git a/src/Storages/MergeTree/MergeTreeReaderCompact.cpp b/src/Storages/MergeTree/MergeTreeReaderCompact.cpp
index 88237091547..4801c9a4058 100644
--- a/src/Storages/MergeTree/MergeTreeReaderCompact.cpp
+++ b/src/Storages/MergeTree/MergeTreeReaderCompact.cpp
@@ -15,19 +15,19 @@ namespace ErrorCodes
 
 
 MergeTreeReaderCompact::MergeTreeReaderCompact(
-    DataPartCompactPtr data_part_,
+    MergeTreeDataPartInfoForReaderPtr data_part_info_for_read_,
     NamesAndTypesList columns_,
     const StorageMetadataPtr & metadata_snapshot_,
     UncompressedCache * uncompressed_cache_,
     MarkCache * mark_cache_,
     MarkRanges mark_ranges_,
     MergeTreeReaderSettings settings_,
-    ThreadPool * load_marks_cache_threadpool_,
+    ThreadPool * load_marks_threadpool_,
     ValueSizeMap avg_value_size_hints_,
     const ReadBufferFromFileBase::ProfileCallback & profile_callback_,
     clockid_t clock_type_)
     : IMergeTreeReader(
-        data_part_,
+        data_part_info_for_read_,
         columns_,
         metadata_snapshot_,
         uncompressed_cache_,
@@ -36,50 +36,22 @@ MergeTreeReaderCompact::MergeTreeReaderCompact(
         settings_,
         avg_value_size_hints_)
     , marks_loader(
-          data_part->data_part_storage,
+          data_part_info_for_read_->getDataPartStorage(),
           mark_cache,
-          data_part->index_granularity_info.getMarksFilePath(MergeTreeDataPartCompact::DATA_FILE_NAME),
-          data_part->getMarksCount(),
-          data_part->index_granularity_info,
+          data_part_info_for_read_->getIndexGranularityInfo().getMarksFilePath(MergeTreeDataPartCompact::DATA_FILE_NAME),
+          data_part_info_for_read_->getMarksCount(),
+          data_part_info_for_read_->getIndexGranularityInfo(),
           settings.save_marks_in_cache,
           settings.read_settings,
-          load_marks_cache_threadpool_,
-          data_part->getColumns().size())
+          load_marks_threadpool_,
+          data_part_info_for_read_->getColumns().size())
 {
     try
     {
-        size_t columns_num = columns_to_read.size();
-
-        column_positions.resize(columns_num);
-        read_only_offsets.resize(columns_num);
-
-        for (size_t i = 0; i < columns_num; ++i)
-        {
-            const auto & column_to_read = columns_to_read[i];
-
-            if (column_to_read.isSubcolumn())
-            {
-                auto storage_column_from_part = getColumnInPart(
-                    {column_to_read.getNameInStorage(), column_to_read.getTypeInStorage()});
-
-                if (!storage_column_from_part.type->tryGetSubcolumnType(column_to_read.getSubcolumnName()))
-                    continue;
-            }
-
-            auto position = data_part->getColumnPosition(column_to_read.getNameInStorage());
-            if (!position && typeid_cast<const DataTypeArray *>(column_to_read.type.get()))
-            {
-                /// If array of Nested column is missing in part,
-                /// we have to read its offsets if they exist.
-                position = findColumnForOffsets(column_to_read.name);
-                read_only_offsets[i] = (position != std::nullopt);
-            }
-
-            column_positions[i] = std::move(position);
-        }
+        fillColumnPositions();
 
         /// Do not use max_read_buffer_size, but try to lower buffer size with maximal size of granule to avoid reading much data.
-        auto buffer_size = getReadBufferSize(data_part, marks_loader, column_positions, all_mark_ranges);
+        auto buffer_size = getReadBufferSize(*data_part_info_for_read, marks_loader, column_positions, all_mark_ranges);
         if (buffer_size)
             settings.read_settings = settings.read_settings.adjustBufferSize(buffer_size);
 
@@ -90,10 +62,10 @@ MergeTreeReaderCompact::MergeTreeReaderCompact(
         if (uncompressed_cache)
         {
             auto buffer = std::make_unique<CachedCompressedReadBuffer>(
-                std::string(fs::path(data_part->data_part_storage->getFullPath()) / path),
+                std::string(fs::path(data_part_info_for_read->getDataPartStorage()->getFullPath()) / path),
                 [this, path]()
                 {
-                    return data_part->data_part_storage->readFile(
+                    return data_part_info_for_read->getDataPartStorage()->readFile(
                         path,
                         settings.read_settings,
                         std::nullopt, std::nullopt);
@@ -115,7 +87,7 @@ MergeTreeReaderCompact::MergeTreeReaderCompact(
         {
             auto buffer =
                 std::make_unique<CompressedReadBufferFromFile>(
-                    data_part->data_part_storage->readFile(
+                    data_part_info_for_read->getDataPartStorage()->readFile(
                         path,
                         settings.read_settings,
                         std::nullopt, std::nullopt),
@@ -134,11 +106,49 @@ MergeTreeReaderCompact::MergeTreeReaderCompact(
     }
     catch (...)
     {
-        storage.reportBrokenPart(data_part);
+        data_part_info_for_read->reportBroken();
         throw;
     }
 }
 
+void MergeTreeReaderCompact::fillColumnPositions()
+{
+    size_t columns_num = columns_to_read.size();
+
+    column_positions.resize(columns_num);
+    read_only_offsets.resize(columns_num);
+
+    for (size_t i = 0; i < columns_num; ++i)
+    {
+        const auto & column_to_read = columns_to_read[i];
+
+        auto position = data_part_info_for_read->getColumnPosition(column_to_read.getNameInStorage());
+        bool is_array = isArray(column_to_read.type);
+
+        if (column_to_read.isSubcolumn())
+        {
+            auto storage_column_from_part = getColumnInPart(
+                {column_to_read.getNameInStorage(), column_to_read.getTypeInStorage()});
+
+            auto subcolumn_name = column_to_read.getSubcolumnName();
+            if (!storage_column_from_part.type->hasSubcolumn(subcolumn_name))
+                position.reset();
+        }
+
+        if (!position && is_array)
+        {
+            /// If array of Nested column is missing in part,
+            /// we have to read its offsets if they exist.
+            position = findColumnForOffsets(column_to_read);
+            read_only_offsets[i] = (position != std::nullopt);
+        }
+
+        column_positions[i] = std::move(position);
+        if (read_only_offsets[i])
+            partially_read_columns.insert(column_to_read.name);
+    }
+}
+
 size_t MergeTreeReaderCompact::readRows(
     size_t from_mark, size_t current_task_last_mark, bool continue_reading, size_t max_rows_to_read, Columns & res_columns)
 {
@@ -158,7 +168,7 @@ size_t MergeTreeReaderCompact::readRows(
 
     while (read_rows < max_rows_to_read)
     {
-        size_t rows_to_read = data_part->index_granularity.getMarkRows(from_mark);
+        size_t rows_to_read = data_part_info_for_read->getIndexGranularity().getMarkRows(from_mark);
 
         for (size_t pos = 0; pos < num_columns; ++pos)
         {
@@ -181,7 +191,7 @@ size_t MergeTreeReaderCompact::readRows(
             catch (Exception & e)
             {
                 if (e.code() != ErrorCodes::MEMORY_LIMIT_EXCEEDED)
-                    storage.reportBrokenPart(data_part);
+                    data_part_info_for_read->reportBroken();
 
                 /// Better diagnostics.
                 e.addMessage("(while reading column " + columns_to_read[pos].name + ")");
@@ -189,7 +199,7 @@ size_t MergeTreeReaderCompact::readRows(
             }
             catch (...)
             {
-                storage.reportBrokenPart(data_part);
+                data_part_info_for_read->reportBroken();
                 throw;
             }
         }
@@ -216,7 +226,8 @@ void MergeTreeReaderCompact::readData(
 
     auto buffer_getter = [&](const ISerialization::SubstreamPath & substream_path) -> ReadBuffer *
     {
-        if (only_offsets && (substream_path.size() != 1 || substream_path[0].type != ISerialization::Substream::ArraySizes))
+        bool is_offsets = !substream_path.empty() && substream_path.back().type == ISerialization::Substream::ArraySizes;
+        if (only_offsets && !is_offsets)
             return nullptr;
 
         return data_buffer;
@@ -281,7 +292,7 @@ void MergeTreeReaderCompact::seekToMark(size_t row_index, size_t column_index)
 void MergeTreeReaderCompact::adjustUpperBound(size_t last_mark)
 {
     size_t right_offset = 0;
-    if (last_mark < data_part->getMarksCount()) /// Otherwise read until the end of file
+    if (last_mark < data_part_info_for_read->getMarksCount()) /// Otherwise read until the end of file
         right_offset = marks_loader.getMark(last_mark).offset_in_compressed_file;
 
     if (right_offset == 0)
@@ -309,7 +320,7 @@ bool MergeTreeReaderCompact::isContinuousReading(size_t mark, size_t column_posi
         return false;
     const auto & [last_mark, last_column] = *last_read_granule;
     return (mark == last_mark && column_position == last_column + 1)
-        || (mark == last_mark + 1 && column_position == 0 && last_column == data_part->getColumns().size() - 1);
+        || (mark == last_mark + 1 && column_position == 0 && last_column == data_part_info_for_read->getColumns().size() - 1);
 }
 
 namespace
@@ -361,16 +372,16 @@ private:
 }
 
 size_t MergeTreeReaderCompact::getReadBufferSize(
-    const DataPartPtr & part,
+    const IMergeTreeDataPartInfoForReader & data_part_info_for_reader,
     MergeTreeMarksLoader & marks_loader,
     const ColumnPositions & column_positions,
     const MarkRanges & mark_ranges)
 {
     size_t buffer_size = 0;
     size_t columns_num = column_positions.size();
-    size_t file_size = part->getFileSizeOrZero(MergeTreeDataPartCompact::DATA_FILE_NAME_WITH_EXTENSION);
+    size_t file_size = data_part_info_for_reader.getFileSizeOrZero(MergeTreeDataPartCompact::DATA_FILE_NAME_WITH_EXTENSION);
 
-    MarksCounter counter(part->getMarksCount(), part->getColumns().size());
+    MarksCounter counter(data_part_info_for_reader.getMarksCount(), data_part_info_for_reader.getColumns().size());
 
     for (const auto & mark_range : mark_ranges)
     {
diff --git a/src/Storages/MergeTree/MergeTreeReaderCompact.h b/src/Storages/MergeTree/MergeTreeReaderCompact.h
index 348e4802e2a..ee099755a8e 100644
--- a/src/Storages/MergeTree/MergeTreeReaderCompact.h
+++ b/src/Storages/MergeTree/MergeTreeReaderCompact.h
@@ -19,14 +19,14 @@ class MergeTreeReaderCompact : public IMergeTreeReader
 {
 public:
     MergeTreeReaderCompact(
-        DataPartCompactPtr data_part_,
+        MergeTreeDataPartInfoForReaderPtr data_part_info_for_read_,
         NamesAndTypesList columns_,
         const StorageMetadataPtr & metadata_snapshot_,
         UncompressedCache * uncompressed_cache_,
         MarkCache * mark_cache_,
         MarkRanges mark_ranges_,
         MergeTreeReaderSettings settings_,
-        ThreadPool * load_marks_cache_threadpool_,
+        ThreadPool * load_marks_threadpool_,
         ValueSizeMap avg_value_size_hints_ = {},
         const ReadBufferFromFileBase::ProfileCallback & profile_callback_ = {},
         clockid_t clock_type_ = CLOCK_MONOTONIC_COARSE);
@@ -40,6 +40,7 @@ public:
 
 private:
     bool isContinuousReading(size_t mark, size_t column_position);
+    void fillColumnPositions();
 
     ReadBuffer * data_buffer;
     CompressedReadBufferBase * compressed_data_buffer;
@@ -68,7 +69,7 @@ private:
     /// Returns maximal value of granule size in compressed file from @mark_ranges.
     /// This value is used as size of read buffer.
     static size_t getReadBufferSize(
-        const DataPartPtr & part,
+        const IMergeTreeDataPartInfoForReader & data_part_info_for_reader,
         MergeTreeMarksLoader & marks_loader,
         const ColumnPositions & column_positions,
         const MarkRanges & mark_ranges);
diff --git a/src/Storages/MergeTree/MergeTreeReaderInMemory.cpp b/src/Storages/MergeTree/MergeTreeReaderInMemory.cpp
index 766c28c99b9..3b3a6b95cff 100644
--- a/src/Storages/MergeTree/MergeTreeReaderInMemory.cpp
+++ b/src/Storages/MergeTree/MergeTreeReaderInMemory.cpp
@@ -16,13 +16,14 @@ namespace ErrorCodes
 
 
 MergeTreeReaderInMemory::MergeTreeReaderInMemory(
+    MergeTreeDataPartInfoForReaderPtr data_part_info_for_read_,
     DataPartInMemoryPtr data_part_,
     NamesAndTypesList columns_,
     const StorageMetadataPtr & metadata_snapshot_,
     MarkRanges mark_ranges_,
     MergeTreeReaderSettings settings_)
     : IMergeTreeReader(
-        data_part_,
+        data_part_info_for_read_,
         columns_,
         metadata_snapshot_,
         nullptr,
@@ -32,13 +33,19 @@ MergeTreeReaderInMemory::MergeTreeReaderInMemory(
         {})
     , part_in_memory(std::move(data_part_))
 {
-    for (const auto & [name, type] : columns_to_read)
+    for (const auto & column_to_read : columns_to_read)
     {
         /// If array of Nested column is missing in part,
         /// we have to read its offsets if they exist.
-        if (!part_in_memory->block.has(name) && typeid_cast<const DataTypeArray *>(type.get()))
-            if (auto offset_position = findColumnForOffsets(name))
-                positions_for_offsets[name] = *offset_position;
+        if (typeid_cast<const DataTypeArray *>(column_to_read.type.get())
+            && !tryGetColumnFromBlock(part_in_memory->block, column_to_read))
+        {
+            if (auto offsets_position = findColumnForOffsets(column_to_read))
+            {
+                positions_for_offsets[column_to_read.name] = *offsets_position;
+                partially_read_columns.insert(column_to_read.name);
+            }
+        }
     }
 }
 
@@ -48,7 +55,7 @@ size_t MergeTreeReaderInMemory::readRows(
     if (!continue_reading)
         total_rows_read = 0;
 
-    size_t total_marks = data_part->index_granularity.getMarksCount();
+    size_t total_marks = data_part_info_for_read->getIndexGranularity().getMarksCount();
     if (from_mark >= total_marks)
         throw Exception("Mark " + toString(from_mark) + " is out of bound. Max mark: "
             + toString(total_marks), ErrorCodes::ARGUMENT_OUT_OF_BOUND);
diff --git a/src/Storages/MergeTree/MergeTreeReaderInMemory.h b/src/Storages/MergeTree/MergeTreeReaderInMemory.h
index ff6eb92d9c3..cb67bc46eae 100644
--- a/src/Storages/MergeTree/MergeTreeReaderInMemory.h
+++ b/src/Storages/MergeTree/MergeTreeReaderInMemory.h
@@ -15,6 +15,7 @@ class MergeTreeReaderInMemory : public IMergeTreeReader
 {
 public:
     MergeTreeReaderInMemory(
+        MergeTreeDataPartInfoForReaderPtr data_part_info_for_read_,
         DataPartInMemoryPtr data_part_,
         NamesAndTypesList columns_,
         const StorageMetadataPtr & metadata_snapshot_,
diff --git a/src/Storages/MergeTree/MergeTreeReaderWide.cpp b/src/Storages/MergeTree/MergeTreeReaderWide.cpp
index 9ac7b5a5c5a..ea367a9502e 100644
--- a/src/Storages/MergeTree/MergeTreeReaderWide.cpp
+++ b/src/Storages/MergeTree/MergeTreeReaderWide.cpp
@@ -17,7 +17,6 @@ namespace DB
 
 namespace
 {
-    using OffsetColumns = std::map<std::string, ColumnPtr>;
     constexpr auto DATA_FILE_EXTENSION = ".bin";
 }
 
@@ -27,7 +26,7 @@ namespace ErrorCodes
 }
 
 MergeTreeReaderWide::MergeTreeReaderWide(
-    DataPartWidePtr data_part_,
+    MergeTreeDataPartInfoForReaderPtr data_part_info_,
     NamesAndTypesList columns_,
     const StorageMetadataPtr & metadata_snapshot_,
     UncompressedCache * uncompressed_cache_,
@@ -38,7 +37,7 @@ MergeTreeReaderWide::MergeTreeReaderWide(
     const ReadBufferFromFileBase::ProfileCallback & profile_callback_,
     clockid_t clock_type_)
     : IMergeTreeReader(
-        data_part_,
+        data_part_info_,
         columns_,
         metadata_snapshot_,
         uncompressed_cache_,
@@ -54,7 +53,7 @@ MergeTreeReaderWide::MergeTreeReaderWide(
     }
     catch (...)
     {
-        storage.reportBrokenPart(data_part);
+        data_part_info_for_read->reportBroken();
         throw;
     }
 }
@@ -74,7 +73,7 @@ size_t MergeTreeReaderWide::readRows(
         std::unordered_map<String, ISerialization::SubstreamsCache> caches;
 
         std::unordered_set<std::string> prefetched_streams;
-        if (data_part->data_part_storage->isStoredOnRemoteDisk() ? settings.read_settings.remote_fs_prefetch : settings.read_settings.local_fs_prefetch)
+        if (data_part_info_for_read->getDataPartStorage()->isStoredOnRemoteDisk() ? settings.read_settings.remote_fs_prefetch : settings.read_settings.local_fs_prefetch)
         {
             /// Request reading of data in advance,
             /// so if reading can be asynchronous, it will also be performed in parallel for all columns.
@@ -137,17 +136,17 @@ size_t MergeTreeReaderWide::readRows(
     catch (Exception & e)
     {
         if (e.code() != ErrorCodes::MEMORY_LIMIT_EXCEEDED)
-            storage.reportBrokenPart(data_part);
+            data_part_info_for_read->reportBroken();
 
         /// Better diagnostics.
-        e.addMessage("(while reading from part " + data_part->data_part_storage->getFullPath() + " "
+        e.addMessage("(while reading from part " + data_part_info_for_read->getDataPartStorage()->getFullPath() + " "
                      "from mark " + toString(from_mark) + " "
                      "with max_rows_to_read = " + toString(max_rows_to_read) + ")");
         throw;
     }
     catch (...)
     {
-        storage.reportBrokenPart(data_part);
+        data_part_info_for_read->reportBroken();
 
         throw;
     }
@@ -161,35 +160,48 @@ void MergeTreeReaderWide::addStreams(
     const ReadBufferFromFileBase::ProfileCallback & profile_callback,
     clockid_t clock_type)
 {
+    bool has_any_stream = false;
+    bool has_all_streams = true;
+
     ISerialization::StreamCallback callback = [&] (const ISerialization::SubstreamPath & substream_path)
     {
         String stream_name = ISerialization::getFileNameForStream(name_and_type, substream_path);
 
         if (streams.contains(stream_name))
+        {
+            has_any_stream = true;
             return;
+        }
 
-        bool data_file_exists = data_part->checksums.files.contains(stream_name + DATA_FILE_EXTENSION);
+        bool data_file_exists = data_part_info_for_read->getChecksums().files.contains(stream_name + DATA_FILE_EXTENSION);
 
         /** If data file is missing then we will not try to open it.
           * It is necessary since it allows to add new column to structure of the table without creating new files for old parts.
           */
         if (!data_file_exists)
+        {
+            has_all_streams = false;
             return;
+        }
 
+        has_any_stream = true;
         bool is_lc_dict = substream_path.size() > 1 && substream_path[substream_path.size() - 2].type == ISerialization::Substream::Type::DictionaryKeys;
 
-        auto context = data_part->storage.getContext();
+        auto context = data_part_info_for_read->getContext();
         auto * load_marks_threadpool = settings.read_settings.load_marks_asynchronously ? &context->getLoadMarksThreadpool() : nullptr;
 
         streams.emplace(stream_name, std::make_unique<MergeTreeReaderStream>(
-            data_part->data_part_storage, stream_name, DATA_FILE_EXTENSION,
-            data_part->getMarksCount(), all_mark_ranges, settings, mark_cache,
-            uncompressed_cache, data_part->getFileSizeOrZero(stream_name + DATA_FILE_EXTENSION),
-            &data_part->index_granularity_info,
+            data_part_info_for_read->getDataPartStorage(), stream_name, DATA_FILE_EXTENSION,
+            data_part_info_for_read->getMarksCount(), all_mark_ranges, settings, mark_cache,
+            uncompressed_cache, data_part_info_for_read->getFileSizeOrZero(stream_name + DATA_FILE_EXTENSION),
+            &data_part_info_for_read->getIndexGranularityInfo(),
             profile_callback, clock_type, is_lc_dict, load_marks_threadpool));
     };
 
     serialization->enumerateStreams(callback);
+
+    if (has_any_stream && !has_all_streams)
+        partially_read_columns.insert(name_and_type.name);
 }
 
 
@@ -287,6 +299,7 @@ void MergeTreeReaderWide::readData(
             /* seek_to_start = */false, substream_path, streams, name_and_type, from_mark,
             seek_to_mark, current_task_last_mark, cache);
     };
+
     deserialize_settings.continuous_reading = continue_reading;
     auto & deserialize_state = deserialize_binary_bulk_state_map[name_and_type.name];
 
diff --git a/src/Storages/MergeTree/MergeTreeReaderWide.h b/src/Storages/MergeTree/MergeTreeReaderWide.h
index 2137695b6d7..dbfc0310242 100644
--- a/src/Storages/MergeTree/MergeTreeReaderWide.h
+++ b/src/Storages/MergeTree/MergeTreeReaderWide.h
@@ -15,7 +15,7 @@ class MergeTreeReaderWide : public IMergeTreeReader
 {
 public:
     MergeTreeReaderWide(
-        DataPartWidePtr data_part_,
+        MergeTreeDataPartInfoForReaderPtr data_part_info_for_read_,
         NamesAndTypesList columns_,
         const StorageMetadataPtr & metadata_snapshot_,
         UncompressedCache * uncompressed_cache_,
diff --git a/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp
index 3e346df6662..59cbae3f914 100644
--- a/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp
+++ b/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp
@@ -1,6 +1,7 @@
 #include <Storages/MergeTree/MergeTreeSelectProcessor.h>
 #include <Storages/MergeTree/MergeTreeBaseSelectProcessor.h>
 #include <Storages/MergeTree/IMergeTreeReader.h>
+#include <Storages/MergeTree/LoadedMergeTreeDataPartInfoForReader.h>
 #include <Interpreters/Context.h>
 
 
@@ -51,7 +52,7 @@ MergeTreeSelectProcessor::MergeTreeSelectProcessor(
 void MergeTreeSelectProcessor::initializeReaders()
 {
     task_columns = getReadTaskColumns(
-        storage, storage_snapshot, data_part,
+        LoadedMergeTreeDataPartInfoForReader(data_part), storage_snapshot,
         required_columns, virt_column_names, prewhere_info, /*with_subcolumns=*/ true);
 
     /// Will be used to distinguish between PREWHERE and WHERE columns when applying filter
diff --git a/src/Storages/MergeTree/MergeTreeSequentialSource.cpp b/src/Storages/MergeTree/MergeTreeSequentialSource.cpp
index 5b9eceece51..9e0c96fd88a 100644
--- a/src/Storages/MergeTree/MergeTreeSequentialSource.cpp
+++ b/src/Storages/MergeTree/MergeTreeSequentialSource.cpp
@@ -1,5 +1,6 @@
 #include <Storages/MergeTree/MergeTreeSequentialSource.h>
 #include <Storages/MergeTree/MergeTreeBlockReadUtils.h>
+#include <Storages/MergeTree/LoadedMergeTreeDataPartInfoForReader.h>
 #include <Processors/Transforms/FilterTransform.h>
 #include <QueryPipeline/Pipe.h>
 #include <Interpreters/Context.h>
@@ -102,7 +103,7 @@ MergeTreeSequentialSource::MergeTreeSequentialSource(
     addTotalRowsApprox(data_part->rows_count);
 
     /// Add columns because we don't want to read empty blocks
-    injectRequiredColumns(storage, storage_snapshot, data_part, /*with_subcolumns=*/ false, columns_to_read);
+    injectRequiredColumns(LoadedMergeTreeDataPartInfoForReader(data_part), storage_snapshot, /*with_subcolumns=*/ false, columns_to_read);
 
     NamesAndTypesList columns_for_reader;
     if (take_column_types_from_storage)
diff --git a/src/Storages/MergeTree/MergeTreeSink.cpp b/src/Storages/MergeTree/MergeTreeSink.cpp
index 83203939903..5eaa8ec8004 100644
--- a/src/Storages/MergeTree/MergeTreeSink.cpp
+++ b/src/Storages/MergeTree/MergeTreeSink.cpp
@@ -23,6 +23,7 @@ MergeTreeSink::MergeTreeSink(
     , metadata_snapshot(metadata_snapshot_)
     , max_parts_per_block(max_parts_per_block_)
     , context(context_)
+    , storage_snapshot(storage.getStorageSnapshot(metadata_snapshot, context))
 {
 }
 
@@ -54,7 +55,6 @@ struct MergeTreeSink::DelayedChunk
 void MergeTreeSink::consume(Chunk chunk)
 {
     auto block = getHeader().cloneWithColumns(chunk.detachColumns());
-    auto storage_snapshot = storage.getStorageSnapshot(metadata_snapshot, context);
 
     storage.writer.deduceTypesOfObjectColumns(storage_snapshot, block);
     auto part_blocks = storage.writer.splitBlockIntoParts(block, max_parts_per_block, metadata_snapshot, context);
diff --git a/src/Storages/MergeTree/MergeTreeSink.h b/src/Storages/MergeTree/MergeTreeSink.h
index 65a565d7f57..68f11d86a25 100644
--- a/src/Storages/MergeTree/MergeTreeSink.h
+++ b/src/Storages/MergeTree/MergeTreeSink.h
@@ -9,6 +9,8 @@ namespace DB
 
 class Block;
 class StorageMergeTree;
+struct StorageSnapshot;
+using StorageSnapshotPtr = std::shared_ptr<StorageSnapshot>;
 
 
 class MergeTreeSink : public SinkToStorage
@@ -32,6 +34,7 @@ private:
     StorageMetadataPtr metadata_snapshot;
     size_t max_parts_per_block;
     ContextPtr context;
+    StorageSnapshotPtr storage_snapshot;
     uint64_t chunk_dedup_seqnum = 0; /// input chunk ordinal number in case of dedup token
 
     /// We can delay processing for previous chunk and start writing a new one.
diff --git a/src/Storages/MergeTree/MergeTreeWriteAheadLog.cpp b/src/Storages/MergeTree/MergeTreeWriteAheadLog.cpp
index 9b79f89ff98..c8b3349734e 100644
--- a/src/Storages/MergeTree/MergeTreeWriteAheadLog.cpp
+++ b/src/Storages/MergeTree/MergeTreeWriteAheadLog.cpp
@@ -3,6 +3,7 @@
 #include <Storages/MergeTree/MergeTreeData.h>
 #include <Storages/MergeTree/MergeTreeDataWriter.h>
 #include <Storages/MergeTree/MergedBlockOutputStream.h>
+#include <Storages/MergeTree/MergeTreeDataPartState.h>
 #include <IO/MemoryReadWriteBuffer.h>
 #include <IO/ReadHelpers.h>
 #include <IO/copyData.h>
@@ -122,7 +123,10 @@ void MergeTreeWriteAheadLog::rotate(const std::unique_lock<std::mutex> &)
     init();
 }
 
-MergeTreeData::MutableDataPartsVector MergeTreeWriteAheadLog::restore(const StorageMetadataPtr & metadata_snapshot, ContextPtr context)
+MergeTreeData::MutableDataPartsVector MergeTreeWriteAheadLog::restore(
+    const StorageMetadataPtr & metadata_snapshot,
+    ContextPtr context,
+    std::unique_lock<std::mutex> & parts_lock)
 {
     std::unique_lock lock(write_mutex);
 
@@ -172,6 +176,9 @@ MergeTreeData::MutableDataPartsVector MergeTreeWriteAheadLog::restore(const Stor
                 part->uuid = metadata.part_uuid;
 
                 block = block_in.read();
+
+                if (storage.getActiveContainingPart(part->info, MergeTreeDataPartState::Active, parts_lock))
+                    continue;
             }
             else
             {
@@ -238,6 +245,15 @@ MergeTreeData::MutableDataPartsVector MergeTreeWriteAheadLog::restore(const Stor
     std::copy_if(parts.begin(), parts.end(), std::back_inserter(result),
         [&dropped_parts](const auto & part) { return dropped_parts.count(part->name) == 0; });
 
+    /// All parts in WAL had been already committed into the disk -> clear the WAL
+    if (result.empty())
+    {
+        LOG_DEBUG(log, "WAL file '{}' had been completely processed. Removing.", path);
+        disk->removeFile(path);
+        init();
+        return {};
+    }
+
     return result;
 }
 
diff --git a/src/Storages/MergeTree/MergeTreeWriteAheadLog.h b/src/Storages/MergeTree/MergeTreeWriteAheadLog.h
index eb75d374cb1..b54161dbdaa 100644
--- a/src/Storages/MergeTree/MergeTreeWriteAheadLog.h
+++ b/src/Storages/MergeTree/MergeTreeWriteAheadLog.h
@@ -62,7 +62,10 @@ public:
 
     void addPart(DataPartInMemoryPtr & part);
     void dropPart(const String & part_name);
-    std::vector<MergeTreeMutableDataPartPtr> restore(const StorageMetadataPtr & metadata_snapshot, ContextPtr context);
+    std::vector<MergeTreeMutableDataPartPtr> restore(
+        const StorageMetadataPtr & metadata_snapshot,
+        ContextPtr context,
+        std::unique_lock<std::mutex> & parts_lock);
 
     using MinMaxBlockNumber = std::pair<Int64, Int64>;
     static std::optional<MinMaxBlockNumber> tryParseMinMaxBlockNumber(const String & filename);
diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp
index d0b39684ba2..254bcd9f7f9 100644
--- a/src/Storages/MergeTree/MutateTask.cpp
+++ b/src/Storages/MergeTree/MutateTask.cpp
@@ -1302,7 +1302,7 @@ private:
                     *ctx->source_part->data_part_storage, it->name(), destination);
                 hardlinked_files.insert(it->name());
             }
-            else if (!endsWith(".tmp_proj", it->name())) // ignore projection tmp merge dir
+            else if (!endsWith(it->name(), ".tmp_proj")) // ignore projection tmp merge dir
             {
                 // it's a projection part directory
                 ctx->data_part_storage_builder->createProjection(destination);
diff --git a/src/Storages/MergeTree/PartMetadataManagerWithCache.cpp b/src/Storages/MergeTree/PartMetadataManagerWithCache.cpp
index 9930aca2576..5a291373e6c 100644
--- a/src/Storages/MergeTree/PartMetadataManagerWithCache.cpp
+++ b/src/Storages/MergeTree/PartMetadataManagerWithCache.cpp
@@ -191,6 +191,7 @@ void PartMetadataManagerWithCache::getKeysAndCheckSums(Strings & keys, std::vect
     {
         ReadBufferFromString rbuf(values[i]);
         HashingReadBuffer hbuf(rbuf);
+        hbuf.ignoreAll();
         checksums.push_back(hbuf.getHash());
     }
 }
diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeAttachThread.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeAttachThread.cpp
index f3e33b6b38b..ba4979e57f2 100644
--- a/src/Storages/MergeTree/ReplicatedMergeTreeAttachThread.cpp
+++ b/src/Storages/MergeTree/ReplicatedMergeTreeAttachThread.cpp
@@ -149,7 +149,7 @@ void ReplicatedMergeTreeAttachThread::runImpl()
     storage.clearOldTemporaryDirectories(0, {"tmp_", "delete_tmp_", "tmp-fetch_"});
     storage.clearOldWriteAheadLogs();
     if (storage.getSettings()->merge_tree_enable_clear_old_broken_detached)
-        storage.clearOldBrokenPartsFromDetachedDirecory();
+        storage.clearOldBrokenPartsFromDetachedDirectory();
 
     storage.createNewZooKeeperNodes();
     storage.syncPinnedPartUUIDs();
diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeCleanupThread.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeCleanupThread.cpp
index 74e3d0881ff..cc983960847 100644
--- a/src/Storages/MergeTree/ReplicatedMergeTreeCleanupThread.cpp
+++ b/src/Storages/MergeTree/ReplicatedMergeTreeCleanupThread.cpp
@@ -66,7 +66,7 @@ void ReplicatedMergeTreeCleanupThread::iterate()
         storage.clearOldWriteAheadLogs();
         storage.clearOldTemporaryDirectories(storage.getSettings()->temporary_directories_lifetime.totalSeconds());
         if (storage.getSettings()->merge_tree_enable_clear_old_broken_detached)
-            storage.clearOldBrokenPartsFromDetachedDirecory();
+            storage.clearOldBrokenPartsFromDetachedDirectory();
     }
 
     /// This is loose condition: no problem if we actually had lost leadership at this moment
diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.h b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.h
index e8362e5cc6b..6d1a3efb01d 100644
--- a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.h
+++ b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.h
@@ -279,7 +279,7 @@ private:
     /// Very large queue entries may appear occasionally.
     /// We cannot process MAX_MULTI_OPS at once because it will fail.
     /// But we have to process more than one entry at once because otherwise lagged replicas keep up slowly.
-    /// Let's start with one entry per transaction and icrease it exponentially towards MAX_MULTI_OPS.
+    /// Let's start with one entry per transaction and increase it exponentially towards MAX_MULTI_OPS.
     /// It will allow to make some progress before failing and remain operational even in extreme cases.
     size_t current_multi_batch_size = 1;
 
diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp
index 34d64b92d69..6c7fbcb52d8 100644
--- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp
+++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp
@@ -41,23 +41,29 @@ struct ReplicatedMergeTreeSink::DelayedChunk
         String block_id;
     };
 
+    DelayedChunk() = default;
+    explicit DelayedChunk(size_t replicas_num_) : replicas_num(replicas_num_) {}
+
+    size_t replicas_num = 0;
+
     std::vector<Partition> partitions;
 };
 
 ReplicatedMergeTreeSink::ReplicatedMergeTreeSink(
     StorageReplicatedMergeTree & storage_,
     const StorageMetadataPtr & metadata_snapshot_,
-    size_t quorum_,
+    size_t quorum_size,
     size_t quorum_timeout_ms_,
     size_t max_parts_per_block_,
     bool quorum_parallel_,
     bool deduplicate_,
+    bool majority_quorum,
     ContextPtr context_,
     bool is_attach_)
     : SinkToStorage(metadata_snapshot_->getSampleBlock())
     , storage(storage_)
     , metadata_snapshot(metadata_snapshot_)
-    , quorum(quorum_)
+    , required_quorum_size(majority_quorum ? std::nullopt : std::make_optional<size_t>(quorum_size))
     , quorum_timeout_ms(quorum_timeout_ms_)
     , max_parts_per_block(max_parts_per_block_)
     , is_attach(is_attach_)
@@ -65,15 +71,15 @@ ReplicatedMergeTreeSink::ReplicatedMergeTreeSink(
     , deduplicate(deduplicate_)
     , log(&Poco::Logger::get(storage.getLogName() + " (Replicated OutputStream)"))
     , context(context_)
+    , storage_snapshot(storage.getStorageSnapshot(metadata_snapshot, context))
 {
     /// The quorum value `1` has the same meaning as if it is disabled.
-    if (quorum == 1)
-        quorum = 0;
+    if (required_quorum_size == 1)
+        required_quorum_size = 0;
 }
 
 ReplicatedMergeTreeSink::~ReplicatedMergeTreeSink() = default;
 
-
 /// Allow to verify that the session in ZooKeeper is still alive.
 static void assertSessionIsNotExpired(zkutil::ZooKeeperPtr & zookeeper)
 {
@@ -84,9 +90,11 @@ static void assertSessionIsNotExpired(zkutil::ZooKeeperPtr & zookeeper)
         throw Exception("ZooKeeper session has been expired.", ErrorCodes::NO_ZOOKEEPER);
 }
 
-
-void ReplicatedMergeTreeSink::checkQuorumPrecondition(zkutil::ZooKeeperPtr & zookeeper)
+size_t ReplicatedMergeTreeSink::checkQuorumPrecondition(zkutil::ZooKeeperPtr & zookeeper)
 {
+    if (!isQuorumEnabled())
+        return 0;
+
     quorum_info.status_path = storage.zookeeper_path + "/quorum/status";
 
     Strings replicas = zookeeper->getChildren(fs::path(storage.zookeeper_path) / "replicas");
@@ -104,9 +112,12 @@ void ReplicatedMergeTreeSink::checkQuorumPrecondition(zkutil::ZooKeeperPtr & zoo
         if (status.get().error == Coordination::Error::ZOK)
             ++active_replicas;
 
-    if (active_replicas < quorum)
-        throw Exception(ErrorCodes::TOO_FEW_LIVE_REPLICAS, "Number of alive replicas ({}) is less than requested quorum ({}).",
-                        active_replicas, quorum);
+    size_t replicas_number = replicas.size();
+    size_t quorum_size = getQuorumSize(replicas_number);
+
+    if (active_replicas < quorum_size)
+        throw Exception(ErrorCodes::TOO_FEW_LIVE_REPLICAS, "Number of alive replicas ({}) is less than requested quorum ({}/{}).",
+                        active_replicas, quorum_size, replicas_number);
 
     /** Is there a quorum for the last part for which a quorum is needed?
         * Write of all the parts with the included quorum is linearly ordered.
@@ -132,8 +143,9 @@ void ReplicatedMergeTreeSink::checkQuorumPrecondition(zkutil::ZooKeeperPtr & zoo
     quorum_info.is_active_node_value = is_active.data;
     quorum_info.is_active_node_version = is_active.stat.version;
     quorum_info.host_node_version = host.stat.version;
-}
 
+    return replicas_number;
+}
 
 void ReplicatedMergeTreeSink::consume(Chunk chunk)
 {
@@ -147,10 +159,8 @@ void ReplicatedMergeTreeSink::consume(Chunk chunk)
       * And also check that during the insertion, the replica was not reinitialized or disabled (by the value of `is_active` node).
       * TODO Too complex logic, you can do better.
       */
-    if (quorum)
-        checkQuorumPrecondition(zookeeper);
+    size_t replicas_num = checkQuorumPrecondition(zookeeper);
 
-    auto storage_snapshot = storage.getStorageSnapshot(metadata_snapshot, context);
     storage.writer.deduceTypesOfObjectColumns(storage_snapshot, block);
     auto part_blocks = storage.writer.splitBlockIntoParts(block, max_parts_per_block, metadata_snapshot, context);
 
@@ -193,11 +203,11 @@ void ReplicatedMergeTreeSink::consume(Chunk chunk)
             }
 
             block_id = temp_part.part->getZeroLevelPartBlockID(block_dedup_token);
-            LOG_DEBUG(log, "Wrote block with ID '{}', {} rows", block_id, current_block.block.rows());
+            LOG_DEBUG(log, "Wrote block with ID '{}', {} rows on {} replicas", block_id, current_block.block.rows(), replicas_num);
         }
         else
         {
-            LOG_DEBUG(log, "Wrote block with {} rows", current_block.block.rows());
+            LOG_DEBUG(log, "Wrote block with {} rows on {} replicas", current_block.block.rows(), replicas_num);
         }
 
         UInt64 elapsed_ns = watch.elapsed();
@@ -211,7 +221,7 @@ void ReplicatedMergeTreeSink::consume(Chunk chunk)
         if (streams > max_insert_delayed_streams_for_parallel_write)
         {
             finishDelayedChunk(zookeeper);
-            delayed_chunk = std::make_unique<ReplicatedMergeTreeSink::DelayedChunk>();
+            delayed_chunk = std::make_unique<ReplicatedMergeTreeSink::DelayedChunk>(replicas_num);
             delayed_chunk->partitions = std::move(partitions);
             finishDelayedChunk(zookeeper);
 
@@ -254,7 +264,7 @@ void ReplicatedMergeTreeSink::finishDelayedChunk(zkutil::ZooKeeperPtr & zookeepe
 
         try
         {
-            commitPart(zookeeper, part, partition.block_id, partition.temp_part.builder);
+            commitPart(zookeeper, part, partition.block_id, partition.temp_part.builder, delayed_chunk->replicas_num);
 
             last_block_is_duplicate = last_block_is_duplicate || part->is_duplicate;
 
@@ -273,7 +283,6 @@ void ReplicatedMergeTreeSink::finishDelayedChunk(zkutil::ZooKeeperPtr & zookeepe
     delayed_chunk.reset();
 }
 
-
 void ReplicatedMergeTreeSink::writeExistingPart(MergeTreeData::MutableDataPartPtr & part)
 {
     /// NOTE: No delay in this case. That's Ok.
@@ -281,15 +290,14 @@ void ReplicatedMergeTreeSink::writeExistingPart(MergeTreeData::MutableDataPartPt
     auto zookeeper = storage.getZooKeeper();
     assertSessionIsNotExpired(zookeeper);
 
-    if (quorum)
-        checkQuorumPrecondition(zookeeper);
+    size_t replicas_num = checkQuorumPrecondition(zookeeper);
 
     Stopwatch watch;
 
     try
     {
         part->version.setCreationTID(Tx::PrehistoricTID, nullptr);
-        commitPart(zookeeper, part, "", part->data_part_storage->getBuilder());
+        commitPart(zookeeper, part, "", part->data_part_storage->getBuilder(), replicas_num);
         PartLog::addNewPart(storage.getContext(), part, watch.elapsed());
     }
     catch (...)
@@ -299,12 +307,12 @@ void ReplicatedMergeTreeSink::writeExistingPart(MergeTreeData::MutableDataPartPt
     }
 }
 
-
 void ReplicatedMergeTreeSink::commitPart(
     zkutil::ZooKeeperPtr & zookeeper,
     MergeTreeData::MutableDataPartPtr & part,
     const String & block_id,
-    DataPartStorageBuilderPtr builder)
+    DataPartStorageBuilderPtr builder,
+    size_t replicas_num)
 {
     metadata_snapshot->check(part->getColumns());
     assertSessionIsNotExpired(zookeeper);
@@ -367,7 +375,7 @@ void ReplicatedMergeTreeSink::commitPart(
             log_entry.source_replica = storage.replica_name;
             log_entry.new_part_name = part->name;
             /// TODO maybe add UUID here as well?
-            log_entry.quorum = quorum;
+            log_entry.quorum = getQuorumSize(replicas_num);
             log_entry.block_id = block_id;
             log_entry.new_part_type = part->getType();
 
@@ -384,11 +392,11 @@ void ReplicatedMergeTreeSink::commitPart(
               *  but for it the quorum has not yet been reached.
               *  You can not do the next quorum record at this time.)
               */
-            if (quorum)
+            if (isQuorumEnabled())
             {
                 ReplicatedMergeTreeQuorumEntry quorum_entry;
                 quorum_entry.part_name = part->name;
-                quorum_entry.required_number_of_replicas = quorum;
+                quorum_entry.required_number_of_replicas = getQuorumSize(replicas_num);
                 quorum_entry.replicas.insert(storage.replica_name);
 
                 /** At this point, this node will contain information that the current replica received a part.
@@ -436,7 +444,7 @@ void ReplicatedMergeTreeSink::commitPart(
             {
                 part->is_duplicate = true;
                 ProfileEvents::increment(ProfileEvents::DuplicatedInsertedBlocks);
-                if (quorum)
+                if (isQuorumEnabled())
                 {
                     LOG_INFO(log, "Block with ID {} already exists locally as part {}; ignoring it, but checking quorum.", block_id, existing_part_name);
 
@@ -446,7 +454,7 @@ void ReplicatedMergeTreeSink::commitPart(
                     else
                         quorum_path = storage.zookeeper_path + "/quorum/status";
 
-                    waitForQuorum(zookeeper, existing_part_name, quorum_path, quorum_info.is_active_node_value);
+                    waitForQuorum(zookeeper, existing_part_name, quorum_path, quorum_info.is_active_node_value, replicas_num);
                 }
                 else
                 {
@@ -593,7 +601,7 @@ void ReplicatedMergeTreeSink::commitPart(
         break;
     }
 
-    if (quorum)
+    if (isQuorumEnabled())
     {
         if (is_already_existing_part)
         {
@@ -605,7 +613,7 @@ void ReplicatedMergeTreeSink::commitPart(
                 storage.updateQuorum(part->name, false);
         }
 
-        waitForQuorum(zookeeper, part->name, quorum_info.status_path, quorum_info.is_active_node_value);
+        waitForQuorum(zookeeper, part->name, quorum_info.status_path, quorum_info.is_active_node_value, replicas_num);
     }
 }
 
@@ -627,10 +635,11 @@ void ReplicatedMergeTreeSink::waitForQuorum(
     zkutil::ZooKeeperPtr & zookeeper,
     const std::string & part_name,
     const std::string & quorum_path,
-    const std::string & is_active_node_value) const
+    const std::string & is_active_node_value,
+    size_t replicas_num) const
 {
     /// We are waiting for quorum to be satisfied.
-    LOG_TRACE(log, "Waiting for quorum");
+    LOG_TRACE(log, "Waiting for quorum '{}' for part {} on {} replicas", quorum_path, part_name, replicas_num);
 
     try
     {
@@ -654,7 +663,7 @@ void ReplicatedMergeTreeSink::waitForQuorum(
             if (!event->tryWait(quorum_timeout_ms))
                 throw Exception("Timeout while waiting for quorum", ErrorCodes::TIMEOUT_EXCEEDED);
 
-            LOG_TRACE(log, "Quorum {} updated, will check quorum node still exists", quorum_path);
+            LOG_TRACE(log, "Quorum {} for part {} updated, will check quorum node still exists", quorum_path, part_name);
         }
 
         /// And what if it is possible that the current replica at this time has ceased to be active
@@ -672,8 +681,23 @@ void ReplicatedMergeTreeSink::waitForQuorum(
             ErrorCodes::UNKNOWN_STATUS_OF_INSERT);
     }
 
-    LOG_TRACE(log, "Quorum satisfied");
+    LOG_TRACE(log, "Quorum '{}' for part {} satisfied", quorum_path, part_name);
 }
 
+size_t ReplicatedMergeTreeSink::getQuorumSize(size_t replicas_num) const
+{
+    if (!isQuorumEnabled())
+        return 0;
+
+    if (required_quorum_size)
+        return required_quorum_size.value();
+
+    return replicas_num / 2 + 1;
+}
+
+bool ReplicatedMergeTreeSink::isQuorumEnabled() const
+{
+    return !required_quorum_size.has_value() || required_quorum_size.value() > 1;
+}
 
 }
diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeSink.h b/src/Storages/MergeTree/ReplicatedMergeTreeSink.h
index f7504d2f784..48e94ef5659 100644
--- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.h
+++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.h
@@ -17,6 +17,8 @@ namespace DB
 {
 
 class StorageReplicatedMergeTree;
+struct StorageSnapshot;
+using StorageSnapshotPtr = std::shared_ptr<StorageSnapshot>;
 
 
 class ReplicatedMergeTreeSink : public SinkToStorage
@@ -30,6 +32,7 @@ public:
         size_t max_parts_per_block_,
         bool quorum_parallel_,
         bool deduplicate_,
+        bool majority_quorum_,
         ContextPtr context_,
         // special flag to determine the ALTER TABLE ATTACH PART without the query context,
         // needed to set the special LogEntryType::ATTACH_PART
@@ -66,24 +69,34 @@ private:
     };
 
     QuorumInfo quorum_info;
-    void checkQuorumPrecondition(zkutil::ZooKeeperPtr & zookeeper);
+
+    /// Checks active replicas.
+    /// Returns total number of replicas.
+    size_t checkQuorumPrecondition(zkutil::ZooKeeperPtr & zookeeper);
 
     /// Rename temporary part and commit to ZooKeeper.
     void commitPart(
         zkutil::ZooKeeperPtr & zookeeper,
         MergeTreeData::MutableDataPartPtr & part,
         const String & block_id,
-        DataPartStorageBuilderPtr part_builder);
+        DataPartStorageBuilderPtr part_builder,
+        size_t replicas_num);
 
     /// Wait for quorum to be satisfied on path (quorum_path) form part (part_name)
     /// Also checks that replica still alive.
     void waitForQuorum(
         zkutil::ZooKeeperPtr & zookeeper, const std::string & part_name,
-        const std::string & quorum_path, const std::string & is_active_node_value) const;
+        const std::string & quorum_path, const std::string & is_active_node_value, size_t replicas_num) const;
 
     StorageReplicatedMergeTree & storage;
     StorageMetadataPtr metadata_snapshot;
-    size_t quorum;
+
+    /// Empty means use majority quorum.
+    std::optional<size_t> required_quorum_size;
+
+    size_t getQuorumSize(size_t replicas_num) const;
+    bool isQuorumEnabled() const;
+
     size_t quorum_timeout_ms;
     size_t max_parts_per_block;
 
@@ -96,6 +109,8 @@ private:
     Poco::Logger * log;
 
     ContextPtr context;
+    StorageSnapshotPtr storage_snapshot;
+
     UInt64 chunk_dedup_seqnum = 0; /// input chunk ordinal number in case of dedup token
 
     /// We can delay processing for previous chunk and start writing a new one.
diff --git a/src/Storages/NATS/StorageNATS.cpp b/src/Storages/NATS/StorageNATS.cpp
index 3c1a04c7824..fc3079a7aa7 100644
--- a/src/Storages/NATS/StorageNATS.cpp
+++ b/src/Storages/NATS/StorageNATS.cpp
@@ -144,6 +144,8 @@ ContextMutablePtr StorageNATS::addSettings(ContextPtr local_context) const
     modified_context->setSetting("input_format_skip_unknown_fields", true);
     modified_context->setSetting("input_format_allow_errors_ratio", 0.);
     modified_context->setSetting("input_format_allow_errors_num", nats_settings->nats_skip_broken_messages.value);
+    /// Since we are reusing the same context for all queries executed simultaneously, we don't want to used shared `analyze_count`
+    modified_context->setSetting("max_analyze_depth", Field{0});
 
     if (!schema_name.empty())
         modified_context->setSetting("format_schema", schema_name);
diff --git a/src/Storages/PartitionCommands.h b/src/Storages/PartitionCommands.h
index 9807c90bc23..4921cf8e53b 100644
--- a/src/Storages/PartitionCommands.h
+++ b/src/Storages/PartitionCommands.h
@@ -104,7 +104,7 @@ struct PartitionCommandResultInfo
 
 using PartitionCommandsResultInfo = std::vector<PartitionCommandResultInfo>;
 
-/// Convert partition comands result to Source from single Chunk, which will be
+/// Convert partition commands result to Source from single Chunk, which will be
 /// used to print info to the user. Tries to create narrowest table for given
 /// results. For example, if all commands were FREEZE commands, than
 /// old_part_name column will be absent.
diff --git a/src/Storages/PostgreSQL/MaterializedPostgreSQLConsumer.h b/src/Storages/PostgreSQL/MaterializedPostgreSQLConsumer.h
index 91bf5eeccde..37caa66aae5 100644
--- a/src/Storages/PostgreSQL/MaterializedPostgreSQLConsumer.h
+++ b/src/Storages/PostgreSQL/MaterializedPostgreSQLConsumer.h
@@ -126,7 +126,7 @@ private:
 
     static void assertCorrectInsertion(StorageData::Buffer & buffer, size_t column_idx);
 
-    /// lsn - log sequnce nuumber, like wal offset (64 bit).
+    /// lsn - log sequence number, like wal offset (64 bit).
     static Int64 getLSNValue(const std::string & lsn)
     {
         UInt32 upper_half, lower_half;
diff --git a/src/Storages/RabbitMQ/StorageRabbitMQ.cpp b/src/Storages/RabbitMQ/StorageRabbitMQ.cpp
index f831f81cd22..70838daec24 100644
--- a/src/Storages/RabbitMQ/StorageRabbitMQ.cpp
+++ b/src/Storages/RabbitMQ/StorageRabbitMQ.cpp
@@ -242,6 +242,8 @@ ContextMutablePtr StorageRabbitMQ::addSettings(ContextPtr local_context) const
     modified_context->setSetting("input_format_skip_unknown_fields", true);
     modified_context->setSetting("input_format_allow_errors_ratio", 0.);
     modified_context->setSetting("input_format_allow_errors_num", rabbitmq_settings->rabbitmq_skip_broken_messages.value);
+    /// Since we are reusing the same context for all queries executed simultaneously, we don't want to used shared `analyze_count`
+    modified_context->setSetting("max_analyze_depth", Field{0});
 
     if (!schema_name.empty())
         modified_context->setSetting("format_schema", schema_name);
diff --git a/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp b/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp
index c9ddd9147b9..31cb2f2f9c2 100644
--- a/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp
+++ b/src/Storages/RocksDB/StorageEmbeddedRocksDB.cpp
@@ -1,6 +1,7 @@
 #include <Storages/checkAndGetLiteralArgument.h>
 #include <Storages/RocksDB/StorageEmbeddedRocksDB.h>
 #include <Storages/RocksDB/EmbeddedRocksDBSink.h>
+#include <Storages/MutationCommands.h>
 
 #include <DataTypes/DataTypesNumber.h>
 
@@ -10,11 +11,15 @@
 #include <Parsers/ASTCreateQuery.h>
 
 #include <QueryPipeline/Pipe.h>
+#include <QueryPipeline/QueryPipelineBuilder.h>
 #include <Processors/ISource.h>
 
 #include <Interpreters/castColumn.h>
 #include <Interpreters/Context.h>
 #include <Interpreters/TreeRewriter.h>
+#include <Interpreters/MutationsInterpreter.h>
+
+#include <Processors/Executors/PullingPipelineExecutor.h>
 
 #include <Poco/Logger.h>
 #include <Poco/Util/AbstractConfiguration.h>
@@ -29,6 +34,7 @@
 #include <cstddef>
 #include <filesystem>
 #include <shared_mutex>
+#include <utility>
 
 
 namespace fs = std::filesystem;
@@ -166,14 +172,21 @@ StorageEmbeddedRocksDB::StorageEmbeddedRocksDB(const StorageID & table_id_,
         bool attach,
         ContextPtr context_,
         const String & primary_key_,
-        Int32 ttl_)
+        Int32 ttl_,
+        String rocksdb_dir_,
+        bool read_only_)
     : IStorage(table_id_)
     , WithContext(context_->getGlobalContext())
     , primary_key{primary_key_}
+    , rocksdb_dir(std::move(rocksdb_dir_))
     , ttl(ttl_)
+    , read_only(read_only_)
 {
     setInMemoryMetadata(metadata_);
-    rocksdb_dir = context_->getPath() + relative_data_path_;
+    if (rocksdb_dir.empty())
+    {
+        rocksdb_dir = context_->getPath() + relative_data_path_;
+    }
     if (!attach)
     {
         fs::create_directories(rocksdb_dir);
@@ -192,6 +205,92 @@ void StorageEmbeddedRocksDB::truncate(const ASTPtr &, const StorageMetadataPtr &
     initDB();
 }
 
+void StorageEmbeddedRocksDB::checkMutationIsPossible(const MutationCommands & commands, const Settings & /* settings */) const
+{
+    if (commands.empty())
+        return;
+
+    if (commands.size() > 1)
+        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Mutations cannot be combined for EmbeddedRocksDB");
+
+    const auto command_type = commands.front().type;
+    if (command_type != MutationCommand::Type::UPDATE && command_type != MutationCommand::Type::DELETE)
+        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Only DELETE and UPDATE mutation supported for EmbeddedRocksDB");
+}
+
+void StorageEmbeddedRocksDB::mutate(const MutationCommands & commands, ContextPtr context_)
+{
+    if (commands.empty())
+        return;
+
+    assert(commands.size() == 1);
+
+    auto metadata_snapshot = getInMemoryMetadataPtr();
+    auto storage = getStorageID();
+    auto storage_ptr = DatabaseCatalog::instance().getTable(storage, context_);
+
+    if (commands.front().type == MutationCommand::Type::DELETE)
+    {
+        auto interpreter = std::make_unique<MutationsInterpreter>(
+            storage_ptr,
+            metadata_snapshot,
+            commands,
+            context_,
+            /*can_execute_*/ true,
+            /*return_all_columns_*/ true,
+            /*return_deleted_rows_*/ true);
+        auto pipeline = QueryPipelineBuilder::getPipeline(interpreter->execute());
+        PullingPipelineExecutor executor(pipeline);
+
+        auto sink = std::make_shared<EmbeddedRocksDBSink>(*this, metadata_snapshot);
+
+        Block block;
+        while (executor.pull(block))
+        {
+            auto column_it = std::find_if(block.begin(), block.end(), [&](const auto & column) { return column.name == primary_key; });
+            assert(column_it != block.end());
+
+            auto column = column_it->column;
+            auto size = column->size();
+
+            rocksdb::WriteBatch batch;
+            WriteBufferFromOwnString wb_key;
+            for (size_t i = 0; i < size; ++i)
+            {
+                wb_key.restart();
+
+                column_it->type->getDefaultSerialization()->serializeBinary(*column, i, wb_key);
+                auto status = batch.Delete(wb_key.str());
+                if (!status.ok())
+                    throw Exception("RocksDB write error: " + status.ToString(), ErrorCodes::ROCKSDB_ERROR);
+            }
+
+            auto status = rocksdb_ptr->Write(rocksdb::WriteOptions(), &batch);
+            if (!status.ok())
+                throw Exception("RocksDB write error: " + status.ToString(), ErrorCodes::ROCKSDB_ERROR);
+        }
+
+        return;
+    }
+
+    assert(commands.front().type == MutationCommand::Type::UPDATE);
+    if (commands.front().column_to_update_expression.contains(primary_key))
+        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Primary key cannot be updated");
+
+    auto interpreter = std::make_unique<MutationsInterpreter>(
+        storage_ptr, metadata_snapshot, commands, context_, /*can_execute_*/ true, /*return_all_columns*/ true);
+    auto pipeline = QueryPipelineBuilder::getPipeline(interpreter->execute());
+    PullingPipelineExecutor executor(pipeline);
+
+    auto sink = std::make_shared<EmbeddedRocksDBSink>(*this, metadata_snapshot);
+
+    Block block;
+    while (executor.pull(block))
+    {
+        sink->consume(Chunk{block.getColumns(), block.rows()});
+    }
+}
+
 void StorageEmbeddedRocksDB::initDB()
 {
     rocksdb::Status status;
@@ -269,7 +368,7 @@ void StorageEmbeddedRocksDB::initDB()
     if (ttl > 0)
     {
         rocksdb::DBWithTTL * db;
-        status = rocksdb::DBWithTTL::Open(merged, rocksdb_dir, &db, ttl);
+        status = rocksdb::DBWithTTL::Open(merged, rocksdb_dir, &db, ttl, read_only);
         if (!status.ok())
         {
             throw Exception(ErrorCodes::ROCKSDB_ERROR, "Failed to open rocksdb path at: {}: {}",
@@ -280,7 +379,14 @@ void StorageEmbeddedRocksDB::initDB()
     else
     {
         rocksdb::DB * db;
-        status = rocksdb::DB::Open(merged, rocksdb_dir, &db);
+        if (read_only)
+        {
+            status = rocksdb::DB::OpenForReadOnly(merged, rocksdb_dir, &db);
+        }
+        else
+        {
+            status = rocksdb::DB::Open(merged, rocksdb_dir, &db);
+        }
         if (!status.ok())
         {
             throw Exception(ErrorCodes::ROCKSDB_ERROR, "Failed to open rocksdb path at: {}: {}",
@@ -351,15 +457,21 @@ static StoragePtr create(const StorageFactory::Arguments & args)
 {
     // TODO custom RocksDBSettings, table function
     auto engine_args = args.engine_args;
-    if (engine_args.size() > 1)
+    if (engine_args.size() > 3)
     {
-        throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Engine {} requires at most 1 parameter. ({} given). Correct usage: EmbeddedRocksDB([ttl])",
+        throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Engine {} requires at most 3 parameters. ({} given). Correct usage: EmbeddedRocksDB([ttl, rocksdb_dir, read_only])",
             args.engine_name, engine_args.size());
     }
 
     Int32 ttl{0};
+    String rocksdb_dir;
+    bool read_only{false};
     if (!engine_args.empty())
         ttl = checkAndGetLiteralArgument<UInt64>(engine_args[0], "ttl");
+    if (engine_args.size() > 1)
+        rocksdb_dir = checkAndGetLiteralArgument<String>(engine_args[1], "rocksdb_dir");
+    if (engine_args.size() > 2)
+        read_only = checkAndGetLiteralArgument<bool>(engine_args[2], "read_only");
 
     StorageInMemoryMetadata metadata;
     metadata.setColumns(args.columns);
@@ -374,7 +486,7 @@ static StoragePtr create(const StorageFactory::Arguments & args)
     {
         throw Exception("StorageEmbeddedRocksDB must require one column in primary key", ErrorCodes::BAD_ARGUMENTS);
     }
-    return std::make_shared<StorageEmbeddedRocksDB>(args.table_id, args.relative_data_path, metadata, args.attach, args.getContext(), primary_key_names[0], ttl);
+    return std::make_shared<StorageEmbeddedRocksDB>(args.table_id, args.relative_data_path, metadata, args.attach, args.getContext(), primary_key_names[0], ttl, std::move(rocksdb_dir), read_only);
 }
 
 std::shared_ptr<rocksdb::Statistics> StorageEmbeddedRocksDB::getRocksDBStatistics() const
diff --git a/src/Storages/RocksDB/StorageEmbeddedRocksDB.h b/src/Storages/RocksDB/StorageEmbeddedRocksDB.h
index ab87eac3e66..03848510e66 100644
--- a/src/Storages/RocksDB/StorageEmbeddedRocksDB.h
+++ b/src/Storages/RocksDB/StorageEmbeddedRocksDB.h
@@ -33,7 +33,9 @@ public:
         bool attach,
         ContextPtr context_,
         const String & primary_key_,
-        Int32 ttl_ = 0);
+        Int32 ttl_ = 0,
+        String rocksdb_dir_ = "",
+        bool read_only_ = false);
 
     std::string getName() const override { return "EmbeddedRocksDB"; }
 
@@ -49,6 +51,9 @@ public:
     SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context) override;
     void truncate(const ASTPtr &, const StorageMetadataPtr & metadata_snapshot, ContextPtr, TableExclusiveLockHolder &) override;
 
+    void checkMutationIsPossible(const MutationCommands & commands, const Settings & settings) const override;
+    void mutate(const MutationCommands &, ContextPtr) override;
+
     bool supportsParallelInsert() const override { return true; }
     bool supportsIndexForIn() const override { return true; }
     bool mayBenefitFromIndexForIn(
@@ -82,6 +87,7 @@ private:
     mutable std::shared_mutex rocksdb_ptr_mx;
     String rocksdb_dir;
     Int32 ttl;
+    bool read_only;
 
     void initDB();
 };
diff --git a/src/Storages/StorageMemory.cpp b/src/Storages/StorageMemory.cpp
index f3f1162287f..e4dbfe15095 100644
--- a/src/Storages/StorageMemory.cpp
+++ b/src/Storages/StorageMemory.cpp
@@ -95,7 +95,7 @@ protected:
             ++name_and_type;
         }
 
-        fillMissingColumns(columns, src.rows(), column_names_and_types, /*metadata_snapshot=*/ nullptr);
+        fillMissingColumns(columns, src.rows(), column_names_and_types, column_names_and_types, {}, nullptr);
         assert(std::all_of(columns.begin(), columns.end(), [](const auto & column) { return column != nullptr; }));
 
         return Chunk(std::move(columns), src.rows());
diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp
index 507047751f3..5adc1974257 100644
--- a/src/Storages/StorageMergeTree.cpp
+++ b/src/Storages/StorageMergeTree.cpp
@@ -963,7 +963,7 @@ bool StorageMergeTree::merge(
     if (!merge_mutate_entry)
         return false;
 
-    /// Copying a vector of columns `deduplicate bu columns.
+    /// Copying a vector of columns `deduplicate by columns.
     IExecutableTask::TaskResultCallback f = [](bool) {};
     auto task = std::make_shared<MergePlainMergeTreeTask>(
         *this, metadata_snapshot, deduplicate, deduplicate_by_columns, merge_mutate_entry, table_lock_holder, f);
@@ -1202,7 +1202,7 @@ bool StorageMergeTree::scheduleDataProcessingJob(BackgroundJobsAssignee & assign
                 cleared_count += clearOldMutations();
                 cleared_count += clearEmptyParts();
                 if (getSettings()->merge_tree_enable_clear_old_broken_detached)
-                    cleared_count += clearOldBrokenPartsFromDetachedDirecory();
+                    cleared_count += clearOldBrokenPartsFromDetachedDirectory();
                 return cleared_count;
                 /// TODO maybe take into account number of cleared objects when calculating backoff
             }, common_assignee_trigger, getStorageID()), /* need_trigger */ false);
@@ -1785,7 +1785,7 @@ void StorageMergeTree::backupData(BackupEntriesCollector & backup_entries_collec
     for (const auto & data_part : data_parts)
         min_data_version = std::min(min_data_version, data_part->info.getDataVersion());
 
-    backup_entries_collector.addBackupEntries(backupParts(data_parts, data_path_in_backup));
+    backup_entries_collector.addBackupEntries(backupParts(data_parts, data_path_in_backup, local_context));
     backup_entries_collector.addBackupEntries(backupMutations(min_data_version + 1, data_path_in_backup));
 }
 
diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index 762c3d52627..b1612e217e8 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -2180,7 +2180,7 @@ bool StorageReplicatedMergeTree::executeReplaceRange(const LogEntry & entry)
             if (interserver_scheme != address.scheme)
                 throw Exception("Interserver schemas are different '" + interserver_scheme + "' != '" + address.scheme + "', can't fetch part from " + address.host, ErrorCodes::LOGICAL_ERROR);
 
-            part_desc->res_part = fetcher.fetchPart(
+            part_desc->res_part = fetcher.fetchSelectedPart(
                 metadata_snapshot, getContext(), part_desc->found_new_part_name, source_replica_path,
                 address.host, address.replication_port, timeouts, credentials->getUser(), credentials->getPassword(),
                 interserver_scheme, replicated_fetches_throttler, false, TMP_PREFIX + "fetch_");
@@ -2299,7 +2299,7 @@ void StorageReplicatedMergeTree::executeClonePartFromShard(const LogEntry & entr
                                 + "' != '" + address.scheme + "', can't fetch part from " + address.host,
                                 ErrorCodes::LOGICAL_ERROR);
 
-            return fetcher.fetchPart(
+            return fetcher.fetchSelectedPart(
                 metadata_snapshot, getContext(), entry.new_part_name, source_replica_path,
                 address.host, address.replication_port,
                 timeouts, credentials->getUser(), credentials->getPassword(), interserver_scheme,
@@ -3641,8 +3641,8 @@ void StorageReplicatedMergeTree::updateQuorum(const String & part_name, bool is_
         if (quorum_entry.replicas.size() >= quorum_entry.required_number_of_replicas)
         {
             /// The quorum is reached. Delete the node, and update information about the last part that was successfully written with quorum.
-            LOG_TRACE(log, "Got {} replicas confirmed quorum {}, going to remove node",
-                      quorum_entry.replicas.size(), quorum_status_path);
+            LOG_TRACE(log, "Got {} (of {}) replicas confirmed quorum {}, going to remove node",
+                      quorum_entry.replicas.size(), quorum_entry.required_number_of_replicas, quorum_status_path);
 
             Coordination::Requests ops;
             Coordination::Responses responses;
@@ -3690,8 +3690,8 @@ void StorageReplicatedMergeTree::updateQuorum(const String & part_name, bool is_
         }
         else
         {
-            LOG_TRACE(log, "Quorum {} still not satisfied (have only {} replicas), updating node",
-                      quorum_status_path, quorum_entry.replicas.size());
+            LOG_TRACE(log, "Quorum {} still not satisfied (have only {} of {} replicas), updating node",
+                      quorum_status_path, quorum_entry.replicas.size(), quorum_entry.required_number_of_replicas);
             /// We update the node, registering there one more replica.
             auto code = zookeeper->trySet(quorum_status_path, quorum_entry.toString(), stat.version);
 
@@ -3831,9 +3831,10 @@ bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const Stora
 
     LOG_DEBUG(log, "Fetching part {} from {}", part_name, source_replica_path);
 
+    auto settings_ptr = getSettings();
     TableLockHolder table_lock_holder;
     if (!to_detached)
-        table_lock_holder = lockForShare(RWLockImpl::NO_QUERY, getSettings()->lock_acquire_timeout_for_background_operations);
+        table_lock_holder = lockForShare(RWLockImpl::NO_QUERY, settings_ptr->lock_acquire_timeout_for_background_operations);
 
     /// Logging
     Stopwatch stopwatch;
@@ -3857,7 +3858,8 @@ bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const Stora
         covered_part_info.mutation = 0;
         auto source_part = getActiveContainingPart(covered_part_info);
 
-        if (source_part)
+        /// Fetch for zero-copy replication is cheap and straightforward, so we don't use local clone here
+        if (source_part && (!settings_ptr->allow_remote_fs_zero_copy_replication || !source_part->data_part_storage->supportZeroCopyReplication()))
         {
             auto source_part_header = ReplicatedMergeTreePartHeader::fromColumnsAndChecksums(
                 source_part->getColumns(), source_part->checksums);
@@ -3897,7 +3899,6 @@ bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const Stora
                 part_to_clone = source_part;
             }
         }
-
     }
 
     ReplicatedMergeTreeAddress address;
@@ -3933,7 +3934,7 @@ bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const Stora
                     + "' != '" + address.scheme + "', can't fetch part from " + address.host,
                     ErrorCodes::INTERSERVER_SCHEME_DOESNT_MATCH);
 
-            return fetcher.fetchPart(
+            return fetcher.fetchSelectedPart(
                 metadata_snapshot,
                 getContext(),
                 part_name,
@@ -4070,7 +4071,7 @@ DataPartStoragePtr StorageReplicatedMergeTree::fetchExistsPart(
         currently_fetching_parts.erase(part_name);
     });
 
-    LOG_DEBUG(log, "Fetching part {} from {}", part_name, source_replica_path);
+    LOG_DEBUG(log, "Fetching already known part {} from {}", part_name, source_replica_path);
 
     TableLockHolder table_lock_holder = lockForShare(RWLockImpl::NO_QUERY, getSettings()->lock_acquire_timeout_for_background_operations);
 
@@ -4100,7 +4101,7 @@ DataPartStoragePtr StorageReplicatedMergeTree::fetchExistsPart(
                 + "' != '" + address.scheme + "', can't fetch part from " + address.host,
                 ErrorCodes::INTERSERVER_SCHEME_DOESNT_MATCH);
 
-        return fetcher.fetchPart(
+        return fetcher.fetchSelectedPart(
             metadata_snapshot, getContext(), part_name, source_replica_path,
             address.host, address.replication_port,
             timeouts, credentials->getUser(), credentials->getPassword(),
@@ -4304,12 +4305,12 @@ ReplicatedMergeTreeQuorumAddedParts::PartitionIdToMaxBlock StorageReplicatedMerg
             auto added_parts = part_with_quorum.added_parts;
 
             for (const auto & added_part : added_parts)
+            {
                 if (!getActiveContainingPart(added_part.second))
-                    throw Exception(
-                        "Replica doesn't have part " + added_part.second
-                            + " which was successfully written to quorum of other replicas."
-                              " Send query to another replica or disable 'select_sequential_consistency' setting.",
-                        ErrorCodes::REPLICA_IS_NOT_IN_QUORUM);
+                    throw Exception(ErrorCodes::REPLICA_IS_NOT_IN_QUORUM,
+                        "Replica doesn't have part '{}' which was successfully written to quorum of other replicas. "
+                        "Send query to another replica or disable 'select_sequential_consistency' setting", added_part.second);
+            }
 
             for (const auto & max_block : part_with_quorum.getMaxInsertedBlocks())
                 max_added_blocks[max_block.first] = max_block.second;
@@ -4430,13 +4431,13 @@ SinkToStoragePtr StorageReplicatedMergeTree::write(const ASTPtr & /*query*/, con
     bool deduplicate = storage_settings_ptr->replicated_deduplication_window != 0 && query_settings.insert_deduplicate;
 
     // TODO: should we also somehow pass list of columns to deduplicate on to the ReplicatedMergeTreeSink?
-    // TODO: insert_quorum = 'auto' would be supported in https://github.com/ClickHouse/ClickHouse/pull/39970, now it's same as 0.
     return std::make_shared<ReplicatedMergeTreeSink>(
         *this, metadata_snapshot, query_settings.insert_quorum.valueOr(0),
         query_settings.insert_quorum_timeout.totalMilliseconds(),
         query_settings.max_partitions_per_insert_block,
         query_settings.insert_quorum_parallel,
         deduplicate,
+        query_settings.insert_quorum.is_auto,
         local_context);
 }
 
@@ -4582,7 +4583,7 @@ bool StorageReplicatedMergeTree::executeMetadataAlter(const StorageReplicatedMer
     if (entry.alter_version < metadata_version)
     {
         /// TODO Can we replace it with LOGICAL_ERROR?
-        /// As for now, it may rerely happen due to reordering of ALTER_METADATA entries in the queue of
+        /// As for now, it may rarely happen due to reordering of ALTER_METADATA entries in the queue of
         /// non-initial replica and also may happen after stale replica recovery.
         LOG_WARNING(log, "Attempt to update metadata of version {} "
                          "to older version {} when processing log entry {}: {}",
@@ -4664,7 +4665,7 @@ PartitionBlockNumbersHolder StorageReplicatedMergeTree::allocateBlockNumbersInAf
     }
     else
     {
-        /// TODO: Implement optimal block number aqcuisition algorithm in multiple (but not all) partitions
+        /// TODO: Implement optimal block number acquisition algorithm in multiple (but not all) partitions
         EphemeralLocksInAllPartitions lock_holder(
             fs::path(zookeeper_path) / "block_numbers", "block-", fs::path(zookeeper_path) / "temp", *zookeeper);
 
@@ -4841,7 +4842,7 @@ void StorageReplicatedMergeTree::alter(
         Coordination::Responses results;
         Coordination::Error rc = zookeeper->tryMulti(ops, results);
 
-        /// For the sake of constitency with mechanics of concurrent background process of assigning parts merge tasks
+        /// For the sake of consistency with mechanics of concurrent background process of assigning parts merge tasks
         /// this placeholder must be held up until the moment of committing into ZK of the mutation entry
         /// See ReplicatedMergeTreeMergePredicate::canMergeTwoParts() method
         partition_block_numbers_holder.reset();
@@ -5125,7 +5126,7 @@ PartitionCommandsResultInfo StorageReplicatedMergeTree::attachPartition(
     MutableDataPartsVector loaded_parts = tryLoadPartsToAttach(partition, attach_part, query_context, renamed_parts);
 
     /// TODO Allow to use quorum here.
-    ReplicatedMergeTreeSink output(*this, metadata_snapshot, 0, 0, 0, false, false, query_context,
+    ReplicatedMergeTreeSink output(*this, metadata_snapshot, 0, 0, 0, false, false, false, query_context,
         /*is_attach*/true);
 
     for (size_t i = 0; i < loaded_parts.size(); ++i)
@@ -5897,7 +5898,7 @@ void StorageReplicatedMergeTree::mutate(const MutationCommands & commands, Conte
     /// partitions, saves them in the mutation entry and writes the mutation entry to a new ZK node in
     /// the /mutations folder. This block numbers are needed to determine which parts should be mutated and
     /// which shouldn't (parts inserted after the mutation will have the block number higher than the
-    /// block number acquired by the mutation in that partition and so will not be mutatied).
+    /// block number acquired by the mutation in that partition and so will not be mutated).
     /// This block number is called "mutation version" in that partition.
     ///
     /// Mutation versions are acquired atomically in all partitions, so the case when an insert in some
@@ -7217,7 +7218,7 @@ bool StorageReplicatedMergeTree::addOpsToDropAllPartsInPartition(
 }
 
 void StorageReplicatedMergeTree::dropAllPartsInPartitions(
-    zkutil::ZooKeeper & zookeeper, const Strings partition_ids, std::vector<LogEntryPtr> & entries, ContextPtr query_context, bool detach)
+    zkutil::ZooKeeper & zookeeper, const Strings & partition_ids, std::vector<LogEntryPtr> & entries, ContextPtr query_context, bool detach)
 {
     entries.reserve(partition_ids.size());
 
@@ -7335,6 +7336,21 @@ CheckResults StorageReplicatedMergeTree::checkData(const ASTPtr & query, Context
 }
 
 
+bool StorageReplicatedMergeTree::canUseZeroCopyReplication() const
+{
+    auto settings_ptr = getSettings();
+    if (!settings_ptr->allow_remote_fs_zero_copy_replication)
+        return false;
+
+    auto disks = getStoragePolicy()->getDisks();
+    for (const auto & disk : disks)
+    {
+        if (disk->supportZeroCopyReplication())
+            return true;
+    }
+    return false;
+}
+
 void StorageReplicatedMergeTree::checkBrokenDisks()
 {
     auto disks = getStoragePolicy()->getDisks();
@@ -7538,21 +7554,42 @@ void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part,
 
 std::pair<bool, NameSet> StorageReplicatedMergeTree::unlockSharedData(const IMergeTreeDataPart & part) const
 {
-    if (!part.data_part_storage || !part.isStoredOnDisk())
+    auto settings = getSettings();
+    if (!settings->allow_remote_fs_zero_copy_replication)
         return std::make_pair(true, NameSet{});
 
-    if (!part.data_part_storage || !part.data_part_storage->supportZeroCopyReplication())
+    if (!part.data_part_storage)
+        LOG_WARNING(log, "Datapart storage for part {} (temp: {}) is not initialzied", part.name, part.is_temp);
+
+    if (!part.data_part_storage || !part.isStoredOnDisk())
+    {
+        LOG_TRACE(log, "Part {} is not stored on disk, blobs can be removed", part.name);
         return std::make_pair(true, NameSet{});
+    }
+
+    if (!part.data_part_storage || !part.data_part_storage->supportZeroCopyReplication())
+    {
+        LOG_TRACE(log, "Part {} is not stored on zero-copy replicaed disk, blobs can be removed", part.name);
+        return std::make_pair(true, NameSet{});
+    }
 
     /// If part is temporary refcount file may be absent
     if (part.data_part_storage->exists(IMergeTreeDataPart::FILE_FOR_REFERENCES_CHECK))
     {
         auto ref_count = part.data_part_storage->getRefCount(IMergeTreeDataPart::FILE_FOR_REFERENCES_CHECK);
         if (ref_count > 0) /// Keep part shard info for frozen backups
+        {
+            LOG_TRACE(log, "Part {} has more than zero local references ({}), blobs cannot be removed", part.name, ref_count);
             return std::make_pair(false, NameSet{});
+        }
+        else
+        {
+            LOG_TRACE(log, "Part {} local references is zero, will check blobs can be removed in zookeeper", part.name);
+        }
     }
     else
     {
+        LOG_TRACE(log, "Part {} looks temporary, because checksums file doesn't exists, blobs can be removed", part.name);
         /// Temporary part with some absent file cannot be locked in shared mode
         return std::make_pair(true, NameSet{});
     }
@@ -7600,10 +7637,14 @@ std::pair<bool, NameSet> StorageReplicatedMergeTree::unlockSharedDataByID(
 
         if (!children.empty())
         {
-            LOG_TRACE(logger, "Found {} ({}) zookeper locks for {}", zookeeper_part_uniq_node, children.size(), fmt::join(children, ", "));
+            LOG_TRACE(logger, "Found {} ({}) zookeper locks for {}", children.size(), fmt::join(children, ", "), zookeeper_part_uniq_node);
             part_has_no_more_locks = false;
             continue;
         }
+        else
+        {
+            LOG_TRACE(logger, "No more children left for for {}, will try to remove the whole node", zookeeper_part_uniq_node);
+        }
 
         auto error_code = zookeeper_ptr->tryRemove(zookeeper_part_uniq_node);
 
@@ -7654,7 +7695,7 @@ std::pair<bool, NameSet> StorageReplicatedMergeTree::unlockSharedDataByID(
         }
         else
         {
-            LOG_TRACE(logger, "Can't remove parent zookeeper lock {} for part {}, because children {} ({}) were concurrently created",
+            LOG_TRACE(logger, "Can't remove parent zookeeper lock {} for part {}, because children {} ({}) exists",
                       zookeeper_part_node, part_name, children.size(), fmt::join(children, ", "));
         }
     }
@@ -7706,12 +7747,12 @@ String StorageReplicatedMergeTree::getSharedDataReplica(
             String zookeeper_part_uniq_node = fs::path(zc_zookeeper_path) / id;
             Strings id_replicas;
             zookeeper->tryGetChildren(zookeeper_part_uniq_node, id_replicas);
-            LOG_TRACE(log, "Found zookeper replicas for {}: {}", zookeeper_part_uniq_node, id_replicas.size());
+            LOG_TRACE(log, "Found zookeeper replicas for {}: {}", zookeeper_part_uniq_node, id_replicas.size());
             replicas.insert(id_replicas.begin(), id_replicas.end());
         }
     }
 
-    LOG_TRACE(log, "Found zookeper replicas for part {}: {}", part.name, replicas.size());
+    LOG_TRACE(log, "Found zookeeper replicas for part {}: {}", part.name, replicas.size());
 
     Strings active_replicas;
 
@@ -7724,7 +7765,7 @@ String StorageReplicatedMergeTree::getSharedDataReplica(
         if ((replica != replica_name) && (zookeeper->exists(fs::path(zookeeper_path) / "replicas" / replica / "is_active")))
             active_replicas.push_back(replica);
 
-    LOG_TRACE(log, "Found zookeper active replicas for part {}: {}", part.name, active_replicas.size());
+    LOG_TRACE(log, "Found zookeeper active replicas for part {}: {}", part.name, active_replicas.size());
 
     if (active_replicas.empty())
         return "";
@@ -8159,7 +8200,7 @@ void StorageReplicatedMergeTree::createZeroCopyLockNode(
 
     if (!created)
     {
-        String mode_str = mode == zkutil::CreateMode::Persistent ? "persistent" : "ephemral";
+        String mode_str = mode == zkutil::CreateMode::Persistent ? "persistent" : "ephemeral";
         throw Exception(ErrorCodes::NOT_FOUND_NODE, "Cannot create {} zero copy lock {} because part was unlocked from zookeeper", mode_str, zookeeper_node);
     }
 }
@@ -8264,7 +8305,7 @@ void StorageReplicatedMergeTree::backupData(
     else
         data_parts = getVisibleDataPartsVector(local_context);
 
-    auto backup_entries = backupParts(data_parts, "");
+    auto backup_entries = backupParts(data_parts, /* data_path_in_backup */ "", local_context);
 
     auto coordination = backup_entries_collector.getBackupCoordination();
     String shared_id = getTableSharedID();
@@ -8394,7 +8435,7 @@ void StorageReplicatedMergeTree::restoreDataFromBackup(RestorerFromBackup & rest
 void StorageReplicatedMergeTree::attachRestoredParts(MutableDataPartsVector && parts)
 {
     auto metadata_snapshot = getInMemoryMetadataPtr();
-    auto sink = std::make_shared<ReplicatedMergeTreeSink>(*this, metadata_snapshot, 0, 0, 0, false, false, getContext(), /*is_attach*/true);
+    auto sink = std::make_shared<ReplicatedMergeTreeSink>(*this, metadata_snapshot, 0, 0, 0, false, false, false,  getContext(), /*is_attach*/true);
     for (auto part : parts)
         sink->writeExistingPart(part);
 }
diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h
index 2e2a5ca79b7..14def28309b 100644
--- a/src/Storages/StorageReplicatedMergeTree.h
+++ b/src/Storages/StorageReplicatedMergeTree.h
@@ -327,6 +327,7 @@ public:
     static bool removeSharedDetachedPart(DiskPtr disk, const String & path, const String & part_name, const String & table_uuid,
         const String & zookeeper_name, const String & replica_name, const String & zookeeper_path, ContextPtr local_context);
 
+    bool canUseZeroCopyReplication() const;
 private:
     std::atomic_bool are_restoring_replica {false};
 
@@ -754,7 +755,7 @@ private:
         std::vector<EphemeralLockInZooKeeper> & delimiting_block_locks,
         std::vector<size_t> & log_entry_ops_idx);
     void dropAllPartsInPartitions(
-        zkutil::ZooKeeper & zookeeper, const Strings partition_ids, std::vector<LogEntryPtr> & entries, ContextPtr query_context, bool detach);
+        zkutil::ZooKeeper & zookeeper, const Strings & partition_ids, std::vector<LogEntryPtr> & entries, ContextPtr query_context, bool detach);
 
     LogEntryPtr dropAllPartsInPartition(
         zkutil::ZooKeeper & zookeeper, const String & partition_id, ContextPtr query_context, bool detach);
diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp
index 1685de55b6e..627679d6779 100644
--- a/src/Storages/StorageS3.cpp
+++ b/src/Storages/StorageS3.cpp
@@ -1076,7 +1076,8 @@ void StorageS3::updateS3Configuration(ContextPtr ctx, StorageS3::S3Configuration
     S3::PocoHTTPClientConfiguration client_configuration = S3::ClientFactory::instance().createClientConfiguration(
         settings.auth_settings.region,
         ctx->getRemoteHostFilter(), ctx->getGlobalContext()->getSettingsRef().s3_max_redirects,
-        ctx->getGlobalContext()->getSettingsRef().enable_s3_requests_logging);
+        ctx->getGlobalContext()->getSettingsRef().enable_s3_requests_logging,
+        /* for_disk_s3 = */ false);
 
     client_configuration.endpointOverride = upd.uri.endpoint;
     client_configuration.maxConnections = upd.rw_settings.max_connections;
diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp
index c3da392b9cd..8d0d85a39ef 100644
--- a/src/Storages/StorageURL.cpp
+++ b/src/Storages/StorageURL.cpp
@@ -100,20 +100,19 @@ namespace
     ReadWriteBufferFromHTTP::HTTPHeaderEntries getHeaders(const ReadWriteBufferFromHTTP::HTTPHeaderEntries & headers_)
     {
         ReadWriteBufferFromHTTP::HTTPHeaderEntries headers(headers_.begin(), headers_.end());
-        // Propagate OpenTelemetry trace context, if any, downstream.
-        if (CurrentThread::isInitialized())
-        {
-            const auto & thread_trace_context = CurrentThread::get().thread_trace_context;
-            if (thread_trace_context.trace_id != UUID())
-            {
-                headers.emplace_back("traceparent", thread_trace_context.composeTraceparentHeader());
 
-                if (!thread_trace_context.tracestate.empty())
-                {
-                    headers.emplace_back("tracestate", thread_trace_context.tracestate);
-                }
+        // Propagate OpenTelemetry trace context, if any, downstream.
+        const auto &current_trace_context = OpenTelemetry::CurrentContext();
+        if (current_trace_context.isTraceEnabled())
+        {
+            headers.emplace_back("traceparent", current_trace_context.composeTraceparentHeader());
+
+            if (!current_trace_context.tracestate.empty())
+            {
+                headers.emplace_back("tracestate", current_trace_context.tracestate);
             }
         }
+
         return headers;
     }
 
diff --git a/src/Storages/System/StorageSystemDistributionQueue.cpp b/src/Storages/System/StorageSystemDistributionQueue.cpp
index d8879c3655e..5297c4eb93c 100644
--- a/src/Storages/System/StorageSystemDistributionQueue.cpp
+++ b/src/Storages/System/StorageSystemDistributionQueue.cpp
@@ -57,7 +57,7 @@ std::string maskDataPath(const std::string & path)
         size_t user_pw_end = masked_path.find('@', node_pos);
         if (user_pw_end == std::string::npos)
         {
-            /// Likey new format (use_compact_format_in_distributed_parts_names=1)
+            /// Likely new format (use_compact_format_in_distributed_parts_names=1)
             return path;
         }
 
diff --git a/src/Storages/System/StorageSystemPartsColumns.cpp b/src/Storages/System/StorageSystemPartsColumns.cpp
index 87a5afe2439..8837c11970d 100644
--- a/src/Storages/System/StorageSystemPartsColumns.cpp
+++ b/src/Storages/System/StorageSystemPartsColumns.cpp
@@ -242,7 +242,7 @@ void StorageSystemPartsColumns::processNextStorage(
             IDataType::forEachSubcolumn([&](const auto & subpath, const auto & name, const auto & data)
             {
                 /// We count only final subcolumns, which are represented by files on disk
-                /// and skip intermediate suibcolumns of types Tuple and Nested.
+                /// and skip intermediate subcolumns of types Tuple and Nested.
                 if (isTuple(data.type) || isNested(data.type))
                     return;
 
@@ -270,7 +270,7 @@ void StorageSystemPartsColumns::processNextStorage(
                 subcolumn_data_uncompressed_bytes.push_back(size.data_uncompressed);
                 subcolumn_marks_bytes.push_back(size.marks);
 
-            }, { serialization, column.type, nullptr, nullptr });
+            }, ISerialization::SubstreamData(serialization).withType(column.type));
 
             if (columns_mask[src_index++])
                 columns[res_index++]->insert(subcolumn_names);
diff --git a/src/Storages/System/StorageSystemRemoteDataPaths.cpp b/src/Storages/System/StorageSystemRemoteDataPaths.cpp
index fe7aaf97970..de7e1911e44 100644
--- a/src/Storages/System/StorageSystemRemoteDataPaths.cpp
+++ b/src/Storages/System/StorageSystemRemoteDataPaths.cpp
@@ -1,6 +1,7 @@
 #include "StorageSystemRemoteDataPaths.h"
 #include <DataTypes/DataTypeString.h>
 #include <DataTypes/DataTypeArray.h>
+#include <DataTypes/DataTypesNumber.h>
 #include <Interpreters/Cache/FileCache.h>
 #include <Interpreters/Cache/FileCacheFactory.h>
 #include <Columns/ColumnString.h>
@@ -23,6 +24,8 @@ StorageSystemRemoteDataPaths::StorageSystemRemoteDataPaths(const StorageID & tab
         {"cache_base_path", std::make_shared<DataTypeString>()},
         {"local_path", std::make_shared<DataTypeString>()},
         {"remote_path", std::make_shared<DataTypeString>()},
+        {"size", std::make_shared<DataTypeUInt64>()},
+        {"common_prefix_for_blobs", std::make_shared<DataTypeString>()},
         {"cache_paths", std::make_shared<DataTypeArray>(std::make_shared<DataTypeString>())},
     }));
     setInMemoryMetadata(storage_metadata);
@@ -44,6 +47,8 @@ Pipe StorageSystemRemoteDataPaths::read(
     MutableColumnPtr col_cache_base_path = ColumnString::create();
     MutableColumnPtr col_local_path = ColumnString::create();
     MutableColumnPtr col_remote_path = ColumnString::create();
+    MutableColumnPtr col_size = ColumnUInt64::create();
+    MutableColumnPtr col_namespace = ColumnString::create();
     MutableColumnPtr col_cache_paths = ColumnArray::create(ColumnString::create());
 
     auto disks = context->getDisksMap();
@@ -61,7 +66,7 @@ Pipe StorageSystemRemoteDataPaths::read(
             if (!cache_base_path.empty())
                 cache = FileCacheFactory::instance().get(cache_base_path);
 
-            for (const auto & [local_path, storage_objects] : remote_paths_by_local_path)
+            for (const auto & [local_path, common_prefox_for_objects, storage_objects] : remote_paths_by_local_path)
             {
                 for (const auto & object : storage_objects)
                 {
@@ -70,6 +75,8 @@ Pipe StorageSystemRemoteDataPaths::read(
                     col_cache_base_path->insert(cache_base_path);
                     col_local_path->insert(local_path);
                     col_remote_path->insert(object.absolute_path);
+                    col_size->insert(object.bytes_size);
+                    col_namespace->insert(common_prefox_for_objects);
 
                     if (cache)
                     {
@@ -91,6 +98,8 @@ Pipe StorageSystemRemoteDataPaths::read(
     res_columns.emplace_back(std::move(col_cache_base_path));
     res_columns.emplace_back(std::move(col_local_path));
     res_columns.emplace_back(std::move(col_remote_path));
+    res_columns.emplace_back(std::move(col_size));
+    res_columns.emplace_back(std::move(col_namespace));
     res_columns.emplace_back(std::move(col_cache_paths));
 
     UInt64 num_rows = res_columns.at(0)->size();
diff --git a/src/TableFunctions/ITableFunction.h b/src/TableFunctions/ITableFunction.h
index b419c4cfeed..4b9a87b93f1 100644
--- a/src/TableFunctions/ITableFunction.h
+++ b/src/TableFunctions/ITableFunction.h
@@ -69,13 +69,14 @@ public:
 
     virtual ~ITableFunction() = default;
 
+protected:
+    virtual AccessType getSourceAccessType() const;
+
 private:
     virtual StoragePtr executeImpl(
         const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns) const = 0;
 
     virtual const char * getStorageTypeName() const = 0;
-
-    virtual AccessType getSourceAccessType() const;
 };
 
 using TableFunctionPtr = std::shared_ptr<ITableFunction>;
diff --git a/src/TableFunctions/TableFunctionHDFS.cpp b/src/TableFunctions/TableFunctionHDFS.cpp
index ed3000ec152..57f692eadad 100644
--- a/src/TableFunctions/TableFunctionHDFS.cpp
+++ b/src/TableFunctions/TableFunctionHDFS.cpp
@@ -7,6 +7,8 @@
 #include <TableFunctions/TableFunctionFactory.h>
 #include <TableFunctions/TableFunctionHDFS.h>
 #include <Interpreters/parseColumnsListForTableFunction.h>
+#include <Interpreters/Context.h>
+#include <Access/Common/AccessFlags.h>
 
 namespace DB
 {
@@ -29,7 +31,10 @@ StoragePtr TableFunctionHDFS::getStorage(
 ColumnsDescription TableFunctionHDFS::getActualTableStructure(ContextPtr context) const
 {
     if (structure == "auto")
+    {
+        context->checkAccess(getSourceAccessType());
         return StorageHDFS::getTableStructureFromData(format, filename, compression_method, context);
+    }
 
     return parseColumnsListFromString(structure, context);
 }
diff --git a/src/TableFunctions/TableFunctionHDFSCluster.cpp b/src/TableFunctions/TableFunctionHDFSCluster.cpp
index 4a68fec1a5e..385d280a100 100644
--- a/src/TableFunctions/TableFunctionHDFSCluster.cpp
+++ b/src/TableFunctions/TableFunctionHDFSCluster.cpp
@@ -14,6 +14,7 @@
 #include <TableFunctions/TableFunctionHDFS.h>
 #include <TableFunctions/TableFunctionHDFSCluster.h>
 #include <Interpreters/parseColumnsListForTableFunction.h>
+#include <Access/Common/AccessFlags.h>
 #include <Parsers/ASTLiteral.h>
 #include <Parsers/IAST_fwd.h>
 
@@ -74,7 +75,10 @@ void TableFunctionHDFSCluster::parseArguments(const ASTPtr & ast_function, Conte
 ColumnsDescription TableFunctionHDFSCluster::getActualTableStructure(ContextPtr context) const
 {
     if (structure == "auto")
+    {
+        context->checkAccess(getSourceAccessType());
         return StorageHDFS::getTableStructureFromData(format, filename, compression_method, context);
+    }
 
     return parseColumnsListFromString(structure, context);
 }
diff --git a/src/TableFunctions/TableFunctionS3.cpp b/src/TableFunctions/TableFunctionS3.cpp
index e81b67d70a4..86a7e9a0eae 100644
--- a/src/TableFunctions/TableFunctionS3.cpp
+++ b/src/TableFunctions/TableFunctionS3.cpp
@@ -8,6 +8,7 @@
 #include <TableFunctions/TableFunctionFactory.h>
 #include <TableFunctions/TableFunctionS3.h>
 #include <Interpreters/parseColumnsListForTableFunction.h>
+#include <Access/Common/AccessFlags.h>
 #include <Parsers/ASTLiteral.h>
 #include <Storages/checkAndGetLiteralArgument.h>
 #include <Storages/StorageS3.h>
@@ -133,6 +134,7 @@ ColumnsDescription TableFunctionS3::getActualTableStructure(ContextPtr context)
 {
     if (configuration.structure == "auto")
     {
+        context->checkAccess(getSourceAccessType());
         return StorageS3::getTableStructureFromData(
             configuration.format,
             S3::URI(Poco::URI(configuration.url)),
diff --git a/src/TableFunctions/TableFunctionS3Cluster.cpp b/src/TableFunctions/TableFunctionS3Cluster.cpp
index d76bd954d27..5c61207b717 100644
--- a/src/TableFunctions/TableFunctionS3Cluster.cpp
+++ b/src/TableFunctions/TableFunctionS3Cluster.cpp
@@ -15,6 +15,7 @@
 #include <TableFunctions/TableFunctionS3.h>
 #include <TableFunctions/TableFunctionS3Cluster.h>
 #include <Interpreters/parseColumnsListForTableFunction.h>
+#include <Access/Common/AccessFlags.h>
 #include <Parsers/ASTLiteral.h>
 #include <Parsers/ASTExpressionList.h>
 #include <Parsers/IAST_fwd.h>
@@ -83,6 +84,7 @@ ColumnsDescription TableFunctionS3Cluster::getActualTableStructure(ContextPtr co
 {
     if (configuration.structure == "auto")
     {
+        context->checkAccess(getSourceAccessType());
         return StorageS3::getTableStructureFromData(
             configuration.format,
             S3::URI(Poco::URI(configuration.url)),
diff --git a/src/TableFunctions/TableFunctionURL.cpp b/src/TableFunctions/TableFunctionURL.cpp
index bbae0990062..99ec87c2e8f 100644
--- a/src/TableFunctions/TableFunctionURL.cpp
+++ b/src/TableFunctions/TableFunctionURL.cpp
@@ -10,6 +10,7 @@
 #include <Storages/StorageExternalDistributed.h>
 #include <TableFunctions/TableFunctionFactory.h>
 #include <Interpreters/parseColumnsListForTableFunction.h>
+#include <Interpreters/Context.h>
 #include <Formats/FormatFactory.h>
 
 
@@ -113,12 +114,15 @@ ReadWriteBufferFromHTTP::HTTPHeaderEntries TableFunctionURL::getHeaders() const
 ColumnsDescription TableFunctionURL::getActualTableStructure(ContextPtr context) const
 {
     if (structure == "auto")
+    {
+        context->checkAccess(getSourceAccessType());
         return StorageURL::getTableStructureFromData(format,
             filename,
             chooseCompressionMethod(Poco::URI(filename).getPath(), compression_method),
             getHeaders(),
             std::nullopt,
             context);
+    }
 
     return parseColumnsListFromString(structure, context);
 }
diff --git a/tests/ci/ast_fuzzer_check.py b/tests/ci/ast_fuzzer_check.py
index 9f3ddbe9932..8f94ef4a915 100644
--- a/tests/ci/ast_fuzzer_check.py
+++ b/tests/ci/ast_fuzzer_check.py
@@ -17,7 +17,7 @@ from env_helper import (
 from s3_helper import S3Helper
 from get_robot_token import get_best_robot_token
 from pr_info import PRInfo
-from build_download_helper import get_build_name_for_check, get_build_urls
+from build_download_helper import get_build_name_for_check, read_build_urls
 from docker_pull_helper import get_image_with_version
 from commit_status_helper import post_commit_status
 from clickhouse_helper import ClickHouseHelper, prepare_tests_results_for_clickhouse
@@ -29,7 +29,11 @@ IMAGE_NAME = "clickhouse/fuzzer"
 
 def get_run_command(pr_number, sha, download_url, workspace_path, image):
     return (
-        f"docker run --network=host --volume={workspace_path}:/workspace "
+        f"docker run "
+        # For sysctl
+        "--privileged "
+        "--network=host "
+        f"--volume={workspace_path}:/workspace "
         "--cap-add syslog --cap-add sys_admin --cap-add=SYS_PTRACE "
         f'-e PR_TO_TEST={pr_number} -e SHA_TO_TEST={sha} -e BINARY_URL_TO_DOWNLOAD="{download_url}" '
         f"{image}"
@@ -69,7 +73,7 @@ if __name__ == "__main__":
 
     build_name = get_build_name_for_check(check_name)
     print(build_name)
-    urls = get_build_urls(build_name, reports_path)
+    urls = read_build_urls(build_name, reports_path)
     if not urls:
         raise Exception("No build URLs found")
 
diff --git a/tests/ci/build_download_helper.py b/tests/ci/build_download_helper.py
index f5eb72dddee..58997bed253 100644
--- a/tests/ci/build_download_helper.py
+++ b/tests/ci/build_download_helper.py
@@ -1,11 +1,11 @@
 #!/usr/bin/env python3
 
-import os
 import json
 import logging
+import os
 import sys
 import time
-from typing import Optional
+from typing import List, Optional
 
 import requests  # type: ignore
 
@@ -41,11 +41,11 @@ def get_with_retries(
     return response
 
 
-def get_build_name_for_check(check_name):
+def get_build_name_for_check(check_name) -> str:
     return CI_CONFIG["tests_config"][check_name]["required_build"]
 
 
-def get_build_urls(build_name, reports_path):
+def read_build_urls(build_name, reports_path) -> List[str]:
     for root, _, files in os.walk(reports_path):
         for f in files:
             if build_name in f:
@@ -56,7 +56,7 @@ def get_build_urls(build_name, reports_path):
     return []
 
 
-def dowload_build_with_progress(url, path):
+def download_build_with_progress(url, path):
     logging.info("Downloading from %s to temp path %s", url, path)
     for i in range(DOWNLOAD_RETRIES_COUNT):
         try:
@@ -104,14 +104,14 @@ def download_builds(result_path, build_urls, filter_fn):
         if filter_fn(url):
             fname = os.path.basename(url.replace("%2B", "+").replace("%20", " "))
             logging.info("Will download %s to %s", fname, result_path)
-            dowload_build_with_progress(url, os.path.join(result_path, fname))
+            download_build_with_progress(url, os.path.join(result_path, fname))
 
 
 def download_builds_filter(
     check_name, reports_path, result_path, filter_fn=lambda _: True
 ):
     build_name = get_build_name_for_check(check_name)
-    urls = get_build_urls(build_name, reports_path)
+    urls = read_build_urls(build_name, reports_path)
     print(urls)
 
     if not urls:
diff --git a/tests/ci/cherry_pick.py b/tests/ci/cherry_pick.py
index a0a77a2684e..064a0b3add1 100644
--- a/tests/ci/cherry_pick.py
+++ b/tests/ci/cherry_pick.py
@@ -206,14 +206,7 @@ Merge it only if you intend to backport changes to the target branch, otherwise
         )
         self.cherrypick_pr.add_to_labels(Labels.LABEL_CHERRYPICK)
         self.cherrypick_pr.add_to_labels(Labels.LABEL_DO_NOT_TEST)
-        if self.pr.assignees:
-            logging.info(
-                "Assing to assignees of the original PR: %s",
-                ", ".join(user.login for user in self.pr.assignees),
-            )
-            self.cherrypick_pr.add_to_assignees(*self.pr.assignees)
-        logging.info("Assign to the author of the original PR: %s", self.pr.user.login)
-        self.cherrypick_pr.add_to_assignees(self.pr.user)
+        self._assign_new_pr(self.cherrypick_pr)
 
     def create_backport(self):
         # Checkout the backport branch from the remote and make all changes to
@@ -244,14 +237,21 @@ Merge it only if you intend to backport changes to the target branch, otherwise
             head=self.backport_branch,
         )
         self.backport_pr.add_to_labels(Labels.LABEL_BACKPORT)
+        self._assign_new_pr(self.backport_pr)
+
+    def _assign_new_pr(self, new_pr: PullRequest):
+        """Assign `new_pr` to author, merger and assignees of an original PR"""
+        # It looks there some race when multiple .add_to_assignees are executed,
+        # so we'll add all at once
+        assignees = [self.pr.user, self.pr.merged_by]
         if self.pr.assignees:
-            logging.info(
-                "Assing to assignees of the original PR: %s",
-                ", ".join(user.login for user in self.pr.assignees),
-            )
-            self.cherrypick_pr.add_to_assignees(*self.pr.assignees)
-        logging.info("Assign to the author of the original PR: %s", self.pr.user.login)
-        self.backport_pr.add_to_assignees(self.pr.user)
+            assignees.extend(self.pr.assignees)
+        logging.info(
+            "Assing #%s to author and assignees of the original PR: %s",
+            new_pr.number,
+            ", ".join(user.login for user in assignees),
+        )
+        new_pr.add_to_assignees(*assignees)
 
     @property
     def backported(self) -> bool:
diff --git a/tests/ci/ci_config.py b/tests/ci/ci_config.py
index 61dfb07f762..fa68d1982d2 100644
--- a/tests/ci/ci_config.py
+++ b/tests/ci/ci_config.py
@@ -161,6 +161,16 @@ CI_CONFIG = {
             "tidy": "disable",
             "with_coverage": False,
         },
+        "binary_amd64sse2": {
+            "compiler": "clang-14-amd64sse2",
+            "build_type": "",
+            "sanitizer": "",
+            "package_type": "binary",
+            "static_binary_name": "amd64sse2",
+            "libraries": "static",
+            "tidy": "disable",
+            "with_coverage": False,
+        },
     },
     "builds_report_config": {
         "ClickHouse build check": [
@@ -182,18 +192,19 @@ CI_CONFIG = {
             "binary_freebsd",
             "binary_darwin_aarch64",
             "binary_ppc64le",
+            "binary_amd64sse2",
         ],
     },
     "tests_config": {
         # required_build - build name for artifacts
         # force_tests - force success status for tests
-        "Stateful tests (address)": {
+        "Stateful tests (asan)": {
             "required_build": "package_asan",
         },
-        "Stateful tests (thread)": {
+        "Stateful tests (tsan)": {
             "required_build": "package_tsan",
         },
-        "Stateful tests (memory)": {
+        "Stateful tests (msan)": {
             "required_build": "package_msan",
         },
         "Stateful tests (ubsan)": {
@@ -214,13 +225,13 @@ CI_CONFIG = {
         "Stateful tests (release, DatabaseReplicated)": {
             "required_build": "package_release",
         },
-        "Stateless tests (address)": {
+        "Stateless tests (asan)": {
             "required_build": "package_asan",
         },
-        "Stateless tests (thread)": {
+        "Stateless tests (tsan)": {
             "required_build": "package_tsan",
         },
-        "Stateless tests (memory)": {
+        "Stateless tests (msan)": {
             "required_build": "package_msan",
         },
         "Stateless tests (ubsan)": {
@@ -247,16 +258,22 @@ CI_CONFIG = {
         "Stateless tests (release, s3 storage)": {
             "required_build": "package_release",
         },
-        "Stress test (address)": {
-            "required_build": "package_asan",
+        "Stateless tests (debug, s3 storage)": {
+            "required_build": "package_debug",
         },
-        "Stress test (thread)": {
+        "Stateless tests (tsan, s3 storage)": {
             "required_build": "package_tsan",
         },
-        "Stress test (undefined)": {
+        "Stress test (asan)": {
+            "required_build": "package_asan",
+        },
+        "Stress test (tsan)": {
+            "required_build": "package_tsan",
+        },
+        "Stress test (ubsan)": {
             "required_build": "package_ubsan",
         },
-        "Stress test (memory)": {
+        "Stress test (msan)": {
             "required_build": "package_msan",
         },
         "Stress test (debug)": {
@@ -265,13 +282,13 @@ CI_CONFIG = {
         "Integration tests (asan)": {
             "required_build": "package_asan",
         },
-        "Integration tests (thread)": {
+        "Integration tests (tsan)": {
             "required_build": "package_tsan",
         },
         "Integration tests (release)": {
             "required_build": "package_release",
         },
-        "Integration tests (memory)": {
+        "Integration tests (msan)": {
             "required_build": "package_msan",
         },
         "Integration tests flaky check (asan)": {
@@ -301,19 +318,19 @@ CI_CONFIG = {
         "AST fuzzer (debug)": {
             "required_build": "package_debug",
         },
-        "AST fuzzer (ASan)": {
+        "AST fuzzer (asan)": {
             "required_build": "package_asan",
         },
-        "AST fuzzer (MSan)": {
+        "AST fuzzer (msan)": {
             "required_build": "package_msan",
         },
-        "AST fuzzer (TSan)": {
+        "AST fuzzer (tsan)": {
             "required_build": "package_tsan",
         },
-        "AST fuzzer (UBSan)": {
+        "AST fuzzer (ubsan)": {
             "required_build": "package_ubsan",
         },
-        "Stateless tests flaky check (address)": {
+        "Stateless tests flaky check (asan)": {
             "required_build": "package_asan",
         },
         "ClickHouse Keeper Jepsen": {
diff --git a/tests/ci/download_binary.py b/tests/ci/download_binary.py
new file mode 100755
index 00000000000..b95c86aa0bd
--- /dev/null
+++ b/tests/ci/download_binary.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python
+"""
+This file is needed to avoid cicle import build_download_helper.py <=> env_helper.py
+"""
+
+import argparse
+import logging
+import os
+
+from build_download_helper import download_build_with_progress
+from ci_config import CI_CONFIG, BuildConfig
+from env_helper import RUNNER_TEMP, S3_ARTIFACT_DOWNLOAD_TEMPLATE
+from git_helper import Git, commit
+from version_helper import get_version_from_repo, version_arg
+
+TEMP_PATH = os.path.join(RUNNER_TEMP, "download_binary")
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+        description="Script to download binary artifacts from S3. Downloaded artifacts "
+        "are renamed to clickhouse-{static_binary_name}",
+    )
+    parser.add_argument(
+        "--version",
+        type=version_arg,
+        default=get_version_from_repo().string,
+        help="a version to generate a download url, get from the repo by default",
+    )
+    parser.add_argument(
+        "--commit",
+        type=commit,
+        default=Git(True).sha,
+        help="a version to generate a download url, get from the repo by default",
+    )
+    parser.add_argument("--rename", default=True, help=argparse.SUPPRESS)
+    parser.add_argument(
+        "--no-rename",
+        dest="rename",
+        action="store_false",
+        default=argparse.SUPPRESS,
+        help="if set, the downloaded binary won't be renamed to "
+        "clickhouse-{static_binary_name}, makes sense only for a single build name",
+    )
+    parser.add_argument(
+        "build_names",
+        nargs="+",
+        help="the build names to download",
+    )
+    args = parser.parse_args()
+    if not args.rename and len(args.build_names) > 1:
+        parser.error("`--no-rename` shouldn't be used with more than one build name")
+    return args
+
+
+def main():
+    logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s")
+    args = parse_args()
+    os.makedirs(TEMP_PATH, exist_ok=True)
+    for build in args.build_names:
+        # check if it's in CI_CONFIG
+        config = CI_CONFIG["build_config"][build]  # type: BuildConfig
+        if args.rename:
+            path = os.path.join(TEMP_PATH, f"clickhouse-{config['static_binary_name']}")
+        else:
+            path = os.path.join(TEMP_PATH, "clickhouse")
+
+        url = S3_ARTIFACT_DOWNLOAD_TEMPLATE.format(
+            pr_or_release=f"{args.version.major}.{args.version.minor}",
+            commit=args.commit,
+            build_name=build,
+            artifact="clickhouse",
+        )
+        download_build_with_progress(url, path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/ci/env_helper.py b/tests/ci/env_helper.py
index 12c21398781..a18f47497fd 100644
--- a/tests/ci/env_helper.py
+++ b/tests/ci/env_helper.py
@@ -22,10 +22,14 @@ IMAGES_PATH = os.getenv("IMAGES_PATH", TEMP_PATH)
 REPORTS_PATH = os.getenv("REPORTS_PATH", p.abspath(p.join(module_dir, "./reports")))
 REPO_COPY = os.getenv("REPO_COPY", git_root)
 RUNNER_TEMP = os.getenv("RUNNER_TEMP", p.abspath(p.join(module_dir, "./tmp")))
-S3_URL = os.getenv("S3_URL", "https://s3.amazonaws.com")
-S3_DOWNLOAD = os.getenv("S3_DOWNLOAD", S3_URL)
 S3_BUILDS_BUCKET = os.getenv("S3_BUILDS_BUCKET", "clickhouse-builds")
 S3_TEST_REPORTS_BUCKET = os.getenv("S3_TEST_REPORTS_BUCKET", "clickhouse-test-reports")
+S3_URL = os.getenv("S3_URL", "https://s3.amazonaws.com")
+S3_DOWNLOAD = os.getenv("S3_DOWNLOAD", S3_URL)
+S3_ARTIFACT_DOWNLOAD_TEMPLATE = (
+    f"{S3_DOWNLOAD}/{S3_BUILDS_BUCKET}/"
+    "{pr_or_release}/{commit}/{build_name}/{artifact}"
+)
 
 # These parameters are set only on demand, and only once
 _GITHUB_JOB_ID = ""
diff --git a/tests/ci/push_to_artifactory.py b/tests/ci/push_to_artifactory.py
index 6b407eb5bd8..dd8081227bf 100755
--- a/tests/ci/push_to_artifactory.py
+++ b/tests/ci/push_to_artifactory.py
@@ -8,8 +8,8 @@ from collections import namedtuple
 from typing import Dict, List, Tuple
 
 from artifactory import ArtifactorySaaSPath  # type: ignore
-from build_download_helper import dowload_build_with_progress
-from env_helper import RUNNER_TEMP, S3_BUILDS_BUCKET, S3_DOWNLOAD
+from build_download_helper import download_build_with_progress
+from env_helper import S3_ARTIFACT_DOWNLOAD_TEMPLATE, RUNNER_TEMP
 from git_helper import TAG_REGEXP, commit, removeprefix, removesuffix
 
 
@@ -97,18 +97,6 @@ class Packages:
 
 
 class S3:
-    template = (
-        f"{S3_DOWNLOAD}/"
-        # "clickhouse-builds/"
-        f"{S3_BUILDS_BUCKET}/"
-        # "33333/" or "21.11/" from --release, if pull request is omitted
-        "{pr}/"
-        # "2bef313f75e4cacc6ea2ef2133e8849ecf0385ec/"
-        "{commit}/"
-        # "package_release/clickhouse-common-static_21.11.5.0_amd64.deb"
-        "{s3_path_suffix}"
-    )
-
     def __init__(
         self,
         pr: int,
@@ -117,7 +105,7 @@ class S3:
         force_download: bool,
     ):
         self._common = dict(
-            pr=pr,
+            pr_or_release=pr,
             commit=commit,
         )
         self.force_download = force_download
@@ -133,18 +121,19 @@ class S3:
                 self.packages.replace_with_fallback(package_file)
 
             return
-        url = self.template.format_map(
-            {**self._common, "s3_path_suffix": s3_path_suffix}
+        build_name, artifact = s3_path_suffix.split("/")
+        url = S3_ARTIFACT_DOWNLOAD_TEMPLATE.format_map(
+            {**self._common, "build_name": build_name, "artifact": artifact}
         )
         try:
-            dowload_build_with_progress(url, path)
+            download_build_with_progress(url, path)
         except Exception as e:
             if "Cannot download dataset from" in e.args[0]:
                 new_url = Packages.fallback_to_all(url)
                 logging.warning(
                     "Fallback downloading %s for old release", fallback_path
                 )
-                dowload_build_with_progress(new_url, fallback_path)
+                download_build_with_progress(new_url, fallback_path)
                 self.packages.replace_with_fallback(package_file)
 
     def download_deb(self):
diff --git a/tests/ci/run_check.py b/tests/ci/run_check.py
index 5e6542f6e4c..39dbc938c8f 100644
--- a/tests/ci/run_check.py
+++ b/tests/ci/run_check.py
@@ -20,8 +20,6 @@ from workflow_approve_rerun_lambda.app import TRUSTED_CONTRIBUTORS
 NAME = "Run Check"
 
 TRUSTED_ORG_IDS = {
-    7409213,  # yandex
-    28471076,  # altinity
     54801242,  # clickhouse
 }
 
@@ -89,14 +87,19 @@ def should_run_checks_for_pr(pr_info: PRInfo) -> Tuple[bool, str, str]:
     # Consider the labels and whether the user is trusted.
     print("Got labels", pr_info.labels)
     if FORCE_TESTS_LABEL in pr_info.labels:
+        print(f"Label '{FORCE_TESTS_LABEL}' set, forcing remaining checks")
         return True, f"Labeled '{FORCE_TESTS_LABEL}'", "pending"
 
     if DO_NOT_TEST_LABEL in pr_info.labels:
+        print(f"Label '{DO_NOT_TEST_LABEL}' set, skipping remaining checks")
         return False, f"Labeled '{DO_NOT_TEST_LABEL}'", "success"
 
     if CAN_BE_TESTED_LABEL not in pr_info.labels and not pr_is_by_trusted_user(
         pr_info.user_login, pr_info.user_orgs
     ):
+        print(
+            f"PRs by untrusted users need the '{CAN_BE_TESTED_LABEL}' label - please contact a member of the core team"
+        )
         return False, "Needs 'can be tested' label", "failure"
 
     if OK_SKIP_LABELS.intersection(pr_info.labels):
@@ -221,7 +224,7 @@ if __name__ == "__main__":
     elif SUBMODULE_CHANGED_LABEL in pr_info.labels:
         pr_labels_to_remove.append(SUBMODULE_CHANGED_LABEL)
 
-    print(f"change labels: add {pr_labels_to_add}, remove {pr_labels_to_remove}")
+    print(f"Change labels: add {pr_labels_to_add}, remove {pr_labels_to_remove}")
     if pr_labels_to_add:
         post_labels(gh, pr_info, pr_labels_to_add)
 
diff --git a/tests/ci/stress_check.py b/tests/ci/stress_check.py
index e644eef3bc8..8f310eaa99d 100644
--- a/tests/ci/stress_check.py
+++ b/tests/ci/stress_check.py
@@ -33,7 +33,7 @@ def get_run_command(
         "docker run --cap-add=SYS_PTRACE "
         # a static link, don't use S3_URL or S3_DOWNLOAD
         "-e S3_URL='https://s3.amazonaws.com/clickhouse-datasets' "
-        # For dmesg
+        # For dmesg and sysctl
         "--privileged "
         f"--volume={build_path}:/package_folder "
         f"--volume={result_folder}:/test_output "
diff --git a/tests/ci/version_helper.py b/tests/ci/version_helper.py
index de98b8431de..966858c0747 100755
--- a/tests/ci/version_helper.py
+++ b/tests/ci/version_helper.py
@@ -20,7 +20,7 @@ const char * auto_contributors[] {{
 
 VERSIONS = Dict[str, Union[int, str]]
 
-VERSIONS_TEMPLATE = """# This variables autochanged by release_lib.sh:
+VERSIONS_TEMPLATE = """# This variables autochanged by tests/ci/version_helper.py:
 
 # NOTE: has nothing common with DBMS_TCP_PROTOCOL_VERSION,
 # only DBMS_TCP_PROTOCOL_VERSION should be incremented on protocol changes.
diff --git a/tests/ci/workflow_approve_rerun_lambda/app.py b/tests/ci/workflow_approve_rerun_lambda/app.py
index 29f3271a34c..39bd9cfb283 100644
--- a/tests/ci/workflow_approve_rerun_lambda/app.py
+++ b/tests/ci/workflow_approve_rerun_lambda/app.py
@@ -50,8 +50,6 @@ WorkflowDescription = namedtuple(
 
 # See https://api.github.com/orgs/{name}
 TRUSTED_ORG_IDS = {
-    7409213,  # yandex
-    28471076,  # altinity
     54801242,  # clickhouse
 }
 
@@ -104,8 +102,6 @@ TRUSTED_CONTRIBUTORS = {
         "kreuzerkrieg",
         "lehasm",  # DOCSUP
         "michon470",  # DOCSUP
-        "MyroTk",  # Tester in Altinity
-        "myrrc",  # Michael Kot, Altinity
         "nikvas0",
         "nvartolomei",
         "olgarev",  # DOCSUP
diff --git a/tests/clickhouse-test b/tests/clickhouse-test
index f40c93c6f5d..14cf4d0674a 100755
--- a/tests/clickhouse-test
+++ b/tests/clickhouse-test
@@ -533,7 +533,7 @@ class TestCase:
         else:
             # If --database is not specified, we will create temporary database with
             # unique name and we will recreate and drop it for each test
-            def random_str(length=6):
+            def random_str(length=8):
                 alphabet = string.ascii_lowercase + string.digits
                 # NOTE: it is important not to use default random generator, since it shares state.
                 return "".join(
@@ -995,9 +995,12 @@ class TestCase:
                 args.timeout - (datetime.now() - start_time).total_seconds(), 20
             )
             try:
+                drop_database_query = "DROP DATABASE " + database
+                if args.replicated_database:
+                    drop_database_query += " ON CLUSTER test_cluster_database_replicated"
                 clickhouse_execute(
                     args,
-                    "DROP DATABASE " + database,
+                    drop_database_query,
                     timeout=seconds_left,
                     settings={
                         "log_comment": args.testcase_basename,
diff --git a/tests/config/config.d/zookeeper_fault_injection.xml b/tests/config/config.d/zookeeper_fault_injection.xml
new file mode 100644
index 00000000000..45d3cc8193d
--- /dev/null
+++ b/tests/config/config.d/zookeeper_fault_injection.xml
@@ -0,0 +1,19 @@
+<clickhouse>
+    <zookeeper>
+        <node index="1">
+            <host>localhost</host>
+            <port>9181</port>
+        </node>
+
+        <!-- Settings for fault injection.
+          Approximate probability of request success:
+            (1 - send_fault_probability) * (1 - recv_fault_probability) = 0.99998 * 0.99998 = 0.99996
+          Actually it will be less, because if some request fails due to fault injection,
+          then all requests which are in the queue now also fail.
+          In other words, session will expire 4 times per 99996 successful requests
+          or approximately each 25000 requests (on average).
+        -->
+        <send_fault_probability>0.00002</send_fault_probability>
+        <recv_fault_probability>0.00002</recv_fault_probability>
+    </zookeeper>
+</clickhouse>
diff --git a/tests/config/install.sh b/tests/config/install.sh
index e7d0f8e7acf..e27675b8abb 100755
--- a/tests/config/install.sh
+++ b/tests/config/install.sh
@@ -15,7 +15,6 @@ mkdir -p $DEST_SERVER_PATH/config.d/
 mkdir -p $DEST_SERVER_PATH/users.d/
 mkdir -p $DEST_CLIENT_PATH
 
-ln -sf $SRC_PATH/config.d/zookeeper.xml $DEST_SERVER_PATH/config.d/
 ln -sf $SRC_PATH/config.d/zookeeper_write.xml $DEST_SERVER_PATH/config.d/
 ln -sf $SRC_PATH/config.d/listen.xml $DEST_SERVER_PATH/config.d/
 ln -sf $SRC_PATH/config.d/text_log.xml $DEST_SERVER_PATH/config.d/
@@ -89,6 +88,12 @@ ln -sf $SRC_PATH/dhparam.pem $DEST_SERVER_PATH/
 ln -sf --backup=simple --suffix=_original.xml \
    $SRC_PATH/config.d/query_masking_rules.xml $DEST_SERVER_PATH/config.d/
 
+if [[ -n "$ZOOKEEPER_FAULT_INJECTION" ]] && [[ "$ZOOKEEPER_FAULT_INJECTION" -eq 1 ]]; then
+    ln -sf $SRC_PATH/config.d/zookeeper_fault_injection.xml $DEST_SERVER_PATH/config.d/
+else
+    ln -sf $SRC_PATH/config.d/zookeeper.xml $DEST_SERVER_PATH/config.d/
+fi
+
 # We randomize creating the snapshot on exit for Keeper to test out using older snapshots
 create_snapshot_on_exit=$(($RANDOM % 2))
 sed --follow-symlinks -i "s|<create_snapshot_on_exit>true</create_snapshot_on_exit>|<create_snapshot_on_exit>$create_snapshot_on_exit</create_snapshot_on_exit>|" $DEST_SERVER_PATH/config.d/keeper_port.xml
diff --git a/tests/instructions/clang-tidy.txt b/tests/instructions/clang-tidy.txt
deleted file mode 100644
index 84145564bf0..00000000000
--- a/tests/instructions/clang-tidy.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-# clang-tidy has been integrated into CMake:
-# --> Build ClickHouse with -DENABLE_CLANG_TIDY=1 and see cmake/clang_tidy.cmake for details
diff --git a/tests/integration/runner b/tests/integration/runner
index f0d87b23a83..e1b9a55b43e 100755
--- a/tests/integration/runner
+++ b/tests/integration/runner
@@ -350,8 +350,7 @@ if __name__ == "__main__":
         # randomizer, we should remove it after Sep 2022
         try:
             subprocess.check_call(
-                "docker volume rm $(docker volume ls -q | "
-                f"grep '{VOLUME_NAME}_.*_volume')",
+                f"docker volume ls -q | grep '{VOLUME_NAME}_.*_volume' | xargs --no-run-if-empty docker volume rm",
                 shell=True,
             )
         except Exception as ex:
diff --git a/tests/integration/test_backup_restore_new/test.py b/tests/integration/test_backup_restore_new/test.py
index 2fe3bb99e45..ca0d6a632a0 100644
--- a/tests/integration/test_backup_restore_new/test.py
+++ b/tests/integration/test_backup_restore_new/test.py
@@ -224,6 +224,89 @@ def test_incremental_backup_after_renaming_table():
     assert instance.query("SELECT count(), sum(x) FROM test.table2") == "100\t4950\n"
 
 
+def test_incremental_backup_for_log_family():
+    backup_name = new_backup_name()
+    create_and_fill_table(engine="Log")
+
+    assert instance.query("SELECT count(), sum(x) FROM test.table") == "100\t4950\n"
+    instance.query(f"BACKUP TABLE test.table TO {backup_name}")
+
+    instance.query("INSERT INTO test.table VALUES (65, 'a'), (66, 'b')")
+
+    assert instance.query("SELECT count(), sum(x) FROM test.table") == "102\t5081\n"
+
+    backup_name2 = new_backup_name()
+    instance.query(f"BACKUP TABLE test.table TO {backup_name2}")
+
+    backup_name_inc = new_backup_name()
+    instance.query(
+        f"BACKUP TABLE test.table TO {backup_name_inc} SETTINGS base_backup = {backup_name}"
+    )
+
+    metadata_path = os.path.join(
+        get_path_to_backup(backup_name), "metadata/test/table.sql"
+    )
+
+    metadata_path2 = os.path.join(
+        get_path_to_backup(backup_name2), "metadata/test/table.sql"
+    )
+
+    metadata_path_inc = os.path.join(
+        get_path_to_backup(backup_name_inc), "metadata/test/table.sql"
+    )
+
+    assert os.path.isfile(metadata_path)
+    assert os.path.isfile(metadata_path2)
+    assert not os.path.isfile(metadata_path_inc)
+    assert os.path.getsize(metadata_path) > 0
+    assert os.path.getsize(metadata_path) == os.path.getsize(metadata_path2)
+
+    x_bin_path = os.path.join(get_path_to_backup(backup_name), "data/test/table/x.bin")
+    y_bin_path = os.path.join(get_path_to_backup(backup_name), "data/test/table/y.bin")
+
+    x_bin_path2 = os.path.join(
+        get_path_to_backup(backup_name2), "data/test/table/x.bin"
+    )
+    y_bin_path2 = os.path.join(
+        get_path_to_backup(backup_name2), "data/test/table/y.bin"
+    )
+
+    x_bin_path_inc = os.path.join(
+        get_path_to_backup(backup_name_inc), "data/test/table/x.bin"
+    )
+
+    y_bin_path_inc = os.path.join(
+        get_path_to_backup(backup_name_inc), "data/test/table/y.bin"
+    )
+
+    assert os.path.isfile(x_bin_path)
+    assert os.path.isfile(y_bin_path)
+    assert os.path.isfile(x_bin_path2)
+    assert os.path.isfile(y_bin_path2)
+    assert os.path.isfile(x_bin_path_inc)
+    assert os.path.isfile(y_bin_path_inc)
+
+    x_bin_size = os.path.getsize(x_bin_path)
+    y_bin_size = os.path.getsize(y_bin_path)
+    x_bin_size2 = os.path.getsize(x_bin_path2)
+    y_bin_size2 = os.path.getsize(y_bin_path2)
+    x_bin_size_inc = os.path.getsize(x_bin_path_inc)
+    y_bin_size_inc = os.path.getsize(y_bin_path_inc)
+
+    assert x_bin_size > 0
+    assert y_bin_size > 0
+    assert x_bin_size2 > 0
+    assert y_bin_size2 > 0
+    assert x_bin_size_inc > 0
+    assert y_bin_size_inc > 0
+    assert x_bin_size2 == x_bin_size + x_bin_size_inc
+    assert y_bin_size2 == y_bin_size + y_bin_size_inc
+
+    instance.query(f"RESTORE TABLE test.table AS test.table2 FROM {backup_name_inc}")
+
+    assert instance.query("SELECT count(), sum(x) FROM test.table2") == "102\t5081\n"
+
+
 def test_backup_not_found_or_already_exists():
     backup_name = new_backup_name()
 
diff --git a/tests/integration/test_backup_restore_on_cluster/test_concurrency.py b/tests/integration/test_backup_restore_on_cluster/test_concurrency.py
index 2269ccda828..dd8b6aa50da 100644
--- a/tests/integration/test_backup_restore_on_cluster/test_concurrency.py
+++ b/tests/integration/test_backup_restore_on_cluster/test_concurrency.py
@@ -29,7 +29,6 @@ def generate_cluster_def():
 
 
 main_configs = ["configs/backups_disk.xml", generate_cluster_def()]
-
 user_configs = ["configs/allow_database_types.xml"]
 
 nodes = []
@@ -175,11 +174,21 @@ def test_concurrent_backups_on_different_nodes():
 
 @pytest.mark.parametrize(
     "db_engine, table_engine",
-    [("Replicated", "ReplicatedMergeTree"), ("Ordinary", "MergeTree")],
+    [
+        ("Ordinary", "MergeTree"),
+        ("Atomic", "MergeTree"),
+        ("Replicated", "ReplicatedMergeTree"),
+        ("Memory", "MergeTree"),
+        ("Lazy", "Log"),
+    ],
 )
 def test_create_or_drop_tables_during_backup(db_engine, table_engine):
     if db_engine == "Replicated":
         db_engine = "Replicated('/clickhouse/path/','{shard}','{replica}')"
+
+    if db_engine == "Lazy":
+        db_engine = "Lazy(20)"
+
     if table_engine.endswith("MergeTree"):
         table_engine += " ORDER BY tuple()"
 
@@ -189,7 +198,7 @@ def test_create_or_drop_tables_during_backup(db_engine, table_engine):
     start_time = time.time()
     end_time = start_time + 60
 
-    def create_table():
+    def create_tables():
         while time.time() < end_time:
             node = nodes[randint(0, num_nodes - 1)]
             table_name = f"mydb.tbl{randint(1, num_nodes)}"
@@ -200,13 +209,13 @@ def test_create_or_drop_tables_during_backup(db_engine, table_engine):
                 f"INSERT INTO {table_name} SELECT rand32() FROM numbers(10)"
             )
 
-    def drop_table():
+    def drop_tables():
         while time.time() < end_time:
             table_name = f"mydb.tbl{randint(1, num_nodes)}"
             node = nodes[randint(0, num_nodes - 1)]
             node.query(f"DROP TABLE IF EXISTS {table_name} NO DELAY")
 
-    def rename_table():
+    def rename_tables():
         while time.time() < end_time:
             table_name1 = f"mydb.tbl{randint(1, num_nodes)}"
             table_name2 = f"mydb.tbl{randint(1, num_nodes)}"
@@ -215,7 +224,13 @@ def test_create_or_drop_tables_during_backup(db_engine, table_engine):
                 f"RENAME TABLE {table_name1} TO {table_name2}"
             )
 
-    def make_backup():
+    def truncate_tables():
+        while time.time() < end_time:
+            table_name = f"mydb.tbl{randint(1, num_nodes)}"
+            node = nodes[randint(0, num_nodes - 1)]
+            node.query(f"TRUNCATE TABLE IF EXISTS {table_name} NO DELAY")
+
+    def make_backups():
         ids = []
         while time.time() < end_time:
             time.sleep(
@@ -231,11 +246,12 @@ def test_create_or_drop_tables_during_backup(db_engine, table_engine):
     ids = []
     with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
         futures = []
-        ids_future = executor.submit(make_backup)
+        ids_future = executor.submit(make_backups)
         futures.append(ids_future)
-        futures.append(executor.submit(create_table))
-        futures.append(executor.submit(drop_table))
-        futures.append(executor.submit(rename_table))
+        futures.append(executor.submit(create_tables))
+        futures.append(executor.submit(drop_tables))
+        futures.append(executor.submit(rename_tables))
+        futures.append(executor.submit(truncate_tables))
         for future in futures:
             future.result()
         ids = ids_future.result()
diff --git a/tests/integration/test_config_xml_yaml_mix/configs/config.d/path.yaml b/tests/integration/test_config_xml_yaml_mix/configs/config.d/path.yaml
index 7fd5b1a0478..de0fefd2f48 100644
--- a/tests/integration/test_config_xml_yaml_mix/configs/config.d/path.yaml
+++ b/tests/integration/test_config_xml_yaml_mix/configs/config.d/path.yaml
@@ -1,18 +1,6 @@
-path:
-  - /var/lib/clickhouse
-  - "@replace": replace
-tmp_path:
-  - /var/lib/clickhouse/tmp/
-  - "@replace": replace
-user_files_path:
-  - /var/lib/clickhouse/user_files/
-  - "@replace": replace
-format_schema_path:
-  - /var/lib/clickhouse/format_schemas/
-  - "@replace": replace
-access_control_path:
-  - /var/lib/clickhouse/access/
-  - "@replace": replace
-top_level_domains_path:
-  - /var/lib/clickhouse/top_level_domains/
-  - "@replace": replace
+path: /var/lib/clickhouse
+tmp_path: /var/lib/clickhouse/tmp/
+user_files_path: /var/lib/clickhouse/user_files/
+format_schema_path: /var/lib/clickhouse/format_schemas/
+access_control_path: /var/lib/clickhouse/access/
+top_level_domains_path: /var/lib/clickhouse/top_level_domains/
diff --git a/tests/integration/test_config_xml_yaml_mix/configs/users.yaml b/tests/integration/test_config_xml_yaml_mix/configs/users.yaml
index a87a8c82819..7a28807949f 100644
--- a/tests/integration/test_config_xml_yaml_mix/configs/users.yaml
+++ b/tests/integration/test_config_xml_yaml_mix/configs/users.yaml
@@ -6,7 +6,6 @@ users:
   default:
     password: ''
     networks:
-      "@replace": replace
       ip: '::/0'
     profile: default
 
diff --git a/tests/integration/test_config_yaml_full/configs/config.d/path.yaml b/tests/integration/test_config_yaml_full/configs/config.d/path.yaml
index 7fd5b1a0478..de0fefd2f48 100644
--- a/tests/integration/test_config_yaml_full/configs/config.d/path.yaml
+++ b/tests/integration/test_config_yaml_full/configs/config.d/path.yaml
@@ -1,18 +1,6 @@
-path:
-  - /var/lib/clickhouse
-  - "@replace": replace
-tmp_path:
-  - /var/lib/clickhouse/tmp/
-  - "@replace": replace
-user_files_path:
-  - /var/lib/clickhouse/user_files/
-  - "@replace": replace
-format_schema_path:
-  - /var/lib/clickhouse/format_schemas/
-  - "@replace": replace
-access_control_path:
-  - /var/lib/clickhouse/access/
-  - "@replace": replace
-top_level_domains_path:
-  - /var/lib/clickhouse/top_level_domains/
-  - "@replace": replace
+path: /var/lib/clickhouse
+tmp_path: /var/lib/clickhouse/tmp/
+user_files_path: /var/lib/clickhouse/user_files/
+format_schema_path: /var/lib/clickhouse/format_schemas/
+access_control_path: /var/lib/clickhouse/access/
+top_level_domains_path: /var/lib/clickhouse/top_level_domains/
diff --git a/tests/integration/test_config_yaml_full/configs/users.yaml b/tests/integration/test_config_yaml_full/configs/users.yaml
index a87a8c82819..7a28807949f 100644
--- a/tests/integration/test_config_yaml_full/configs/users.yaml
+++ b/tests/integration/test_config_yaml_full/configs/users.yaml
@@ -6,7 +6,6 @@ users:
   default:
     password: ''
     networks:
-      "@replace": replace
       ip: '::/0'
     profile: default
 
diff --git a/tests/integration/test_config_yaml_main/configs/users.yaml b/tests/integration/test_config_yaml_main/configs/users.yaml
index a87a8c82819..7a28807949f 100644
--- a/tests/integration/test_config_yaml_main/configs/users.yaml
+++ b/tests/integration/test_config_yaml_main/configs/users.yaml
@@ -6,7 +6,6 @@ users:
   default:
     password: ''
     networks:
-      "@replace": replace
       ip: '::/0'
     profile: default
 
diff --git a/tests/integration/test_grpc_protocol/test.py b/tests/integration/test_grpc_protocol/test.py
index 469113cd68e..52c583973d0 100644
--- a/tests/integration/test_grpc_protocol/test.py
+++ b/tests/integration/test_grpc_protocol/test.py
@@ -744,7 +744,7 @@ def test_opentelemetry_context_propagation():
     assert (
         node.query(
             f"SELECT attribute['db.statement'], attribute['clickhouse.tracestate'] FROM system.opentelemetry_span_log "
-            f"WHERE trace_id='{trace_id}' AND parent_span_id={parent_span_id}"
+            f"WHERE trace_id='{trace_id}' AND operation_name='query'"
         )
         == "SELECT 1\tsome custom state\n"
     )
diff --git a/tests/integration/test_join_set_family_s3/test.py b/tests/integration/test_join_set_family_s3/test.py
index b09d5735628..38b56b7b15b 100644
--- a/tests/integration/test_join_set_family_s3/test.py
+++ b/tests/integration/test_join_set_family_s3/test.py
@@ -27,7 +27,7 @@ def cluster():
 
 def assert_objects_count(cluster, objects_count, path="data/"):
     minio = cluster.minio_client
-    s3_objects = list(minio.list_objects(cluster.minio_bucket, path))
+    s3_objects = list(minio.list_objects(cluster.minio_bucket, path, recursive=True))
     if objects_count != len(s3_objects):
         for s3_object in s3_objects:
             object_meta = minio.stat_object(cluster.minio_bucket, s3_object.object_name)
diff --git a/tests/integration/test_log_family_s3/test.py b/tests/integration/test_log_family_s3/test.py
index 76ff0930db3..bed379d098b 100644
--- a/tests/integration/test_log_family_s3/test.py
+++ b/tests/integration/test_log_family_s3/test.py
@@ -25,7 +25,7 @@ def cluster():
 
 def assert_objects_count(cluster, objects_count, path="data/"):
     minio = cluster.minio_client
-    s3_objects = list(minio.list_objects(cluster.minio_bucket, path))
+    s3_objects = list(minio.list_objects(cluster.minio_bucket, path, recursive=True))
     if objects_count != len(s3_objects):
         for s3_object in s3_objects:
             object_meta = minio.stat_object(cluster.minio_bucket, s3_object.object_name)
diff --git a/tests/integration/test_merge_tree_s3/test.py b/tests/integration/test_merge_tree_s3/test.py
index 544f064bdff..4276125c347 100644
--- a/tests/integration/test_merge_tree_s3/test.py
+++ b/tests/integration/test_merge_tree_s3/test.py
@@ -120,11 +120,17 @@ def run_s3_mocks(cluster):
 def wait_for_delete_s3_objects(cluster, expected, timeout=30):
     minio = cluster.minio_client
     while timeout > 0:
-        if len(list(minio.list_objects(cluster.minio_bucket, "data/"))) == expected:
+        if (
+            len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
+            == expected
+        ):
             return
         timeout -= 1
         time.sleep(1)
-    assert len(list(minio.list_objects(cluster.minio_bucket, "data/"))) == expected
+    assert (
+        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
+        == expected
+    )
 
 
 @pytest.fixture(autouse=True)
@@ -140,7 +146,9 @@ def drop_table(cluster, node_name):
         wait_for_delete_s3_objects(cluster, 0)
     finally:
         # Remove extra objects to prevent tests cascade failing
-        for obj in list(minio.list_objects(cluster.minio_bucket, "data/")):
+        for obj in list(
+            minio.list_objects(cluster.minio_bucket, "data/", recursive=True)
+        ):
             minio.remove_object(cluster.minio_bucket, obj.object_name)
 
 
@@ -162,7 +170,7 @@ def test_simple_insert_select(
     node.query("INSERT INTO s3_test VALUES {}".format(values1))
     assert node.query("SELECT * FROM s3_test order by dt, id FORMAT Values") == values1
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/")))
+        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
         == FILES_OVERHEAD + files_per_part
     )
 
@@ -173,7 +181,7 @@ def test_simple_insert_select(
         == values1 + "," + values2
     )
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/")))
+        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
         == FILES_OVERHEAD + files_per_part * 2
     )
 
@@ -217,7 +225,7 @@ def test_insert_same_partition_and_merge(cluster, merge_vertical, node_name):
         node.query("SELECT count(distinct(id)) FROM s3_test FORMAT Values") == "(8192)"
     )
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/")))
+        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
         == FILES_OVERHEAD_PER_PART_WIDE * 6 + FILES_OVERHEAD
     )
 
@@ -306,28 +314,28 @@ def test_attach_detach_partition(cluster, node_name):
     )
     assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(8192)"
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/")))
+        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
         == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2
     )
 
     node.query("ALTER TABLE s3_test DETACH PARTITION '2020-01-03'")
     assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(4096)"
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/")))
+        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
         == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2
     )
 
     node.query("ALTER TABLE s3_test ATTACH PARTITION '2020-01-03'")
     assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(8192)"
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/")))
+        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
         == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2
     )
 
     node.query("ALTER TABLE s3_test DROP PARTITION '2020-01-03'")
     assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(4096)"
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/")))
+        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
         == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE
     )
 
@@ -338,7 +346,8 @@ def test_attach_detach_partition(cluster, node_name):
     )
     assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(0)"
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/"))) == FILES_OVERHEAD
+        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
+        == FILES_OVERHEAD
     )
 
 
@@ -356,21 +365,21 @@ def test_move_partition_to_another_disk(cluster, node_name):
     )
     assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(8192)"
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/")))
+        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
         == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2
     )
 
     node.query("ALTER TABLE s3_test MOVE PARTITION '2020-01-04' TO DISK 'hdd'")
     assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(8192)"
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/")))
+        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
         == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE
     )
 
     node.query("ALTER TABLE s3_test MOVE PARTITION '2020-01-04' TO DISK 's3'")
     assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(8192)"
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/")))
+        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
         == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2
     )
 
@@ -391,7 +400,7 @@ def test_table_manipulations(cluster, node_name):
     node.query("RENAME TABLE s3_test TO s3_renamed")
     assert node.query("SELECT count(*) FROM s3_renamed FORMAT Values") == "(8192)"
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/")))
+        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
         == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2
     )
     node.query("RENAME TABLE s3_renamed TO s3_test")
@@ -402,14 +411,15 @@ def test_table_manipulations(cluster, node_name):
     node.query("ATTACH TABLE s3_test")
     assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(8192)"
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/")))
+        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
         == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2
     )
 
     node.query("TRUNCATE TABLE s3_test")
     assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(0)"
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/"))) == FILES_OVERHEAD
+        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
+        == FILES_OVERHEAD
     )
 
 
@@ -434,7 +444,7 @@ def test_move_replace_partition_to_another_table(cluster, node_name):
     assert node.query("SELECT sum(id) FROM s3_test FORMAT Values") == "(0)"
     assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(16384)"
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/")))
+        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
         == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 4
     )
 
@@ -448,7 +458,7 @@ def test_move_replace_partition_to_another_table(cluster, node_name):
     assert node.query("SELECT count(*) FROM s3_clone FORMAT Values") == "(8192)"
     # Number of objects in S3 should be unchanged.
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/")))
+        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
         == FILES_OVERHEAD * 2 + FILES_OVERHEAD_PER_PART_WIDE * 4
     )
 
@@ -462,7 +472,7 @@ def test_move_replace_partition_to_another_table(cluster, node_name):
     assert node.query("SELECT sum(id) FROM s3_test FORMAT Values") == "(0)"
     assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(16384)"
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/")))
+        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
         == FILES_OVERHEAD * 2 + FILES_OVERHEAD_PER_PART_WIDE * 6
     )
 
@@ -483,14 +493,14 @@ def test_move_replace_partition_to_another_table(cluster, node_name):
     assert node.query("SELECT count(*) FROM s3_test FORMAT Values") == "(16384)"
     # Data should remain in S3
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/")))
+        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
         == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 4
     )
 
     node.query("ALTER TABLE s3_test FREEZE")
     # Number S3 objects should be unchanged.
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/")))
+        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
         == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 4
     )
 
@@ -499,7 +509,7 @@ def test_move_replace_partition_to_another_table(cluster, node_name):
 
     wait_for_delete_s3_objects(cluster, FILES_OVERHEAD_PER_PART_WIDE * 4)
 
-    for obj in list(minio.list_objects(cluster.minio_bucket, "data/")):
+    for obj in list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)):
         minio.remove_object(cluster.minio_bucket, obj.object_name)
 
 
@@ -520,7 +530,7 @@ def test_freeze_unfreeze(cluster, node_name):
 
     node.query("TRUNCATE TABLE s3_test")
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/")))
+        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
         == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2
     )
 
@@ -533,7 +543,8 @@ def test_freeze_unfreeze(cluster, node_name):
 
     # Data should be removed from S3.
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/"))) == FILES_OVERHEAD
+        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
+        == FILES_OVERHEAD
     )
 
 
@@ -556,7 +567,7 @@ def test_freeze_system_unfreeze(cluster, node_name):
     node.query("TRUNCATE TABLE s3_test")
     node.query("DROP TABLE s3_test_removed NO DELAY")
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/")))
+        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
         == FILES_OVERHEAD + FILES_OVERHEAD_PER_PART_WIDE * 2
     )
 
@@ -565,7 +576,8 @@ def test_freeze_system_unfreeze(cluster, node_name):
 
     # Data should be removed from S3.
     assert (
-        len(list(minio.list_objects(cluster.minio_bucket, "data/"))) == FILES_OVERHEAD
+        len(list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)))
+        == FILES_OVERHEAD
     )
 
 
@@ -692,7 +704,7 @@ def test_lazy_seek_optimization_for_async_read(cluster, node_name):
     node.query("SELECT * FROM s3_test WHERE value LIKE '%abc%' ORDER BY value LIMIT 10")
     node.query("DROP TABLE IF EXISTS s3_test NO DELAY")
     minio = cluster.minio_client
-    for obj in list(minio.list_objects(cluster.minio_bucket, "data/")):
+    for obj in list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)):
         minio.remove_object(cluster.minio_bucket, obj.object_name)
 
 
diff --git a/tests/integration/test_profile_events_s3/test.py b/tests/integration/test_profile_events_s3/test.py
index aa578a1273a..18f1c5ee9ad 100644
--- a/tests/integration/test_profile_events_s3/test.py
+++ b/tests/integration/test_profile_events_s3/test.py
@@ -33,9 +33,10 @@ def cluster():
 
 init_list = {
     "ReadBufferFromS3Bytes": 0,
-    "S3ReadMicroseconds": 0,
     "ReadBufferFromS3Microseconds": 0,
     "ReadBufferFromS3RequestsErrors": 0,
+    "WriteBufferFromS3Bytes": 0,
+    "S3ReadMicroseconds": 0,
     "S3ReadRequestsCount": 0,
     "S3ReadRequestsErrorsTotal": 0,
     "S3ReadRequestsErrors503": 0,
@@ -45,14 +46,23 @@ init_list = {
     "S3WriteRequestsErrorsTotal": 0,
     "S3WriteRequestsErrors503": 0,
     "S3WriteRequestsRedirects": 0,
-    "WriteBufferFromS3Bytes": 0,
+    "DiskS3ReadMicroseconds": 0,
+    "DiskS3ReadRequestsCount": 0,
+    "DiskS3ReadRequestsErrorsTotal": 0,
+    "DiskS3ReadRequestsErrors503": 0,
+    "DiskS3ReadRequestsRedirects": 0,
+    "DiskS3WriteMicroseconds": 0,
+    "DiskS3WriteRequestsCount": 0,
+    "DiskS3WriteRequestsErrorsTotal": 0,
+    "DiskS3WriteRequestsErrors503": 0,
+    "DiskS3WriteRequestsRedirects": 0,
 }
 
 
 def get_s3_events(instance):
     result = init_list.copy()
     events = instance.query(
-        "SELECT event,value FROM system.events WHERE event LIKE '%S3%'"
+        "SELECT event, value FROM system.events WHERE event LIKE '%S3%'"
     ).split("\n")
     for event in events:
         ev = event.split("\t")
@@ -75,20 +85,20 @@ def get_minio_stat(cluster):
         )
     ).text.split("\n")
     for line in stat:
-        x = re.search("s3_requests_total(\{.*\})?\s(\d+)(\s.*)?", line)
+        x = re.search(r"s3_requests_total(\{.*\})?\s(\d+)(\s.*)?", line)
         if x != None:
             y = re.search('.*api="(get|list|head|select).*', x.group(1))
             if y != None:
                 result["get_requests"] += int(x.group(2))
             else:
                 result["set_requests"] += int(x.group(2))
-        x = re.search("s3_errors_total(\{.*\})?\s(\d+)(\s.*)?", line)
+        x = re.search(r"s3_errors_total(\{.*\})?\s(\d+)(\s.*)?", line)
         if x != None:
             result["errors"] += int(x.group(2))
-        x = re.search("s3_rx_bytes_total(\{.*\})?\s([\d\.e\+\-]+)(\s.*)?", line)
+        x = re.search(r"s3_rx_bytes_total(\{.*\})?\s([\d\.e\+\-]+)(\s.*)?", line)
         if x != None:
             result["tx_bytes"] += float(x.group(2))
-        x = re.search("s3_tx_bytes_total(\{.*\})?\s([\d\.e\+\-]+)(\s.*)?", line)
+        x = re.search(r"s3_tx_bytes_total(\{.*\})?\s([\d\.e\+\-]+)(\s.*)?", line)
         if x != None:
             result["rx_bytes"] += float(x.group(2))
     return result
@@ -118,8 +128,10 @@ def get_query_stat(instance, hint):
 def get_minio_size(cluster):
     minio = cluster.minio_client
     size = 0
-    for obj in minio.list_objects(cluster.minio_bucket, "data/"):
-        size += obj.size
+    for obj_level1 in minio.list_objects(
+        cluster.minio_bucket, prefix="data/", recursive=True
+    ):
+        size += obj_level1.size
     return size
 
 
@@ -135,7 +147,7 @@ def test_profile_events(cluster):
     metrics0 = get_s3_events(instance)
     minio0 = get_minio_stat(cluster)
 
-    query1 = "CREATE TABLE test_s3.test_s3 (key UInt32, value UInt32) ENGINE=MergeTree PRIMARY KEY key ORDER BY key SETTINGS storage_policy='s3'"
+    query1 = "CREATE TABLE test_s3.test_s3 (key UInt32, value UInt32) ENGINE=MergeTree PRIMARY KEY key ORDER BY key SETTINGS storage_policy = 's3'"
     instance.query(query1)
 
     size1 = get_minio_size(cluster)
@@ -157,7 +169,7 @@ def test_profile_events(cluster):
         metrics1["WriteBufferFromS3Bytes"] - metrics0["WriteBufferFromS3Bytes"] == size1
     )
 
-    query2 = "INSERT INTO test_s3.test_s3 FORMAT Values"
+    query2 = "INSERT INTO test_s3.test_s3 VALUES"
     instance.query(query2 + " (1,1)")
 
     size2 = get_minio_size(cluster)
@@ -172,9 +184,12 @@ def test_profile_events(cluster):
         metrics2["S3WriteRequestsCount"] - metrics1["S3WriteRequestsCount"]
         == minio2["set_requests"] - minio1["set_requests"]
     )
+
     stat2 = get_query_stat(instance, query2)
+
     for metric in stat2:
         assert stat2[metric] == metrics2[metric] - metrics1[metric]
+
     assert (
         metrics2["WriteBufferFromS3Bytes"] - metrics1["WriteBufferFromS3Bytes"]
         == size2 - size1
@@ -195,6 +210,7 @@ def test_profile_events(cluster):
         == minio3["set_requests"] - minio2["set_requests"]
     )
     stat3 = get_query_stat(instance, query3)
+
     # With async reads profile events are not updated fully because reads are done in a separate thread.
     # for metric in stat3:
     #    print(metric)
diff --git a/tests/integration/test_replicated_merge_tree_s3/test.py b/tests/integration/test_replicated_merge_tree_s3/test.py
index 37027d07969..0d978bb6967 100644
--- a/tests/integration/test_replicated_merge_tree_s3/test.py
+++ b/tests/integration/test_replicated_merge_tree_s3/test.py
@@ -113,7 +113,7 @@ def drop_table(cluster):
 
     minio = cluster.minio_client
     # Remove extra objects to prevent tests cascade failing
-    for obj in list(minio.list_objects(cluster.minio_bucket, "data/")):
+    for obj in list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)):
         minio.remove_object(cluster.minio_bucket, obj.object_name)
 
 
@@ -130,9 +130,9 @@ def test_insert_select_replicated(cluster, min_rows_for_wide_part, files_per_par
     insert(cluster, node_idxs=[1, 2, 3], verify=True)
 
     minio = cluster.minio_client
-    assert len(list(minio.list_objects(cluster.minio_bucket, "data/"))) == 3 * (
-        FILES_OVERHEAD + files_per_part * 3
-    )
+    assert len(
+        list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True))
+    ) == 3 * (FILES_OVERHEAD + files_per_part * 3)
 
 
 def test_drop_cache_on_cluster(cluster):
diff --git a/tests/integration/test_replicated_merge_tree_s3_zero_copy/test.py b/tests/integration/test_replicated_merge_tree_s3_zero_copy/test.py
index 73b611ad169..60a1b9b9746 100644
--- a/tests/integration/test_replicated_merge_tree_s3_zero_copy/test.py
+++ b/tests/integration/test_replicated_merge_tree_s3_zero_copy/test.py
@@ -87,7 +87,7 @@ def drop_table(cluster):
 
     minio = cluster.minio_client
     # Remove extra objects to prevent tests cascade failing
-    for obj in list(minio.list_objects(cluster.minio_bucket, "data/")):
+    for obj in list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True)):
         minio.remove_object(cluster.minio_bucket, obj.object_name)
 
 
@@ -124,6 +124,6 @@ def test_insert_select_replicated(cluster, min_rows_for_wide_part, files_per_par
         )
 
     minio = cluster.minio_client
-    assert len(list(minio.list_objects(cluster.minio_bucket, "data/"))) == (
-        3 * FILES_OVERHEAD
-    ) + (files_per_part * 3)
+    assert len(
+        list(minio.list_objects(cluster.minio_bucket, "data/", recursive=True))
+    ) == (3 * FILES_OVERHEAD) + (files_per_part * 3)
diff --git a/tests/integration/test_rocksdb_options/test.py b/tests/integration/test_rocksdb_options/test.py
index a9e12eae4fd..c746d4d0042 100644
--- a/tests/integration/test_rocksdb_options/test.py
+++ b/tests/integration/test_rocksdb_options/test.py
@@ -42,6 +42,18 @@ def test_valid_options(start_cluster):
     DROP TABLE test;
     """
     )
+    node.query(
+        """
+    CREATE TABLE test (key UInt64, value String) Engine=EmbeddedRocksDB(0, '/var/lib/clickhouse/store/test_rocksdb_read_only') PRIMARY KEY(key);
+    DROP TABLE test;
+    """
+    )
+    node.query(
+        """
+    CREATE TABLE test (key UInt64, value String) Engine=EmbeddedRocksDB(10, '/var/lib/clickhouse/store/test_rocksdb_read_only', 1) PRIMARY KEY(key);
+    DROP TABLE test;
+    """
+    )
 
 
 def test_invalid_options(start_cluster):
diff --git a/tests/queries/0_stateless/01594_too_low_memory_limits.reference b/tests/integration/test_rocksdb_read_only/__init__.py
similarity index 100%
rename from tests/queries/0_stateless/01594_too_low_memory_limits.reference
rename to tests/integration/test_rocksdb_read_only/__init__.py
diff --git a/tests/integration/test_rocksdb_read_only/configs/rocksdb.xml b/tests/integration/test_rocksdb_read_only/configs/rocksdb.xml
new file mode 100644
index 00000000000..c0ac49576fc
--- /dev/null
+++ b/tests/integration/test_rocksdb_read_only/configs/rocksdb.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0" encoding="utf-8"?>
+<clickhouse>
+  <rocksdb>
+    <options>
+        <max_background_jobs>8</max_background_jobs>
+    </options>
+    <column_family_options>
+      <num_levels>2</num_levels>
+    </column_family_options>
+    <tables>
+      <table>
+        <name>test</name>
+        <options>
+          <max_open_files>10000</max_open_files>
+        </options>
+        <column_family_options>
+          <max_bytes_for_level_base>14</max_bytes_for_level_base>
+        </column_family_options>
+      </table>
+    </tables>
+  </rocksdb>
+</clickhouse>
diff --git a/tests/integration/test_rocksdb_read_only/test.py b/tests/integration/test_rocksdb_read_only/test.py
new file mode 100644
index 00000000000..dcbfa417bff
--- /dev/null
+++ b/tests/integration/test_rocksdb_read_only/test.py
@@ -0,0 +1,137 @@
+# pylint: disable=unused-argument
+# pylint: disable=redefined-outer-name
+# pylint: disable=line-too-long
+
+import pytest
+
+from helpers.client import QueryRuntimeException
+from helpers.cluster import ClickHouseCluster
+
+cluster = ClickHouseCluster(__file__)
+
+node = cluster.add_instance(
+    "node", main_configs=["configs/rocksdb.xml"], stay_alive=True
+)
+
+
+@pytest.fixture(scope="module")
+def start_cluster():
+    try:
+        cluster.start()
+        yield cluster
+    finally:
+        cluster.shutdown()
+
+
+def test_read_only(start_cluster):
+    # fail if read_only = true and directory does not exist.
+    with pytest.raises(QueryRuntimeException):
+        node.query(
+            """
+        CREATE TABLE test (key UInt64, value String) Engine=EmbeddedRocksDB(0, '/var/lib/clickhouse/store/test_rocksdb_read_only', 1) PRIMARY KEY(key);
+        """
+        )
+    # create directory if read_only = false
+    node.query(
+        """
+    CREATE TABLE test (key UInt64, value String) Engine=EmbeddedRocksDB(0, '/var/lib/clickhouse/store/test_rocksdb_read_only') PRIMARY KEY(key);
+    INSERT INTO test (key, value) VALUES (0, 'a'), (1, 'b'), (2, 'c');
+    """
+    )
+    # fail if create multiple non-read-only tables on the same directory
+    with pytest.raises(QueryRuntimeException):
+        node.query(
+            """
+        CREATE TABLE test_fail (key UInt64, value String) Engine=EmbeddedRocksDB(0, '/var/lib/clickhouse/store/test_rocksdb_read_only') PRIMARY KEY(key);
+        """
+        )
+    with pytest.raises(QueryRuntimeException):
+        node.query(
+            """
+        CREATE TABLE test_fail (key UInt64, value String) Engine=EmbeddedRocksDB(10, '/var/lib/clickhouse/store/test_rocksdb_read_only') PRIMARY KEY(key);
+        """
+        )
+    # success if create multiple read-only tables on the same directory
+    node.query(
+        """
+    CREATE TABLE test_1 (key UInt64, value String) Engine=EmbeddedRocksDB(0, '/var/lib/clickhouse/store/test_rocksdb_read_only', 1) PRIMARY KEY(key);
+    DROP TABLE test_1;
+    """
+    )
+    node.query(
+        """
+    CREATE TABLE test_2 (key UInt64, value String) Engine=EmbeddedRocksDB(10, '/var/lib/clickhouse/store/test_rocksdb_read_only', 1) PRIMARY KEY(key);
+    DROP TABLE test_2;
+    """
+    )
+    # success if create table on existing directory with no other tables on it
+    node.query(
+        """
+    DROP TABLE test;
+    CREATE TABLE test (key UInt64, value String) Engine=EmbeddedRocksDB(10, '/var/lib/clickhouse/store/test_rocksdb_read_only', 1) PRIMARY KEY(key);
+    """
+    )
+    result = node.query("""SELECT count() FROM test;""")
+    assert result.strip() == "3"
+    # fail if insert into table with read_only = true
+    with pytest.raises(QueryRuntimeException):
+        node.query(
+            """INSERT INTO test (key, value) VALUES (4, 'd');
+        """
+        )
+    node.query(
+        """
+    DROP TABLE test;
+    """
+    )
+
+
+def test_dirctory_missing_after_stop(start_cluster):
+    # for read_only = false
+    node.query(
+        """
+    CREATE TABLE test (key UInt64, value String) Engine=EmbeddedRocksDB(0, '/var/lib/clickhouse/store/test_rocksdb_read_only_missing') PRIMARY KEY(key);
+    """
+    )
+    node.stop_clickhouse()
+    node.exec_in_container(
+        [
+            "bash",
+            "-c",
+            "rm -r /var/lib/clickhouse/store/test_rocksdb_read_only_missing",
+        ]
+    )
+    node.start_clickhouse()
+    result = node.query(
+        """INSERT INTO test (key, value) VALUES (0, 'a');
+    SELECT * FROM test;
+    """
+    )
+    assert result.strip() == "0\ta"
+    node.query(
+        """DROP TABLE test;
+    """
+    )
+    # for read_only = true
+    node.query(
+        """
+    CREATE TABLE test (key UInt64, value String) Engine=EmbeddedRocksDB(0, '/var/lib/clickhouse/store/test_rocksdb_read_only_missing', 1) PRIMARY KEY(key);
+    """
+    )
+    node.stop_clickhouse()
+    node.exec_in_container(
+        [
+            "bash",
+            "-c",
+            "rm -r /var/lib/clickhouse/store/test_rocksdb_read_only_missing",
+        ]
+    )
+    node.start_clickhouse()
+    with pytest.raises(QueryRuntimeException):
+        node.query("""INSERT INTO test (key, value) VALUES (1, 'b');""")
+    result = node.query("""SELECT * FROM test;""")
+    assert result.strip() == ""
+    node.query(
+        """DROP TABLE test;
+    """
+    )
diff --git a/tests/integration/test_s3_zero_copy_replication/test.py b/tests/integration/test_s3_zero_copy_replication/test.py
index 7b7fb9d21ad..860b83d4ed1 100644
--- a/tests/integration/test_s3_zero_copy_replication/test.py
+++ b/tests/integration/test_s3_zero_copy_replication/test.py
@@ -39,7 +39,9 @@ def cluster():
 def get_large_objects_count(cluster, size=100, folder="data"):
     minio = cluster.minio_client
     counter = 0
-    for obj in minio.list_objects(cluster.minio_bucket, "{}/".format(folder)):
+    for obj in minio.list_objects(
+        cluster.minio_bucket, "{}/".format(folder), recursive=True
+    ):
         if obj.size is not None and obj.size >= size:
             counter = counter + 1
     return counter
diff --git a/tests/integration/test_storage_url/test.py b/tests/integration/test_storage_url/test.py
index 6ffb38bd8d7..5591e63400c 100644
--- a/tests/integration/test_storage_url/test.py
+++ b/tests/integration/test_storage_url/test.py
@@ -1,31 +1,26 @@
 import pytest
-
 from helpers.cluster import ClickHouseCluster
+from helpers.test_tools import TSV
 
-uuids = []
+cluster = ClickHouseCluster(__file__)
+node1 = cluster.add_instance(
+    "node1", main_configs=["configs/conf.xml"], with_nginx=True
+)
 
 
-@pytest.fixture(scope="module")
-def cluster():
+@pytest.fixture(scope="module", autouse=True)
+def setup_node():
     try:
-        cluster = ClickHouseCluster(__file__)
-        cluster.add_instance(
-            "node1", main_configs=["configs/conf.xml"], with_nginx=True
-        )
         cluster.start()
-
-        yield cluster
-
+        node1.query(
+            "insert into table function url(url1) partition by column3 values (1, 2, 3), (3, 2, 1), (1, 3, 2)"
+        )
+        yield
     finally:
         cluster.shutdown()
 
 
-def test_partition_by(cluster):
-    node1 = cluster.instances["node1"]
-
-    node1.query(
-        f"insert into table function url(url1) partition by column3 values (1, 2, 3), (3, 2, 1), (1, 3, 2)"
-    )
+def test_partition_by():
     result = node1.query(
         f"select * from url('http://nginx:80/test_1', 'TSV', 'column1 UInt32, column2 UInt32, column3 UInt32')"
     )
@@ -38,3 +33,45 @@ def test_partition_by(cluster):
         f"select * from url('http://nginx:80/test_3', 'TSV', 'column1 UInt32, column2 UInt32, column3 UInt32')"
     )
     assert result.strip() == "1\t2\t3"
+
+
+def test_table_function_url_access_rights():
+    node1.query("CREATE USER OR REPLACE u1")
+
+    expected_error = "necessary to have grant CREATE TEMPORARY TABLE, URL ON *.*"
+    assert expected_error in node1.query_and_get_error(
+        f"SELECT * FROM url('http://nginx:80/test_1', 'TSV', 'column1 UInt32, column2 UInt32, column3 UInt32')",
+        user="u1",
+    )
+
+    expected_error = "necessary to have grant CREATE TEMPORARY TABLE, URL ON *.*"
+    assert expected_error in node1.query_and_get_error(
+        f"SELECT * FROM url('http://nginx:80/test_1', 'TSV')", user="u1"
+    )
+
+    assert node1.query(
+        f"DESCRIBE TABLE url('http://nginx:80/test_1', 'TSV', 'column1 UInt32, column2 UInt32, column3 UInt32')",
+        user="u1",
+    ) == TSV([["column1", "UInt32"], ["column2", "UInt32"], ["column3", "UInt32"]])
+
+    assert node1.query(
+        f"DESCRIBE TABLE url('http://nginx:80/not-exist', 'TSV', 'column1 UInt32, column2 UInt32, column3 UInt32')",
+        user="u1",
+    ) == TSV([["column1", "UInt32"], ["column2", "UInt32"], ["column3", "UInt32"]])
+
+    expected_error = "necessary to have grant URL ON *.*"
+    assert expected_error in node1.query_and_get_error(
+        f"DESCRIBE TABLE url('http://nginx:80/test_1', 'TSV')", user="u1"
+    )
+
+    node1.query("GRANT URL ON *.* TO u1")
+    assert node1.query(
+        f"DESCRIBE TABLE url('http://nginx:80/test_1', 'TSV')",
+        user="u1",
+    ) == TSV(
+        [
+            ["c1", "Nullable(Int64)"],
+            ["c2", "Nullable(Int64)"],
+            ["c3", "Nullable(Int64)"],
+        ]
+    )
diff --git a/tests/integration/test_ttl_move/test.py b/tests/integration/test_ttl_move/test.py
index 49d7ab4f2fc..99978cbf6dc 100644
--- a/tests/integration/test_ttl_move/test.py
+++ b/tests/integration/test_ttl_move/test.py
@@ -1284,19 +1284,6 @@ def test_materialize_ttl_in_partition(started_cluster, name, engine):
 def test_alter_multiple_ttls(started_cluster, name, engine, positive):
     name = unique_table_name(name)
 
-    """Copyright 2019, Altinity LTD
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License."""
     """Check that when multiple TTL expressions are set
     and before any parts are inserted the TTL expressions
     are changed with ALTER command then all old
@@ -1664,16 +1651,6 @@ def test_double_move_while_select(started_cluster, name, positive):
 def test_alter_with_merge_work(started_cluster, name, engine, positive):
     name = unique_table_name(name)
 
-    """Copyright 2019, Altinity LTD
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License."""
     """Check that TTL expressions are re-evaluated for
     existing parts after ALTER command changes TTL expressions
     and parts are merged.
diff --git a/tests/integration/test_ttl_replicated/test.py b/tests/integration/test_ttl_replicated/test.py
index bcdb2d25912..cacd9ef0c78 100644
--- a/tests/integration/test_ttl_replicated/test.py
+++ b/tests/integration/test_ttl_replicated/test.py
@@ -342,16 +342,6 @@ def optimize_with_retry(node, table_name, retry=20):
     ],
 )
 def test_ttl_alter_delete(started_cluster, name, engine):
-    """Copyright 2019, Altinity LTD
-    Licensed under the Apache License, Version 2.0 (the "License");
-    you may not use this file except in compliance with the License.
-    You may obtain a copy of the License at
-        http://www.apache.org/licenses/LICENSE-2.0
-    Unless required by applicable law or agreed to in writing, software
-    distributed under the License is distributed on an "AS IS" BASIS,
-    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-    See the License for the specific language governing permissions and
-    limitations under the License."""
     """Check compatibility with old TTL delete expressions to make sure
     that:
     * alter modify of column's TTL delete expression works
diff --git a/tests/performance/join_set_filter.xml b/tests/performance/join_set_filter.xml
new file mode 100644
index 00000000000..7f7804853fc
--- /dev/null
+++ b/tests/performance/join_set_filter.xml
@@ -0,0 +1,45 @@
+<test>
+    <substitutions>
+       <substitution>
+           <name>table_size</name>
+           <values>
+               <value>100000000</value>
+           </values>
+       </substitution>
+    </substitutions>
+
+    <settings>
+        <join_algorithm>full_sorting_merge</join_algorithm>
+    </settings>
+
+    <create_query>
+        CREATE TABLE t1 (x UInt64, y UInt64) ENGINE = MergeTree ORDER BY y
+        AS SELECT
+            sipHash64(number, 't1_x') % {table_size} AS x,
+            sipHash64(number, 't1_y') % {table_size} AS y
+        FROM numbers({table_size})
+    </create_query>
+
+    <create_query>
+        CREATE TABLE t2 (x UInt64, y UInt64) ENGINE = MergeTree ORDER BY y
+        AS SELECT
+            sipHash64(number, 't2_x') % {table_size} AS x,
+            sipHash64(number, 't2_y') % {table_size} AS y
+        FROM numbers({table_size})
+    </create_query>
+
+    <query>SELECT * FROM t1 JOIN t2 ON t1.x = t2.x WHERE less(t1.y, 10000)</query>
+    <query>SELECT * FROM t2 JOIN t1 ON t1.x = t2.x WHERE less(t1.y, 10000)</query>
+
+    <query>SELECT * FROM t1 JOIN t2 ON t1.x = t2.x WHERE greater(t1.y, {table_size} - 10000)</query>
+    <query>SELECT * FROM t2 JOIN t1 ON t1.x = t2.x WHERE greater(t1.y, {table_size} - 10000)</query>
+
+    <query>SELECT * FROM t1 JOIN t2 ON t1.x = t2.x WHERE t1.y % 100 = 0</query>
+    <query>SELECT * FROM t2 JOIN t1 ON t1.x = t2.x WHERE t1.y % 100 = 0</query>
+
+    <query>SELECT * FROM t1 JOIN t2 ON t1.x = t2.x WHERE t1.y % 1000 = 0</query>
+    <query>SELECT * FROM t2 JOIN t1 ON t1.x = t2.x WHERE t1.y % 1000 = 0</query>
+
+    <drop_query>DROP TABLE IF EXISTS t1</drop_query>
+    <drop_query>DROP TABLE IF EXISTS t2</drop_query>
+</test>
diff --git a/tests/performance/lz4.xml b/tests/performance/lz4.xml
new file mode 100644
index 00000000000..c5cf2772121
--- /dev/null
+++ b/tests/performance/lz4.xml
@@ -0,0 +1,22 @@
+<test>
+  <create_query>create table t_lz4(a UInt64) engine=MergeTree order by tuple()</create_query>
+  <create_query>create table t_lz4_norm(a UInt64) engine=MergeTree order by tuple()</create_query>
+  <create_query>create table t_lz4_uncomp(a UInt32) engine=MergeTree order by a</create_query>
+
+  <fill_query>insert into t_lz4 select number % 100 from numbers_mt(5e7) order by rand()</fill_query>
+  <fill_query>optimize table t_lz4 final</fill_query>
+
+  <fill_query>insert into t_lz4_norm select number from numbers_mt(5e7) order by rand()</fill_query>
+  <fill_query>optimize table t_lz4_norm final</fill_query>
+
+  <fill_query>insert into t_lz4_uncomp select number from numbers_mt(5e7)</fill_query>
+  <fill_query>optimize table t_lz4_uncomp final</fill_query>
+
+  <query>select a from t_lz4 format Null</query>
+  <query>select a from t_lz4_norm format Null</query>
+  <query>select a from t_lz4_uncomp format Null</query>
+
+  <drop_query>drop table t_lz4</drop_query>
+  <drop_query>drop table t_lz4_norm</drop_query>
+  <drop_query>drop table t_lz4_uncomp</drop_query>
+</test>
diff --git a/tests/performance/lz4_hits_columns.xml b/tests/performance/lz4_hits_columns.xml
new file mode 100644
index 00000000000..0b93e4bc4e8
--- /dev/null
+++ b/tests/performance/lz4_hits_columns.xml
@@ -0,0 +1,39 @@
+<test>
+  <substitutions>
+    <substitution>
+      <name>column</name>
+      <values>
+        <value>ClientIP</value>
+        <value>ClientTimeZone</value>
+        <value>CookieEnable</value>
+        <value>CounterClass</value>
+        <value>CounterID</value>
+        <value>EventDate</value>
+        <value>EventTime</value>
+        <value>GoodEvent</value>
+        <value>HitColor</value>
+        <value>JavaEnable</value>
+        <value>OpenerName</value>
+        <value>PageCharset</value>
+        <value>ParamCurrency</value>
+        <value>ParamPrice</value>
+        <value>Referer</value>
+        <value>RefererCategoryID</value>
+        <value>RefererHash</value>
+        <value>RegionID</value>
+        <value>SearchPhrase</value>
+        <value>SilverlightVersion4</value>
+        <value>Title</value>
+        <value>TraficSourceID</value>
+        <value>URLCategoryID</value>
+        <value>UserAgent</value>
+        <value>UserAgentMinor</value>
+        <value>UserID</value>
+        <value>WatchID</value>
+        <value>WindowName</value>
+      </values>
+    </substitution>
+  </substitutions>
+
+  <query>select {column} from hits_100m_single format Null</query>
+</test>
diff --git a/tests/performance/queries_over_aggregation.xml b/tests/performance/queries_over_aggregation.xml
index 2a92ea26819..ceaed61c5bb 100644
--- a/tests/performance/queries_over_aggregation.xml
+++ b/tests/performance/queries_over_aggregation.xml
@@ -1,4 +1,8 @@
 <test>
+  <query>select sipHash64(number) from numbers(1e7) group by number format Null</query>
+  <query>select * from (select * from numbers(1e7) group by number) group by number format Null</query>
+  <query>select * from (select * from numbers(1e7) group by number) order by number format Null</query>
+
   <query>select * from (select * from numbers_mt(1e7) group by number) group by number format Null</query>
   <query>select * from (select * from numbers_mt(1e7) group by number) order by number format Null</query>
   <query>select * from (select * from numbers_mt(1e7) group by number) group by number format Null settings max_bytes_before_external_group_by = 1</query>
diff --git a/tests/queries/0_stateless/00284_external_aggregation.sql b/tests/queries/0_stateless/00284_external_aggregation.sql
index a42dd91b6a5..d19f9f5aee8 100644
--- a/tests/queries/0_stateless/00284_external_aggregation.sql
+++ b/tests/queries/0_stateless/00284_external_aggregation.sql
@@ -8,6 +8,7 @@ SET group_by_two_level_threshold_bytes = 50000000;
 SELECT sum(k), sum(c) FROM (SELECT number AS k, count() AS c FROM (SELECT * FROM system.numbers LIMIT 10000000) GROUP BY k);
 SELECT sum(k), sum(c), max(u) FROM (SELECT number AS k, count() AS c, uniqArray(range(number % 16)) AS u FROM (SELECT * FROM system.numbers LIMIT 1000000) GROUP BY k);
 
+SET max_memory_usage = 0;
 SET group_by_two_level_threshold = 100000;
 SET max_bytes_before_external_group_by = '1Mi';
 
diff --git a/tests/queries/0_stateless/00488_column_name_primary.reference b/tests/queries/0_stateless/00488_column_name_primary.reference
new file mode 100644
index 00000000000..a6905f8ba44
--- /dev/null
+++ b/tests/queries/0_stateless/00488_column_name_primary.reference
@@ -0,0 +1 @@
+999
diff --git a/tests/queries/0_stateless/00488_column_name_primary.sql b/tests/queries/0_stateless/00488_column_name_primary.sql
new file mode 100644
index 00000000000..124d0e14239
--- /dev/null
+++ b/tests/queries/0_stateless/00488_column_name_primary.sql
@@ -0,0 +1,16 @@
+DROP TABLE IF EXISTS primary;
+
+CREATE TABLE primary
+(
+    `primary` String
+)
+ENGINE = MergeTree
+ORDER BY primary
+settings min_bytes_for_wide_part=0,min_bytes_for_wide_part=0
+ AS
+SELECT *
+FROM numbers(1000);
+
+select max(primary) from primary;
+
+DROP TABLE primary;
diff --git a/tests/queries/0_stateless/00965_shard_unresolvable_addresses.sql b/tests/queries/0_stateless/00965_shard_unresolvable_addresses.sql
index 555e7a98380..5b763d2d853 100644
--- a/tests/queries/0_stateless/00965_shard_unresolvable_addresses.sql
+++ b/tests/queries/0_stateless/00965_shard_unresolvable_addresses.sql
@@ -2,7 +2,7 @@
 
 SET prefer_localhost_replica = 1;
 
-SELECT count() FROM remote('127.0.0.1,localhos', system.one); -- { serverError 198 }
+SELECT count() FROM remote('127.0.0.1,localhos', system.one); -- { serverError 279 }
 SELECT count() FROM remote('127.0.0.1|localhos', system.one);
 
 -- Clear cache to avoid future errors in the logs
diff --git a/tests/queries/0_stateless/01064_incremental_streaming_from_2_src_with_feedback.sql b/tests/queries/0_stateless/01064_incremental_streaming_from_2_src_with_feedback.sql
index 0bc5fcd1db8..9a439180265 100644
--- a/tests/queries/0_stateless/01064_incremental_streaming_from_2_src_with_feedback.sql
+++ b/tests/queries/0_stateless/01064_incremental_streaming_from_2_src_with_feedback.sql
@@ -1,4 +1,5 @@
 SET joined_subquery_requires_alias = 0;
+SET max_threads = 1;
 
 -- incremental streaming usecase
 -- that has sense only if data filling order has guarantees of chronological order
diff --git a/tests/queries/0_stateless/01079_parallel_alter_add_drop_column_zookeeper.sh b/tests/queries/0_stateless/01079_parallel_alter_add_drop_column_zookeeper.sh
index 06d6ef6a94b..26c2bf133ac 100755
--- a/tests/queries/0_stateless/01079_parallel_alter_add_drop_column_zookeeper.sh
+++ b/tests/queries/0_stateless/01079_parallel_alter_add_drop_column_zookeeper.sh
@@ -111,5 +111,9 @@ for i in $(seq $REPLICAS); do
     $CLICKHOUSE_CLIENT --query "SELECT * FROM system.mutations WHERE is_done = 0 and table = 'concurrent_alter_add_drop_$i'"
     $CLICKHOUSE_CLIENT --query "SELECT COUNT() FROM system.replication_queue WHERE table = 'concurrent_alter_add_drop_$i'"
     $CLICKHOUSE_CLIENT --query "SELECT * FROM system.replication_queue WHERE table = 'concurrent_alter_add_drop_$i' and (type = 'ALTER_METADATA' or type = 'MUTATE_PART')"
+
+    $CLICKHOUSE_CLIENT --query "DETACH TABLE concurrent_alter_add_drop_$i"
+    $CLICKHOUSE_CLIENT --query "ATTACH TABLE concurrent_alter_add_drop_$i"
+
     $CLICKHOUSE_CLIENT --query "DROP TABLE IF EXISTS concurrent_alter_add_drop_$i"
 done
diff --git a/tests/queries/0_stateless/01091_num_threads.sql b/tests/queries/0_stateless/01091_num_threads.sql
index faeceb0e6d6..0d2a66a8c2e 100644
--- a/tests/queries/0_stateless/01091_num_threads.sql
+++ b/tests/queries/0_stateless/01091_num_threads.sql
@@ -28,7 +28,7 @@ WITH
         ORDER BY event_time DESC
         LIMIT 1
     ) AS id
-SELECT uniqExact(thread_id)
+SELECT uniqExact(thread_id) > 2
 FROM system.query_thread_log
 WHERE (event_date >= (today() - 1)) AND (query_id = id) AND (thread_id != master_thread_id);
 
diff --git a/tests/queries/0_stateless/01281_group_by_limit_memory_tracking.sh b/tests/queries/0_stateless/01281_group_by_limit_memory_tracking.sh
index 2f4164ee0d1..b48958a18f6 100755
--- a/tests/queries/0_stateless/01281_group_by_limit_memory_tracking.sh
+++ b/tests/queries/0_stateless/01281_group_by_limit_memory_tracking.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Tags: no-replicated-database, no-parallel, no-fasttest, no-tsan, no-asan, no-random-settings, no-s3-storage
+# Tags: no-replicated-database, no-parallel, no-fasttest, no-tsan, no-asan, no-random-settings, no-s3-storage, no-msan
 # Tag no-fasttest: max_memory_usage_for_user can interfere another queries running concurrently
 
 # Regression for MemoryTracker that had been incorrectly accounted
@@ -32,7 +32,7 @@ function execute_group_by()
     # max_memory_usage_for_user is installed to 0 once there are no more
     # queries for user.
     local opts=(
-        "--max_memory_usage_for_user="$((150<<20))
+        "--max_memory_usage_for_user="$((200<<20))
         "--max_threads=2"
     )
     execute_null "${opts[@]}" <<<'SELECT uniq(number) FROM numbers_mt(1e6) GROUP BY number % 5e5'
diff --git a/tests/queries/0_stateless/01455_opentelemetry_distributed.reference b/tests/queries/0_stateless/01455_opentelemetry_distributed.reference
index e70506599ec..d48b3738bc2 100644
--- a/tests/queries/0_stateless/01455_opentelemetry_distributed.reference
+++ b/tests/queries/0_stateless/01455_opentelemetry_distributed.reference
@@ -15,7 +15,7 @@
 {"query":"select 1 format Null\n","query_status":"QueryFinish","tracestate":"another custom state","sorted_by_finish_time":1}
 {"query":"select 1 format Null\n","query_status":"QueryFinish","tracestate":"another custom state","sorted_by_finish_time":1}
 {"query":"select * from url('http:\/\/127.0.0.2:8123\/?query=select%201%20format%20Null', CSV, 'a int')","query_status":"QueryFinish","tracestate":"another custom state","sorted_by_finish_time":1}
-{"total spans":"3","unique spans":"3","unique non-zero parent spans":"2"}
+{"total spans":"3","unique spans":"3","unique non-zero parent spans":"3"}
 {"initial query spans with proper parent":"1"}
 {"unique non-empty tracestate values":"1"}
 ===sampled===
diff --git a/tests/queries/0_stateless/01455_opentelemetry_distributed.sh b/tests/queries/0_stateless/01455_opentelemetry_distributed.sh
index 95d99449837..b2b5ae89105 100755
--- a/tests/queries/0_stateless/01455_opentelemetry_distributed.sh
+++ b/tests/queries/0_stateless/01455_opentelemetry_distributed.sh
@@ -48,21 +48,16 @@ select count(*) "'"'"total spans"'"'",
     ;
 
 -- Also check that the initial query span in ClickHouse has proper parent span.
+-- the first span should be child of input trace context
+-- the 2nd span should be the 'query' span
 select count(*) "'"'"initial query spans with proper parent"'"'"
-    from
-        (select *, attribute_name, attribute_value
-            from system.opentelemetry_span_log
-                array join mapKeys(attribute) as attribute_name,
-                     mapValues(attribute) as attribute_value) o
-        join system.query_log on query_id = o.attribute_value
+    from system.opentelemetry_span_log
     where
         trace_id = UUIDNumToString(toFixedString(unhex('$trace_id'), 16))
-        and current_database = currentDatabase()
         and operation_name = 'query'
-        and parent_span_id = reinterpretAsUInt64(unhex('73'))
-        and o.attribute_name = 'clickhouse.query_id'
-        and is_initial_query
-        and type = 'QueryFinish'
+        and parent_span_id in ( 
+           select span_id from system.opentelemetry_span_log where trace_id = UUIDNumToString(toFixedString(unhex('$trace_id'), 16)) and parent_span_id = reinterpretAsUInt64(unhex('73'))
+        )
     ;
 
 -- Check that the tracestate header was propagated. It must have exactly the
@@ -136,7 +131,6 @@ ${CLICKHOUSE_CLIENT} -q "
     select if(2 <= count() and count() <= 18, 'OK', 'Fail')
     from system.opentelemetry_span_log
     where operation_name = 'query'
-        and parent_span_id = 0  -- only account for the initial queries
         and attribute['clickhouse.query_id'] like '$query_id-%'
     ;
 "
diff --git a/tests/queries/0_stateless/01524_do_not_merge_across_partitions_select_final.reference b/tests/queries/0_stateless/01524_do_not_merge_across_partitions_select_final.reference
index a3f2106cd5f..540137d4887 100644
--- a/tests/queries/0_stateless/01524_do_not_merge_across_partitions_select_final.reference
+++ b/tests/queries/0_stateless/01524_do_not_merge_across_partitions_select_final.reference
@@ -6,4 +6,4 @@
 2020-01-01 00:00:00	2	
 1
 499999
-5
+18
diff --git a/tests/queries/0_stateless/01524_do_not_merge_across_partitions_select_final.sql b/tests/queries/0_stateless/01524_do_not_merge_across_partitions_select_final.sql
index de228c90753..dafe652d271 100644
--- a/tests/queries/0_stateless/01524_do_not_merge_across_partitions_select_final.sql
+++ b/tests/queries/0_stateless/01524_do_not_merge_across_partitions_select_final.sql
@@ -1,7 +1,7 @@
 DROP TABLE IF EXISTS select_final;
 
 SET do_not_merge_across_partitions_select_final = 1;
-SET max_threads = 0;
+SET max_threads = 16;
 
 CREATE TABLE select_final (t DateTime, x Int32, string String) ENGINE = ReplacingMergeTree() PARTITION BY toYYYYMM(t) ORDER BY (x, t);
 
diff --git a/tests/queries/0_stateless/01594_too_low_memory_limits.config.xml b/tests/queries/0_stateless/01594_too_low_memory_limits.config.xml
deleted file mode 100644
index 0c286bfbd21..00000000000
--- a/tests/queries/0_stateless/01594_too_low_memory_limits.config.xml
+++ /dev/null
@@ -1,35 +0,0 @@
-<?xml version="1.0"?>
-<clickhouse>
-    <logger>
-        <level>trace</level>
-        <console>true</console>
-    </logger>
-
-    <tcp_port>9000</tcp_port>
-
-    <path>./</path>
-
-    <mark_cache_size>0</mark_cache_size>
-
-    <users>
-        <default>
-            <password></password>
-
-            <networks>
-                <ip>::/0</ip>
-            </networks>
-
-            <profile>default</profile>
-            <quota>default</quota>
-            <access_management>1</access_management>
-        </default>
-    </users>
-
-    <profiles>
-        <default/>
-    </profiles>
-
-    <quotas>
-        <default />
-    </quotas>
-</clickhouse>
diff --git a/tests/queries/0_stateless/01594_too_low_memory_limits.sh b/tests/queries/0_stateless/01594_too_low_memory_limits.sh
deleted file mode 100755
index b513a947bd9..00000000000
--- a/tests/queries/0_stateless/01594_too_low_memory_limits.sh
+++ /dev/null
@@ -1,108 +0,0 @@
-#!/usr/bin/env bash
-# Tags: no-tsan, no-asan, no-ubsan, no-msan, no-parallel, no-fasttest
-# Tag no-tsan: requires jemalloc to track small allocations
-# Tag no-asan: requires jemalloc to track small allocations
-# Tag no-ubsan: requires jemalloc to track small allocations
-# Tag no-msan: requires jemalloc to track small allocations
-
-#
-# Regression for INSERT SELECT, that abnormally terminates the server
-# in case of too small memory limits.
-#
-# NOTE: After #24483 had been merged the only place where the allocation may
-# fail is the insert into PODArray in DB::OwnSplitChannel::log, but after
-# #24069 those errors will be ignored, so to check new behaviour separate
-# server is required.
-#
-
-CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
-# shellcheck source=../shell_config.sh
-. "$CURDIR"/../shell_config.sh
-
-server_opts=(
-    "--config-file=$CURDIR/$(basename "${BASH_SOURCE[0]}" .sh).config.xml"
-    "--"
-    # to avoid multiple listen sockets (complexity for port discovering)
-    "--listen_host=127.1"
-    # we will discover the real port later.
-    "--tcp_port=0"
-    "--shutdown_wait_unfinished=0"
-)
-CLICKHOUSE_WATCHDOG_ENABLE=0 $CLICKHOUSE_SERVER_BINARY "${server_opts[@]}" >clickhouse-server.log 2>clickhouse-server.stderr &
-server_pid=$!
-
-trap cleanup EXIT
-function cleanup()
-{
-    kill -9 $server_pid
-
-    echo "Test failed. Server log:"
-    cat clickhouse-server.log
-    cat clickhouse-server.stderr
-    rm -f clickhouse-server.log
-    rm -f clickhouse-server.stderr
-
-    exit 1
-}
-
-server_port=
-i=0 retries=300
-# wait until server will start to listen (max 30 seconds)
-while [[ -z $server_port ]] && [[ $i -lt $retries ]]; do
-    server_port=$(lsof -n -a -P -i tcp -s tcp:LISTEN -p $server_pid 2>/dev/null | awk -F'[ :]' '/LISTEN/ { print $(NF-1) }')
-    ((++i))
-    sleep 0.1
-    if ! kill -0 $server_pid >& /dev/null; then
-        echo "No server (pid $server_pid)"
-        break
-    fi
-done
-if [[ -z $server_port ]]; then
-    echo "Cannot wait for LISTEN socket" >&2
-    exit 1
-fi
-
-# wait for the server to start accepting tcp connections (max 30 seconds)
-i=0 retries=300
-while ! $CLICKHOUSE_CLIENT_BINARY --host 127.1 --port "$server_port" --format Null -q 'select 1' 2>/dev/null && [[ $i -lt $retries ]]; do
-    sleep 0.1
-    if ! kill -0 $server_pid >& /dev/null; then
-        echo "No server (pid $server_pid)"
-        break
-    fi
-done
-if ! $CLICKHOUSE_CLIENT_BINARY --host 127.1 --port "$server_port" --format Null -q 'select 1'; then
-    echo "Cannot wait until server will start accepting connections on <tcp_port>" >&2
-    exit 1
-fi
-
-# it is not mandatory to use existing table since it fails earlier, hence just a placeholder.
-# this is format of INSERT SELECT, that pass these settings exactly for INSERT query not the SELECT
-if $CLICKHOUSE_CLIENT_BINARY --host 127.1 --port "$server_port" --format Null --send_logs_level=warning --max_memory_usage=1 --max_untracked_memory=1 -q 'insert into placeholder_table_name select * from numbers_mt(65535)' >& /dev/null; then
-    echo "INSERT SELECT should fail" >&2
-    exit 1
-fi
-
-# no sleep, since flushing to stderr should not be buffered.
-if ! grep -E -q 'Cannot add message to the log: Code: 60.*placeholder_table_name' clickhouse-server.stderr; then
-    echo "Adding message to the log should fail" >&2
-    exit 1
-fi
-
-# check that server is still alive
-$CLICKHOUSE_CLIENT_BINARY --host 127.1 --port "$server_port" --format Null -q 'SELECT 1'
-
-# send TERM and save the error code to ensure that it is 0 (EXIT_SUCCESS)
-kill $server_pid
-wait $server_pid
-return_code=$?
-
-trap '' EXIT
-if [ $return_code != 0 ]; then
-    cat clickhouse-server.log
-    cat clickhouse-server.stderr
-fi
-rm -f clickhouse-server.log
-rm -f clickhouse-server.stderr
-
-exit $return_code
diff --git a/tests/queries/0_stateless/01650_fetch_patition_with_macro_in_zk_path_long.sql b/tests/queries/0_stateless/01650_fetch_patition_with_macro_in_zk_path_long.sql
index 4357aa199dc..1dae8e7b383 100644
--- a/tests/queries/0_stateless/01650_fetch_patition_with_macro_in_zk_path_long.sql
+++ b/tests/queries/0_stateless/01650_fetch_patition_with_macro_in_zk_path_long.sql
@@ -5,13 +5,15 @@ DROP TABLE IF EXISTS restore_01640;
 
 CREATE TABLE test_01640(i Int64, d Date, s String)
 ENGINE = ReplicatedMergeTree('/clickhouse/{database}/{shard}/tables/test_01640','{replica}') 
-PARTITION BY toYYYYMM(d) ORDER BY i;
+PARTITION BY toYYYYMM(d) ORDER BY i
+SETTINGS allow_remote_fs_zero_copy_replication=0;
 
 insert into test_01640 values (1, '2021-01-01','some');
 
 CREATE TABLE restore_01640(i Int64, d Date, s String)
 ENGINE = ReplicatedMergeTree('/clickhouse/{database}/{shard}/tables/restore_01640','{replica}')
-PARTITION BY toYYYYMM(d) ORDER BY i;
+PARTITION BY toYYYYMM(d) ORDER BY i
+SETTINGS allow_remote_fs_zero_copy_replication=0;
 
 ALTER TABLE restore_01640 FETCH PARTITION tuple(toYYYYMM(toDate('2021-01-01')))
   FROM '/clickhouse/{database}/{shard}/tables/test_01640';
diff --git a/tests/queries/0_stateless/01655_plan_optimizations_optimize_read_in_window_order.reference b/tests/queries/0_stateless/01655_plan_optimizations_optimize_read_in_window_order.reference
index 7fcd29b5faf..00eb03bd5f0 100644
--- a/tests/queries/0_stateless/01655_plan_optimizations_optimize_read_in_window_order.reference
+++ b/tests/queries/0_stateless/01655_plan_optimizations_optimize_read_in_window_order.reference
@@ -10,3 +10,12 @@ No sorting plan
   optimize_read_in_window_order=1
     Prefix sort description: n ASC, x ASC
     Result sort description: n ASC, x ASC
+Complex ORDER BY
+  optimize_read_in_window_order=0
+3	3	1
+4	5	2
+5	7	3
+  optimize_read_in_window_order=1
+3	3	1
+4	5	2
+5	7	3
diff --git a/tests/queries/0_stateless/01655_plan_optimizations_optimize_read_in_window_order.sh b/tests/queries/0_stateless/01655_plan_optimizations_optimize_read_in_window_order.sh
index 418baea8113..328d181fadd 100755
--- a/tests/queries/0_stateless/01655_plan_optimizations_optimize_read_in_window_order.sh
+++ b/tests/queries/0_stateless/01655_plan_optimizations_optimize_read_in_window_order.sh
@@ -31,6 +31,15 @@ $CLICKHOUSE_CLIENT -q "explain plan actions=1, description=1 select n, sum(x) OV
 echo '  optimize_read_in_window_order=1'
 $CLICKHOUSE_CLIENT -q "explain plan actions=1, description=1 select n, sum(x) OVER (ORDER BY n, x ROWS BETWEEN 100 PRECEDING AND CURRENT ROW) from ${name}_n_x SETTINGS optimize_read_in_window_order=1" | grep -i "sort description"
 
+echo 'Complex ORDER BY'
+$CLICKHOUSE_CLIENT -q "CREATE TABLE ${name}_complex (unique1 Int32, unique2 Int32, ten Int32) ENGINE=MergeTree ORDER BY tuple() SETTINGS index_granularity = 8192"
+$CLICKHOUSE_CLIENT -q "INSERT INTO ${name}_complex VALUES (1, 2, 3), (2, 3, 4), (3, 4, 5)"
+echo '  optimize_read_in_window_order=0'
+$CLICKHOUSE_CLIENT -q "SELECT ten, sum(unique1) + sum(unique2) AS res, rank() OVER (ORDER BY sum(unique1) + sum(unique2) ASC) AS rank FROM ${name}_complex GROUP BY ten ORDER BY ten ASC SETTINGS optimize_read_in_window_order=0"
+echo '  optimize_read_in_window_order=1'
+$CLICKHOUSE_CLIENT -q "SELECT ten, sum(unique1) + sum(unique2) AS res, rank() OVER (ORDER BY sum(unique1) + sum(unique2) ASC) AS rank FROM ${name}_complex GROUP BY ten ORDER BY ten ASC SETTINGS optimize_read_in_window_order=1"
+
 $CLICKHOUSE_CLIENT -q "drop table ${name}"
 $CLICKHOUSE_CLIENT -q "drop table ${name}_n"
 $CLICKHOUSE_CLIENT -q "drop table ${name}_n_x"
+$CLICKHOUSE_CLIENT -q "drop table ${name}_complex"
diff --git a/tests/queries/0_stateless/01671_aggregate_function_group_bitmap_data.reference b/tests/queries/0_stateless/01671_aggregate_function_group_bitmap_data.reference
index 161f4a6372f..8c3288df670 100644
--- a/tests/queries/0_stateless/01671_aggregate_function_group_bitmap_data.reference
+++ b/tests/queries/0_stateless/01671_aggregate_function_group_bitmap_data.reference
@@ -1,27 +1,12 @@
 1	50	50	1	0	49
 1	50	50	1	0	49
 1	50	50	1	0	49
-1	50	51	0	1	51
-1	50	50	1	0	49
-1	50	51	0	1	51
-1	50	50	1	0	49
-1	50	50	1	0	49
-1	50	50	1	0	49
 1	50	50	1	0	49
 1	50	50	1	0	49
-1	50	51	0	1	51
-1	50	50	1	0	49
-1	50	51	0	1	51
-1	50	50	1	0	49
-1	50	51	0	1	51
-1	50	50	1	0	49
-1	50	51	0	1	51
 1	50	50	1	0	49
 1	50	50	1	0	49
 1	50	50	1	0	49
-1	50	51	0	1	51
 1	50	50	1	0	49
-1	50	51	0	1	51
 1	50	50	1	0	49
 1	50	50	1	0	49
 1	50	50	1	0	49
@@ -29,32 +14,47 @@
 1	50	50	1	0	49
 1	50	50	1	0	49
 1	50	50	1	0	49
-1	50	51	0	1	51
+1	50	50	1	0	49
+1	50	50	1	0	49
+1	50	50	1	0	49
+1	50	50	1	0	49
+1	50	50	1	0	49
+1	50	50	1	0	49
+1	50	50	1	0	49
+1	50	50	1	0	49
+1	50	50	1	0	49
+1	50	50	1	0	49
+1	50	50	1	0	49
+1	50	50	1	0	49
+1	50	50	1	0	49
+1	50	50	1	0	49
+1	50	50	1	0	49
+1	50	50	1	0	49
+1	50	50	1	0	49
+1	50	50	1	0	49
+1	50	50	1	0	49
+1	50	50	1	0	49
+1	50	50	1	0	49
+1	50	50	1	0	49
+1	50	50	1	0	49
 1	50	50	1	0	49
 1	50	51	0	1	51
-1	50	50	1	0	49
 1	50	51	0	1	51
 1	50	51	0	1	51
-1	50	50	1	0	49
-1	50	50	1	0	49
-1	50	50	1	0	49
-1	50	51	0	1	51
-1	50	50	1	0	49
-1	50	50	1	0	49
 1	50	51	0	1	51
 1	50	51	0	1	51
-1	50	50	1	0	49
-1	50	51	0	1	51
-1	50	50	1	0	49
-1	50	50	1	0	49
-1	50	50	1	0	49
-1	50	50	1	0	49
 1	50	51	0	1	51
 1	50	51	0	1	51
-1	50	50	1	0	49
-1	50	50	1	0	49
 1	50	51	0	1	51
-1	50	50	1	0	49
-1	50	50	1	0	49
-1	50	50	1	0	49
+1	50	51	0	1	51
+1	50	51	0	1	51
+1	50	51	0	1	51
+1	50	51	0	1	51
+1	50	51	0	1	51
+1	50	51	0	1	51
+1	50	51	0	1	51
+1	50	51	0	1	51
+1	50	51	0	1	51
+1	50	51	0	1	51
+1	50	51	0	1	51
 1	50	51	0	1	51
diff --git a/tests/queries/0_stateless/01671_aggregate_function_group_bitmap_data.sql b/tests/queries/0_stateless/01671_aggregate_function_group_bitmap_data.sql
index d70665655ca..a04f40058fd 100644
--- a/tests/queries/0_stateless/01671_aggregate_function_group_bitmap_data.sql
+++ b/tests/queries/0_stateless/01671_aggregate_function_group_bitmap_data.sql
@@ -52,6 +52,7 @@ ALL LEFT JOIN
     FROM group_bitmap_data_test
     WHERE pickup_date = '2019-01-01'
     GROUP BY city_id
-) AS js2 USING (city_id);
+) AS js2 USING (city_id)
+ORDER BY today_users, before_users, ll_users, old_users, new_users, diff_users;
 
 DROP TABLE IF EXISTS group_bitmap_data_test;
diff --git a/tests/queries/0_stateless/01710_aggregate_projection_with_grouping_set.reference b/tests/queries/0_stateless/01710_aggregate_projection_with_grouping_set.reference
new file mode 100644
index 00000000000..b233507ce6d
--- /dev/null
+++ b/tests/queries/0_stateless/01710_aggregate_projection_with_grouping_set.reference
@@ -0,0 +1,28 @@
+a		2
+a	x	1
+a	y	1
+b		2
+b	x	1
+b	y	1
+		4
+a		2
+a	x	1
+a	y	1
+b		2
+b	x	1
+b	y	1
+		4
+	x	2
+	y	2
+a		2
+a	x	1
+a	y	1
+b		2
+b	x	1
+b	y	1
+a	x	1
+a	y	1
+b	x	1
+b	y	1
+
+		4
diff --git a/tests/queries/0_stateless/01710_aggregate_projection_with_grouping_set.sql b/tests/queries/0_stateless/01710_aggregate_projection_with_grouping_set.sql
new file mode 100644
index 00000000000..652ce786b5d
--- /dev/null
+++ b/tests/queries/0_stateless/01710_aggregate_projection_with_grouping_set.sql
@@ -0,0 +1,15 @@
+drop table if exists test;
+
+create table test(dim1 String, dim2 String, projection p1 (select dim1, dim2, count() group by dim1, dim2)) engine MergeTree order by dim1;
+
+insert into test values ('a', 'x') ('a', 'y') ('b', 'x') ('b', 'y');
+
+select dim1, dim2, count() from test group by grouping sets ((dim1, dim2), dim1) order by dim1, dim2, count();
+
+select dim1, dim2, count() from test group by dim1, dim2 with rollup order by dim1, dim2, count();
+
+select dim1, dim2, count() from test group by dim1, dim2 with cube order by dim1, dim2, count();
+
+select dim1, dim2, count() from test group by dim1, dim2 with totals order by dim1, dim2, count();
+
+drop table test;
diff --git a/tests/queries/0_stateless/01730_distributed_group_by_no_merge_order_by_long.sql b/tests/queries/0_stateless/01730_distributed_group_by_no_merge_order_by_long.sql
index 62b578c21d6..3d6a25fe799 100644
--- a/tests/queries/0_stateless/01730_distributed_group_by_no_merge_order_by_long.sql
+++ b/tests/queries/0_stateless/01730_distributed_group_by_no_merge_order_by_long.sql
@@ -12,7 +12,7 @@ select * from remote('127.{2..11}', view(select * from numbers(1e6))) group by n
 -- and the query with GROUP BY on remote servers will first do GROUP BY and then send the block,
 -- so the initiator will first receive all blocks from remotes and only after start merging,
 -- and will hit the memory limit.
-select * from remote('127.{2..11}', view(select * from numbers(1e6))) group by number order by number limit 1e6 settings distributed_group_by_no_merge=2, max_memory_usage='100Mi'; -- { serverError 241 }
+select * from remote('127.{2..11}', view(select * from numbers(1e6))) group by number order by number limit 1e6 settings distributed_group_by_no_merge=2, max_memory_usage='100Mi', max_block_size=1e12; -- { serverError 241 }
 
 -- with optimize_aggregation_in_order=1 remote servers will produce blocks more frequently,
 -- since they don't need to wait until the aggregation will be finished,
diff --git a/tests/queries/0_stateless/01798_uniq_theta_union_intersect_not.reference b/tests/queries/0_stateless/01798_uniq_theta_union_intersect_not.reference
new file mode 100644
index 00000000000..ff12af23d43
--- /dev/null
+++ b/tests/queries/0_stateless/01798_uniq_theta_union_intersect_not.reference
@@ -0,0 +1,18 @@
+uniqTheta union test
+0	0	0
+4	2	3
+4	3	2
+uniqTheta intersect test
+0	0	0
+1	2	3
+1	3	2
+uniqTheta union test
+0	0	0
+1	2	3
+2	3	2
+uniqTheta retention test
+4	9	4
+uniqTheta retention with AggregatingMergeTree test
+0.5	2	4
+uniqTheta retention with MergeTree test
+0.5	2	4
diff --git a/tests/queries/0_stateless/01798_uniq_theta_union_intersect_not.sql b/tests/queries/0_stateless/01798_uniq_theta_union_intersect_not.sql
new file mode 100644
index 00000000000..ccaf6ca6c6c
--- /dev/null
+++ b/tests/queries/0_stateless/01798_uniq_theta_union_intersect_not.sql
@@ -0,0 +1,90 @@
+-- Tags: no-fasttest
+
+SELECT 'uniqTheta union test';
+
+select finalizeAggregation(uniqThetaUnion(a, b)), finalizeAggregation(a), finalizeAggregation(b) from (select arrayReduce('uniqThetaState',[]) as a, arrayReduce('uniqThetaState',[]) as b );
+
+select finalizeAggregation(uniqThetaUnion(a, b)), finalizeAggregation(a), finalizeAggregation(b) from (select arrayReduce('uniqThetaState',[1,2]) as a, arrayReduce('uniqThetaState',[2,3,4]) as b );
+
+select finalizeAggregation(uniqThetaUnion(a, b)), finalizeAggregation(a), finalizeAggregation(b) from (select arrayReduce('uniqThetaState',[2,3,4]) as a, arrayReduce('uniqThetaState',[1,2]) as b );
+
+SELECT 'uniqTheta intersect test';
+
+select finalizeAggregation(uniqThetaIntersect(a, b)), finalizeAggregation(a), finalizeAggregation(b) from (select arrayReduce('uniqThetaState',[]) as a, arrayReduce('uniqThetaState',[]) as b );
+
+select finalizeAggregation(uniqThetaIntersect(a, b)), finalizeAggregation(a), finalizeAggregation(b) from (select arrayReduce('uniqThetaState',[1,2]) as a, arrayReduce('uniqThetaState',[2,3,4]) as b );
+
+select finalizeAggregation(uniqThetaIntersect(a, b)), finalizeAggregation(a), finalizeAggregation(b) from (select arrayReduce('uniqThetaState',[2,3,4]) as a, arrayReduce('uniqThetaState',[1,2]) as b );
+
+SELECT 'uniqTheta union test';
+
+select finalizeAggregation(uniqThetaNot(a, b)), finalizeAggregation(a), finalizeAggregation(b) from (select arrayReduce('uniqThetaState',[]) as a, arrayReduce('uniqThetaState',[]) as b );
+
+select finalizeAggregation(uniqThetaNot(a, b)), finalizeAggregation(a), finalizeAggregation(b) from (select arrayReduce('uniqThetaState',[1,2]) as a, arrayReduce('uniqThetaState',[2,3,4]) as b );
+
+select finalizeAggregation(uniqThetaNot(a, b)), finalizeAggregation(a), finalizeAggregation(b) from (select arrayReduce('uniqThetaState',[2,3,4]) as a, arrayReduce('uniqThetaState',[1,2]) as b );
+
+SELECT 'uniqTheta retention test';
+
+select finalizeAggregation(uniqThetaIntersect(a,b)), finalizeAggregation(a),finalizeAggregation(b) from 
+(
+select (uniqThetaStateIf(number, number>0)) as a, (uniqThetaStateIf(number, number>5)) as b 
+from 
+(select  number  FROM system.numbers LIMIT 10)
+);
+
+SELECT 'uniqTheta retention with AggregatingMergeTree test';
+DROP TABLE IF EXISTS test1;
+
+CREATE TABLE test1
+(
+    `year` String ,
+    `uv` AggregateFunction(uniqTheta, Int64)
+)
+ENGINE = AggregatingMergeTree()
+ORDER BY (year);
+
+INSERT INTO TABLE test1(year, uv) select '2021',uniqThetaState(toInt64(1));
+INSERT INTO TABLE test1(year, uv) select '2021',uniqThetaState(toInt64(2));
+INSERT INTO TABLE test1(year, uv) select '2021',uniqThetaState(toInt64(3));
+INSERT INTO TABLE test1(year, uv) select '2021',uniqThetaState(toInt64(4));
+INSERT INTO TABLE test1(year, uv) select '2022',uniqThetaState(toInt64(1));
+INSERT INTO TABLE test1(year, uv) select '2022',uniqThetaState(toInt64(3));
+
+select finalizeAggregation(uniqThetaIntersect(uv2021,uv2022))/finalizeAggregation(uv2021),finalizeAggregation(uniqThetaIntersect(uv2021,uv2022)),finalizeAggregation(uv2021)
+from
+(
+select uniqThetaMergeStateIf(uv,year='2021') as uv2021, uniqThetaMergeStateIf(uv,year='2022') as uv2022 
+from test1
+);
+
+DROP TABLE IF EXISTS test1;
+
+SELECT 'uniqTheta retention with MergeTree test';
+DROP TABLE IF EXISTS test2;
+
+CREATE TABLE test2
+(
+    `year` String ,
+    `uv`  Int64
+)
+ENGINE = MergeTree()
+ORDER BY (year);
+
+INSERT INTO TABLE test2(year, uv) select '2021',1;
+INSERT INTO TABLE test2(year, uv) select '2021',2;
+INSERT INTO TABLE test2(year, uv) select '2021',3;
+INSERT INTO TABLE test2(year, uv) select '2021',4;
+INSERT INTO TABLE test2(year, uv) select '2022',1;
+INSERT INTO TABLE test2(year, uv) select '2022',3;
+
+select finalizeAggregation(uniqThetaIntersect(uv2021,uv2022))/finalizeAggregation(uv2021),finalizeAggregation(uniqThetaIntersect(uv2021,uv2022)),finalizeAggregation(uv2021)
+from
+(
+select uniqThetaStateIf(uv,year='2021') as uv2021, uniqThetaStateIf(uv,year='2022') as uv2022 
+from test2
+);
+
+
+
+DROP TABLE IF EXISTS test2;
diff --git a/tests/queries/0_stateless/01825_type_json_17.reference b/tests/queries/0_stateless/01825_type_json_17.reference
new file mode 100644
index 00000000000..0f97bfed5bc
--- /dev/null
+++ b/tests/queries/0_stateless/01825_type_json_17.reference
@@ -0,0 +1,27 @@
+Tuple(arr Nested(k1 Nested(k2 String, k3 String, k4 Int8), k5 Tuple(k6 String)), id Int8)
+{"obj":{"arr":[{"k1":[{"k2":"aaa","k3":"bbb","k4":0},{"k2":"ccc","k3":"","k4":0}],"k5":{"k6":""}}],"id":1}}
+{"obj":{"arr":[{"k1":[{"k2":"","k3":"ddd","k4":10},{"k2":"","k3":"","k4":20}],"k5":{"k6":"foo"}}],"id":2}}
+[['bbb','']]	[['aaa','ccc']]
+[['ddd','']]	[['','']]
+1
+[[0,0]]
+[[10,20]]
+Tuple(arr Nested(k1 Nested(k2 String, k3 Nested(k4 Int8))), id Int8)
+{"obj":{"arr":[{"k1":[{"k2":"aaa","k3":[]}]}],"id":1}}
+{"obj":{"arr":[{"k1":[{"k2":"bbb","k3":[{"k4":10}]},{"k2":"ccc","k3":[{"k4":20}]}]}],"id":2}}
+[['aaa']]	[[[]]]
+[['bbb','ccc']]	[[[10],[20]]]
+1
+[[[]]]
+[[[10],[20]]]
+Tuple(arr Nested(k1 Nested(k2 String, k4 Nested(k5 Int8)), k3 String), id Int8)
+{"obj":{"arr":[{"k1":[],"k3":"qqq"},{"k1":[],"k3":"www"}],"id":1}}
+{"obj":{"arr":[{"k1":[{"k2":"aaa","k4":[]}],"k3":"eee"}],"id":2}}
+{"obj":{"arr":[{"k1":[{"k2":"bbb","k4":[{"k5":10}]},{"k2":"ccc","k4":[{"k5":20}]}],"k3":"rrr"}],"id":3}}
+['qqq','www']	[[],[]]	[[],[]]
+['eee']	[['aaa']]	[[[]]]
+['rrr']	[['bbb','ccc']]	[[[10],[20]]]
+1
+[[],[]]
+[[[]]]
+[[[10],[20]]]
diff --git a/tests/queries/0_stateless/01825_type_json_17.sql b/tests/queries/0_stateless/01825_type_json_17.sql
new file mode 100644
index 00000000000..e3c0c83322b
--- /dev/null
+++ b/tests/queries/0_stateless/01825_type_json_17.sql
@@ -0,0 +1,48 @@
+-- Tags: no-fasttest
+
+DROP TABLE IF EXISTS t_json_17;
+SET allow_experimental_object_type = 1;
+SET output_format_json_named_tuples_as_objects = 1;
+
+CREATE TABLE t_json_17(obj JSON)
+ENGINE = MergeTree ORDER BY tuple();
+
+DROP FUNCTION IF EXISTS hasValidSizes17;
+CREATE FUNCTION hasValidSizes17 AS (arr1, arr2) -> length(arr1) = length(arr2) AND arrayAll((x, y) -> length(x) = length(y), arr1, arr2);
+
+SYSTEM STOP MERGES t_json_17;
+
+INSERT INTO t_json_17 FORMAT JSONAsObject {"id": 1, "arr": [{"k1": [{"k2": "aaa", "k3": "bbb"}, {"k2": "ccc"}]}]}
+INSERT INTO t_json_17 FORMAT JSONAsObject {"id": 2, "arr": [{"k1": [{"k3": "ddd", "k4": 10}, {"k4": 20}], "k5": {"k6": "foo"}}]}
+
+SELECT toTypeName(obj) FROM t_json_17 LIMIT 1;
+SELECT obj FROM t_json_17 ORDER BY obj.id FORMAT JSONEachRow;
+SELECT obj.arr.k1.k3, obj.arr.k1.k2 FROM t_json_17 ORDER BY obj.id;
+SELECT sum(hasValidSizes17(obj.arr.k1.k3, obj.arr.k1.k2)) == count() FROM t_json_17;
+SELECT obj.arr.k1.k4 FROM t_json_17 ORDER BY obj.id;
+
+TRUNCATE TABLE t_json_17;
+
+INSERT INTO t_json_17 FORMAT JSONAsObject {"id": 1, "arr": [{"k1": [{"k2": "aaa"}]}]}
+INSERT INTO t_json_17 FORMAT JSONAsObject {"id": 2, "arr": [{"k1": [{"k2": "bbb", "k3": [{"k4": 10}]}, {"k2": "ccc", "k3": [{"k4": 20}]}]}]}
+
+SELECT toTypeName(obj) FROM t_json_17 LIMIT 1;
+SELECT obj FROM t_json_17 ORDER BY obj.id FORMAT JSONEachRow;
+SELECT obj.arr.k1.k2, obj.arr.k1.k3.k4 FROM t_json_17 ORDER BY obj.id;
+SELECT sum(hasValidSizes17(obj.arr.k1.k2, obj.arr.k1.k3.k4)) == count() FROM t_json_17;
+SELECT obj.arr.k1.k3.k4 FROM t_json_17 ORDER BY obj.id;
+
+TRUNCATE TABLE t_json_17;
+
+INSERT INTO t_json_17 FORMAT JSONAsObject {"id": 1, "arr": [{"k3": "qqq"}, {"k3": "www"}]}
+INSERT INTO t_json_17 FORMAT JSONAsObject {"id": 2, "arr": [{"k1": [{"k2": "aaa"}], "k3": "eee"}]}
+INSERT INTO t_json_17 FORMAT JSONAsObject {"id": 3, "arr": [{"k1": [{"k2": "bbb", "k4": [{"k5": 10}]}, {"k2": "ccc", "k4": [{"k5": 20}]}], "k3": "rrr"}]}
+
+SELECT toTypeName(obj) FROM t_json_17 LIMIT 1;
+SELECT obj FROM t_json_17 ORDER BY obj.id FORMAT JSONEachRow;
+SELECT obj.arr.k3, obj.arr.k1.k2, obj.arr.k1.k4.k5 FROM t_json_17 ORDER BY obj.id;
+SELECT sum(hasValidSizes17(obj.arr.k1.k2, obj.arr.k1.k4.k5)) == count() FROM t_json_17;
+SELECT obj.arr.k1.k4.k5 FROM t_json_17 ORDER BY obj.id;
+
+DROP FUNCTION hasValidSizes17;
+DROP TABLE t_json_17;
diff --git a/tests/queries/0_stateless/02004_intersect_except_const_column.reference b/tests/queries/0_stateless/02004_intersect_except_const_column.reference
new file mode 100644
index 00000000000..6e3081b017d
--- /dev/null
+++ b/tests/queries/0_stateless/02004_intersect_except_const_column.reference
@@ -0,0 +1,85 @@
+-- { echo }
+-- Test: crash the server
+SELECT 'fooooo' INTERSECT DISTINCT SELECT 'fooooo';
+fooooo
+SELECT 'fooooo' EXCEPT ALL SELECT 'fooooo';
+-- Test: intersect return incorrect result for const column
+SELECT 1 FROM numbers(10) INTERSECT SELECT 1 FROM numbers(10);
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+SELECT toString(1) FROM numbers(10) INTERSECT SELECT toString(1) FROM numbers(10);
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+SELECT '1' FROM numbers(10) INTERSECT SELECT '1' FROM numbers(10);
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+SELECT 1 FROM numbers(10) INTERSECT DISTINCT SELECT 1 FROM numbers(10);
+1
+SELECT toString(1) FROM numbers(10) INTERSECT DISTINCT SELECT toString(1) FROM numbers(10);
+1
+SELECT '1' FROM numbers(10) INTERSECT DISTINCT SELECT '1' FROM numbers(10);
+1
+-- Test: except return incorrect result for const column
+SELECT 2 FROM numbers(10) EXCEPT SELECT 1 FROM numbers(5);
+2
+2
+2
+2
+2
+2
+2
+2
+2
+2
+SELECT toString(2) FROM numbers(10) EXCEPT SELECT toString(1) FROM numbers(5);
+2
+2
+2
+2
+2
+2
+2
+2
+2
+2
+SELECT '2' FROM numbers(10) EXCEPT SELECT '1' FROM numbers(5);
+2
+2
+2
+2
+2
+2
+2
+2
+2
+2
+SELECT 2 FROM numbers(10) EXCEPT DISTINCT SELECT 1 FROM numbers(5);
+2
+SELECT toString(2) FROM numbers(10) EXCEPT DISTINCT SELECT toString(1) FROM numbers(5);
+2
+SELECT '2' FROM numbers(10) EXCEPT DISTINCT SELECT '1' FROM numbers(5);
+2
diff --git a/tests/queries/0_stateless/02004_intersect_except_const_column.sql b/tests/queries/0_stateless/02004_intersect_except_const_column.sql
new file mode 100644
index 00000000000..6fabf34d38a
--- /dev/null
+++ b/tests/queries/0_stateless/02004_intersect_except_const_column.sql
@@ -0,0 +1,20 @@
+-- { echo }
+-- Test: crash the server
+SELECT 'fooooo' INTERSECT DISTINCT SELECT 'fooooo';
+SELECT 'fooooo' EXCEPT ALL SELECT 'fooooo';
+
+-- Test: intersect return incorrect result for const column
+SELECT 1 FROM numbers(10) INTERSECT SELECT 1 FROM numbers(10);
+SELECT toString(1) FROM numbers(10) INTERSECT SELECT toString(1) FROM numbers(10);
+SELECT '1' FROM numbers(10) INTERSECT SELECT '1' FROM numbers(10);
+SELECT 1 FROM numbers(10) INTERSECT DISTINCT SELECT 1 FROM numbers(10);
+SELECT toString(1) FROM numbers(10) INTERSECT DISTINCT SELECT toString(1) FROM numbers(10);
+SELECT '1' FROM numbers(10) INTERSECT DISTINCT SELECT '1' FROM numbers(10);
+
+-- Test: except return incorrect result for const column
+SELECT 2 FROM numbers(10) EXCEPT SELECT 1 FROM numbers(5);
+SELECT toString(2) FROM numbers(10) EXCEPT SELECT toString(1) FROM numbers(5);
+SELECT '2' FROM numbers(10) EXCEPT SELECT '1' FROM numbers(5);
+SELECT 2 FROM numbers(10) EXCEPT DISTINCT SELECT 1 FROM numbers(5);
+SELECT toString(2) FROM numbers(10) EXCEPT DISTINCT SELECT toString(1) FROM numbers(5);
+SELECT '2' FROM numbers(10) EXCEPT DISTINCT SELECT '1' FROM numbers(5);
\ No newline at end of file
diff --git a/tests/queries/0_stateless/02004_intersect_except_distinct_operators.reference b/tests/queries/0_stateless/02004_intersect_except_distinct_operators.reference
new file mode 100644
index 00000000000..49529aa7683
--- /dev/null
+++ b/tests/queries/0_stateless/02004_intersect_except_distinct_operators.reference
@@ -0,0 +1,133 @@
+-- { echo }
+
+set intersect_default_mode = 'DISTINCT';
+set except_default_mode = 'DISTINCT';
+select 1 intersect select 1;
+1
+select 2 intersect select 1;
+select 1 except select 1;
+select 2 except select 1;
+2
+select 5 from numbers(20) intersect select number from numbers(5, 5);
+5
+select number from numbers(10) except select number from numbers(5);
+5
+6
+7
+8
+9
+select number, number+10 from numbers(12) except select number+5, number+15 from numbers(10);
+0	10
+1	11
+2	12
+3	13
+4	14
+select 1 except select 2 intersect select 1;
+1
+select 1 except select 2 intersect select 2;
+1
+select 1 intersect select 1 except select 2;
+1
+select 1 intersect select 1 except select 1;
+select 1 intersect select 1 except select 2 intersect select 1 except select 3 intersect select 1;
+1
+select 1 intersect select 1 except select 2 intersect select 1 except select 3 intersect select 2;
+1
+select 1 intersect select 1 except select 2 intersect select 1 except select 3 intersect select 2 except select 1;
+select number%3 from numbers(10) except select 1;
+0
+2
+select number from numbers(100) intersect select number from numbers(20, 60) except select number from numbers(30, 20) except select number from numbers(60, 20);
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+select * from (select 1 intersect select 1);
+1
+with (select number from numbers(10) intersect select 5) as a select a * 10;
+50
+with (select 5 except select 1) as a select a except select 5;
+with (select number from numbers(10) intersect select 5) as a select a intersect select 1;
+with (select number from numbers(10) intersect select 5) as a select a except select 1;
+5
+select count() from (select number from numbers(10) except select 5);
+9
+select count() from (select number from numbers(1000000) intersect select number from numbers(200000, 600000));
+600000
+select count() from (select number from numbers(100) intersect select number from numbers(20, 60) except select number from numbers(30, 20) except select number from numbers(60, 20));
+20
+select count() from (select number from numbers(100) intersect select number from numbers(20, 60) except select number from numbers(30, 20) except select number from numbers(60, 20) union all select number from numbers(100, 10));
+30
+select count() from (select number from numbers(1000000) intersect select number from numbers(200000, 600000) except select number from numbers(300000, 200000) except select number from numbers(600000, 200000));
+200000
+select count() from (select 1 intersect select 1) limit 100;
+1
+select count() from (select 1 except select 2) limit 100;
+1
+with (select count() from (select 1 union distinct select 2 except select 1)) as max
+select count() from (select 1 union all select max) limit 100;
+2
+select 1 union all select 1 intersect select 1;
+1
+1
+select 1 union all select 1 intersect select 2;
+1
+select * from (select 1 union all select 2 union all select 3 union all select 4 except select 3 union all select 5) order by 1;
+1
+2
+4
+5
+select * from (select 1 union all select 2 union all select 3 union all select 4 intersect select 3 union all select 5) order by 1;
+1
+2
+3
+5
+select * from (select 1 union all select 2 union all select 3 union all select 4 intersect select 3 union all select 5 except select 1) order by 1;
+2
+3
+5
+select 1 intersect (select 1 except select 2);
+1
+select 1 union all select 2  except (select 2 except select 1 union all select 1) except select 4;
+select 1 intersect select count() from (select 1 except select 2 intersect select 2 union all select 1);
+explain syntax select 1 intersect select 1;
+SELECT 1
+INTERSECT DISTINCT
+SELECT 1
+explain syntax select 1 except select 1;
+SELECT 1
+EXCEPT DISTINCT
+SELECT 1
+explain syntax select 1 union all select 2  except (select 2 except select 1 union all select 1) except select 4;
+SELECT 1
+UNION ALL
+SELECT 2
+EXCEPT DISTINCT
+SELECT 2
+EXCEPT DISTINCT
+SELECT 1
+UNION ALL
+SELECT 1
+EXCEPT DISTINCT
+SELECT 4
+set limit=1;
+select 1 intersect select 1;
+1
+(((select 1) intersect select 1));
+1
diff --git a/tests/queries/0_stateless/02004_intersect_except_distinct_operators.sql b/tests/queries/0_stateless/02004_intersect_except_distinct_operators.sql
new file mode 100644
index 00000000000..efb48e59f3d
--- /dev/null
+++ b/tests/queries/0_stateless/02004_intersect_except_distinct_operators.sql
@@ -0,0 +1,58 @@
+-- { echo }
+
+set intersect_default_mode = 'DISTINCT';
+set except_default_mode = 'DISTINCT';
+
+select 1 intersect select 1;
+select 2 intersect select 1;
+select 1 except select 1;
+select 2 except select 1;
+
+select 5 from numbers(20) intersect select number from numbers(5, 5);
+select number from numbers(10) except select number from numbers(5);
+select number, number+10 from numbers(12) except select number+5, number+15 from numbers(10);
+
+select 1 except select 2 intersect select 1;
+select 1 except select 2 intersect select 2;
+select 1 intersect select 1 except select 2;
+select 1 intersect select 1 except select 1;
+select 1 intersect select 1 except select 2 intersect select 1 except select 3 intersect select 1;
+select 1 intersect select 1 except select 2 intersect select 1 except select 3 intersect select 2;
+select 1 intersect select 1 except select 2 intersect select 1 except select 3 intersect select 2 except select 1;
+
+select number%3 from numbers(10) except select 1;
+select number from numbers(100) intersect select number from numbers(20, 60) except select number from numbers(30, 20) except select number from numbers(60, 20);
+
+select * from (select 1 intersect select 1);
+with (select number from numbers(10) intersect select 5) as a select a * 10;
+with (select 5 except select 1) as a select a except select 5;
+with (select number from numbers(10) intersect select 5) as a select a intersect select 1;
+with (select number from numbers(10) intersect select 5) as a select a except select 1;
+select count() from (select number from numbers(10) except select 5);
+select count() from (select number from numbers(1000000) intersect select number from numbers(200000, 600000));
+select count() from (select number from numbers(100) intersect select number from numbers(20, 60) except select number from numbers(30, 20) except select number from numbers(60, 20));
+select count() from (select number from numbers(100) intersect select number from numbers(20, 60) except select number from numbers(30, 20) except select number from numbers(60, 20) union all select number from numbers(100, 10));
+select count() from (select number from numbers(1000000) intersect select number from numbers(200000, 600000) except select number from numbers(300000, 200000) except select number from numbers(600000, 200000));
+
+select count() from (select 1 intersect select 1) limit 100;
+select count() from (select 1 except select 2) limit 100;
+with (select count() from (select 1 union distinct select 2 except select 1)) as max
+select count() from (select 1 union all select max) limit 100;
+
+select 1 union all select 1 intersect select 1;
+select 1 union all select 1 intersect select 2;
+select * from (select 1 union all select 2 union all select 3 union all select 4 except select 3 union all select 5) order by 1;
+select * from (select 1 union all select 2 union all select 3 union all select 4 intersect select 3 union all select 5) order by 1;
+select * from (select 1 union all select 2 union all select 3 union all select 4 intersect select 3 union all select 5 except select 1) order by 1;
+
+select 1 intersect (select 1 except select 2);
+select 1 union all select 2  except (select 2 except select 1 union all select 1) except select 4;
+select 1 intersect select count() from (select 1 except select 2 intersect select 2 union all select 1);
+
+explain syntax select 1 intersect select 1;
+explain syntax select 1 except select 1;
+explain syntax select 1 union all select 2  except (select 2 except select 1 union all select 1) except select 4;
+
+set limit=1;
+select 1 intersect select 1;
+(((select 1) intersect select 1));
diff --git a/tests/queries/0_stateless/02004_intersect_except_operators.reference b/tests/queries/0_stateless/02004_intersect_except_operators.reference
index a96a6bc7264..0acb550c1e8 100644
--- a/tests/queries/0_stateless/02004_intersect_except_operators.reference
+++ b/tests/queries/0_stateless/02004_intersect_except_operators.reference
@@ -116,23 +116,23 @@ select 1 union all select 2  except (select 2 except select 1 union all select 1
 select 1 intersect select count() from (select 1 except select 2 intersect select 2 union all select 1);
 explain syntax select 1 intersect select 1;
 SELECT 1
-INTERSECT
+INTERSECT ALL
 SELECT 1
 explain syntax select 1 except select 1;
 SELECT 1
-EXCEPT
+EXCEPT ALL
 SELECT 1
 explain syntax select 1 union all select 2  except (select 2 except select 1 union all select 1) except select 4;
 SELECT 1
 UNION ALL
 SELECT 2
-EXCEPT
+EXCEPT ALL
 SELECT 2
-EXCEPT
+EXCEPT ALL
 SELECT 1
 UNION ALL
 SELECT 1
-EXCEPT
+EXCEPT ALL
 SELECT 4
 set limit=1;
 select 1 intersect select 1;
diff --git a/tests/integration/02044_exists_operator.reference b/tests/queries/0_stateless/02044_exists_operator.reference
similarity index 100%
rename from tests/integration/02044_exists_operator.reference
rename to tests/queries/0_stateless/02044_exists_operator.reference
diff --git a/tests/integration/02044_exists_operator.sql b/tests/queries/0_stateless/02044_exists_operator.sql
similarity index 100%
rename from tests/integration/02044_exists_operator.sql
rename to tests/queries/0_stateless/02044_exists_operator.sql
diff --git a/tests/queries/0_stateless/02131_remove_columns_in_subquery.reference b/tests/queries/0_stateless/02131_remove_columns_in_subquery.reference
index d00491fd7e5..6ed281c757a 100644
--- a/tests/queries/0_stateless/02131_remove_columns_in_subquery.reference
+++ b/tests/queries/0_stateless/02131_remove_columns_in_subquery.reference
@@ -1 +1,2 @@
 1
+1
diff --git a/tests/queries/0_stateless/02131_remove_columns_in_subquery.sql b/tests/queries/0_stateless/02131_remove_columns_in_subquery.sql
index f9ca2269aad..c765d989d1d 100644
--- a/tests/queries/0_stateless/02131_remove_columns_in_subquery.sql
+++ b/tests/queries/0_stateless/02131_remove_columns_in_subquery.sql
@@ -1 +1,2 @@
-select count(1) from (SELECT 1 AS a, count(1) FROM numbers(5))
+select count(1) from (SELECT 1 AS a, count(1) FROM numbers(5));
+select count(1) from (SELECT 1 AS a, count(1) + 1 FROM numbers(5));
\ No newline at end of file
diff --git a/tests/queries/0_stateless/02161_addressToLineWithInlines.sql b/tests/queries/0_stateless/02161_addressToLineWithInlines.sql
index b6b497b4b55..e4624fffd48 100644
--- a/tests/queries/0_stateless/02161_addressToLineWithInlines.sql
+++ b/tests/queries/0_stateless/02161_addressToLineWithInlines.sql
@@ -1,4 +1,5 @@
--- Tags: no-tsan, no-asan, no-ubsan, no-msan, no-debug, no-cpu-aarch64
+-- Tags: no-tsan, no-asan, no-ubsan, no-msan, no-debug, no-cpu-aarch64, disabled
+-- Tag disabled: Parsing inlines may lead to "could not find abbreviation code" (FIXME)
 
 SET allow_introspection_functions = 0;
 SELECT addressToLineWithInlines(1); -- { serverError 446 }
diff --git a/tests/queries/0_stateless/02277_full_sort_join_misc.sql b/tests/queries/0_stateless/02277_full_sort_join_misc.sql
index b4e3882edaf..4297f532b98 100644
--- a/tests/queries/0_stateless/02277_full_sort_join_misc.sql
+++ b/tests/queries/0_stateless/02277_full_sort_join_misc.sql
@@ -1,24 +1,24 @@
 SET join_algorithm = 'full_sorting_merge';
 
-SELECT * FROM (SELECT 1 as key) AS t1 JOIN (SELECT 1 as key) t2 ON t1.key = t2.key;
+SELECT * FROM (SELECT 1 as key) AS t1 JOIN (SELECT 1 as key) t2 ON t1.key = t2.key ORDER BY key;
 
-SELECT * FROM (SELECT 1 as key) AS t1 JOIN (SELECT 1 as key) t2 USING key;
+SELECT * FROM (SELECT 1 as key) AS t1 JOIN (SELECT 1 as key) t2 USING key ORDER BY key;
 
-SELECT * FROM (SELECT 1 :: UInt32 as key) AS t1 FULL JOIN (SELECT 1 :: Nullable(UInt32) as key) t2 USING (key);
+SELECT * FROM (SELECT 1 :: UInt32 as key) AS t1 FULL JOIN (SELECT 1 :: Nullable(UInt32) as key) t2 USING (key) ORDER BY key;
 
-SELECT * FROM (SELECT 1 :: UInt32 as key) AS t1 FULL JOIN (SELECT NULL :: Nullable(UInt32) as key) t2 USING (key);
+SELECT * FROM (SELECT 1 :: UInt32 as key) AS t1 FULL JOIN (SELECT NULL :: Nullable(UInt32) as key) t2 USING (key) ORDER BY key;
 
-SELECT * FROM (SELECT 1 :: Int32 as key) AS t1 JOIN (SELECT 1 :: UInt32 as key) t2 ON t1.key = t2.key;
+SELECT * FROM (SELECT 1 :: Int32 as key) AS t1 JOIN (SELECT 1 :: UInt32 as key) t2 ON t1.key = t2.key ORDER BY key;
 
-SELECT * FROM (SELECT -1 :: Nullable(Int32) as key) AS t1 FULL JOIN (SELECT 4294967295 :: UInt32 as key) t2 ON t1.key = t2.key;
+SELECT * FROM (SELECT -1 :: Nullable(Int32) as key) AS t1 FULL JOIN (SELECT 4294967295 :: UInt32 as key) t2 ON t1.key = t2.key ORDER BY key;
 
-SELECT * FROM (SELECT 'a' :: LowCardinality(String) AS key) AS t1 JOIN (SELECT 'a' :: String AS key) AS t2 ON t1.key = t2.key;
+SELECT * FROM (SELECT 'a' :: LowCardinality(String) AS key) AS t1 JOIN (SELECT 'a' :: String AS key) AS t2 ON t1.key = t2.key ORDER BY key;
 
-SELECT * FROM (SELECT 'a' :: LowCardinality(Nullable(String)) AS key) AS t1 JOIN (SELECT 'a' :: String AS key) AS t2 ON t1.key = t2.key;
+SELECT * FROM (SELECT 'a' :: LowCardinality(Nullable(String)) AS key) AS t1 JOIN (SELECT 'a' :: String AS key) AS t2 ON t1.key = t2.key ORDER BY key;
 
-SELECT * FROM (SELECT 'a' :: LowCardinality(Nullable(String)) AS key) AS t1 JOIN (SELECT 'a' :: Nullable(String) AS key) AS t2 ON t1.key = t2.key;
+SELECT * FROM (SELECT 'a' :: LowCardinality(Nullable(String)) AS key) AS t1 JOIN (SELECT 'a' :: Nullable(String) AS key) AS t2 ON t1.key = t2.key ORDER BY key;
 
-SELECT * FROM (SELECT 'a' :: LowCardinality(String) AS key) AS t1 JOIN (SELECT 'a' :: LowCardinality(String) AS key) AS t2 ON t1.key = t2.key;
+SELECT * FROM (SELECT 'a' :: LowCardinality(String) AS key) AS t1 JOIN (SELECT 'a' :: LowCardinality(String) AS key) AS t2 ON t1.key = t2.key ORDER BY key;
 
-SELECT 5 == count() FROM (SELECT number as a from numbers(5)) as t1 LEFT JOIN (SELECT number as b from numbers(5) WHERE number > 100) as t2 ON t1.a = t2.b;
-SELECT 5 == count() FROM (SELECT number as a from numbers(5) WHERE number > 100) as t1 RIGHT JOIN (SELECT number as b from numbers(5)) as t2 ON t1.a = t2.b;
+SELECT 5 == count() FROM (SELECT number as a from numbers(5)) as t1 LEFT JOIN (SELECT number as b from numbers(5) WHERE number > 100) as t2 ON t1.a = t2.b ORDER BY 1;
+SELECT 5 == count() FROM (SELECT number as a from numbers(5) WHERE number > 100) as t1 RIGHT JOIN (SELECT number as b from numbers(5)) as t2 ON t1.a = t2.b ORDER BY 1;
diff --git a/tests/queries/0_stateless/02337_join_analyze_stuck.reference b/tests/queries/0_stateless/02337_join_analyze_stuck.reference
new file mode 100644
index 00000000000..4c8ac5b69c5
--- /dev/null
+++ b/tests/queries/0_stateless/02337_join_analyze_stuck.reference
@@ -0,0 +1,26 @@
+WITH
+    x AS
+    (
+        SELECT number
+        FROM numbers(10)
+    ),
+    cross_sales AS
+    (
+        SELECT 1 AS xx
+        FROM
+        x,
+        x AS d1,
+        x AS d2,
+        x AS d3,
+        x AS d4,
+        x AS d5,
+        x AS d6,
+        x AS d7,
+        x AS d8,
+        x AS d9
+        WHERE x.number = d9.number
+    )
+SELECT xx
+FROM
+cross_sales
+WHERE xx = 2000
diff --git a/tests/queries/0_stateless/02337_join_analyze_stuck.sql b/tests/queries/0_stateless/02337_join_analyze_stuck.sql
new file mode 100644
index 00000000000..62dd0888673
--- /dev/null
+++ b/tests/queries/0_stateless/02337_join_analyze_stuck.sql
@@ -0,0 +1,13 @@
+-- Tags: long
+
+-- https://github.com/ClickHouse/ClickHouse/issues/21557
+
+EXPLAIN SYNTAX
+WITH
+    x AS ( SELECT number FROM numbers(10) ),
+    cross_sales AS (
+        SELECT 1 AS xx
+        FROM x, x AS d1, x AS d2, x AS d3, x AS d4, x AS d5, x AS d6, x AS d7, x AS d8, x AS d9
+        WHERE x.number = d9.number
+    )
+SELECT xx FROM cross_sales WHERE xx = 2000;
diff --git a/tests/queries/0_stateless/02343_aggregation_pipeline.reference b/tests/queries/0_stateless/02343_aggregation_pipeline.reference
index 67bd9c414ba..ec9a394d05d 100644
--- a/tests/queries/0_stateless/02343_aggregation_pipeline.reference
+++ b/tests/queries/0_stateless/02343_aggregation_pipeline.reference
@@ -1,5 +1,22 @@
 -- { echoOn }
 
+explain pipeline select * from (select * from numbers(1e8) group by number) group by number;
+(Expression)
+ExpressionTransform × 16
+  (Aggregating)
+  Resize 16 → 16
+    AggregatingTransform × 16
+      StrictResize 16 → 16
+        (Expression)
+        ExpressionTransform × 16
+          (Aggregating)
+          Resize 1 → 16
+            AggregatingTransform
+              (Expression)
+              ExpressionTransform
+                (ReadFromStorage)
+                Limit
+                  Numbers 0 → 1
 explain pipeline select * from (select * from numbers_mt(1e8) group by number) group by number;
 (Expression)
 ExpressionTransform × 16
diff --git a/tests/queries/0_stateless/02343_aggregation_pipeline.sql b/tests/queries/0_stateless/02343_aggregation_pipeline.sql
index d259889b042..85e9fd1be1e 100644
--- a/tests/queries/0_stateless/02343_aggregation_pipeline.sql
+++ b/tests/queries/0_stateless/02343_aggregation_pipeline.sql
@@ -1,9 +1,12 @@
 set max_threads = 16;
 set prefer_localhost_replica = 1;
 set optimize_aggregation_in_order = 0;
+set max_block_size = 65505;
 
 -- { echoOn }
 
+explain pipeline select * from (select * from numbers(1e8) group by number) group by number;
+
 explain pipeline select * from (select * from numbers_mt(1e8) group by number) group by number;
 
 explain pipeline select * from (select * from numbers_mt(1e8) group by number) order by number;
diff --git a/tests/queries/0_stateless/02343_group_by_use_nulls.reference b/tests/queries/0_stateless/02343_group_by_use_nulls.reference
index 24b7bb5277c..c694b7abfa1 100644
--- a/tests/queries/0_stateless/02343_group_by_use_nulls.reference
+++ b/tests/queries/0_stateless/02343_group_by_use_nulls.reference
@@ -213,3 +213,27 @@ SETTINGS group_by_use_nulls=1;
 \N	\N	45
 
 0	0	45
+SELECT
+    number,
+    number % 2,
+    sum(number) AS val
+FROM numbers(10)
+GROUP BY
+    GROUPING SETS (
+        (number),
+        (number % 2)
+    )
+ORDER BY 1, tuple(val)
+SETTINGS group_by_use_nulls = 1, max_bytes_before_external_sort=10;
+0	\N	0
+1	\N	1
+2	\N	2
+3	\N	3
+4	\N	4
+5	\N	5
+6	\N	6
+7	\N	7
+8	\N	8
+9	\N	9
+\N	0	20
+\N	1	25
diff --git a/tests/queries/0_stateless/02343_group_by_use_nulls.sql b/tests/queries/0_stateless/02343_group_by_use_nulls.sql
index a14db824013..a979a78be0d 100644
--- a/tests/queries/0_stateless/02343_group_by_use_nulls.sql
+++ b/tests/queries/0_stateless/02343_group_by_use_nulls.sql
@@ -60,3 +60,16 @@ FROM numbers(10)
 GROUP BY CUBE(number, number % 2) WITH TOTALS
 ORDER BY (number, number % 2, val)
 SETTINGS group_by_use_nulls=1;
+
+SELECT
+    number,
+    number % 2,
+    sum(number) AS val
+FROM numbers(10)
+GROUP BY
+    GROUPING SETS (
+        (number),
+        (number % 2)
+    )
+ORDER BY 1, tuple(val)
+SETTINGS group_by_use_nulls = 1, max_bytes_before_external_sort=10;
diff --git a/tests/queries/0_stateless/02355_control_block_size_in_aggregator.reference b/tests/queries/0_stateless/02355_control_block_size_in_aggregator.reference
new file mode 100644
index 00000000000..d00491fd7e5
--- /dev/null
+++ b/tests/queries/0_stateless/02355_control_block_size_in_aggregator.reference
@@ -0,0 +1 @@
+1
diff --git a/tests/queries/0_stateless/02355_control_block_size_in_aggregator.sql b/tests/queries/0_stateless/02355_control_block_size_in_aggregator.sql
new file mode 100644
index 00000000000..b4754c6d6fe
--- /dev/null
+++ b/tests/queries/0_stateless/02355_control_block_size_in_aggregator.sql
@@ -0,0 +1,9 @@
+SET max_block_size = 4213;
+
+SELECT DISTINCT (blockSize() <= 4213)
+FROM
+(
+    SELECT number
+    FROM numbers(100000)
+    GROUP BY number
+);
diff --git a/tests/queries/0_stateless/02377_extend_protocol_with_query_parameters.reference b/tests/queries/0_stateless/02377_extend_protocol_with_query_parameters.reference
index f46cdb6e5e3..1da5cd0b7b3 100644
--- a/tests/queries/0_stateless/02377_extend_protocol_with_query_parameters.reference
+++ b/tests/queries/0_stateless/02377_extend_protocol_with_query_parameters.reference
@@ -7,3 +7,10 @@ UInt64	String	DateTime	Map(UUID, Array(Float32))
 13	str	2022-08-04 18:30:53	{'10':[11,12],'13':[14,15]}
 1
 1
+_CAST(42, \'Int64\')	Int64					
+_CAST([1, 2, 3], \'Array(UInt8)\')	Array(UInt8)					
+_CAST(((\'abc\', 22), (\'def\', 33)), \'Map(String, UInt8)\')	Map(String, UInt8)					
+_CAST([[4, 5, 6], [7], [8, 9]], \'Array(Array(UInt8))\')	Array(Array(UInt8))					
+_CAST(((10, [11, 12]), (13, [14, 15])), \'Map(UInt8, Array(UInt8))\')	Map(UInt8, Array(UInt8))					
+_CAST(((\'ghj\', ((\'klm\', [16, 17]))), (\'nop\', ((\'rst\', [18])))), \'Map(String, Map(String, Array(UInt8)))\')	Map(String, Map(String, Array(UInt8)))					
+a	Int8					
diff --git a/tests/queries/0_stateless/02377_extend_protocol_with_query_parameters.sh b/tests/queries/0_stateless/02377_extend_protocol_with_query_parameters.sh
index 335af1bb6e6..e61dc337d2a 100755
--- a/tests/queries/0_stateless/02377_extend_protocol_with_query_parameters.sh
+++ b/tests/queries/0_stateless/02377_extend_protocol_with_query_parameters.sh
@@ -68,13 +68,27 @@ $CLICKHOUSE_CLIENT -n -q "select {n: UInt8} -- { serverError 456 }"
 $CLICKHOUSE_CLIENT -n -q "set param_n = 12; set param_n = 13; select {n: UInt8}"
 
 
-# but multiple different parameters could be defined within each session
+# multiple different parameters could be defined within each session
 $CLICKHOUSE_CLIENT -n -q "
   set param_a = 13, param_b = 'str';
   set param_c = '2022-08-04 18:30:53';
   set param_d = '{\'10\': [11, 12], \'13\': [14, 15]}';
   select {a: UInt32}, {b: String}, {c: DateTime}, {d: Map(String, Array(UInt8))}"
 
+
 # empty parameter name is not allowed
 $CLICKHOUSE_CLIENT --param_="" -q "select 1" 2>&1 | grep -c 'Code: 36'
 $CLICKHOUSE_CLIENT -q "set param_ = ''" 2>&1 | grep -c 'Code: 36'
+
+
+# parameters are also supported for DESCRIBE TABLE queries
+$CLICKHOUSE_CLIENT \
+  --param_id="42" \
+  --param_arr="[1, 2, 3]" \
+  --param_map="{'abc': 22, 'def': 33}" \
+  --param_mul_arr="[[4, 5, 6], [7], [8, 9]]" \
+  --param_map_arr="{10: [11, 12], 13: [14, 15]}" \
+  --param_map_map_arr="{'ghj': {'klm': [16, 17]}, 'nop': {'rst': [18]}}" \
+  -q "describe table(select {id: Int64}, {arr: Array(UInt8)}, {map: Map(String, UInt8)}, {mul_arr: Array(Array(UInt8))}, {map_arr: Map(UInt8, Array(UInt8))}, {map_map_arr: Map(String, Map(String, Array(UInt8)))})"
+
+$CLICKHOUSE_CLIENT --param_p=42 -q "describe table (select * from (select {p:Int8} as a group by a) order by a)"
diff --git a/tests/queries/0_stateless/02377_majority_insert_quorum_zookeeper_long.reference b/tests/queries/0_stateless/02377_majority_insert_quorum_zookeeper_long.reference
new file mode 100644
index 00000000000..579caf88d46
--- /dev/null
+++ b/tests/queries/0_stateless/02377_majority_insert_quorum_zookeeper_long.reference
@@ -0,0 +1,21 @@
+1
+2
+3
+1
+2
+3
+1
+1
+1
+1
+1
+1
+1
+2
+3
+1
+2
+3
+1
+2
+3
diff --git a/tests/queries/0_stateless/02377_majority_insert_quorum_zookeeper_long.sql b/tests/queries/0_stateless/02377_majority_insert_quorum_zookeeper_long.sql
new file mode 100644
index 00000000000..803607f526e
--- /dev/null
+++ b/tests/queries/0_stateless/02377_majority_insert_quorum_zookeeper_long.sql
@@ -0,0 +1,72 @@
+-- Tags: long, zookeeper, no-replicated-database
+
+-- no-replicated-database:
+--   The number of replicas is doubled, so `SYSTEM STOP FETCHES` stop not enough replicas.
+
+SET insert_quorum_parallel = false;
+
+SET select_sequential_consistency = 1;
+
+DROP TABLE IF EXISTS quorum1;
+DROP TABLE IF EXISTS quorum2;
+DROP TABLE IF EXISTS quorum3;
+
+CREATE TABLE quorum1(x UInt32, y Date) ENGINE ReplicatedMergeTree('/clickhouse/tables/{database}/test_02377/quorum', '1') ORDER BY x PARTITION BY y;
+CREATE TABLE quorum2(x UInt32, y Date) ENGINE ReplicatedMergeTree('/clickhouse/tables/{database}/test_02377/quorum', '2') ORDER BY x PARTITION BY y;
+
+-- insert_quorum = n/2 + 1 , so insert will be written to both replica
+SET insert_quorum = 'auto';
+
+INSERT INTO quorum1 VALUES (1, '2018-11-15');
+INSERT INTO quorum1 VALUES (2, '2018-11-15');
+INSERT INTO quorum1 VALUES (3, '2018-12-16');
+
+SELECT x FROM quorum1 ORDER BY x;
+SELECT x FROM quorum2 ORDER BY x;
+
+DROP TABLE quorum1;
+DROP TABLE quorum2;
+
+-- Create 3 replicas and stop sync 2 replicas
+CREATE TABLE quorum1(x UInt32, y Date) ENGINE ReplicatedMergeTree('/clickhouse/tables/{database}/test_02377/quorum1', '1') ORDER BY x PARTITION BY y;
+CREATE TABLE quorum2(x UInt32, y Date) ENGINE ReplicatedMergeTree('/clickhouse/tables/{database}/test_02377/quorum1', '2') ORDER BY x PARTITION BY y;
+CREATE TABLE quorum3(x UInt32, y Date) ENGINE ReplicatedMergeTree('/clickhouse/tables/{database}/test_02377/quorum1', '3') ORDER BY x PARTITION BY y;
+
+-- Insert should be successful
+-- stop replica 3
+SYSTEM STOP FETCHES quorum3;
+INSERT INTO quorum1 VALUES (1, '2018-11-15');
+SELECT x FROM quorum1 ORDER BY x;
+SELECT x FROM quorum2 ORDER BY x;
+SELECT x FROM quorum3 ORDER BY x; -- {serverError 289}
+
+-- Sync replica 3
+SYSTEM START FETCHES quorum3;
+SYSTEM SYNC REPLICA quorum3;
+SELECT x FROM quorum3 ORDER BY x;
+
+-- Stop 2 replicas , so insert wont be successful
+SYSTEM STOP FETCHES quorum2;
+SYSTEM STOP FETCHES quorum3;
+SET insert_quorum_timeout = 5000;
+INSERT INTO quorum1 VALUES (2, '2018-11-15'); -- { serverError 319 }
+SELECT x FROM quorum1 ORDER BY x;
+SELECT x FROM quorum2 ORDER BY x;
+SELECT x FROM quorum3 ORDER BY x;
+
+-- Sync replica 2 and 3
+SYSTEM START FETCHES quorum2;
+SYSTEM SYNC REPLICA quorum2;
+SYSTEM START FETCHES quorum3;
+SYSTEM SYNC REPLICA quorum3;
+
+INSERT INTO quorum1 VALUES (3, '2018-11-15');
+SELECT x FROM quorum1 ORDER BY x;
+SYSTEM SYNC REPLICA quorum2;
+SYSTEM SYNC REPLICA quorum3;
+SELECT x FROM quorum2 ORDER BY x;
+SELECT x FROM quorum3 ORDER BY x;
+
+DROP TABLE quorum1;
+DROP TABLE quorum2;
+DROP TABLE quorum3;
diff --git a/tests/queries/0_stateless/02377_optimize_sorting_by_input_stream_properties_explain.reference b/tests/queries/0_stateless/02377_optimize_sorting_by_input_stream_properties_explain.reference
index c32f227006c..1ad64150049 100644
--- a/tests/queries/0_stateless/02377_optimize_sorting_by_input_stream_properties_explain.reference
+++ b/tests/queries/0_stateless/02377_optimize_sorting_by_input_stream_properties_explain.reference
@@ -1,8 +1,8 @@
 -- EXPLAIN PLAN sorting for MergeTree w/o sorting key
 -- QUERY: set optimize_read_in_order=1;EXPLAIN PLAN actions=1, header=1, sorting=1 SELECT a FROM optimize_sorting ORDER BY a
-Sorting (Stream): a ASC
+Sorting (Global): a ASC
 Sorting (Sorting for ORDER BY)
-Sorting (Stream): a ASC
+Sorting (Global): a ASC
 Sorting (None)
 Sorting (None)
 -- disable optimization -> sorting order is NOT propagated from subquery -> full sort
@@ -20,22 +20,22 @@ LimitsCheckingTransform
 PartialSortingTransform
 -- ExpressionStep preserves sort mode
 -- QUERY: set optimize_read_in_order=1;EXPLAIN PLAN actions=1, header=1, sorting=1 SELECT a FROM optimize_sorting ORDER BY a
-Sorting (Stream): a ASC
+Sorting (Global): a ASC
 Sorting
+Sorting (Global): a ASC
+Sorting (Stream): a ASC
 Sorting (Stream): a ASC
-Sorting (Port): a ASC
-Sorting (Port): a ASC
 -- QUERY: set optimize_read_in_order=1;EXPLAIN PLAN actions=1, header=1, sorting=1 SELECT a FROM optimize_sorting ORDER BY a+1
 Sorting (None)
 Sorting (Sorting for ORDER BY)
-Sorting (Stream): plus(a, 1) ASC
+Sorting (Global): plus(a, 1) ASC
 Sorting (Chunk): a ASC
 Sorting (Chunk): a ASC
 -- ExpressionStep breaks sort mode
 -- QUERY: set optimize_read_in_order=1;EXPLAIN PLAN actions=1, header=1, sorting=1 SELECT a+1 FROM optimize_sorting ORDER BY a+1
-Sorting (Stream): plus(a, 1) ASC
+Sorting (Global): plus(a, 1) ASC
 Sorting (Sorting for ORDER BY)
-Sorting (Stream): plus(a, 1) ASC
+Sorting (Global): plus(a, 1) ASC
 Sorting (None)
 Sorting (Chunk): a ASC
 -- FilterStep preserves sort mode
@@ -62,28 +62,28 @@ Sorting (None)
 Sorting (Chunk): a ASC
 -- aliases break sorting order
 -- QUERY: set optimize_read_in_order=1;EXPLAIN PLAN actions=1, header=1, sorting=1 SELECT a FROM (SELECT sipHash64(a) AS a FROM (SELECT a FROM optimize_sorting ORDER BY a)) ORDER BY a
-Sorting (Stream): a ASC
+Sorting (Global): a ASC
 Sorting (Sorting for ORDER BY)
-Sorting (Stream): a ASC
+Sorting (Global): a ASC
 Sorting (None)
 Sorting
+Sorting (Global): a ASC
+Sorting (Stream): a ASC
 Sorting (Stream): a ASC
-Sorting (Port): a ASC
-Sorting (Port): a ASC
 -- aliases DONT break sorting order
 -- QUERY: set optimize_read_in_order=1;EXPLAIN PLAN actions=1, header=1, sorting=1 SELECT a, b FROM (SELECT x AS a, y AS b FROM (SELECT a AS x, b AS y FROM optimize_sorting) ORDER BY x, y)
-Sorting (Stream): x ASC, y ASC
+Sorting (Global): x ASC, y ASC
 Sorting (Sorting for ORDER BY)
-Sorting (Stream): x ASC, y ASC
+Sorting (Global): x ASC, y ASC
 Sorting (Chunk): a ASC, b ASC
 Sorting (Chunk): a ASC, b ASC
 -- actions chain breaks sorting order: input(column a)->sipHash64(column a)->alias(sipHash64(column a), a)->plus(alias a, 1)
 -- QUERY: set optimize_read_in_order=1;EXPLAIN PLAN actions=1, header=1, sorting=1 SELECT a, z FROM (SELECT sipHash64(a) AS a, a + 1 AS z FROM (SELECT a FROM optimize_sorting ORDER BY a + 1)) ORDER BY a + 1
 Sorting (None)
 Sorting (Sorting for ORDER BY)
-Sorting (Stream): plus(a, 1) ASC
+Sorting (Global): plus(a, 1) ASC
 Sorting (None)
 Sorting (Sorting for ORDER BY)
-Sorting (Stream): plus(a, 1) ASC
+Sorting (Global): plus(a, 1) ASC
 Sorting (Chunk): a ASC
 Sorting (Chunk): a ASC
diff --git a/tests/queries/0_stateless/02381_intersect_except_const_column.reference b/tests/queries/0_stateless/02381_intersect_except_const_column.reference
deleted file mode 100644
index 290835b412e..00000000000
--- a/tests/queries/0_stateless/02381_intersect_except_const_column.reference
+++ /dev/null
@@ -1,61 +0,0 @@
-fooooo
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
-2
diff --git a/tests/queries/0_stateless/02381_intersect_except_const_column.sql b/tests/queries/0_stateless/02381_intersect_except_const_column.sql
deleted file mode 100644
index b10f913dd1e..00000000000
--- a/tests/queries/0_stateless/02381_intersect_except_const_column.sql
+++ /dev/null
@@ -1,13 +0,0 @@
--- Test: crash the server
-SELECT 'fooooo' INTERSECT SELECT 'fooooo';
-SELECT 'fooooo' EXCEPT SELECT 'fooooo';
-
--- Test: intersect return incorrect result for const column
-SELECT 1 FROM numbers(10) INTERSECT SELECT 1 FROM numbers(10);
-SELECT toString(1) FROM numbers(10) INTERSECT SELECT toString(1) FROM numbers(10);
-SELECT '1' FROM numbers(10) INTERSECT SELECT '1' FROM numbers(10);
-
--- Test: except return incorrect result for const column
-SELECT 2 FROM numbers(10) EXCEPT SELECT 1 FROM numbers(5);
-SELECT toString(2) FROM numbers(10) EXCEPT SELECT toString(1) FROM numbers(5);
-SELECT '2' FROM numbers(10) EXCEPT SELECT '1' FROM numbers(5);
\ No newline at end of file
diff --git a/tests/queries/0_stateless/02381_join_dup_columns_in_plan.sql b/tests/queries/0_stateless/02381_join_dup_columns_in_plan.sql
index a418a06803f..4ed6d965292 100644
--- a/tests/queries/0_stateless/02381_join_dup_columns_in_plan.sql
+++ b/tests/queries/0_stateless/02381_join_dup_columns_in_plan.sql
@@ -7,6 +7,8 @@ USING (key);
 
 SET join_algorithm = 'full_sorting_merge';
 
+SET max_rows_in_set_to_optimize_join = 0;
+
 EXPLAIN actions=0, description=0, header=1
 SELECT * FROM ( SELECT 'key2' AS key ) AS s1
 JOIN ( SELECT 'key1' AS key, '1' AS value UNION ALL SELECT 'key2' AS key, '1' AS value ) AS s2
diff --git a/tests/queries/0_stateless/02382_join_and_filtering_set.reference b/tests/queries/0_stateless/02382_join_and_filtering_set.reference
new file mode 100644
index 00000000000..58c8ccca6a2
--- /dev/null
+++ b/tests/queries/0_stateless/02382_join_and_filtering_set.reference
@@ -0,0 +1,7 @@
+106
+46
+42
+51
+42
+24
+10
diff --git a/tests/queries/0_stateless/02382_join_and_filtering_set.sql b/tests/queries/0_stateless/02382_join_and_filtering_set.sql
new file mode 100644
index 00000000000..4b425f22c87
--- /dev/null
+++ b/tests/queries/0_stateless/02382_join_and_filtering_set.sql
@@ -0,0 +1,20 @@
+DROP TABLE IF EXISTS t1;
+DROP TABLE IF EXISTS t2;
+
+CREATE TABLE t1 (x UInt64, y UInt64) ENGINE = MergeTree ORDER BY y
+AS SELECT sipHash64(number, 't1_x') % 100 AS x, sipHash64(number, 't1_y') % 100 AS y FROM numbers(100);
+
+CREATE TABLE t2 (x UInt64, y UInt64) ENGINE = MergeTree ORDER BY y
+AS SELECT sipHash64(number, 't2_x') % 100 AS x, sipHash64(number, 't2_y') % 100 AS y FROM numbers(100);
+
+SET max_rows_in_set_to_optimize_join = 1000;
+SET join_algorithm = 'full_sorting_merge';
+
+-- different combinations of conditions on key/attribute columns for the left/right tables
+SELECT count() FROM t1 JOIN t2 ON t1.x = t2.x;
+SELECT count() FROM t1 JOIN t2 ON t1.x = t2.x WHERE t1.y % 2 == 0;
+SELECT count() FROM t1 JOIN t2 ON t1.x = t2.x WHERE t1.x % 2 == 0;
+SELECT count() FROM t1 JOIN t2 ON t1.x = t2.x WHERE t2.y % 2 == 0;
+SELECT count() FROM t1 JOIN t2 ON t1.x = t2.x WHERE t2.x % 2 == 0;
+SELECT count() FROM t1 JOIN t2 ON t1.x = t2.x WHERE t1.y % 2 == 0 AND t2.y % 2 == 0;
+SELECT count() FROM t1 JOIN t2 ON t1.x = t2.x WHERE t1.x % 2 == 0 AND t2.x % 2 == 0 AND t1.y % 2 == 0 AND t2.y % 2 == 0;
diff --git a/tests/queries/0_stateless/02383_join_and_filtering_set.reference b/tests/queries/0_stateless/02383_join_and_filtering_set.reference
new file mode 100644
index 00000000000..2ad282ca07f
--- /dev/null
+++ b/tests/queries/0_stateless/02383_join_and_filtering_set.reference
@@ -0,0 +1,10 @@
+Ok
+Ok
+Ok
+Ok
+Ok
+Ok
+Ok
+Ok
+Ok
+Ok
diff --git a/tests/queries/0_stateless/02383_join_and_filtering_set.sh b/tests/queries/0_stateless/02383_join_and_filtering_set.sh
new file mode 100755
index 00000000000..3356be58ff7
--- /dev/null
+++ b/tests/queries/0_stateless/02383_join_and_filtering_set.sh
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+# Tags: no-asan,no-msan,no-tsan,no-ubsan
+#
+# Test doesn't run complex queries, just test the logic of setting, so no need to run with different builds.
+# Also, we run similar queries in 02382_join_and_filtering_set.sql which is enabled for these builds.
+#
+
+CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
+. "$CURDIR"/../shell_config.sh
+
+$CLICKHOUSE_CLIENT -mn -q """
+CREATE TABLE t1 (x UInt64, y UInt64) ENGINE = MergeTree ORDER BY y
+AS SELECT sipHash64(number, 't1_x') % 100 AS x, sipHash64(number, 't1_y') % 100 AS y FROM numbers(100);
+
+CREATE TABLE t2 (x UInt64, y UInt64) ENGINE = MergeTree ORDER BY y
+AS SELECT sipHash64(number, 't2_x') % 100 AS x, sipHash64(number, 't2_y') % 100 AS y FROM numbers(100);
+"""
+
+# Arguments:
+# - value of max_rows_in_set_to_optimize_join
+# - join kind
+# - expected number of steps in plan
+# - expected number of steps in pipeline
+function test() {
+
+PARAM_VALUE=$1
+JOIN_KIND=${2:-}
+
+EXPECTED_PLAN_STEPS=$3
+RES=$(
+    $CLICKHOUSE_CLIENT --max_rows_in_set_to_optimize_join=${PARAM_VALUE} --join_algorithm='full_sorting_merge' \
+                       -q "EXPLAIN PLAN SELECT count() FROM t1 ${JOIN_KIND} JOIN t2 ON t1.x = t2.x" | grep -o 'CreateSetAndFilterOnTheFlyStep' | wc -l
+)
+[ "$RES" -eq "$EXPECTED_PLAN_STEPS" ] && echo "Ok" || echo "Fail: $RES != $EXPECTED_PLAN_STEPS"
+
+EXPECTED_PIPELINE_STEPS=$4
+RES=$(
+    $CLICKHOUSE_CLIENT --max_rows_in_set_to_optimize_join=${PARAM_VALUE} --join_algorithm='full_sorting_merge' \
+                       -q "EXPLAIN PIPELINE SELECT count() FROM t1 ${JOIN_KIND} JOIN t2 ON t1.x = t2.x" \
+                       | grep -o -e ReadHeadBalancedProcessor -e FilterBySetOnTheFlyTransform -e CreatingSetsOnTheFlyTransform | wc -l
+)
+[ "$RES" -eq "$EXPECTED_PIPELINE_STEPS" ] && echo "Ok" || echo "Fail: $RES != $EXPECTED_PIPELINE_STEPS"
+
+}
+
+test 1000 '' 2 6
+
+# no filtering for left/right side
+test 1000 'LEFT' 2 5
+test 1000 'RIGHT' 2 5
+
+# when disabled no extra steps should be created
+test 1000 'FULL' 0 0
+test 0 '' 0 0
diff --git a/tests/queries/0_stateless/02400_memory_accounting_on_error.sql b/tests/queries/0_stateless/02400_memory_accounting_on_error.sql
index 32046e854dd..f80c372b81c 100644
--- a/tests/queries/0_stateless/02400_memory_accounting_on_error.sql
+++ b/tests/queries/0_stateless/02400_memory_accounting_on_error.sql
@@ -1,4 +1,4 @@
 -- max_block_size to avoid randomization
--- SELECT * FROM generateRandom('i Array(Int8)', 1, 1, 1048577) LIMIT 65536 SETTINGS max_memory_usage='1Gi', max_block_size=65505, log_queries=1; -- { serverError MEMORY_LIMIT_EXCEEDED }
--- SYSTEM FLUSH LOGS;
--- SELECT * FROM system.query_log WHERE event_date >= yesterday() AND current_database = currentDatabase() AND memory_usage > 100e6;
+SELECT * FROM generateRandom('i Array(Int8)', 1, 1, 1048577) LIMIT 65536 SETTINGS max_memory_usage='1Gi', max_block_size=65505, log_queries=1; -- { serverError MEMORY_LIMIT_EXCEEDED }
+SYSTEM FLUSH LOGS;
+SELECT * FROM system.query_log WHERE event_date >= yesterday() AND current_database = currentDatabase() AND memory_usage > 100e6 FORMAT JSONEachRow;
diff --git a/tests/queries/0_stateless/02410_inmemory_wal_cleanup.reference b/tests/queries/0_stateless/02410_inmemory_wal_cleanup.reference
new file mode 100644
index 00000000000..6727d83a6f4
--- /dev/null
+++ b/tests/queries/0_stateless/02410_inmemory_wal_cleanup.reference
@@ -0,0 +1,35 @@
+-- { echo }
+
+DROP TABLE IF EXISTS in_memory;
+CREATE TABLE in_memory (a UInt32) ENGINE = MergeTree ORDER BY a SETTINGS min_rows_for_compact_part = 1000, min_bytes_for_wide_part = 10485760;
+INSERT INTO in_memory VALUES (1);
+INSERT INTO in_memory VALUES (2);
+SELECT name, active, part_type FROM system.parts WHERE database = currentDatabase() AND table = 'in_memory';
+all_1_1_0	1	InMemory
+all_2_2_0	1	InMemory
+SELECT * FROM in_memory ORDER BY a;
+1
+2
+-- no WAL remove since parts are still in use
+DETACH TABLE in_memory;
+ATTACH TABLE in_memory;
+SELECT name, active, part_type FROM system.parts WHERE database = currentDatabase() AND table = 'in_memory';
+all_1_1_0	1	InMemory
+all_2_2_0	1	InMemory
+SELECT * FROM in_memory ORDER BY a;
+1
+2
+-- WAL should be removed, since on disk part covers all parts in WAL
+OPTIMIZE TABLE in_memory;
+DETACH TABLE in_memory;
+ATTACH TABLE in_memory;
+SELECT name, active, part_type FROM system.parts WHERE database = currentDatabase() AND table = 'in_memory';
+all_1_2_1	1	Compact
+-- check that the WAL will be reinitialized after remove
+INSERT INTO in_memory VALUES (3);
+DETACH TABLE in_memory;
+ATTACH TABLE in_memory;
+SELECT * FROM in_memory ORDER BY a;
+1
+2
+3
diff --git a/tests/queries/0_stateless/02410_inmemory_wal_cleanup.sql b/tests/queries/0_stateless/02410_inmemory_wal_cleanup.sql
new file mode 100644
index 00000000000..0228852a115
--- /dev/null
+++ b/tests/queries/0_stateless/02410_inmemory_wal_cleanup.sql
@@ -0,0 +1,27 @@
+-- { echo }
+
+DROP TABLE IF EXISTS in_memory;
+
+CREATE TABLE in_memory (a UInt32) ENGINE = MergeTree ORDER BY a SETTINGS min_rows_for_compact_part = 1000, min_bytes_for_wide_part = 10485760;
+INSERT INTO in_memory VALUES (1);
+INSERT INTO in_memory VALUES (2);
+SELECT name, active, part_type FROM system.parts WHERE database = currentDatabase() AND table = 'in_memory';
+SELECT * FROM in_memory ORDER BY a;
+
+-- no WAL remove since parts are still in use
+DETACH TABLE in_memory;
+ATTACH TABLE in_memory;
+SELECT name, active, part_type FROM system.parts WHERE database = currentDatabase() AND table = 'in_memory';
+SELECT * FROM in_memory ORDER BY a;
+
+-- WAL should be removed, since on disk part covers all parts in WAL
+OPTIMIZE TABLE in_memory;
+DETACH TABLE in_memory;
+ATTACH TABLE in_memory;
+SELECT name, active, part_type FROM system.parts WHERE database = currentDatabase() AND table = 'in_memory';
+
+-- check that the WAL will be reinitialized after remove
+INSERT INTO in_memory VALUES (3);
+DETACH TABLE in_memory;
+ATTACH TABLE in_memory;
+SELECT * FROM in_memory ORDER BY a;
diff --git a/tests/queries/0_stateless/02416_in_set_same_ast_diff_columns.reference b/tests/queries/0_stateless/02416_in_set_same_ast_diff_columns.reference
new file mode 100644
index 00000000000..d00491fd7e5
--- /dev/null
+++ b/tests/queries/0_stateless/02416_in_set_same_ast_diff_columns.reference
@@ -0,0 +1 @@
+1
diff --git a/tests/queries/0_stateless/02416_in_set_same_ast_diff_columns.sql b/tests/queries/0_stateless/02416_in_set_same_ast_diff_columns.sql
new file mode 100644
index 00000000000..c3475f37e7e
--- /dev/null
+++ b/tests/queries/0_stateless/02416_in_set_same_ast_diff_columns.sql
@@ -0,0 +1,3 @@
+CREATE TABLE set_crash (key1 Int32, id1 Int64, c1 Int64) ENGINE = MergeTree PARTITION BY id1 ORDER BY key1;
+INSERT INTO set_crash VALUES (-1, 1, 0);
+SELECT 1 in (-1, 1) FROM set_crash WHERE (key1, id1) in (-1, 1);
diff --git a/tests/queries/0_stateless/02416_json_object_inference.reference b/tests/queries/0_stateless/02416_json_object_inference.reference
new file mode 100644
index 00000000000..01537523906
--- /dev/null
+++ b/tests/queries/0_stateless/02416_json_object_inference.reference
@@ -0,0 +1 @@
+a	Object(Nullable(\'json\'))					
diff --git a/tests/queries/0_stateless/02416_json_object_inference.sql b/tests/queries/0_stateless/02416_json_object_inference.sql
new file mode 100644
index 00000000000..b861468a08a
--- /dev/null
+++ b/tests/queries/0_stateless/02416_json_object_inference.sql
@@ -0,0 +1,2 @@
+-- Tags: no-fasttest
+desc format(JSONEachRow, '{"a" : {"b" : {"c" : 1, "d" : "str"}}}');
diff --git a/tests/queries/0_stateless/02416_rename_database_rbac.reference b/tests/queries/0_stateless/02416_rename_database_rbac.reference
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/queries/0_stateless/02416_rename_database_rbac.sh b/tests/queries/0_stateless/02416_rename_database_rbac.sh
new file mode 100755
index 00000000000..c319136d29c
--- /dev/null
+++ b/tests/queries/0_stateless/02416_rename_database_rbac.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+# Tags: no-parallel
+
+CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
+. "$CURDIR"/../shell_config.sh
+
+${CLICKHOUSE_CLIENT} --multiline --multiquery -q """
+DROP DATABASE IF EXISTS database_02416;
+CREATE DATABASE database_02416;
+
+DROP USER IF EXISTS user_test_02416;
+CREATE USER user_test_02416 IDENTIFIED WITH plaintext_password BY 'user_test_02416';
+
+GRANT CREATE DATABASE ON *.* TO 'user_test_02416' WITH GRANT OPTION;
+GRANT DROP DATABASE ON *.* TO 'user_test_02416' WITH GRANT OPTION;
+REVOKE DROP DATABASE ON database_02416.* FROM 'user_test_02416';
+GRANT CREATE TABLE ON *.* TO 'user_test_02416' WITH GRANT OPTION;
+GRANT DROP TABLE ON *.* TO 'user_test_02416' WITH GRANT OPTION;
+"""
+${CLICKHOUSE_CLIENT} --multiline --multiquery --user user_test_02416 --password user_test_02416 -q """
+RENAME DATABASE user_test_02416 to aaaaaaaaa; -- { serverError 497 }
+"""
diff --git a/tests/queries/0_stateless/02416_rocksdb_delete_update.reference b/tests/queries/0_stateless/02416_rocksdb_delete_update.reference
new file mode 100644
index 00000000000..8ca8c0ca5a2
--- /dev/null
+++ b/tests/queries/0_stateless/02416_rocksdb_delete_update.reference
@@ -0,0 +1,32 @@
+1	Some string	0
+2	Some other string	0
+3	random	0
+4	random2	0
+-----------
+3	random	0
+4	random2	0
+-----------
+3	random	0
+-----------
+0
+-----------
+1	String	10
+2	String	20
+3	String	30
+4	String	40
+-----------
+1	String	10
+2	String	20
+3	Another	30
+4	Another	40
+-----------
+1	String	10
+2	String	20
+3	Another	30
+4	Another	40
+-----------
+1	String	102
+2	String	202
+3	Another	302
+4	Another	402
+-----------
diff --git a/tests/queries/0_stateless/02416_rocksdb_delete_update.sql b/tests/queries/0_stateless/02416_rocksdb_delete_update.sql
new file mode 100644
index 00000000000..28953a108d7
--- /dev/null
+++ b/tests/queries/0_stateless/02416_rocksdb_delete_update.sql
@@ -0,0 +1,42 @@
+-- Tags: no-ordinary-database, no-fasttest
+
+DROP TABLE IF EXISTS 02416_rocksdb;
+
+CREATE TABLE 02416_rocksdb (key UInt64, value String, value2 UInt64) Engine=EmbeddedRocksDB PRIMARY KEY(key);
+
+INSERT INTO 02416_rocksdb VALUES (1, 'Some string', 0), (2, 'Some other string', 0), (3, 'random', 0), (4, 'random2', 0);
+
+SELECT * FROM 02416_rocksdb ORDER BY key;
+SELECT '-----------';
+
+DELETE FROM 02416_rocksdb WHERE value LIKE 'Some%string';
+
+SELECT * FROM 02416_rocksdb ORDER BY key;
+SELECT '-----------';
+
+ALTER TABLE 02416_rocksdb DELETE WHERE key >= 4;
+
+SELECT * FROM 02416_rocksdb ORDER BY key;
+SELECT '-----------';
+
+DELETE FROM 02416_rocksdb WHERE 1 = 1;
+SELECT count() FROM 02416_rocksdb;
+SELECT '-----------';
+
+INSERT INTO 02416_rocksdb VALUES (1, 'String', 10), (2, 'String', 20), (3, 'String', 30), (4, 'String', 40);
+SELECT * FROM 02416_rocksdb ORDER BY key;
+SELECT '-----------';
+
+ALTER TABLE 02416_rocksdb UPDATE value = 'Another' WHERE key > 2;
+SELECT * FROM 02416_rocksdb ORDER BY key;
+SELECT '-----------';
+
+ALTER TABLE 02416_rocksdb UPDATE key = key * 10 WHERE 1 = 1; -- { serverError 36 }
+SELECT * FROM 02416_rocksdb ORDER BY key;
+SELECT '-----------';
+
+ALTER TABLE 02416_rocksdb UPDATE value2 = value2 * 10 + 2 WHERE 1 = 1;
+SELECT * FROM 02416_rocksdb ORDER BY key;
+SELECT '-----------';
+
+DROP TABLE IF EXISTS 02416_rocksdb;
diff --git a/tests/queries/0_stateless/02417_repeat_input_commands.expect b/tests/queries/0_stateless/02417_repeat_input_commands.expect
new file mode 100755
index 00000000000..119aac68645
--- /dev/null
+++ b/tests/queries/0_stateless/02417_repeat_input_commands.expect
@@ -0,0 +1,81 @@
+#!/usr/bin/expect -f
+
+set basedir [file dirname $argv0]
+set basename [file tail $argv0]
+exp_internal -f $env(CLICKHOUSE_TMP)/$basename.debuglog 0
+
+log_user 0
+set timeout 10
+match_max 100000
+
+expect_after {
+    # Do not ignore eof from expect
+    eof { exp_continue }
+    # A default timeout action is to do nothing, change it to fail
+    timeout { exit 1 }
+}
+
+spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT --disable_suggestion"
+expect ":) "
+
+# -----------------------------------------
+# test . and / commands prior to the first query
+
+send -- ".\r"
+expect "Empty query"
+expect ":) "
+
+send -- "/\r"
+expect "Empty query"
+expect ":) "
+
+# -----------------------------------------
+# test . and / commands after first query
+
+send -- "SELECT 123\r"
+expect "│ 123 │"
+expect "1 row in set."
+expect ":) "
+
+send -- ".\r"
+expect "│ 123 │"
+expect "1 row in set."
+expect ":) "
+
+# test input of . more than once in a row
+send -- ".\r"
+expect "│ 123 │"
+expect "1 row in set."
+expect ":) "
+
+send -- "/\r"
+expect "│ 123 │"
+expect "1 row in set."
+expect ":) "
+
+# test input of / more than once in a row
+send -- "/\r"
+expect "│ 123 │"
+expect "1 row in set."
+expect ":) "
+
+# -----------------------------------------
+# test . and / commands after another query
+
+send -- "SELECT 321\r"
+expect "│ 321 │"
+expect "1 row in set."
+expect ":) "
+
+send -- ".\r"
+expect "│ 321 │"
+expect "1 row in set."
+expect ":) "
+
+send -- "/\r"
+expect "│ 321 │"
+expect "1 row in set."
+expect ":) "
+
+send -- "quit\r"
+expect eof
diff --git a/tests/queries/0_stateless/02417_repeat_input_commands.reference b/tests/queries/0_stateless/02417_repeat_input_commands.reference
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/queries/0_stateless/02420_stracktrace_debug_symbols.reference b/tests/queries/0_stateless/02420_stracktrace_debug_symbols.reference
new file mode 100644
index 00000000000..d00491fd7e5
--- /dev/null
+++ b/tests/queries/0_stateless/02420_stracktrace_debug_symbols.reference
@@ -0,0 +1 @@
+1
diff --git a/tests/queries/0_stateless/02420_stracktrace_debug_symbols.sh b/tests/queries/0_stateless/02420_stracktrace_debug_symbols.sh
new file mode 100755
index 00000000000..9b647ec984b
--- /dev/null
+++ b/tests/queries/0_stateless/02420_stracktrace_debug_symbols.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+
+CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
+. "$CUR_DIR"/../shell_config.sh
+
+# NOTE: that this test uses stacktrace instead of addressToLineWithInlines() or
+# similar, since that code (use / might use) different code path in Dwarf
+# parser.
+#
+# Also note, that to rely on this test one should assume that CI packages uses
+# ThinLTO builds.
+
+$CLICKHOUSE_LOCAL --stacktrace -q 'select throwIf(1)' |& grep -c 'Common/Exception.cpp:[0-9]*: DB::Exception::Exception'
diff --git a/tests/queries/1_stateful/00175_obfuscator_schema_inference.reference b/tests/queries/1_stateful/00175_obfuscator_schema_inference.reference
new file mode 100644
index 00000000000..bd7f726bffd
--- /dev/null
+++ b/tests/queries/1_stateful/00175_obfuscator_schema_inference.reference
@@ -0,0 +1,4 @@
+403489
+1000	320	171	23
+2500	597	332	14
+2500	597	332	14
diff --git a/tests/queries/1_stateful/00175_obfuscator_schema_inference.sh b/tests/queries/1_stateful/00175_obfuscator_schema_inference.sh
new file mode 100755
index 00000000000..8ff0d2fa648
--- /dev/null
+++ b/tests/queries/1_stateful/00175_obfuscator_schema_inference.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+
+CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
+. "$CURDIR"/../shell_config.sh
+
+# Compared to explicitly specifying the structure of the input,
+#  schema inference adds Nullable(T) to all types, so the model and the results
+#  are a bit different from test '00175_obfuscator_schema_inference.sh'
+
+$CLICKHOUSE_CLIENT --max_threads 1 --query="SELECT URL, Title, SearchPhrase FROM test.hits LIMIT 1000" > "${CLICKHOUSE_TMP}"/data.tsv
+
+# Test obfuscator without saving the model
+$CLICKHOUSE_OBFUSCATOR --input-format TSV --output-format TSV --seed hello --limit 2500 < "${CLICKHOUSE_TMP}"/data.tsv > "${CLICKHOUSE_TMP}"/data2500.tsv 2>/dev/null
+
+# Test obfuscator with saving the model
+$CLICKHOUSE_OBFUSCATOR --input-format TSV --output-format TSV --seed hello --limit 0 --save "${CLICKHOUSE_TMP}"/model.bin < "${CLICKHOUSE_TMP}"/data.tsv 2>/dev/null
+wc -c < "${CLICKHOUSE_TMP}"/model.bin
+$CLICKHOUSE_OBFUSCATOR --input-format TSV --output-format TSV --seed hello --limit 2500 --load "${CLICKHOUSE_TMP}"/model.bin < "${CLICKHOUSE_TMP}"/data.tsv > "${CLICKHOUSE_TMP}"/data2500_load_from_model.tsv 2>/dev/null
+rm "${CLICKHOUSE_TMP}"/model.bin
+
+$CLICKHOUSE_LOCAL --structure "URL String, Title String, SearchPhrase String" --input-format TSV --output-format TSV --query "SELECT count(), uniq(URL), uniq(Title), uniq(SearchPhrase) FROM table" < "${CLICKHOUSE_TMP}"/data.tsv
+$CLICKHOUSE_LOCAL --structure "URL String, Title String, SearchPhrase String" --input-format TSV --output-format TSV --query "SELECT count(), uniq(URL), uniq(Title), uniq(SearchPhrase) FROM table" < "${CLICKHOUSE_TMP}"/data2500.tsv
+$CLICKHOUSE_LOCAL --structure "URL String, Title String, SearchPhrase String" --input-format TSV --output-format TSV --query "SELECT count(), uniq(URL), uniq(Title), uniq(SearchPhrase) FROM table" < "${CLICKHOUSE_TMP}"/data2500_load_from_model.tsv
+
+rm "${CLICKHOUSE_TMP}"/data.tsv
+rm "${CLICKHOUSE_TMP}"/data2500.tsv
+rm "${CLICKHOUSE_TMP}"/data2500_load_from_model.tsv
diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt
index fa178e764da..0a4aecad50e 100644
--- a/utils/check-style/aspell-ignore/en/aspell-dict.txt
+++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt
@@ -251,6 +251,7 @@ dragonbox
 durations
 endian
 enum
+exFAT
 fastops
 fcoverage
 filesystem
diff --git a/utils/check-style/check-typos b/utils/check-style/check-typos
index 3819b6785d7..9194a9464a7 100755
--- a/utils/check-style/check-typos
+++ b/utils/check-style/check-typos
@@ -5,7 +5,7 @@
 ROOT_PATH=$(git rev-parse --show-toplevel)
 
 codespell \
-    --skip "*generated*,*gperf*,*.bin,*.mrk*,*.idx,checksums.txt,*.dat,*.pyc,*.kate-swp,*obfuscateQueries.cpp,d3-*.js,*.min.js,${ROOT_PATH}/utils/check-style/aspell-ignore" \
+    --skip "*generated*,*gperf*,*.bin,*.mrk*,*.idx,checksums.txt,*.dat,*.pyc,*.kate-swp,*obfuscateQueries.cpp,d3-*.js,*.min.js,*.sum,${ROOT_PATH}/utils/check-style/aspell-ignore" \
     --ignore-words "${ROOT_PATH}/utils/check-style/codespell-ignore-words.list" \
     --exclude-file "${ROOT_PATH}/utils/check-style/codespell-ignore-lines.list" \
     --quiet-level 2 \
diff --git a/utils/check-style/codespell-ignore-words.list b/utils/check-style/codespell-ignore-words.list
index 062e8a1622b..fc021920041 100644
--- a/utils/check-style/codespell-ignore-words.list
+++ b/utils/check-style/codespell-ignore-words.list
@@ -16,3 +16,8 @@ ot
 te
 fo
 ba
+ro
+rightt
+iiterator
+hastable
+nam
diff --git a/utils/keeper-bench/Runner.cpp b/utils/keeper-bench/Runner.cpp
index b43ad62bc5b..2f3cf4b0620 100644
--- a/utils/keeper-bench/Runner.cpp
+++ b/utils/keeper-bench/Runner.cpp
@@ -203,16 +203,11 @@ std::vector<std::shared_ptr<Coordination::ZooKeeper>> Runner::getConnections()
         Coordination::ZooKeeper::Node node{Poco::Net::SocketAddress{host_string}, false};
         std::vector<Coordination::ZooKeeper::Node> nodes;
         nodes.push_back(node);
-        zookeepers.emplace_back(std::make_shared<Coordination::ZooKeeper>(
-            nodes,
-            "", /*chroot*/
-            "", /*identity type*/
-            "", /*identity*/
-            Poco::Timespan(0, 30000 * 1000),
-            Poco::Timespan(0, 1000 * 1000),
-            Poco::Timespan(0, 10000 * 1000),
-            nullptr));
-
+        zkutil::ZooKeeperArgs args;
+        args.session_timeout_ms = 30000;
+        args.connection_timeout_ms = 1000;
+        args.operation_timeout_ms = 10000;
+        zookeepers.emplace_back(std::make_shared<Coordination::ZooKeeper>(nodes, args, nullptr));
     }
 
 
diff --git a/utils/self-extracting-executable/decompressor.cpp b/utils/self-extracting-executable/decompressor.cpp
index 8d8d137a2ac..c997526d38d 100644
--- a/utils/self-extracting-executable/decompressor.cpp
+++ b/utils/self-extracting-executable/decompressor.cpp
@@ -13,6 +13,8 @@
 #include <cstring>
 #include <iostream>
 #include <filesystem>
+#include <fstream>
+#include <sstream>
 
 #if (defined(OS_DARWIN) || defined(OS_FREEBSD)) && defined(__GNUC__)
 #   include <machine/endian.h>
@@ -359,6 +361,35 @@ int decompressFiles(int input_fd, char * path, char * name, bool & have_compress
 
 #endif
 
+#if !defined(OS_DARWIN) && !defined(OS_FREEBSD)
+
+uint32_t getInode(const char * self)
+{
+    std::ifstream maps("/proc/self/maps");
+    if (maps.fail())
+    {
+        perror("open maps");
+        return 0;
+    }
+
+    /// Record example for /proc/self/maps:
+    /// address                   perms offset  device inode                     pathname
+    /// 561a247de000-561a247e0000 r--p 00000000 103:01 1564                      /usr/bin/cat
+    /// see "man 5 proc"
+    for (std::string line; std::getline(maps, line);)
+    {
+        std::stringstream ss(line); // STYLE_CHECK_ALLOW_STD_STRING_STREAM
+        std::string addr, mode, offset, id, path;
+        uint32_t inode = 0;
+        if (ss >> addr >> mode >> offset >> id >> inode >> path && path == self)
+            return inode;
+    }
+
+    return 0;
+}
+
+#endif
+
 int main(int/* argc*/, char* argv[])
 {
     char self[4096] = {0};
@@ -382,6 +413,60 @@ int main(int/* argc*/, char* argv[])
     else
         name = file_path;
 
+#if !defined(OS_DARWIN) && !defined(OS_FREEBSD)
+    /// get inode of this executable
+    uint32_t inode = getInode(self);
+    if (inode == 0)
+    {
+        std::cerr << "Unable to obtain inode." << std::endl;
+        return 1;
+    }
+
+    std::stringstream lock_path; // STYLE_CHECK_ALLOW_STD_STRING_STREAM
+    lock_path << "/tmp/" << name << ".decompression." << inode << ".lock";
+    int lock = open(lock_path.str().c_str(), O_CREAT | O_RDWR, 0666);
+    if (lock < 0)
+    {
+        perror("lock open");
+        return 1;
+    }
+
+    /// lock file should be closed on exec call
+    fcntl(lock, F_SETFD, FD_CLOEXEC);
+
+    if (lockf(lock, F_LOCK, 0))
+    {
+        perror("lockf");
+        return 1;
+    }
+
+    struct stat input_info;
+    if (0 != stat(self, &input_info))
+    {
+        perror("stat");
+        return 1;
+    }
+
+    /// if decompression was performed by another process since this copy was started
+    /// then file referred by path "self" is already pointing to different inode
+    if (input_info.st_ino != inode)
+    {
+        struct stat lock_info;
+        if (0 != fstat(lock, &lock_info))
+        {
+            perror("fstat lock");
+            return 1;
+        }
+
+        /// size 1 of lock file indicates that another decompressor has found active executable
+        if (lock_info.st_size == 1)
+            execv(self, argv);
+
+        printf("No target executable - decompression only was performed.\n");
+        return 0;
+    }
+#endif
+
     int input_fd = open(self, O_RDONLY);
     if (input_fd == -1)
     {
@@ -443,12 +528,21 @@ int main(int/* argc*/, char* argv[])
 
         if (has_exec)
         {
+#if !defined(OS_DARWIN) && !defined(OS_FREEBSD)
+            /// write one byte to the lock in case other copies of compressed are running to indicate that
+            /// execution should be performed
+            write(lock, "1", 1);
+#endif
             execv(self, argv);
 
             /// This part of code will be reached only if error happened
             perror("execv");
             return 1;
         }
+#if !defined(OS_DARWIN) && !defined(OS_FREEBSD)
+        /// since inodes can be reused - it's a precaution if lock file already exists and have size of 1
+        ftruncate(lock, 0);
+#endif
 
         printf("No target executable - decompression only was performed.\n");
     }
diff --git a/utils/zookeeper-cli/zookeeper-cli.cpp b/utils/zookeeper-cli/zookeeper-cli.cpp
index 94fec5a8855..bfcdb0a90de 100644
--- a/utils/zookeeper-cli/zookeeper-cli.cpp
+++ b/utils/zookeeper-cli/zookeeper-cli.cpp
@@ -69,7 +69,7 @@ int main(int argc, char ** argv)
         Poco::Logger::root().setChannel(channel);
         Poco::Logger::root().setLevel("trace");
 
-        zkutil::ZooKeeper zk(argv[1]);
+        zkutil::ZooKeeper zk{zkutil::ZooKeeperArgs(argv[1])};
         LineReader lr({}, false, {"\\"}, {});
 
         do