Merge branch 'master' into keeper-prometheus

2024-11-22 15:42:02 +00:00 · 2022-11-28 09:48:57 +01:00 · 2022-11-28 09:48:57 +01:00 · 678958603b
commit 678958603b
parent a6f38cb1cc 143637404e
211 changed files with 5553 additions and 1363 deletions
--- a/.github/workflows/backport_branches.yml
+++ b/.github/workflows/backport_branches.yml
@ -145,8 +145,8 @@ jobs:
          fetch-depth: 0 # For a proper version and performance artifacts
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -190,8 +190,8 @@ jobs:
          fetch-depth: 0 # For a proper version and performance artifacts
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -233,8 +233,8 @@ jobs:
        uses: actions/checkout@v2
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -276,8 +276,8 @@ jobs:
        uses: actions/checkout@v2
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -319,8 +319,8 @@ jobs:
        uses: actions/checkout@v2
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -364,8 +364,8 @@ jobs:
          fetch-depth: 0 # otherwise we will have no info about contributors
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -409,8 +409,8 @@ jobs:
          fetch-depth: 0 # otherwise we will have no info about contributors
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
--- a/.github/workflows/master.yml
+++ b/.github/workflows/master.yml
@ -209,8 +209,8 @@ jobs:
          fetch-depth: 0 # For a proper version and performance artifacts
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -251,8 +251,8 @@ jobs:
          fetch-depth: 0 # For a proper version and performance artifacts
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -295,8 +295,8 @@ jobs:
          fetch-depth: 0 # otherwise we will have no info about contributors
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -338,8 +338,8 @@ jobs:
        uses: actions/checkout@v2
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -381,8 +381,8 @@ jobs:
        uses: actions/checkout@v2
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -424,8 +424,8 @@ jobs:
        uses: actions/checkout@v2
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -467,8 +467,8 @@ jobs:
        uses: actions/checkout@v2
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -510,8 +510,8 @@ jobs:
        uses: actions/checkout@v2
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -556,8 +556,8 @@ jobs:
        uses: actions/checkout@v2
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -599,8 +599,8 @@ jobs:
        uses: actions/checkout@v2
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -644,8 +644,8 @@ jobs:
          fetch-depth: 0 # otherwise we will have no info about contributors
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -689,8 +689,8 @@ jobs:
          fetch-depth: 0 # otherwise we will have no info about contributors
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -734,8 +734,8 @@ jobs:
          fetch-depth: 0 # otherwise we will have no info about contributors
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -779,8 +779,8 @@ jobs:
          fetch-depth: 0 # otherwise we will have no info about contributors
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -824,8 +824,8 @@ jobs:
          fetch-depth: 0 # otherwise we will have no info about contributors
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -869,8 +869,8 @@ jobs:
          fetch-depth: 0 # otherwise we will have no info about contributors
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -914,8 +914,8 @@ jobs:
          fetch-depth: 0 # otherwise we will have no info about contributors
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@ -105,7 +105,7 @@ jobs:
      - name: Build
        run: |
          git -C "$GITHUB_WORKSPACE" submodule sync
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@ -272,8 +272,8 @@ jobs:
          fetch-depth: 0  # for performance artifact
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -315,8 +315,8 @@ jobs:
        uses: actions/checkout@v2
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -360,8 +360,8 @@ jobs:
          fetch-depth: 0  # for performance artifact
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -403,8 +403,8 @@ jobs:
        uses: actions/checkout@v2
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -446,8 +446,8 @@ jobs:
        uses: actions/checkout@v2
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -489,8 +489,8 @@ jobs:
        uses: actions/checkout@v2
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -532,8 +532,8 @@ jobs:
        uses: actions/checkout@v2
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -575,8 +575,8 @@ jobs:
        uses: actions/checkout@v2
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -621,8 +621,8 @@ jobs:
        uses: actions/checkout@v2
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -664,8 +664,8 @@ jobs:
        uses: actions/checkout@v2
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -707,8 +707,8 @@ jobs:
        uses: actions/checkout@v2
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -750,8 +750,8 @@ jobs:
        uses: actions/checkout@v2
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -793,8 +793,8 @@ jobs:
        uses: actions/checkout@v2
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -836,8 +836,8 @@ jobs:
        uses: actions/checkout@v2
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -879,8 +879,8 @@ jobs:
        uses: actions/checkout@v2
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -922,8 +922,8 @@ jobs:
        uses: actions/checkout@v2
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -965,8 +965,8 @@ jobs:
        uses: actions/checkout@v2
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
--- a/.github/workflows/release_branches.yml
+++ b/.github/workflows/release_branches.yml
@ -136,8 +136,8 @@ jobs:
          fetch-depth: 0 # otherwise we will have no info about contributors
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -178,8 +178,8 @@ jobs:
          fetch-depth: 0 # otherwise we will have no info about contributors
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -220,8 +220,8 @@ jobs:
        uses: actions/checkout@v2
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -263,8 +263,8 @@ jobs:
        uses: actions/checkout@v2
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -306,8 +306,8 @@ jobs:
        uses: actions/checkout@v2
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -349,8 +349,8 @@ jobs:
        uses: actions/checkout@v2
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -392,8 +392,8 @@ jobs:
        uses: actions/checkout@v2
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -437,8 +437,8 @@ jobs:
          fetch-depth: 0 # otherwise we will have no info about contributors
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
@ -482,8 +482,8 @@ jobs:
          fetch-depth: 0 # otherwise we will have no info about contributors
      - name: Build
        run: |
-          git -C "$GITHUB_WORKSPACE" submodule sync --recursive
-          git -C "$GITHUB_WORKSPACE" submodule update --depth=1 --recursive --init --jobs=10
+          git -C "$GITHUB_WORKSPACE" submodule sync
+          git -C "$GITHUB_WORKSPACE" submodule update --single-branch --depth=1 --init --jobs=10
          sudo rm -fr "$TEMP_PATH"
          mkdir -p "$TEMP_PATH"
          cp -r "$GITHUB_WORKSPACE" "$TEMP_PATH"
--- a/.gitmodules
+++ b/.gitmodules
@ -290,3 +290,6 @@
 [submodule "contrib/morton-nd"]
 	path = contrib/morton-nd
 	url = https://github.com/morton-nd/morton-nd
+[submodule "contrib/xxHash"]
+	path = contrib/xxHash
+	url = https://github.com/Cyan4973/xxHash.git
--- a/contrib/CMakeLists.txt
+++ b/contrib/CMakeLists.txt
@ -167,7 +167,9 @@ add_contrib (c-ares-cmake c-ares)
 add_contrib (qpl-cmake qpl)
 add_contrib (morton-nd-cmake morton-nd)

-add_contrib(annoy-cmake annoy)
+add_contrib (annoy-cmake annoy)
+
+add_contrib (xxHash-cmake xxHash)

 # Put all targets defined here and in subdirectories under "contrib/<immediate-subdir>" folders in GUI-based IDEs.
 # Some of third-party projects may override CMAKE_FOLDER or FOLDER property of their targets, so they would not appear
--- a/contrib/xxHash
+++ b/contrib/xxHash
@ -0,0 +1 @@
+Subproject commit 3078dc6039f8c0bffcb1904f81cfe6b2c3209435
--- a/contrib/xxHash-cmake/CMakeLists.txt
+++ b/contrib/xxHash-cmake/CMakeLists.txt
@ -0,0 +1,13 @@
+set (LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/xxHash")
+set (SRCS
+    "${LIBRARY_DIR}/xxhash.c"
+)
+
+add_library(xxHash ${SRCS})
+target_include_directories(xxHash SYSTEM BEFORE INTERFACE "${LIBRARY_DIR}")
+
+# XXH_INLINE_ALL - Make all functions inline, with implementations being directly included within xxhash.h. Inlining functions is beneficial for speed on small keys.
+# https://github.com/Cyan4973/xxHash/tree/v0.8.1#build-modifiers
+target_compile_definitions(xxHash PUBLIC XXH_INLINE_ALL)
+
+add_library(ch_contrib::xxHash ALIAS xxHash)
--- a/docker/packager/binary/Dockerfile
+++ b/docker/packager/binary/Dockerfile
@ -6,29 +6,24 @@ FROM clickhouse/test-util:$FROM_TAG
 # Rust toolchain and libraries
 ENV RUSTUP_HOME=/rust/rustup
 ENV CARGO_HOME=/rust/cargo
-RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
-RUN chmod 777 -R /rust
 ENV PATH="/rust/cargo/env:${PATH}"
 ENV PATH="/rust/cargo/bin:${PATH}"
-RUN rustup target add aarch64-unknown-linux-gnu && \
-        rustup target add x86_64-apple-darwin && \
-        rustup target add x86_64-unknown-freebsd && \
-        rustup target add aarch64-apple-darwin && \
-        rustup target add powerpc64le-unknown-linux-gnu
-RUN apt-get install \
+RUN curl https://sh.rustup.rs -sSf | bash -s -- -y && \
+    chmod 777 -R /rust && \
+    rustup target add aarch64-unknown-linux-gnu && \
+    rustup target add x86_64-apple-darwin && \
+    rustup target add x86_64-unknown-freebsd && \
+    rustup target add aarch64-apple-darwin && \
+    rustup target add powerpc64le-unknown-linux-gnu
+
+RUN apt-get update && \
+    apt-get install --yes \
        gcc-aarch64-linux-gnu \
        build-essential \
        libc6 \
        libc6-dev \
-        libc6-dev-arm64-cross \
-        --yes
-
-# Install CMake 3.20+ for Rust compilation
-# Used https://askubuntu.com/a/1157132 as reference
-RUN apt purge cmake --yes
-RUN wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null
-RUN apt-add-repository 'deb https://apt.kitware.com/ubuntu/ focal main'
-RUN apt update && apt install cmake --yes
+        libc6-dev-arm64-cross && \
+    apt-get clean

 ENV CC=clang-${LLVM_VERSION}
 ENV CXX=clang++-${LLVM_VERSION}
--- a/docker/test/fasttest/run.sh
+++ b/docker/test/fasttest/run.sh
@ -137,6 +137,7 @@ function clone_submodules
            contrib/hashidsxx
            contrib/c-ares
            contrib/morton-nd
+            contrib/xxHash
        )

        git submodule sync
--- a/docker/test/stress/run.sh
+++ b/docker/test/stress/run.sh
@ -131,7 +131,14 @@ function stop()
    # Preserve the pid, since the server can hung after the PID will be deleted.
    pid="$(cat /var/run/clickhouse-server/clickhouse-server.pid)"

-    clickhouse stop --do-not-kill && return
+    # --max-tries is supported only since 22.12
+    if dpkg --compare-versions "$(clickhouse local -q 'select version()')" ge "22.12"; then
+        # Increase default waiting timeout for sanitizers and debug builds
+        clickhouse stop --max-tries 180 --do-not-kill && return
+    else
+        clickhouse stop --do-not-kill && return
+    fi
+
    # We failed to stop the server with SIGTERM. Maybe it hang, let's collect stacktraces.
    kill -TERM "$(pidof gdb)" ||:
    sleep 5
@ -388,6 +395,8 @@ else
    rm -f /etc/clickhouse-server/config.d/storage_conf.xml ||:
    rm -f /etc/clickhouse-server/config.d/azure_storage_conf.xml ||:

+    # Turn on after 22.12
+    rm -f /etc/clickhouse-server/config.d/compressed_marks_and_index.xml ||:
    # it uses recently introduced settings which previous versions may not have
    rm -f /etc/clickhouse-server/users.d/insert_keeper_retries.xml ||:

@ -456,7 +465,7 @@ else
    zgrep -Fav -e "Code: 236. DB::Exception: Cancelled merging parts" \
               -e "Code: 236. DB::Exception: Cancelled mutating parts" \
               -e "REPLICA_IS_ALREADY_ACTIVE" \
-               -e "REPLICA_IS_ALREADY_EXIST" \
+               -e "REPLICA_ALREADY_EXISTS" \
               -e "ALL_REPLICAS_LOST" \
               -e "DDLWorker: Cannot parse DDL task query" \
               -e "RaftInstance: failed to accept a rpc connection due to error 125" \
--- a/docker/test/util/Dockerfile
+++ b/docker/test/util/Dockerfile
@ -13,6 +13,7 @@ RUN apt-get update \
        apt-transport-https \
        apt-utils \
        ca-certificates \
+        curl \
        dnsutils \
        gnupg \
        iputils-ping \
@ -24,10 +25,16 @@ RUN apt-get update \
    && echo "${LLVM_PUBKEY_HASH} /tmp/llvm-snapshot.gpg.key" | sha384sum -c \
    && apt-key add /tmp/llvm-snapshot.gpg.key \
    && export CODENAME="$(lsb_release --codename --short | tr 'A-Z' 'a-z')" \
-    && echo "deb [trusted=yes] https://apt.llvm.org/${CODENAME}/ llvm-toolchain-${CODENAME}-${LLVM_VERSION} main" >> \
+    && echo "deb https://apt.llvm.org/${CODENAME}/ llvm-toolchain-${CODENAME}-${LLVM_VERSION} main" >> \
        /etc/apt/sources.list \
    && apt-get clean

+# Install cmake 3.20+ for rust support
+# Used https://askubuntu.com/a/1157132 as reference
+RUN curl -s https://apt.kitware.com/keys/kitware-archive-latest.asc | \
+        gpg --dearmor - > /etc/apt/trusted.gpg.d/kitware.gpg && \
+    echo "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" >> /etc/apt/sources.list
+
 # initial packages
 RUN apt-get update \
    && apt-get install \
@ -37,7 +44,6 @@ RUN apt-get update \
        clang-${LLVM_VERSION} \
        clang-tidy-${LLVM_VERSION} \
        cmake \
-        curl \
        fakeroot \
        gdb \
        git \
--- a/docs/en/engines/table-engines/mergetree-family/replication.md
+++ b/docs/en/engines/table-engines/mergetree-family/replication.md
@ -85,7 +85,7 @@ Example of setting the addresses of the auxiliary ZooKeeper cluster:
 </auxiliary_zookeepers>
 ```

-To store table datameta in a auxiliary ZooKeeper cluster instead of default ZooKeeper cluster, we can use the SQL to create table with
+To store table metadata in an auxiliary ZooKeeper cluster instead of default ZooKeeper cluster, we can use the SQL to create table with
 ReplicatedMergeTree engine as follow:

 ```
--- a/docs/en/sql-reference/functions/arithmetic-functions.md
+++ b/docs/en/sql-reference/functions/arithmetic-functions.md
@ -161,3 +161,140 @@ Result:
 │          -1 │
 └─────────────┘
 ```
+
+## multiplyDecimal(a, b[, result_scale])
+
+Performs multiplication on two decimals. Result value will be of type [Decimal256](../../sql-reference/data-types/decimal.md).
+Result scale can be explicitly specified by `result_scale` argument (const Integer in range `[0, 76]`). If not specified, the result scale is the max scale of given arguments.
+
+:::note    
+These functions work significantly slower than usual `multiply`.
+In case you don't really need controlled precision and/or need fast computation, consider using [multiply](#multiply)
+:::
+
+**Syntax**
+
+```sql
+multiplyDecimal(a, b[, result_scale])
+```
+
+**Arguments**
+
+-   `a` — First value: [Decimal](../../sql-reference/data-types/decimal.md).
+-   `b` — Second value: [Decimal](../../sql-reference/data-types/decimal.md).
+-   `result_scale` — Scale of result: [Int/UInt](../../sql-reference/data-types/int-uint.md).
+
+**Returned value**
+
+-   The result of multiplication with given scale.
+
+Type: [Decimal256](../../sql-reference/data-types/decimal.md).
+
+**Example**
+
+```text
+┌─multiplyDecimal(toDecimal256(-12, 0), toDecimal32(-2.1, 1), 1)─┐
+│                                                           25.2 │
+└────────────────────────────────────────────────────────────────┘
+```
+
+**Difference from regular multiplication:**
+```sql
+SELECT toDecimal64(-12.647, 3) * toDecimal32(2.1239, 4);
+SELECT toDecimal64(-12.647, 3) as a, toDecimal32(2.1239, 4) as b, multiplyDecimal(a, b);
+```
+
+```text
+┌─multiply(toDecimal64(-12.647, 3), toDecimal32(2.1239, 4))─┐
+│                                               -26.8609633 │
+└───────────────────────────────────────────────────────────┘
+┌─multiplyDecimal(toDecimal64(-12.647, 3), toDecimal32(2.1239, 4))─┐
+│                                                         -26.8609 │
+└──────────────────────────────────────────────────────────────────┘
+```
+
+```sql
+SELECT
+    toDecimal64(-12.647987876, 9) AS a,
+    toDecimal64(123.967645643, 9) AS b,
+    multiplyDecimal(a, b);
+
+SELECT
+    toDecimal64(-12.647987876, 9) AS a,
+    toDecimal64(123.967645643, 9) AS b,
+    a * b;
+```
+
+```text
+┌─────────────a─┬─────────────b─┬─multiplyDecimal(toDecimal64(-12.647987876, 9), toDecimal64(123.967645643, 9))─┐
+│ -12.647987876 │ 123.967645643 │                                                               -1567.941279108 │
+└───────────────┴───────────────┴───────────────────────────────────────────────────────────────────────────────┘
+
+Received exception from server (version 22.11.1):
+Code: 407. DB::Exception: Received from localhost:9000. DB::Exception: Decimal math overflow: While processing toDecimal64(-12.647987876, 9) AS a, toDecimal64(123.967645643, 9) AS b, a * b. (DECIMAL_OVERFLOW)
+```
+
+## divideDecimal(a, b[, result_scale])
+
+Performs division on two decimals. Result value will be of type [Decimal256](../../sql-reference/data-types/decimal.md).
+Result scale can be explicitly specified by `result_scale` argument (const Integer in range `[0, 76]`). If not specified, the result scale is the max scale of given arguments.
+
+:::note    
+These function work significantly slower than usual `divide`.
+In case you don't really need controlled precision and/or need fast computation, consider using [divide](#divide).
+:::
+
+**Syntax**
+
+```sql
+divideDecimal(a, b[, result_scale])
+```
+
+**Arguments**
+
+-   `a` — First value: [Decimal](../../sql-reference/data-types/decimal.md).
+-   `b` — Second value: [Decimal](../../sql-reference/data-types/decimal.md).
+-   `result_scale` — Scale of result: [Int/UInt](../../sql-reference/data-types/int-uint.md).
+
+**Returned value**
+
+-   The result of division with given scale.
+
+Type: [Decimal256](../../sql-reference/data-types/decimal.md).
+
+**Example**
+
+```text
+┌─divideDecimal(toDecimal256(-12, 0), toDecimal32(2.1, 1), 10)─┐
+│                                                -5.7142857142 │
+└──────────────────────────────────────────────────────────────┘
+```
+
+**Difference from regular division:**
+```sql
+SELECT toDecimal64(-12, 1) / toDecimal32(2.1, 1);
+SELECT toDecimal64(-12, 1) as a, toDecimal32(2.1, 1) as b, divideDecimal(a, b, 1), divideDecimal(a, b, 5);
+```
+
+```text
+┌─divide(toDecimal64(-12, 1), toDecimal32(2.1, 1))─┐
+│                                             -5.7 │
+└──────────────────────────────────────────────────┘
+
+┌───a─┬───b─┬─divideDecimal(toDecimal64(-12, 1), toDecimal32(2.1, 1), 1)─┬─divideDecimal(toDecimal64(-12, 1), toDecimal32(2.1, 1), 5)─┐
+│ -12 │ 2.1 │                                                       -5.7 │                                                   -5.71428 │
+└─────┴─────┴────────────────────────────────────────────────────────────┴────────────────────────────────────────────────────────────┘
+```
+
+```sql
+SELECT toDecimal64(-12, 0) / toDecimal32(2.1, 1);
+SELECT toDecimal64(-12, 0) as a, toDecimal32(2.1, 1) as b, divideDecimal(a, b, 1), divideDecimal(a, b, 5);
+```
+
+```text
+DB::Exception: Decimal result's scale is less than argument's one: While processing toDecimal64(-12, 0) / toDecimal32(2.1, 1). (ARGUMENT_OUT_OF_BOUND)
+
+┌───a─┬───b─┬─divideDecimal(toDecimal64(-12, 0), toDecimal32(2.1, 1), 1)─┬─divideDecimal(toDecimal64(-12, 0), toDecimal32(2.1, 1), 5)─┐
+│ -12 │ 2.1 │                                                       -5.7 │                                                   -5.71428 │
+└─────┴─────┴────────────────────────────────────────────────────────────┴────────────────────────────────────────────────────────────┘
+```
--- a/docs/en/sql-reference/statements/explain.md
+++ b/docs/en/sql-reference/statements/explain.md
@ -10,7 +10,7 @@ Shows the execution plan of a statement.
 Syntax:

 ```sql
-EXPLAIN [AST | SYNTAX | PLAN | PIPELINE | ESTIMATE | TABLE OVERRIDE] [setting = value, ...]
+EXPLAIN [AST | SYNTAX | QUERY TREE | PLAN | PIPELINE | ESTIMATE | TABLE OVERRIDE] [setting = value, ...]
    [
      SELECT ... |
      tableFunction(...) [COLUMNS (...)] [ORDER BY ...] [PARTITION BY ...] [PRIMARY KEY] [SAMPLE BY ...] [TTL ...]
--- a/docs/ru/sql-reference/functions/arithmetic-functions.md
+++ b/docs/ru/sql-reference/functions/arithmetic-functions.md
@ -159,3 +159,150 @@ SELECT min2(-1, 2);
 └─────────────┘
 ```

+## multiplyDecimal(a, b[, result_scale])
+
+Совершает умножение двух Decimal. Результат будет иметь тип [Decimal256](../../sql-reference/data-types/decimal.md).
+Scale (размер дробной части) результат можно явно задать аргументом `result_scale`  (целочисленная константа из интервала `[0, 76]`).
+Если этот аргумент не задан, то scale результата будет равен наибольшему из scale обоих аргументов.
+
+**Синтаксис**
+
+```sql
+multiplyDecimal(a, b[, result_scale])
+```
+
+:::note    
+Эта функция работают гораздо медленнее обычной `multiply`.
+В случае, если нет необходимости иметь фиксированную точность и/или нужны быстрые вычисления, следует использовать [multiply](#multiply).
+:::
+
+**Аргументы**
+
+-   `a` — Первый сомножитель/делимое: [Decimal](../../sql-reference/data-types/decimal.md).
+-   `b` — Второй сомножитель/делитель: [Decimal](../../sql-reference/data-types/decimal.md).
+-   `result_scale` — Scale результата: [Int/UInt](../../sql-reference/data-types/int-uint.md).
+
+**Возвращаемое значение**
+
+-   Результат умножения с заданным scale.
+
+Тип: [Decimal256](../../sql-reference/data-types/decimal.md).
+
+**Примеры**
+
+```sql
+SELECT multiplyDecimal(toDecimal256(-12, 0), toDecimal32(-2.1, 1), 1);
+```
+
+```text
+┌─multiplyDecimal(toDecimal256(-12, 0), toDecimal32(-2.1, 1), 1)─┐
+│                                                           25.2 │
+└────────────────────────────────────────────────────────────────┘
+```
+
+**Отличие от стандартных функций**
+```sql
+SELECT toDecimal64(-12.647, 3) * toDecimal32(2.1239, 4);
+SELECT toDecimal64(-12.647, 3) as a, toDecimal32(2.1239, 4) as b, multiplyDecimal(a, b);
+```
+
+```text
+┌─multiply(toDecimal64(-12.647, 3), toDecimal32(2.1239, 4))─┐
+│                                               -26.8609633 │
+└───────────────────────────────────────────────────────────┘
+┌─multiplyDecimal(toDecimal64(-12.647, 3), toDecimal32(2.1239, 4))─┐
+│                                                         -26.8609 │
+└──────────────────────────────────────────────────────────────────┘
+```
+
+```sql
+SELECT
+    toDecimal64(-12.647987876, 9) AS a,
+    toDecimal64(123.967645643, 9) AS b,
+    multiplyDecimal(a, b);
+
+SELECT
+    toDecimal64(-12.647987876, 9) AS a,
+    toDecimal64(123.967645643, 9) AS b,
+    a * b;
+```
+
+```text
+┌─────────────a─┬─────────────b─┬─multiplyDecimal(toDecimal64(-12.647987876, 9), toDecimal64(123.967645643, 9))─┐
+│ -12.647987876 │ 123.967645643 │                                                               -1567.941279108 │
+└───────────────┴───────────────┴───────────────────────────────────────────────────────────────────────────────┘
+
+Received exception from server (version 22.11.1):
+Code: 407. DB::Exception: Received from localhost:9000. DB::Exception: Decimal math overflow: While processing toDecimal64(-12.647987876, 9) AS a, toDecimal64(123.967645643, 9) AS b, a * b. (DECIMAL_OVERFLOW)
+```
+
+## divideDecimal(a, b[, result_scale])
+
+Совершает деление двух Decimal. Результат будет иметь тип [Decimal256](../../sql-reference/data-types/decimal.md).
+Scale (размер дробной части) результат можно явно задать аргументом `result_scale`  (целочисленная константа из интервала `[0, 76]`).
+Если этот аргумент не задан, то scale результата будет равен наибольшему из scale обоих аргументов.
+
+**Синтаксис**
+
+```sql
+divideDecimal(a, b[, result_scale])
+```
+
+:::note    
+Эта функция работает гораздо медленнее обычной `divide`.
+В случае, если нет необходимости иметь фиксированную точность и/или нужны быстрые вычисления, следует использовать [divide](#divide).
+:::
+
+**Аргументы**
+
+-   `a` — Первый сомножитель/делимое: [Decimal](../../sql-reference/data-types/decimal.md).
+-   `b` — Второй сомножитель/делитель: [Decimal](../../sql-reference/data-types/decimal.md).
+-   `result_scale` — Scale результата: [Int/UInt](../../sql-reference/data-types/int-uint.md).
+
+**Возвращаемое значение**
+
+-   Результат деления с заданным scale.
+
+Тип: [Decimal256](../../sql-reference/data-types/decimal.md).
+
+**Примеры**
+
+```sql
+SELECT divideDecimal(toDecimal256(-12, 0), toDecimal32(2.1, 1), 10);
+```
+
+```text
+┌─divideDecimal(toDecimal256(-12, 0), toDecimal32(2.1, 1), 10)─┐
+│                                                -5.7142857142 │
+└──────────────────────────────────────────────────────────────┘
+```
+
+**Отличие от стандартных функций**
+```sql
+SELECT toDecimal64(-12, 1) / toDecimal32(2.1, 1);
+SELECT toDecimal64(-12, 1) as a, toDecimal32(2.1, 1) as b, divideDecimal(a, b, 1), divideDecimal(a, b, 5);
+```
+
+```text
+┌─divide(toDecimal64(-12, 1), toDecimal32(2.1, 1))─┐
+│                                             -5.7 │
+└──────────────────────────────────────────────────┘
+
+┌───a─┬───b─┬─divideDecimal(toDecimal64(-12, 1), toDecimal32(2.1, 1), 1)─┬─divideDecimal(toDecimal64(-12, 1), toDecimal32(2.1, 1), 5)─┐
+│ -12 │ 2.1 │                                                       -5.7 │                                                   -5.71428 │
+└─────┴─────┴────────────────────────────────────────────────────────────┴────────────────────────────────────────────────────────────┘
+```
+
+```sql
+SELECT toDecimal64(-12, 0) / toDecimal32(2.1, 1);
+SELECT toDecimal64(-12, 0) as a, toDecimal32(2.1, 1) as b, divideDecimal(a, b, 1), divideDecimal(a, b, 5);
+```
+
+```text
+DB::Exception: Decimal result's scale is less than argument's one: While processing toDecimal64(-12, 0) / toDecimal32(2.1, 1). (ARGUMENT_OUT_OF_BOUND)
+
+┌───a─┬───b─┬─divideDecimal(toDecimal64(-12, 0), toDecimal32(2.1, 1), 1)─┬─divideDecimal(toDecimal64(-12, 0), toDecimal32(2.1, 1), 5)─┐
+│ -12 │ 2.1 │                                                       -5.7 │                                                   -5.71428 │
+└─────┴─────┴────────────────────────────────────────────────────────────┴────────────────────────────────────────────────────────────┘
+```
+
--- a/docs/zh/guides/improving-query-performance/sparse-primary-indexes.md
+++ b/docs/zh/guides/improving-query-performance/sparse-primary-indexes.md
@ -777,7 +777,7 @@ ClickHouse现在创建了一个额外的索引来存储—每组4个连续的颗
 如果我们想显著加快我们的两个示例查询——一个过滤具有特定UserID的行，一个过滤具有特定URL的行——那么我们需要使用多个主索引，通过使用这三个方法中的一个：

 - 新建一个不同主键的新表。
- 创建一个雾化视图。
+- 创建一个物化视图。
 - 增加projection。

 这三个方法都会有效地将示例数据复制到另一个表中，以便重新组织表的主索引和行排序顺序。
@ -992,7 +992,7 @@ Ok.

 :::note
 - 我们在视图的主键中切换键列的顺序(与原始表相比)
- 雾化视图由一个隐藏表支持，该表的行顺序和主索引基于给定的主键定义
+- 物化视图由一个隐藏表支持，该表的行顺序和主索引基于给定的主键定义
 - 我们使用POPULATE关键字，以便用源表hits_UserID_URL中的所有887万行立即导入新的物化视图 
 - 如果在源表hits_UserID_URL中插入了新行，那么这些行也会自动插入到隐藏表中
 - 实际上，隐式创建的隐藏表的行顺序和主索引与我们上面显式创建的辅助表相同:
@ -1082,7 +1082,7 @@ ALTER TABLE hits_UserID_URL
    );
 ```

-雾化projection：
+物化projection：
 ```sql
 ALTER TABLE hits_UserID_URL
    MATERIALIZE PROJECTION prj_url_userid;
--- a/programs/copier/ClusterCopier.cpp
+++ b/programs/copier/ClusterCopier.cpp
@ -1142,7 +1142,7 @@ TaskStatus ClusterCopier::tryCreateDestinationTable(const ConnectionTimeouts & t
        InterpreterCreateQuery::prepareOnClusterQuery(create, getContext(), task_table.cluster_push_name);
        String query = queryToString(create_query_push_ast);

-        LOG_INFO(log, "Create destination tables. Query: \n {}", query);
+        LOG_INFO(log, "Create destination tables. Query: {}", query);
        UInt64 shards = executeQueryOnCluster(task_table.cluster_push, query, task_cluster->settings_push, ClusterExecutionMode::ON_EACH_NODE);
        LOG_INFO(
            log,
@ -1413,7 +1413,7 @@ TaskStatus ClusterCopier::processPartitionPieceTaskImpl(
        auto create_query_push_ast = rewriteCreateQueryStorage(create_query_ast, database_and_table_for_current_piece, new_engine_push_ast);
        String query = queryToString(create_query_push_ast);

-        LOG_INFO(log, "Create destination tables. Query: \n {}", query);
+        LOG_INFO(log, "Create destination tables. Query: {}", query);
        UInt64 shards = executeQueryOnCluster(task_table.cluster_push, query, task_cluster->settings_push, ClusterExecutionMode::ON_EACH_NODE);
        LOG_INFO(
            log,
@ -1517,7 +1517,7 @@ TaskStatus ClusterCopier::processPartitionPieceTaskImpl(
        // Select all fields
        ASTPtr query_select_ast = get_select_query(task_shard.table_read_shard, "*", /*enable_splitting*/ true, inject_fault ? "1" : "");

-        LOG_INFO(log, "Executing SELECT query and pull from {} : {}", task_shard.getDescription(), queryToString(query_select_ast));
+        LOG_INFO(log, "Executing SELECT query and pull from {}: {}", task_shard.getDescription(), queryToString(query_select_ast));

        ASTPtr query_insert_ast;
        {
@ -1871,7 +1871,7 @@ std::set<String> ClusterCopier::getShardPartitions(const ConnectionTimeouts & ti
    const auto & settings = getContext()->getSettingsRef();
    ASTPtr query_ast = parseQuery(parser_query, query, settings.max_query_size, settings.max_parser_depth);

-    LOG_INFO(log, "Computing destination partition set, executing query: \n {}", query);
+    LOG_INFO(log, "Computing destination partition set, executing query: {}", query);

    auto local_context = Context::createCopy(context);
    local_context->setSettings(task_cluster->settings_pull);
@ -1922,7 +1922,7 @@ bool ClusterCopier::checkShardHasPartition(const ConnectionTimeouts & timeouts,
    const auto & settings = getContext()->getSettingsRef();
    ASTPtr query_ast = parseQuery(parser_query, query, settings.max_query_size, settings.max_parser_depth);

-    LOG_INFO(log, "Checking shard {} for partition {} existence, executing query: \n {}",
+    LOG_INFO(log, "Checking shard {} for partition {} existence, executing query: {}",
        task_shard.getDescription(), partition_quoted_name, query_ast->formatForErrorMessage());

    auto local_context = Context::createCopy(context);
@ -1964,7 +1964,7 @@ bool ClusterCopier::checkPresentPartitionPiecesOnCurrentShard(const ConnectionTi

    query += " LIMIT 1";

-    LOG_INFO(log, "Checking shard {} for partition {} piece {} existence, executing query: \n \u001b[36m {}", task_shard.getDescription(), partition_quoted_name, std::to_string(current_piece_number), query);
+    LOG_INFO(log, "Checking shard {} for partition {} piece {} existence, executing query: {}", task_shard.getDescription(), partition_quoted_name, std::to_string(current_piece_number), query);

    ParserQuery parser_query(query.data() + query.size());
    const auto & settings = getContext()->getSettingsRef();
@ -2046,7 +2046,7 @@ UInt64 ClusterCopier::executeQueryOnCluster(
            }
            catch (...)
            {
-                LOG_WARNING(log, "An error occurred while processing query : \n {}", query);
+                LOG_WARNING(log, "An error occurred while processing query: {}", query);
                tryLogCurrentException(log);
                continue;
            }
--- a/programs/install/Install.cpp
+++ b/programs/install/Install.cpp
@ -888,7 +888,7 @@ int mainEntryClickHouseInstall(int argc, char ** argv)

 namespace
 {
-    int start(const std::string & user, const fs::path & executable, const fs::path & config, const fs::path & pid_file)
+    int start(const std::string & user, const fs::path & executable, const fs::path & config, const fs::path & pid_file, unsigned max_tries)
    {
        if (fs::exists(pid_file))
        {
@ -939,8 +939,7 @@ namespace
        /// Wait to start.

        size_t try_num = 0;
-        constexpr size_t num_tries = 60;
-        for (; try_num < num_tries; ++try_num)
+        for (; try_num < max_tries; ++try_num)
        {
            fmt::print("Waiting for server to start\n");
            if (fs::exists(pid_file))
@ -951,7 +950,7 @@ namespace
            sleepForSeconds(1);
        }

-        if (try_num == num_tries)
+        if (try_num == max_tries)
        {
            fmt::print("Cannot start server. You can execute {} without --daemon option to run manually.\n", command);

@ -1052,7 +1051,7 @@ namespace
        return pid;
    }

-    int stop(const fs::path & pid_file, bool force, bool do_not_kill)
+    int stop(const fs::path & pid_file, bool force, bool do_not_kill, unsigned max_tries)
    {
        if (force && do_not_kill)
            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Specified flags are incompatible");
@ -1071,8 +1070,7 @@ namespace
            throwFromErrno(fmt::format("Cannot send {} signal", signal_name), ErrorCodes::SYSTEM_ERROR);

        size_t try_num = 0;
-        constexpr size_t num_tries = 60;
-        for (; try_num < num_tries; ++try_num)
+        for (; try_num < max_tries; ++try_num)
        {
            fmt::print("Waiting for server to stop\n");
            if (!isRunning(pid_file))
@ -1083,7 +1081,7 @@ namespace
            sleepForSeconds(1);
        }

-        if (try_num == num_tries)
+        if (try_num == max_tries)
        {
            if (do_not_kill)
            {
@ -1136,6 +1134,7 @@ int mainEntryClickHouseStart(int argc, char ** argv)
            ("config-path", po::value<std::string>()->default_value("etc/clickhouse-server"), "directory with configs")
            ("pid-path", po::value<std::string>()->default_value("var/run/clickhouse-server"), "directory for pid file")
            ("user", po::value<std::string>()->default_value(DEFAULT_CLICKHOUSE_SERVER_USER), "clickhouse user")
+            ("max-tries", po::value<unsigned>()->default_value(60), "Max number of tries for waiting the server (with 1 second delay)")
        ;

        po::variables_map options;
@ -1153,8 +1152,9 @@ int mainEntryClickHouseStart(int argc, char ** argv)
        fs::path executable = prefix / options["binary-path"].as<std::string>() / "clickhouse-server";
        fs::path config = prefix / options["config-path"].as<std::string>() / "config.xml";
        fs::path pid_file = prefix / options["pid-path"].as<std::string>() / "clickhouse-server.pid";
+        unsigned max_tries = options["max-tries"].as<unsigned>();

-        return start(user, executable, config, pid_file);
+        return start(user, executable, config, pid_file, max_tries);
    }
    catch (...)
    {
@ -1175,6 +1175,7 @@ int mainEntryClickHouseStop(int argc, char ** argv)
            ("pid-path", po::value<std::string>()->default_value("var/run/clickhouse-server"), "directory for pid file")
            ("force", po::bool_switch(), "Stop with KILL signal instead of TERM")
            ("do-not-kill", po::bool_switch(), "Do not send KILL even if TERM did not help")
+            ("max-tries", po::value<unsigned>()->default_value(60), "Max number of tries for waiting the server to finish after sending TERM (with 1 second delay)")
        ;

        po::variables_map options;
@ -1191,7 +1192,8 @@ int mainEntryClickHouseStop(int argc, char ** argv)

        bool force = options["force"].as<bool>();
        bool do_not_kill = options["do-not-kill"].as<bool>();
-        return stop(pid_file, force, do_not_kill);
+        unsigned max_tries = options["max-tries"].as<unsigned>();
+        return stop(pid_file, force, do_not_kill, max_tries);
    }
    catch (...)
    {
@ -1250,6 +1252,7 @@ int mainEntryClickHouseRestart(int argc, char ** argv)
            ("user", po::value<std::string>()->default_value(DEFAULT_CLICKHOUSE_SERVER_USER), "clickhouse user")
            ("force", po::value<bool>()->default_value(false), "Stop with KILL signal instead of TERM")
            ("do-not-kill", po::bool_switch(), "Do not send KILL even if TERM did not help")
+            ("max-tries", po::value<unsigned>()->default_value(60), "Max number of tries for waiting the server (with 1 second delay)")
        ;

        po::variables_map options;
@ -1270,10 +1273,11 @@ int mainEntryClickHouseRestart(int argc, char ** argv)

        bool force = options["force"].as<bool>();
        bool do_not_kill = options["do-not-kill"].as<bool>();
-        if (int res = stop(pid_file, force, do_not_kill))
-            return res;
+        unsigned max_tries = options["max-tries"].as<unsigned>();

-        return start(user, executable, config, pid_file);
+        if (int res = stop(pid_file, force, do_not_kill, max_tries))
+            return res;
+        return start(user, executable, config, pid_file, max_tries);
    }
    catch (...)
    {
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@ -1475,8 +1475,7 @@ try
    if (settings.async_insert_threads)
        global_context->setAsynchronousInsertQueue(std::make_shared<AsynchronousInsertQueue>(
            global_context,
-            settings.async_insert_threads,
-            settings.async_insert_cleanup_timeout_ms));
+            settings.async_insert_threads));

    /// Size of cache for marks (index of MergeTree family of tables).
    size_t mark_cache_size = config().getUInt64("mark_cache_size", 5368709120);
--- a/src/Analyzer/Passes/QueryAnalysisPass.cpp
+++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp
@ -4063,6 +4063,7 @@ ProjectionNames QueryAnalyzer::resolveFunction(QueryTreeNodePtr & node, Identifi
        in_subquery->getJoinTree() = exists_subquery_argument;
        in_subquery->getLimit() = std::make_shared<ConstantNode>(1UL, constant_data_type);
        in_subquery->resolveProjectionColumns({NameAndTypePair("1", constant_data_type)});
+        in_subquery->setIsSubquery(true);

        function_node_ptr = std::make_shared<FunctionNode>("in");
        function_node_ptr->getArguments().getNodes() = {std::make_shared<ConstantNode>(1UL, constant_data_type), in_subquery};
--- a/src/Analyzer/Passes/SumIfToCountIfPass.cpp
+++ b/src/Analyzer/Passes/SumIfToCountIfPass.cpp
@ -61,7 +61,7 @@ public:
            function_node_arguments_nodes[0] = std::move(function_node_arguments_nodes[1]);
            function_node_arguments_nodes.resize(1);

-            resolveAggregateFunctionNode(*function_node, "countIf");
+            resolveAsCountIfAggregateFunction(*function_node, function_node_arguments_nodes[0]->getResultType());
            return;
        }

@ -102,15 +102,16 @@ public:
            function_node_arguments_nodes[0] = std::move(nested_if_function_arguments_nodes[0]);
            function_node_arguments_nodes.resize(1);

-            resolveAggregateFunctionNode(*function_node, "countIf");
+            resolveAsCountIfAggregateFunction(*function_node, function_node_arguments_nodes[0]->getResultType());
            return;
        }

        /// Rewrite `sum(if(cond, 0, 1))` into `countIf(not(cond))`.
        if (if_true_condition_value == 0 && if_false_condition_value == 1)
        {
-            auto condition_result_type = nested_if_function_arguments_nodes[0]->getResultType();
            DataTypePtr not_function_result_type = std::make_shared<DataTypeUInt8>();
+
+            const auto & condition_result_type = nested_if_function_arguments_nodes[0]->getResultType();
            if (condition_result_type->isNullable())
                not_function_result_type = makeNullable(not_function_result_type);

@ -123,23 +124,21 @@ public:
            function_node_arguments_nodes[0] = std::move(not_function);
            function_node_arguments_nodes.resize(1);

-            resolveAggregateFunctionNode(*function_node, "countIf");
+            resolveAsCountIfAggregateFunction(*function_node, function_node_arguments_nodes[0]->getResultType());
            return;
        }
    }

 private:
-    static inline void resolveAggregateFunctionNode(FunctionNode & function_node, const String & aggregate_function_name)
+    static inline void resolveAsCountIfAggregateFunction(FunctionNode & function_node, const DataTypePtr & argument_type)
    {
-        auto function_result_type = function_node.getResultType();
-        auto function_aggregate_function = function_node.getAggregateFunction();
-
        AggregateFunctionProperties properties;
-        auto aggregate_function = AggregateFunctionFactory::instance().get(aggregate_function_name,
-            function_aggregate_function->getArgumentTypes(),
-            function_aggregate_function->getParameters(),
+        auto aggregate_function = AggregateFunctionFactory::instance().get("countIf",
+            {argument_type},
+            function_node.getAggregateFunction()->getParameters(),
            properties);

+        auto function_result_type = function_node.getResultType();
        function_node.resolveAsAggregateFunction(std::move(aggregate_function), std::move(function_result_type));
    }

--- a/src/Analyzer/Passes/UniqInjectiveFunctionsEliminationPass.cpp
+++ b/src/Analyzer/Passes/UniqInjectiveFunctionsEliminationPass.cpp
@ -2,9 +2,13 @@

 #include <Functions/IFunction.h>

+#include <AggregateFunctions/AggregateFunctionFactory.h>
+#include <AggregateFunctions/IAggregateFunction.h>
+
 #include <Analyzer/InDepthQueryTreeVisitor.h>
 #include <Analyzer/FunctionNode.h>

+
 namespace DB
 {

@ -30,7 +34,9 @@ public:
        if (!function_node || !function_node->isAggregateFunction() || !isUniqFunction(function_node->getFunctionName()))
            return;

+        bool replaced_argument = false;
        auto & uniq_function_arguments_nodes = function_node->getArguments().getNodes();
+
        for (auto & uniq_function_argument_node : uniq_function_arguments_nodes)
        {
            auto * uniq_function_argument_node_typed = uniq_function_argument_node->as<FunctionNode>();
@ -49,7 +55,28 @@ public:

            /// Replace injective function with its single argument
            uniq_function_argument_node = uniq_function_argument_node_argument_nodes[0];
+            replaced_argument = true;
        }
+
+        if (!replaced_argument)
+            return;
+
+        const auto & function_node_argument_nodes = function_node->getArguments().getNodes();
+
+        DataTypes argument_types;
+        argument_types.reserve(function_node_argument_nodes.size());
+
+        for (const auto & function_node_argument : function_node_argument_nodes)
+            argument_types.emplace_back(function_node_argument->getResultType());
+
+        AggregateFunctionProperties properties;
+        auto aggregate_function = AggregateFunctionFactory::instance().get(function_node->getFunctionName(),
+            argument_types,
+            function_node->getAggregateFunction()->getParameters(),
+            properties);
+
+        auto function_result_type = function_node->getResultType();
+        function_node->resolveAsAggregateFunction(std::move(aggregate_function), std::move(function_result_type));
    }
 };

--- a/src/Common/ErrorCodes.cpp
+++ b/src/Common/ErrorCodes.cpp
@ -258,7 +258,7 @@
    M(250, NOT_ENOUGH_BLOCK_NUMBERS) \
    M(251, NO_SUCH_REPLICA) \
    M(252, TOO_MANY_PARTS) \
-    M(253, REPLICA_IS_ALREADY_EXIST) \
+    M(253, REPLICA_ALREADY_EXISTS) \
    M(254, NO_ACTIVE_REPLICAS) \
    M(255, TOO_MANY_RETRIES_TO_FETCH_PARTS) \
    M(256, PARTITION_ALREADY_EXISTS) \
--- a/src/Common/ProgressIndication.cpp
+++ b/src/Common/ProgressIndication.cpp
@ -123,13 +123,16 @@ void ProgressIndication::writeFinalProgress()
    if (progress.read_rows < 1000)
        return;

-    std::cout << "Processed " << formatReadableQuantity(progress.read_rows) << " rows, "
-                << formatReadableSizeWithDecimalSuffix(progress.read_bytes);
+    UInt64 processed_rows = progress.read_rows + progress.written_rows;
+    UInt64 processed_bytes = progress.read_bytes + progress.written_bytes;
+
+    std::cout << "Processed " << formatReadableQuantity(processed_rows) << " rows, "
+                << formatReadableSizeWithDecimalSuffix(processed_bytes);

    UInt64 elapsed_ns = getElapsedNanoseconds();
    if (elapsed_ns)
-        std::cout << " (" << formatReadableQuantity(progress.read_rows * 1000000000.0 / elapsed_ns) << " rows/s., "
-                    << formatReadableSizeWithDecimalSuffix(progress.read_bytes * 1000000000.0 / elapsed_ns) << "/s.)";
+        std::cout << " (" << formatReadableQuantity(processed_rows * 1000000000.0 / elapsed_ns) << " rows/s., "
+                    << formatReadableSizeWithDecimalSuffix(processed_bytes * 1000000000.0 / elapsed_ns) << "/s.)";
    else
        std::cout << ". ";
 }
@ -164,16 +167,18 @@ void ProgressIndication::writeProgress(WriteBufferFromFileDescriptor & message)

    size_t prefix_size = message.count();

+    UInt64 processed_rows = progress.read_rows + progress.written_rows;
+    UInt64 processed_bytes = progress.read_bytes + progress.written_bytes;
    message << indicator << " Progress: ";
    message
-        << formatReadableQuantity(progress.read_rows) << " rows, "
-        << formatReadableSizeWithDecimalSuffix(progress.read_bytes);
+        << formatReadableQuantity(processed_rows) << " rows, "
+        << formatReadableSizeWithDecimalSuffix(processed_bytes);

    UInt64 elapsed_ns = getElapsedNanoseconds();
    if (elapsed_ns)
        message << " ("
-                << formatReadableQuantity(progress.read_rows * 1000000000.0 / elapsed_ns) << " rows/s., "
-                << formatReadableSizeWithDecimalSuffix(progress.read_bytes * 1000000000.0 / elapsed_ns) << "/s.) ";
+                << formatReadableQuantity(processed_rows * 1000000000.0 / elapsed_ns) << " rows/s., "
+                << formatReadableSizeWithDecimalSuffix(processed_bytes * 1000000000.0 / elapsed_ns) << "/s.) ";
    else
        message << ". ";

--- a/src/Common/formatIPv6.cpp
+++ b/src/Common/formatIPv6.cpp
@ -146,7 +146,8 @@ void formatIPv6(const unsigned char * src, char *& dst, uint8_t zeroed_tail_byte
            uint8_t ipv4_buffer[IPV4_BINARY_LENGTH] = {0};
            memcpy(ipv4_buffer, src + 12, IPV4_BINARY_LENGTH);
            // Due to historical reasons formatIPv4() takes ipv4 in BE format, but inside ipv6 we store it in LE-format.
-            std::reverse(std::begin(ipv4_buffer), std::end(ipv4_buffer));
+            if constexpr (std::endian::native == std::endian::little)
+                std::reverse(std::begin(ipv4_buffer), std::end(ipv4_buffer));

            formatIPv4(ipv4_buffer, dst, std::min(zeroed_tail_bytes_count, static_cast<uint8_t>(IPV4_BINARY_LENGTH)), "0");
            // formatIPv4 has already added a null-terminator for us.
--- a/src/Common/formatIPv6.h
+++ b/src/Common/formatIPv6.h
@ -56,11 +56,8 @@ inline bool parseIPv4(const char * src, unsigned char * dst)
    }
    if (*(src - 1) != '\0')
        return false;
-#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    reverseMemcpy(dst, &result, sizeof(result));
-#else
+
    memcpy(dst, &result, sizeof(result));
-#endif
    return true;
 }

@ -138,7 +135,9 @@ inline bool parseIPv6(const char * src, unsigned char * dst)
        {
            if (!parseIPv4(curtok, tp))
                return clear_dst();
-            std::reverse(tp, tp + IPV4_BINARY_LENGTH);
+
+            if constexpr (std::endian::native == std::endian::little)
+                std::reverse(tp, tp + IPV4_BINARY_LENGTH);

            tp += IPV4_BINARY_LENGTH;
            saw_xdigit = false;
@ -207,7 +206,11 @@ inline void formatIPv4(const unsigned char * src, char *& dst, uint8_t mask_tail
    const size_t limit = std::min(IPV4_BINARY_LENGTH, IPV4_BINARY_LENGTH - mask_tail_octets);
    for (size_t octet = 0; octet < limit; ++octet)
    {
-        const uint8_t value = static_cast<uint8_t>(src[IPV4_BINARY_LENGTH - octet - 1]);
+        uint8_t value = 0;
+        if constexpr (std::endian::native == std::endian::little)
+            value = static_cast<uint8_t>(src[IPV4_BINARY_LENGTH - octet - 1]);
+        else
+            value = static_cast<uint8_t>(src[octet]);
        const auto * rep = one_byte_to_string_lookup_table[value];
        const uint8_t len = rep[0];
        const char* str = rep + 1;
--- a/src/Common/hex.h
+++ b/src/Common/hex.h
@ -1,6 +1,6 @@
 #pragma once
 #include <string>
-#include <Core/Types.h>
+

 /// Maps 0..15 to 0..9A..F or 0..9a..f correspondingly.

@ -50,32 +50,17 @@ inline void writeBinByte(UInt8 byte, void * out)
 template <typename TUInt>
 inline void writeHexUIntImpl(TUInt uint_, char * out, const char * const table)
 {
-    if constexpr (is_integer<TUInt>)
+    union
    {
-        /// For integer types, use endian indepentant way for conversion
-        TUInt value = uint_;
+        TUInt value;
+        UInt8 uint8[sizeof(TUInt)];
+    };

-        for (size_t i = 0; i < sizeof(TUInt); ++i)
-        {
-            memcpy(out + (sizeof(TUInt) - 1 - i) * 2, &table[static_cast<size_t>(value % 256) * 2], 2);
-            value /= 256;
-        }
-    }
-    else
-    {
-        /// For non-integer types, access memory directly for conversion to keep back-compatibility
-        union
-        {
-            TUInt value;
-            UInt8 uint8[sizeof(TUInt)];
-        };
+    value = uint_;

-        value = uint_;
-
-        /// Use little endian
-        for (size_t i = 0; i < sizeof(TUInt); ++i)
-            memcpy(out + i * 2, &table[static_cast<size_t>(uint8[sizeof(TUInt) - 1 - i]) * 2], 2);
-    }
+    /// Use little endian
+    for (size_t i = 0; i < sizeof(TUInt); ++i)
+        memcpy(out + i * 2, &table[static_cast<size_t>(uint8[sizeof(TUInt) - 1 - i]) * 2], 2);
 }

 template <typename TUInt>
--- a/src/Compression/CachedCompressedReadBuffer.cpp
+++ b/src/Compression/CachedCompressedReadBuffer.cpp
@ -47,8 +47,8 @@ bool CachedCompressedReadBuffer::nextImpl()

        auto cell = std::make_shared<UncompressedCacheCell>();

-        size_t size_decompressed;
-        size_t size_compressed_without_checksum;
+        size_t size_decompressed = 0;
+        size_t size_compressed_without_checksum = 0;
        cell->compressed_size = readCompressedData(size_decompressed, size_compressed_without_checksum, false);

        if (cell->compressed_size)
--- a/src/Coordination/KeeperSnapshotManager.cpp
+++ b/src/Coordination/KeeperSnapshotManager.cpp
@ -194,6 +194,9 @@ void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, Wr
        // write only the root system path because of digest
        if (Coordination::matchPath(path.toView(), keeper_system_path) == Coordination::PathMatchResult::IS_CHILD)
        {
+            if (counter == snapshot.snapshot_container_size - 1)
+                break;
+
            ++it;
            continue;
        }
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -143,6 +143,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
    M(UInt64, group_by_two_level_threshold_bytes, 50000000, "From what size of the aggregation state in bytes, a two-level aggregation begins to be used. 0 - the threshold is not set. Two-level aggregation is used when at least one of the thresholds is triggered.", 0) \
    M(Bool, distributed_aggregation_memory_efficient, true, "Is the memory-saving mode of distributed aggregation enabled.", 0) \
    M(UInt64, aggregation_memory_efficient_merge_threads, 0, "Number of threads to use for merge intermediate aggregation results in memory efficient mode. When bigger, then more memory is consumed. 0 means - same as 'max_threads'.", 0) \
+    M(Bool, enable_memory_bound_merging_of_aggregation_results, false, "Enable memory bound merging strategy for aggregation. Set it to true only if all nodes of your clusters have versions >= 22.12.", 0) \
    M(Bool, enable_positional_arguments, true, "Enable positional arguments in ORDER BY, GROUP BY and LIMIT BY", 0) \
    M(Bool, enable_extended_results_for_datetime_functions, false, "Enable date functions like toLastDayOfMonth return Date32 results (instead of Date results) for Date32/DateTime64 arguments.", 0) \
    \
@ -604,7 +605,6 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
    M(Seconds, wait_for_async_insert_timeout, DBMS_DEFAULT_LOCK_ACQUIRE_TIMEOUT_SEC, "Timeout for waiting for processing asynchronous insertion", 0) \
    M(UInt64, async_insert_max_data_size, 1000000, "Maximum size in bytes of unparsed data collected per query before being inserted", 0) \
    M(Milliseconds, async_insert_busy_timeout_ms, 200, "Maximum time to wait before dumping collected data per query since the first data appeared", 0) \
-    M(Milliseconds, async_insert_cleanup_timeout_ms, 1000, "Time to wait before each iteration of cleaning up buffers for INSERT queries which don't appear anymore. Only has meaning at server startup.", 0) \
    \
    M(UInt64, remote_fs_read_max_backoff_ms, 10000, "Max wait time when trying to read data for remote disk", 0) \
    M(UInt64, remote_fs_read_backoff_max_tries, 5, "Max attempts to read with backoff", 0) \
@ -668,6 +668,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
    M(UInt64, insert_keeper_retry_max_backoff_ms, 10000, "Max backoff timeout for keeper operations during insert", 0) \
    M(Float, insert_keeper_fault_injection_probability, 0.0f, "Approximate probability of failure for a keeper request during insert. Valid value is in interval [0.0f, 1.0f]", 0) \
    M(UInt64, insert_keeper_fault_injection_seed, 0, "0 - random seed, otherwise the setting value", 0) \
+    M(Bool, force_aggregation_in_order, false, "Force use of aggregation in order on remote nodes during distributed aggregation. PLEASE, NEVER CHANGE THIS SETTING VALUE MANUALLY!", IMPORTANT) \
    // End of COMMON_SETTINGS
    // Please add settings related to formats into the FORMAT_FACTORY_SETTINGS and move obsolete settings to OBSOLETE_SETTINGS.

@ -705,6 +706,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
    MAKE_OBSOLETE(M, DefaultDatabaseEngine, default_database_engine, DefaultDatabaseEngine::Atomic) \
    MAKE_OBSOLETE(M, UInt64, max_pipeline_depth, 0)                                                                                 \
    MAKE_OBSOLETE(M, Seconds, temporary_live_view_timeout, 1) \
+    MAKE_OBSOLETE(M, Milliseconds, async_insert_cleanup_timeout_ms, 1000) \

    /** The section above is for obsolete settings. Do not add anything there. */

--- a/src/Core/SortDescription.cpp
+++ b/src/Core/SortDescription.cpp
@ -3,6 +3,7 @@
 #include <IO/Operators.h>
 #include <Common/JSONBuilder.h>
 #include <Common/SipHash.h>
+#include <Common/typeid_cast.h>

 #if USE_EMBEDDED_COMPILER
 #include <DataTypes/Native.h>
@ -58,6 +59,20 @@ bool SortDescription::hasPrefix(const SortDescription & prefix) const
    return true;
 }

+SortDescription commonPrefix(const SortDescription & lhs, const SortDescription & rhs)
+{
+    size_t i = 0;
+    for (; i < std::min(lhs.size(), rhs.size()); ++i)
+    {
+        if (lhs[i] != rhs[i])
+            break;
+    }
+
+    auto res = lhs;
+    res.erase(res.begin() + i, res.end());
+    return res;
+}
+
 #if USE_EMBEDDED_COMPILER

 static CHJIT & getJITInstance()
--- a/src/Core/SortDescription.h
+++ b/src/Core/SortDescription.h
@ -125,6 +125,9 @@ public:
    bool hasPrefix(const SortDescription & prefix) const;
 };

+/// Returns a copy of lhs containing only the prefix of columns matching rhs's columns.
+SortDescription commonPrefix(const SortDescription & lhs, const SortDescription & rhs);
+
 /** Compile sort description for header_types.
  * Description is compiled only if compilation attempts to compile identical description is more than min_count_to_compile_sort_description.
  */
--- a/src/DataTypes/ObjectUtils.cpp
+++ b/src/DataTypes/ObjectUtils.cpp
@ -981,4 +981,11 @@ Field FieldVisitorFoldDimension::operator()(const Array & x) const
    return res;
 }

+void setAllObjectsToDummyTupleType(NamesAndTypesList & columns)
+{
+    for (auto & column : columns)
+        if (column.type->hasDynamicSubcolumns())
+            column.type = createConcreteEmptyDynamicColumn(column.type);
+}
+
 }
--- a/src/DataTypes/ObjectUtils.h
+++ b/src/DataTypes/ObjectUtils.h
@ -162,6 +162,8 @@ private:
    size_t num_dimensions_to_fold;
 };

+void setAllObjectsToDummyTupleType(NamesAndTypesList & columns);
+
 /// Receives range of objects, which contains collections
 /// of columns-like objects (e.g. ColumnsDescription or NamesAndTypesList)
 /// and deduces the common types of object columns for all entries.
--- a/src/Databases/DatabaseReplicated.cpp
+++ b/src/Databases/DatabaseReplicated.cpp
@ -39,7 +39,7 @@ namespace ErrorCodes
    extern const int NO_ZOOKEEPER;
    extern const int LOGICAL_ERROR;
    extern const int BAD_ARGUMENTS;
-    extern const int REPLICA_IS_ALREADY_EXIST;
+    extern const int REPLICA_ALREADY_EXISTS;
    extern const int DATABASE_REPLICATION_FAILED;
    extern const int UNKNOWN_DATABASE;
    extern const int UNKNOWN_TABLE;
@ -297,7 +297,7 @@ void DatabaseReplicated::tryConnectToZooKeeperAndInitDatabase(LoadingStrictnessL
            if (is_create_query || replica_host_id != host_id)
            {
                throw Exception(
-                    ErrorCodes::REPLICA_IS_ALREADY_EXIST,
+                    ErrorCodes::REPLICA_ALREADY_EXISTS,
                    "Replica {} of shard {} of replicated database at {} already exists. Replica host ID: '{}', current host ID: '{}'",
                    replica_name, shard_name, zookeeper_path, replica_host_id, host_id);
            }
--- a/src/Functions/CMakeLists.txt
+++ b/src/Functions/CMakeLists.txt
@ -29,9 +29,9 @@ list (APPEND PRIVATE_LIBS
        ch_contrib::zlib
        boost::filesystem
        divide_impl
+        ch_contrib::xxHash
 )

-
 if (TARGET ch_rust::blake3)
    list (APPEND PUBLIC_LIBS
        ch_rust::blake3
@ -66,8 +66,6 @@ if (TARGET ch_contrib::base64)
    list (APPEND PRIVATE_LIBS ch_contrib::base64)
 endif()

-list (APPEND PRIVATE_LIBS ch_contrib::lz4)
-
 if (ENABLE_NLP)
    list (APPEND PRIVATE_LIBS ch_contrib::cld2)
 endif()
--- a/src/Functions/FunctionsCodingIP.cpp
+++ b/src/Functions/FunctionsCodingIP.cpp
@ -232,8 +232,8 @@ public:
 private:
    static bool isIPv4Mapped(const UInt8 * address)
    {
-        return (unalignedLoad<UInt64>(address) == 0) &&
-               ((unalignedLoad<UInt64>(address + 8) & 0x00000000FFFFFFFFull) == 0x00000000FFFF0000ull);
+        return (unalignedLoadLE<UInt64>(address) == 0) &&
+               ((unalignedLoadLE<UInt64>(address + 8) & 0x00000000FFFFFFFFull) == 0x00000000FFFF0000ull);
    }

    static void cutAddress(const unsigned char * address, char *& dst, UInt8 zeroed_tail_bytes_count)
@ -514,7 +514,11 @@ private:
    static void mapIPv4ToIPv6(UInt32 in, UInt8 * buf)
    {
        unalignedStore<UInt64>(buf, 0);
-        unalignedStore<UInt64>(buf + 8, 0x00000000FFFF0000ull | (static_cast<UInt64>(ntohl(in)) << 32));
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+            unalignedStoreLE<UInt64>(buf + 8, 0x00000000FFFF0000ull | (static_cast<UInt64>(ntohl(in)) << 32));
+#else
+            unalignedStoreLE<UInt64>(buf + 8, 0x00000000FFFF0000ull | (static_cast<UInt64>(__builtin_bswap32(ntohl(in))) << 32));
+#endif
    }
 };

--- a/src/Functions/FunctionsConversion.h
+++ b/src/Functions/FunctionsConversion.h
@ -2297,6 +2297,10 @@ struct ToStringMonotonicity
        if (const auto * low_cardinality_type = checkAndGetDataType<DataTypeLowCardinality>(type_ptr))
            type_ptr = low_cardinality_type->getDictionaryType().get();

+        /// Order on enum values (which is the order on integers) is completely arbitrary in respect to the order on strings.
+        if (WhichDataType(type).isEnum())
+            return not_monotonic;
+
        /// `toString` function is monotonous if the argument is Date or Date32 or DateTime or String, or non-negative numbers with the same number of symbols.
        if (checkDataTypes<DataTypeDate, DataTypeDate32, DataTypeDateTime, DataTypeString>(type_ptr))
            return positive;
--- a/src/Functions/FunctionsDecimalArithmetics.cpp
+++ b/src/Functions/FunctionsDecimalArithmetics.cpp
@ -0,0 +1,17 @@
+#include <Functions/FunctionsDecimalArithmetics.h>
+#include <Functions/FunctionFactory.h>
+
+namespace DB
+{
+REGISTER_FUNCTION(DivideDecimals)
+{
+    factory.registerFunction<FunctionsDecimalArithmetics<DivideDecimalsImpl>>(Documentation(
+        "Decimal division with given precision. Slower than simple `divide`, but has controlled precision and no sound overflows"));
+}
+
+REGISTER_FUNCTION(MultiplyDecimals)
+{
+    factory.registerFunction<FunctionsDecimalArithmetics<MultiplyDecimalsImpl>>(Documentation(
+        "Decimal multiplication with given precision. Slower than simple `divide`, but has controlled precision and no sound overflows"));
+}
+}
--- a/src/Functions/FunctionsDecimalArithmetics.h
+++ b/src/Functions/FunctionsDecimalArithmetics.h
@ -0,0 +1,457 @@
+#pragma once
+#include <type_traits>
+#include <Core/AccurateComparison.h>
+
+#include <DataTypes/DataTypesDecimal.h>
+#include <Columns/ColumnsNumber.h>
+#include <Functions/IFunction.h>
+#include <Functions/FunctionHelpers.h>
+#include <Functions/castTypeToEither.h>
+#include <IO/WriteHelpers.h>
+
+#include <Common/logger_useful.h>
+#include <Poco/Logger.h>
+#include <Loggers/Loggers.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int DECIMAL_OVERFLOW;
+    extern const int ILLEGAL_COLUMN;
+    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
+    extern const int ILLEGAL_DIVISION;
+}
+
+
+struct DecimalOpHelpers
+{
+    /* These functions perform main arithmetic logic.
+     * As soon as intermediate results may not fit Decimal256 (e.g. 1e36, scale 10),
+     * we may not operate with Decimals. Later on this big number may be shrunk (e.g. result scale is 0 in the case above).
+     * That's why we need to store intermediate results in a flexible extendable storage (here we use std::vector)
+     * Here we operate on numbers using simple digit arithmetic.
+     * This is the reason these functions are slower than traditional ones.
+     *
+     * Here and below we use UInt8 for storing digits (0-9 range with maximum carry of 9 will definitely fit this)
+     */
+    static std::vector<UInt8> multiply(const std::vector<UInt8> & num1, const std::vector<UInt8> & num2)
+    {
+        UInt16 const len1 = num1.size();
+        UInt16 const len2 = num2.size();
+        if (len1 == 0 || len2 == 0)
+            return {0};
+
+        std::vector<UInt8> result(len1 + len2, 0);
+        UInt16 i_n1 = 0;
+        UInt16 i_n2;
+
+        for (Int32 i = len1 - 1; i >= 0; --i)
+        {
+            UInt16 carry = 0;
+            i_n2 = 0;
+            for (Int32 j = len2 - 1; j >= 0; --j)
+            {
+                if (unlikely(i_n1 + i_n2 >= len1 + len2))
+                    throw DB::Exception("Numeric overflow: result bigger that Decimal256", ErrorCodes::DECIMAL_OVERFLOW);
+                UInt16 sum = num1[i] * num2[j] + result[i_n1 + i_n2] + carry;
+                carry = sum / 10;
+                result[i_n1 + i_n2] = sum % 10;
+                ++i_n2;
+            }
+
+            if (carry > 0)
+            {
+                if (unlikely(i_n1 + i_n2 >= len1 + len2))
+                    throw DB::Exception("Numeric overflow: result bigger that Decimal256", ErrorCodes::DECIMAL_OVERFLOW);
+                result[i_n1 + i_n2] += carry;
+            }
+
+            ++i_n1;
+        }
+
+        // Maximum Int32 value exceeds 2 billion, we can safely use it for array length storing
+        Int32 i = static_cast<Int32>(result.size() - 1);
+
+        while (i >= 0 && result[i] == 0)
+        {
+            result.pop_back();
+            --i;
+        }
+        if (i == -1)
+            return {0};
+
+        std::reverse(result.begin(), result.end());
+        return result;
+    }
+
+    static std::vector<UInt8> divide(const std::vector<UInt8> & number, const Int256 & divisor)
+    {
+        std::vector<UInt8> result;
+        const auto max_index = number.size() - 1;
+
+        UInt16 idx = 0;
+        Int256 temp = 0;
+
+        while (temp < divisor && max_index > idx)
+        {
+            temp = temp * 10 + number[idx];
+            ++idx;
+        }
+
+        if (unlikely(temp == 0))
+            return {0};
+
+        while (max_index >= idx)
+        {
+            result.push_back(temp / divisor);
+            temp = (temp % divisor) * 10 + number[idx];
+            ++idx;
+        }
+        result.push_back(temp / divisor);
+
+        return result;
+    }
+
+    static std::vector<UInt8> toDigits(Int256 x)
+    {
+        std::vector<UInt8> result;
+        if (x >= 10)
+            result = toDigits(x / 10);
+
+        result.push_back(x % 10);
+        return result;
+    }
+
+    static UInt256 fromDigits(const std::vector<UInt8> & digits)
+    {
+        Int256 result = 0;
+        Int256 scale = 0;
+        for (auto i = digits.rbegin(); i != digits.rend(); ++i)
+        {
+            result += DecimalUtils::scaleMultiplier<Decimal256>(scale) * (*i);
+            ++scale;
+        }
+        return result;
+    }
+};
+
+
+struct DivideDecimalsImpl
+{
+    static constexpr auto name = "divideDecimal";
+
+    template <typename FirstType, typename SecondType>
+    static inline Decimal256
+    execute(FirstType a, SecondType b, UInt16 scale_a, UInt16 scale_b, UInt16 result_scale)
+    {
+        if (b.value == 0)
+            throw DB::Exception("Division by zero", ErrorCodes::ILLEGAL_DIVISION);
+        if (a.value == 0)
+            return Decimal256(0);
+
+        Int256 sign_a = a.value < 0 ? -1 : 1;
+        Int256 sign_b = b.value < 0 ? -1 : 1;
+
+        std::vector<UInt8> a_digits = DecimalOpHelpers::toDigits(a.value * sign_a);
+
+        while (scale_a < scale_b + result_scale)
+        {
+            a_digits.push_back(0);
+            ++scale_a;
+        }
+
+        while (scale_a > scale_b + result_scale && !a_digits.empty())
+        {
+            a_digits.pop_back();
+            --scale_a;
+        }
+
+        if (a_digits.empty())
+            return Decimal256(0);
+
+        std::vector<UInt8> divided = DecimalOpHelpers::divide(a_digits, b.value * sign_b);
+
+        if (divided.size() > DecimalUtils::max_precision<Decimal256>)
+            throw DB::Exception("Numeric overflow: result bigger that Decimal256", ErrorCodes::DECIMAL_OVERFLOW);
+        return Decimal256(sign_a * sign_b * DecimalOpHelpers::fromDigits(divided));
+    }
+};
+
+
+struct MultiplyDecimalsImpl
+{
+    static constexpr auto name = "multiplyDecimal";
+
+    template <typename FirstType, typename SecondType>
+    static inline Decimal256
+    execute(FirstType a, SecondType b, UInt16 scale_a, UInt16 scale_b, UInt16 result_scale)
+    {
+        if (a.value == 0 || b.value == 0)
+            return Decimal256(0);
+
+        Int256 sign_a = a.value < 0 ? -1 : 1;
+        Int256 sign_b = b.value < 0 ? -1 : 1;
+
+        std::vector<UInt8> a_digits = DecimalOpHelpers::toDigits(a.value * sign_a);
+        std::vector<UInt8> b_digits = DecimalOpHelpers::toDigits(b.value * sign_b);
+
+        std::vector<UInt8> multiplied = DecimalOpHelpers::multiply(a_digits, b_digits);
+
+        UInt16 product_scale = scale_a + scale_b;
+        while (product_scale < result_scale)
+        {
+            multiplied.push_back(0);
+            ++product_scale;
+        }
+
+        while (product_scale > result_scale&& !multiplied.empty())
+        {
+            multiplied.pop_back();
+            --product_scale;
+        }
+
+        if (multiplied.empty())
+            return Decimal256(0);
+
+        if (multiplied.size() > DecimalUtils::max_precision<Decimal256>)
+            throw DB::Exception("Numeric overflow: result bigger that Decimal256", ErrorCodes::DECIMAL_OVERFLOW);
+
+        return Decimal256(sign_a * sign_b * DecimalOpHelpers::fromDigits(multiplied));
+    }
+};
+
+
+template <typename ResultType, typename Transform>
+struct Processor
+{
+    const Transform transform;
+
+    explicit Processor(Transform transform_)
+        : transform(std::move(transform_))
+    {}
+
+    template <typename FirstArgVectorType, typename SecondArgType>
+    void NO_INLINE
+    vectorConstant(const FirstArgVectorType & vec_first, const SecondArgType second_value,
+                   PaddedPODArray<typename ResultType::FieldType> & vec_to, UInt16 scale_a, UInt16 scale_b, UInt16 result_scale) const
+    {
+        size_t size = vec_first.size();
+        vec_to.resize(size);
+
+        for (size_t i = 0; i < size; ++i)
+            vec_to[i] = transform.execute(vec_first[i], second_value, scale_a, scale_b, result_scale);
+    }
+
+    template <typename FirstArgVectorType, typename SecondArgVectorType>
+    void NO_INLINE
+    vectorVector(const FirstArgVectorType & vec_first, const SecondArgVectorType & vec_second,
+                 PaddedPODArray<typename ResultType::FieldType> & vec_to, UInt16 scale_a, UInt16 scale_b, UInt16 result_scale) const
+    {
+        size_t size = vec_first.size();
+        vec_to.resize(size);
+
+        for (size_t i = 0; i < size; ++i)
+            vec_to[i] = transform.execute(vec_first[i], vec_second[i], scale_a, scale_b, result_scale);
+    }
+
+    template <typename FirstArgType, typename SecondArgVectorType>
+    void NO_INLINE
+    constantVector(const FirstArgType & first_value, const SecondArgVectorType & vec_second,
+                   PaddedPODArray<typename ResultType::FieldType> & vec_to, UInt16 scale_a, UInt16 scale_b, UInt16 result_scale) const
+    {
+        size_t size = vec_second.size();
+        vec_to.resize(size);
+
+        for (size_t i = 0; i < size; ++i)
+            vec_to[i] = transform.execute(first_value, vec_second[i], scale_a, scale_b, result_scale);
+    }
+};
+
+
+template <typename FirstArgType, typename SecondArgType, typename ResultType, typename Transform>
+struct DecimalArithmeticsImpl
+{
+    static ColumnPtr execute(Transform transform, const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type)
+    {
+        using FirstArgValueType = typename FirstArgType::FieldType;
+        using FirstArgColumnType = typename FirstArgType::ColumnType;
+        using SecondArgValueType = typename SecondArgType::FieldType;
+        using SecondArgColumnType = typename SecondArgType::ColumnType;
+        using ResultColumnType = typename ResultType::ColumnType;
+
+        UInt16 scale_a = getDecimalScale(*arguments[0].type);
+        UInt16 scale_b = getDecimalScale(*arguments[1].type);
+        UInt16 result_scale = getDecimalScale(*result_type->getPtr());
+
+        auto op = Processor<ResultType, Transform>{std::move(transform)};
+
+        auto result_col = result_type->createColumn();
+        auto col_to = assert_cast<ResultColumnType *>(result_col.get());
+
+        const auto * first_col = checkAndGetColumn<FirstArgColumnType>(arguments[0].column.get());
+        const auto * second_col = checkAndGetColumn<SecondArgColumnType>(arguments[1].column.get());
+        const auto * first_col_const = typeid_cast<const ColumnConst *>(arguments[0].column.get());
+        const auto * second_col_const = typeid_cast<const ColumnConst *>(arguments[1].column.get());
+
+        if (first_col)
+        {
+            if (second_col_const)
+                op.vectorConstant(first_col->getData(), second_col_const->template getValue<SecondArgValueType>(), col_to->getData(), scale_a, scale_b, result_scale);
+            else
+                op.vectorVector(first_col->getData(), second_col->getData(), col_to->getData(), scale_a, scale_b, result_scale);
+        }
+        else if (first_col_const)
+        {
+            op.constantVector(first_col_const->template getValue<FirstArgValueType>(), second_col->getData(), col_to->getData(), scale_a, scale_b, result_scale);
+        }
+        else
+        {
+            throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}",
+                            arguments[0].column->getName(), Transform::name);
+        }
+
+        return result_col;
+    }
+};
+
+
+template <typename Transform>
+class FunctionsDecimalArithmetics : public IFunction
+{
+public:
+    static constexpr auto name = Transform::name;
+    static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionsDecimalArithmetics>(); }
+
+    String getName() const override
+    {
+        return name;
+    }
+
+    bool isVariadic() const override { return true; }
+    size_t getNumberOfArguments() const override { return 0; }
+    bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }
+
+    DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
+    {
+        if (arguments.size() != 2 && arguments.size() != 3)
+            throw Exception("Number of arguments for function " + getName() + " does not match: 2 or 3 expected",
+                            ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
+
+        if (!isDecimal(arguments[0].type) || !isDecimal(arguments[1].type))
+            throw Exception("Arguments for " + getName() + " function must be Decimal", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+
+        UInt8 scale = std::max(getDecimalScale(*arguments[0].type->getPtr()), getDecimalScale(*arguments[1].type->getPtr()));
+
+        if (arguments.size() == 3)
+        {
+            WhichDataType which_scale(arguments[2].type.get());
+
+            if (!which_scale.isUInt8())
+                throw Exception(
+                    "Illegal type " + arguments[2].type->getName() + " of third argument of function " + getName()
+                        + ". Should be constant UInt8 from range[0, 76]",
+                            ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+
+            const ColumnConst * scale_column = checkAndGetColumnConst<ColumnUInt8>(arguments[2].column.get());
+
+            if (!scale_column)
+                throw Exception(
+                    "Illegal column of third argument of function " + getName() + ". Should be constant UInt8",
+                        ErrorCodes::ILLEGAL_COLUMN);
+
+            scale = scale_column->getValue<UInt8>();
+        }
+
+        /**
+        At compile time, result is unknown. We only know the Scale (number of fractional digits) at runtime.
+        Also nothing is known about size of whole part.
+        As in simple division/multiplication for decimals, we scale the result up, but is is explicit here and no downscale is performed.
+        It guarantees that result will have given scale and it can also be MANUALLY converted to other decimal types later.
+        **/
+        if (scale > DecimalUtils::max_precision<Decimal256>)
+            throw Exception("Illegal value of third argument of function " + this->getName() + ": must be integer in range [0, 76]",
+                            ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+
+        return std::make_shared<DataTypeDecimal256>(DecimalUtils::max_precision<Decimal256>, scale);
+    }
+
+    bool useDefaultImplementationForConstants() const override { return true; }
+    ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {2}; }
+
+    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /*input_rows_count*/) const override
+    {
+        return resolveOverload(arguments, result_type);
+    }
+
+private:
+    //long resolver to call proper templated func
+    ColumnPtr resolveOverload(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type) const
+    {
+        WhichDataType which_dividend(arguments[0].type.get());
+        WhichDataType which_divisor(arguments[1].type.get());
+        if (which_dividend.isDecimal32())
+        {
+            using DividendType = DataTypeDecimal32;
+            if (which_divisor.isDecimal32())
+                return DecimalArithmeticsImpl<DividendType, DataTypeDecimal32, DataTypeDecimal256, Transform>::execute(Transform{}, arguments, result_type);
+            else if (which_divisor.isDecimal64())
+                return DecimalArithmeticsImpl<DividendType, DataTypeDecimal64, DataTypeDecimal256, Transform>::execute(Transform{}, arguments, result_type);
+            else if (which_divisor.isDecimal128())
+                return DecimalArithmeticsImpl<DividendType, DataTypeDecimal128, DataTypeDecimal256, Transform>::execute(Transform{}, arguments, result_type);
+            else if (which_divisor.isDecimal256())
+                return DecimalArithmeticsImpl<DividendType, DataTypeDecimal256, DataTypeDecimal256, Transform>::execute(Transform{}, arguments, result_type);
+        }
+
+        else if (which_dividend.isDecimal64())
+        {
+            using DividendType = DataTypeDecimal64;
+            if (which_divisor.isDecimal32())
+                return DecimalArithmeticsImpl<DividendType, DataTypeDecimal32, DataTypeDecimal256, Transform>::execute(Transform{}, arguments, result_type);
+            else if (which_divisor.isDecimal64())
+                return DecimalArithmeticsImpl<DividendType, DataTypeDecimal64, DataTypeDecimal256, Transform>::execute(Transform{}, arguments, result_type);
+            else if (which_divisor.isDecimal128())
+                return DecimalArithmeticsImpl<DividendType, DataTypeDecimal128, DataTypeDecimal256, Transform>::execute(Transform{}, arguments, result_type);
+            else if (which_divisor.isDecimal256())
+                return DecimalArithmeticsImpl<DividendType, DataTypeDecimal256, DataTypeDecimal256, Transform>::execute(Transform{}, arguments, result_type);
+
+        }
+
+        else if (which_dividend.isDecimal128())
+        {
+            using DividendType = DataTypeDecimal128;
+            if (which_divisor.isDecimal32())
+                return DecimalArithmeticsImpl<DividendType, DataTypeDecimal32, DataTypeDecimal256, Transform>::execute(Transform{}, arguments, result_type);
+            else if (which_divisor.isDecimal64())
+                return DecimalArithmeticsImpl<DividendType, DataTypeDecimal64, DataTypeDecimal256, Transform>::execute(Transform{}, arguments, result_type);
+            else if (which_divisor.isDecimal128())
+                return DecimalArithmeticsImpl<DividendType, DataTypeDecimal128, DataTypeDecimal256, Transform>::execute(Transform{}, arguments, result_type);
+            else if (which_divisor.isDecimal256())
+                return DecimalArithmeticsImpl<DividendType, DataTypeDecimal256, DataTypeDecimal256, Transform>::execute(Transform{}, arguments, result_type);
+
+        }
+
+        else if (which_dividend.isDecimal256())
+        {
+            using DividendType = DataTypeDecimal256;
+            if (which_divisor.isDecimal32())
+                return DecimalArithmeticsImpl<DividendType, DataTypeDecimal32, DataTypeDecimal256, Transform>::execute(Transform{}, arguments, result_type);
+            else if (which_divisor.isDecimal64())
+                return DecimalArithmeticsImpl<DividendType, DataTypeDecimal64, DataTypeDecimal256, Transform>::execute(Transform{}, arguments, result_type);
+            else if (which_divisor.isDecimal128())
+                return DecimalArithmeticsImpl<DividendType, DataTypeDecimal128, DataTypeDecimal256, Transform>::execute(Transform{}, arguments, result_type);
+            else if (which_divisor.isDecimal256())
+                return DecimalArithmeticsImpl<DividendType, DataTypeDecimal256, DataTypeDecimal256, Transform>::execute(Transform{}, arguments, result_type);
+
+        }
+
+        // the compiler is happy now
+        return nullptr;
+    }
+};
+
+}
+
--- a/src/Functions/FunctionsHashing.cpp
+++ b/src/Functions/FunctionsHashing.cpp
@ -39,6 +39,13 @@ REGISTER_FUNCTION(Hashing)

    factory.registerFunction<FunctionXxHash32>();
    factory.registerFunction<FunctionXxHash64>();
+    factory.registerFunction<FunctionXXH3>(
+        {
+            "Calculates value of XXH3 64-bit hash function. Refer to https://github.com/Cyan4973/xxHash for detailed documentation.",
+            Documentation::Examples{{"hash", "SELECT xxh3('ClickHouse')"}},
+            Documentation::Categories{"Hash"}
+        },
+        FunctionFactory::CaseSensitive);

    factory.registerFunction<FunctionWyHash64>();

--- a/src/Functions/FunctionsHashing.h
+++ b/src/Functions/FunctionsHashing.h
@ -3,12 +3,18 @@
 #include <city.h>
 #include <farmhash.h>
 #include <metrohash.h>
+#include <wyhash.h>
 #include <MurmurHash2.h>
 #include <MurmurHash3.h>
-#include <wyhash.h>

 #include "config.h"

+#ifdef __clang__
+#    pragma clang diagnostic push
+#    pragma clang diagnostic ignored "-Wused-but-marked-unused"
+#endif
+#include <xxhash.h>
+
 #if USE_BLAKE3
 #    include <blake3.h>
 #endif
@ -17,7 +23,6 @@
 #include <Common/typeid_cast.h>
 #include <Common/safe_cast.h>
 #include <Common/HashTable/Hash.h>
-#include <xxhash.h>

 #if USE_SSL
 #    include <openssl/md4.h>
@ -588,7 +593,7 @@ struct ImplXxHash32
    static constexpr auto name = "xxHash32";
    using ReturnType = UInt32;

-    static auto apply(const char * s, const size_t len) { return XXH32(s, len, 0); }
+    static auto apply(const char * s, const size_t len) { return XXH_INLINE_XXH32(s, len, 0); }
    /**
      *  With current implementation with more than 1 arguments it will give the results
      *  non-reproducible from outside of CH.
@ -609,7 +614,24 @@ struct ImplXxHash64
    using ReturnType = UInt64;
    using uint128_t = CityHash_v1_0_2::uint128;

-    static auto apply(const char * s, const size_t len) { return XXH64(s, len, 0); }
+    static auto apply(const char * s, const size_t len) { return XXH_INLINE_XXH64(s, len, 0); }
+
+    /*
+       With current implementation with more than 1 arguments it will give the results
+       non-reproducible from outside of CH. (see comment on ImplXxHash32).
+     */
+    static auto combineHashes(UInt64 h1, UInt64 h2) { return CityHash_v1_0_2::Hash128to64(uint128_t(h1, h2)); }
+
+    static constexpr bool use_int_hash_for_pods = false;
+};
+
+struct ImplXXH3
+{
+    static constexpr auto name = "xxh3";
+    using ReturnType = UInt64;
+    using uint128_t = CityHash_v1_0_2::uint128;
+
+    static auto apply(const char * s, const size_t len) { return XXH_INLINE_XXH3_64bits(s, len); }

    /*
       With current implementation with more than 1 arguments it will give the results
@ -1508,7 +1530,12 @@ using FunctionHiveHash = FunctionAnyHash<HiveHashImpl>;

 using FunctionXxHash32 = FunctionAnyHash<ImplXxHash32>;
 using FunctionXxHash64 = FunctionAnyHash<ImplXxHash64>;
+using FunctionXXH3 = FunctionAnyHash<ImplXXH3>;

 using FunctionWyHash64 = FunctionAnyHash<ImplWyHash64>;
 using FunctionBLAKE3 = FunctionStringHashFixedString<ImplBLAKE3>;
 }
+
+#ifdef __clang__
+#    pragma clang diagnostic pop
+#endif
--- a/src/Interpreters/AsynchronousInsertQueue.cpp
+++ b/src/Interpreters/AsynchronousInsertQueue.cpp
@ -48,15 +48,22 @@ namespace ErrorCodes
    extern const int TIMEOUT_EXCEEDED;
    extern const int UNKNOWN_EXCEPTION;
    extern const int UNKNOWN_FORMAT;
+    extern const int BAD_ARGUMENTS;
 }

 AsynchronousInsertQueue::InsertQuery::InsertQuery(const ASTPtr & query_, const Settings & settings_)
-    : query(query_->clone()), settings(settings_)
+    : query(query_->clone())
+    , query_str(queryToString(query))
+    , settings(settings_)
+    , hash(calculateHash())
 {
 }

 AsynchronousInsertQueue::InsertQuery::InsertQuery(const InsertQuery & other)
-    : query(other.query->clone()), settings(other.settings)
+    : query(other.query->clone())
+    , query_str(other.query_str)
+    , settings(other.settings)
+    , hash(other.hash)
 {
 }

@ -66,29 +73,33 @@ AsynchronousInsertQueue::InsertQuery::operator=(const InsertQuery & other)
    if (this != &other)
    {
        query = other.query->clone();
+        query_str = other.query_str;
        settings = other.settings;
+        hash = other.hash;
    }

    return *this;
 }

-UInt64 AsynchronousInsertQueue::InsertQuery::Hash::operator()(const InsertQuery & insert_query) const
+UInt128 AsynchronousInsertQueue::InsertQuery::calculateHash() const
 {
-    SipHash hash;
-    insert_query.query->updateTreeHash(hash);
+    SipHash siphash;
+    query->updateTreeHash(siphash);

-    for (const auto & setting : insert_query.settings.allChanged())
+    for (const auto & setting : settings.allChanged())
    {
-        hash.update(setting.getName());
-        applyVisitor(FieldVisitorHash(hash), setting.getValue());
+        siphash.update(setting.getName());
+        applyVisitor(FieldVisitorHash(siphash), setting.getValue());
    }

-    return hash.get64();
+    UInt128 res;
+    siphash.get128(res);
+    return res;
 }

 bool AsynchronousInsertQueue::InsertQuery::operator==(const InsertQuery & other) const
 {
-    return queryToString(query) == queryToString(other.query) && settings == other.settings;
+    return query_str == other.query_str && settings == other.settings;
 }

 AsynchronousInsertQueue::InsertData::Entry::Entry(String && bytes_, String && query_id_)
@ -100,43 +111,31 @@ AsynchronousInsertQueue::InsertData::Entry::Entry(String && bytes_, String && qu

 void AsynchronousInsertQueue::InsertData::Entry::finish(std::exception_ptr exception_)
 {
-    std::lock_guard lock(mutex);
-    finished = true;
+    if (finished.exchange(true))
+        return;
+
    if (exception_)
+    {
+        promise.set_exception(exception_);
        ProfileEvents::increment(ProfileEvents::FailedAsyncInsertQuery, 1);
-    exception = exception_;
-    cv.notify_all();
+    }
+    else
+    {
+        promise.set_value();
+    }
 }

-bool AsynchronousInsertQueue::InsertData::Entry::wait(const Milliseconds & timeout) const
-{
-    std::unique_lock lock(mutex);
-    return cv.wait_for(lock, timeout, [&] { return finished; });
-}
-
-bool AsynchronousInsertQueue::InsertData::Entry::isFinished() const
-{
-    std::lock_guard lock(mutex);
-    return finished;
-}
-
-std::exception_ptr AsynchronousInsertQueue::InsertData::Entry::getException() const
-{
-    std::lock_guard lock(mutex);
-    return exception;
-}
-
-
-AsynchronousInsertQueue::AsynchronousInsertQueue(ContextPtr context_, size_t pool_size, Milliseconds cleanup_timeout_)
+AsynchronousInsertQueue::AsynchronousInsertQueue(ContextPtr context_, size_t pool_size_)
    : WithContext(context_)
-    , cleanup_timeout(cleanup_timeout_)
+    , pool_size(pool_size_)
+    , queue_shards(pool_size)
    , pool(pool_size)
-    , dump_by_first_update_thread(&AsynchronousInsertQueue::busyCheck, this)
-    , cleanup_thread(&AsynchronousInsertQueue::cleanup, this)
 {
-    using namespace std::chrono;
+    if (!pool_size)
+        throw Exception(ErrorCodes::BAD_ARGUMENTS, "pool_size cannot be zero");

-    assert(pool_size);
+    for (size_t i = 0; i < pool_size; ++i)
+        dump_by_first_update_threads.emplace_back([this, i] { processBatchDeadlines(i); });
 }

 AsynchronousInsertQueue::~AsynchronousInsertQueue()
@ -144,34 +143,31 @@ AsynchronousInsertQueue::~AsynchronousInsertQueue()
    /// TODO: add a setting for graceful shutdown.

    LOG_TRACE(log, "Shutting down the asynchronous insertion queue");
-
    shutdown = true;
-    {
-        std::lock_guard lock(deadline_mutex);
-        are_tasks_available.notify_one();
-    }
-    {
-        std::lock_guard lock(cleanup_mutex);
-        cleanup_can_run.notify_one();
-    }

-    assert(dump_by_first_update_thread.joinable());
-    dump_by_first_update_thread.join();
+    for (size_t i = 0; i < pool_size; ++i)
+    {
+        auto & shard = queue_shards[i];

-    assert(cleanup_thread.joinable());
-    cleanup_thread.join();
+        shard.are_tasks_available.notify_one();
+        assert(dump_by_first_update_threads[i].joinable());
+        dump_by_first_update_threads[i].join();
+
+        {
+            std::lock_guard lock(shard.mutex);
+
+            for (auto & [_, elem] : shard.queue)
+            {
+                for (const auto & entry : elem.data->entries)
+                {
+                    entry->finish(std::make_exception_ptr(Exception(
+                        ErrorCodes::TIMEOUT_EXCEEDED, "Wait for async insert timeout exceeded)")));
+                }
+            }
+        }
+    }

    pool.wait();
-
-    std::lock_guard lock(currently_processing_mutex);
-    for (const auto & [_, entry] : currently_processing_queries)
-    {
-        if (!entry->isFinished())
-            entry->finish(std::make_exception_ptr(Exception(
-                ErrorCodes::TIMEOUT_EXCEEDED,
-                "Wait for async insert timeout exceeded)")));
-    }
-
    LOG_TRACE(log, "Asynchronous insertion queue finished");
 }

@ -185,7 +181,7 @@ void AsynchronousInsertQueue::scheduleDataProcessingJob(const InsertQuery & key,
    });
 }

-void AsynchronousInsertQueue::push(ASTPtr query, ContextPtr query_context)
+std::future<void> AsynchronousInsertQueue::push(ASTPtr query, ContextPtr query_context)
 {
    query = query->clone();
    const auto & settings = query_context->getSettingsRef();
@ -214,97 +210,77 @@ void AsynchronousInsertQueue::push(ASTPtr query, ContextPtr query_context)
        quota->used(QuotaType::WRITTEN_BYTES, bytes.size());

    auto entry = std::make_shared<InsertData::Entry>(std::move(bytes), query_context->getCurrentQueryId());
+
    InsertQuery key{query, settings};
+    InsertDataPtr data_to_process;
+    std::future<void> insert_future;
+
+    auto shard_num = key.hash % pool_size;
+    auto & shard = queue_shards[shard_num];

    {
-        /// Firstly try to get entry from queue without exclusive lock.
-        std::shared_lock read_lock(rwlock);
-        if (auto it = queue.find(key); it != queue.end())
+        std::lock_guard lock(shard.mutex);
+
+        auto [it, inserted] = shard.iterators.try_emplace(key.hash);
+        if (inserted)
        {
-            pushImpl(std::move(entry), it);
-            return;
+            auto now = std::chrono::steady_clock::now();
+            auto timeout = now + Milliseconds{key.settings.async_insert_busy_timeout_ms};
+            it->second = shard.queue.emplace(timeout, Container{key, std::make_unique<InsertData>()}).first;
        }
+
+        auto queue_it = it->second;
+        auto & data = queue_it->second.data;
+        size_t entry_data_size = entry->bytes.size();
+
+        assert(data);
+        data->size_in_bytes += entry_data_size;
+        data->entries.emplace_back(entry);
+        insert_future = entry->getFuture();
+
+        LOG_TRACE(log, "Have {} pending inserts with total {} bytes of data for query '{}'",
+            data->entries.size(), data->size_in_bytes, key.query_str);
+
+        /// Here we check whether we hit the limit on maximum data size in the buffer.
+        /// And use setting from query context.
+        /// It works, because queries with the same set of settings are already grouped together.
+        if (data->size_in_bytes > key.settings.async_insert_max_data_size)
+        {
+            data_to_process = std::move(data);
+            shard.iterators.erase(it);
+            shard.queue.erase(queue_it);
+        }
+
+        CurrentMetrics::add(CurrentMetrics::PendingAsyncInsert);
+        ProfileEvents::increment(ProfileEvents::AsyncInsertQuery);
+        ProfileEvents::increment(ProfileEvents::AsyncInsertBytes, entry_data_size);
    }

-    std::lock_guard write_lock(rwlock);
-    auto it = queue.emplace(key, std::make_shared<Container>()).first;
-    pushImpl(std::move(entry), it);
+    if (data_to_process)
+        scheduleDataProcessingJob(key, std::move(data_to_process), getContext());
+    else
+        shard.are_tasks_available.notify_one();
+
+    return insert_future;
 }

-void AsynchronousInsertQueue::pushImpl(InsertData::EntryPtr entry, QueueIterator it)
+void AsynchronousInsertQueue::processBatchDeadlines(size_t shard_num)
 {
-    auto & [data_mutex, data] = *it->second;
-    std::lock_guard data_lock(data_mutex);
+    auto & shard = queue_shards[shard_num];

-    if (!data)
-    {
-        auto now = std::chrono::steady_clock::now();
-        data = std::make_unique<InsertData>(now);
-
-        std::lock_guard lock(deadline_mutex);
-        deadline_queue.insert({now + Milliseconds{it->first.settings.async_insert_busy_timeout_ms}, it});
-        are_tasks_available.notify_one();
-    }
-
-    size_t entry_data_size = entry->bytes.size();
-
-    data->size += entry_data_size;
-    data->entries.emplace_back(entry);
-
-    {
-        std::lock_guard currently_processing_lock(currently_processing_mutex);
-        currently_processing_queries.emplace(entry->query_id, entry);
-    }
-
-    LOG_TRACE(log, "Have {} pending inserts with total {} bytes of data for query '{}'",
-        data->entries.size(), data->size, queryToString(it->first.query));
-
-    /// Here we check whether we hit the limit on maximum data size in the buffer.
-    /// And use setting from query context!
-    /// It works, because queries with the same set of settings are already grouped together.
-    if (data->size > it->first.settings.async_insert_max_data_size)
-        scheduleDataProcessingJob(it->first, std::move(data), getContext());
-
-    CurrentMetrics::add(CurrentMetrics::PendingAsyncInsert);
-    ProfileEvents::increment(ProfileEvents::AsyncInsertQuery);
-    ProfileEvents::increment(ProfileEvents::AsyncInsertBytes, entry_data_size);
-}
-
-void AsynchronousInsertQueue::waitForProcessingQuery(const String & query_id, const Milliseconds & timeout)
-{
-    InsertData::EntryPtr entry;
-
-    {
-        std::lock_guard lock(currently_processing_mutex);
-        auto it = currently_processing_queries.find(query_id);
-        if (it == currently_processing_queries.end())
-            return;
-
-        entry = it->second;
-    }
-
-    bool finished = entry->wait(timeout);
-
-    if (!finished)
-        throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, "Wait for async insert timeout ({} ms) exceeded)", timeout.count());
-
-    if (auto exception = entry->getException())
-        std::rethrow_exception(exception);
-}
-
-void AsynchronousInsertQueue::busyCheck()
-{
    while (!shutdown)
    {
-        std::vector<QueueIterator> entries_to_flush;
+        std::vector<Container> entries_to_flush;
        {
-            std::unique_lock deadline_lock(deadline_mutex);
-            are_tasks_available.wait_for(deadline_lock, Milliseconds(getContext()->getSettingsRef().async_insert_busy_timeout_ms), [this]()
+            std::unique_lock lock(shard.mutex);
+
+            shard.are_tasks_available.wait_for(lock,
+                Milliseconds(getContext()->getSettingsRef().async_insert_busy_timeout_ms), [&shard, this]
            {
                if (shutdown)
                    return true;

-                if (!deadline_queue.empty() && deadline_queue.begin()->first < std::chrono::steady_clock::now())
+                if (!shard.queue.empty() && shard.queue.begin()->first < std::chrono::steady_clock::now())
                    return true;

                return false;
@ -317,91 +293,22 @@ void AsynchronousInsertQueue::busyCheck()

            while (true)
            {
-                if (deadline_queue.empty() || deadline_queue.begin()->first > now)
+                if (shard.queue.empty() || shard.queue.begin()->first > now)
                    break;

-                entries_to_flush.emplace_back(deadline_queue.begin()->second);
-                deadline_queue.erase(deadline_queue.begin());
+                auto it = shard.queue.begin();
+                shard.iterators.erase(it->second.key.hash);
+
+                entries_to_flush.emplace_back(std::move(it->second));
+                shard.queue.erase(it);
            }
        }

-        std::shared_lock read_lock(rwlock);
        for (auto & entry : entries_to_flush)
-        {
-            auto & [key, elem] = *entry;
-            std::lock_guard data_lock(elem->mutex);
-            if (!elem->data)
-                continue;
-
-            scheduleDataProcessingJob(key, std::move(elem->data), getContext());
-        }
+            scheduleDataProcessingJob(entry.key, std::move(entry.data), getContext());
    }
 }

-void AsynchronousInsertQueue::cleanup()
-{
-    while (true)
-    {
-        {
-            std::unique_lock cleanup_lock(cleanup_mutex);
-            cleanup_can_run.wait_for(cleanup_lock, Milliseconds(cleanup_timeout), [this]() -> bool { return shutdown; });
-
-            if (shutdown)
-                return;
-        }
-
-        std::vector<InsertQuery> keys_to_remove;
-
-        {
-            std::shared_lock read_lock(rwlock);
-
-            for (auto & [key, elem] : queue)
-            {
-                std::lock_guard data_lock(elem->mutex);
-                if (!elem->data)
-                    keys_to_remove.push_back(key);
-            }
-        }
-
-        if (!keys_to_remove.empty())
-        {
-            std::lock_guard write_lock(rwlock);
-            size_t total_removed = 0;
-
-            for (const auto & key : keys_to_remove)
-            {
-                auto it = queue.find(key);
-                if (it != queue.end() && !it->second->data)
-                {
-                    queue.erase(it);
-                    ++total_removed;
-                }
-            }
-
-            if (total_removed)
-                LOG_TRACE(log, "Removed stale entries for {} queries from asynchronous insertion queue", total_removed);
-        }
-
-        {
-            std::vector<String> ids_to_remove;
-            std::lock_guard lock(currently_processing_mutex);
-
-            for (const auto & [query_id, entry] : currently_processing_queries)
-                if (entry->isFinished())
-                    ids_to_remove.push_back(query_id);
-
-            if (!ids_to_remove.empty())
-            {
-                for (const auto & id : ids_to_remove)
-                    currently_processing_queries.erase(id);
-
-                LOG_TRACE(log, "Removed {} finished entries from asynchronous insertion queue", ids_to_remove.size());
-            }
-        }
-    }
-}
-
-
 static void appendElementsToLogSafe(
    AsynchronousInsertLog & log,
    std::vector<AsynchronousInsertLogElement> elements,
@ -464,7 +371,7 @@ try
    {
        current_exception = e.displayText();
        LOG_ERROR(log, "Failed parsing for query '{}' with query id {}. {}",
-            queryToString(key.query), current_entry->query_id, current_exception);
+            key.query_str, current_entry->query_id, current_exception);

        for (const auto & column : result_columns)
            if (column->size() > total_rows)
@ -546,7 +453,7 @@ try
        completed_executor.execute();

        LOG_INFO(log, "Flushed {} rows, {} bytes for query '{}'",
-            total_rows, total_bytes, queryToString(key.query));
+            total_rows, total_bytes, key.query_str);
    }
    catch (...)
    {
--- a/src/Interpreters/AsynchronousInsertQueue.h
+++ b/src/Interpreters/AsynchronousInsertQueue.h
@ -4,10 +4,7 @@
 #include <Common/ThreadPool.h>
 #include <Core/Settings.h>
 #include <Poco/Logger.h>
-
-#include <atomic>
-#include <unordered_map>
-
+#include <future>

 namespace DB
 {
@ -19,25 +16,29 @@ class AsynchronousInsertQueue : public WithContext
 public:
    using Milliseconds = std::chrono::milliseconds;

-    AsynchronousInsertQueue(ContextPtr context_, size_t pool_size, Milliseconds cleanup_timeout);
+    AsynchronousInsertQueue(ContextPtr context_, size_t pool_size_);
    ~AsynchronousInsertQueue();

-    void push(ASTPtr query, ContextPtr query_context);
-    void waitForProcessingQuery(const String & query_id, const Milliseconds & timeout);
+    std::future<void> push(ASTPtr query, ContextPtr query_context);
+    size_t getPoolSize() const { return pool_size; }

 private:

    struct InsertQuery
    {
+    public:
        ASTPtr query;
+        String query_str;
        Settings settings;
+        UInt128 hash;

        InsertQuery(const ASTPtr & query_, const Settings & settings_);
        InsertQuery(const InsertQuery & other);
        InsertQuery & operator=(const InsertQuery & other);
-
        bool operator==(const InsertQuery & other) const;
-        struct Hash { UInt64 operator()(const InsertQuery & insert_query) const; };
+
+    private:
+        UInt128 calculateHash() const;
    };

    struct InsertData
@ -47,109 +48,84 @@ private:
        public:
            const String bytes;
            const String query_id;
-            std::chrono::time_point<std::chrono::system_clock> create_time;
+            const std::chrono::time_point<std::chrono::system_clock> create_time;

            Entry(String && bytes_, String && query_id_);

            void finish(std::exception_ptr exception_ = nullptr);
-            bool wait(const Milliseconds & timeout) const;
-            bool isFinished() const;
-            std::exception_ptr getException() const;
+            std::future<void> getFuture() { return promise.get_future(); }
+            bool isFinished() const { return finished; }

        private:
-            mutable std::mutex mutex;
-            mutable std::condition_variable cv;
-
-            bool finished = false;
-            std::exception_ptr exception;
+            std::promise<void> promise;
+            std::atomic_bool finished = false;
        };

-        explicit InsertData(std::chrono::steady_clock::time_point now)
-            : first_update(now)
-        {}
-
        using EntryPtr = std::shared_ptr<Entry>;

        std::list<EntryPtr> entries;
-        size_t size = 0;
-
-        /// Timestamp of the first insert into queue, or after the last queue dump.
-        /// Used to detect for how long the queue is active, so we can dump it by timer.
-        std::chrono::time_point<std::chrono::steady_clock> first_update;
+        size_t size_in_bytes = 0;
    };

    using InsertDataPtr = std::unique_ptr<InsertData>;

-    /// A separate container, that holds a data and a mutex for it.
-    /// When it's needed to process current chunk of data, it can be moved for processing
-    /// and new data can be recreated without holding a lock during processing.
    struct Container
    {
-        std::mutex mutex;
+        InsertQuery key;
        InsertDataPtr data;
    };

-    using Queue = std::unordered_map<InsertQuery, std::shared_ptr<Container>, InsertQuery::Hash>;
-    using QueueIterator = Queue::iterator;
    /// Ordered container
-    using DeadlineQueue = std::map<std::chrono::steady_clock::time_point, QueueIterator>;
+    /// Key is a timestamp of the first insert into batch.
+    /// Used to detect for how long the batch is active, so we can dump it by timer.
+    using Queue = std::map<std::chrono::steady_clock::time_point, Container>;
+    using QueueIterator = Queue::iterator;
+    using QueueIteratorByKey = std::unordered_map<UInt128, QueueIterator>;

+    struct QueueShard
+    {
+        mutable std::mutex mutex;
+        mutable std::condition_variable are_tasks_available;

-    mutable std::shared_mutex rwlock;
-    Queue queue;
+        Queue queue;
+        QueueIteratorByKey iterators;
+    };

-    /// This is needed only for using inside cleanup() function and correct signaling about shutdown
-    mutable std::mutex cleanup_mutex;
-    mutable std::condition_variable cleanup_can_run;
-
-    mutable std::mutex deadline_mutex;
-    mutable std::condition_variable are_tasks_available;
-    DeadlineQueue deadline_queue;
-
-    using QueryIdToEntry = std::unordered_map<String, InsertData::EntryPtr>;
-    mutable std::mutex currently_processing_mutex;
-    QueryIdToEntry currently_processing_queries;
+    const size_t pool_size;
+    std::vector<QueueShard> queue_shards;

    /// Logic and events behind queue are as follows:
-    ///  - busy_timeout:   if queue is active for too long and there are a lot of rapid inserts, then we dump the data, so it doesn't
-    ///                    grow for a long period of time and users will be able to select new data in deterministic manner.
-    ///  - stale_timeout:  if queue is stale for too long, then we dump the data too, so that users will be able to select the last
-    ///                    piece of inserted data.
+    ///  - async_insert_busy_timeout_ms:
+    ///   if queue is active for too long and there are a lot of rapid inserts, then we dump the data, so it doesn't
+    ///   grow for a long period of time and users will be able to select new data in deterministic manner.
    ///
-    /// During processing incoming INSERT queries we can also check whether the maximum size of data in buffer is reached (async_insert_max_data_size setting)
-    /// If so, then again we dump the data.
-
-    const Milliseconds cleanup_timeout;
+    /// During processing incoming INSERT queries we can also check whether the maximum size of data in buffer is reached
+    /// (async_insert_max_data_size setting). If so, then again we dump the data.

    std::atomic<bool> shutdown{false};

-    ThreadPool pool;  /// dump the data only inside this pool.
-    ThreadFromGlobalPool dump_by_first_update_thread;  /// uses busy_timeout and busyCheck()
-    ThreadFromGlobalPool cleanup_thread;               /// uses busy_timeout and cleanup()
+    /// Dump the data only inside this pool.
+    ThreadPool pool;
+
+    /// Uses async_insert_busy_timeout_ms and processBatchDeadlines()
+    std::vector<ThreadFromGlobalPool> dump_by_first_update_threads;

    Poco::Logger * log = &Poco::Logger::get("AsynchronousInsertQueue");

-    void busyCheck();
-    void cleanup();
-
-    /// Should be called with shared or exclusively locked 'rwlock'.
-    void pushImpl(InsertData::EntryPtr entry, QueueIterator it);
-
+    void processBatchDeadlines(size_t shard_num);
    void scheduleDataProcessingJob(const InsertQuery & key, InsertDataPtr data, ContextPtr global_context);
+
    static void processData(InsertQuery key, InsertDataPtr data, ContextPtr global_context);

    template <typename E>
    static void finishWithException(const ASTPtr & query, const std::list<InsertData::EntryPtr> & entries, const E & exception);

-    /// @param timeout - time to wait
-    /// @return true if shutdown requested
-    bool waitForShutdown(const Milliseconds & timeout);
-
 public:
-    auto getQueueLocked() const
+    auto getQueueLocked(size_t shard_num) const
    {
-        std::shared_lock lock(rwlock);
-        return std::make_pair(std::ref(queue), std::move(lock));
+        auto & shard = queue_shards[shard_num];
+        std::unique_lock lock(shard.mutex);
+        return std::make_pair(std::ref(shard.queue), std::move(lock));
    }
 };

--- a/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp
+++ b/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp
@ -174,18 +174,15 @@ void SelectStreamFactory::createForShard(
 }


-SelectStreamFactory::ShardPlans SelectStreamFactory::createForShardWithParallelReplicas(
+void SelectStreamFactory::createForShardWithParallelReplicas(
    const Cluster::ShardInfo & shard_info,
    const ASTPtr & query_ast,
    const StorageID & main_table,
-    const ASTPtr & table_function_ptr,
-    const ThrottlerPtr & throttler,
    ContextPtr context,
    UInt32 shard_count,
-    const std::shared_ptr<const StorageLimitsList> & storage_limits)
+    std::vector<QueryPlanPtr> & local_plans,
+    Shards & remote_shards)
 {
-    SelectStreamFactory::ShardPlans result;
-
    if (auto it = objects_by_shard.find(shard_info.shard_num); it != objects_by_shard.end())
        replaceMissedSubcolumnsByConstants(storage_snapshot->object_columns, it->second, query_ast);

@ -213,8 +210,6 @@ SelectStreamFactory::ShardPlans SelectStreamFactory::createForShardWithParallelR
    size_t all_replicas_count = shard_info.getRemoteNodeCount();

    auto coordinator = std::make_shared<ParallelReplicasReadingCoordinator>();
-    auto remote_plan = std::make_unique<QueryPlan>();
-

    if (settings.prefer_localhost_replica && shard_info.isLocal())
    {
@ -223,48 +218,22 @@ SelectStreamFactory::ShardPlans SelectStreamFactory::createForShardWithParallelR
        {
            ++all_replicas_count;

-            result.local_plan = createLocalPlan(
-                query_ast, header, context, processed_stage, shard_info.shard_num, shard_count, next_replica_number, all_replicas_count, coordinator);
+            local_plans.emplace_back(createLocalPlan(
+                query_ast, header, context, processed_stage, shard_info.shard_num, shard_count, next_replica_number, all_replicas_count, coordinator));

            ++next_replica_number;
        }
    }

-    Scalars scalars = context->hasQueryContext() ? context->getQueryContext()->getScalars() : Scalars{};
-    scalars.emplace(
-        "_shard_count", Block{{DataTypeUInt32().createColumnConst(1, shard_count), std::make_shared<DataTypeUInt32>(), "_shard_count"}});
-    auto external_tables = context->getExternalTables();
-
-    auto shard = Shard{
-        .query = query_ast,
-        .header = header,
-        .shard_info = shard_info,
-        .lazy = false,
-        .local_delay = 0,
-    };
-
    if (shard_info.hasRemoteConnections())
-    {
-        auto read_from_remote = std::make_unique<ReadFromParallelRemoteReplicasStep>(
-            coordinator,
-            shard,
-            header,
-            processed_stage,
-            main_table,
-            table_function_ptr,
-            context,
-            throttler,
-            std::move(scalars),
-            std::move(external_tables),
-            &Poco::Logger::get("ReadFromParallelRemoteReplicasStep"),
-            storage_limits);
-
-        remote_plan->addStep(std::move(read_from_remote));
-        remote_plan->addInterpreterContext(context);
-        result.remote_plan = std::move(remote_plan);
-    }
-
-    return result;
+        remote_shards.emplace_back(Shard{
+            .query = query_ast,
+            .header = header,
+            .shard_info = shard_info,
+            .lazy = false,
+            .local_delay = 0,
+            .coordinator = coordinator,
+        });
 }

 }
--- a/src/Interpreters/ClusterProxy/SelectStreamFactory.h
+++ b/src/Interpreters/ClusterProxy/SelectStreamFactory.h
@ -1,12 +1,13 @@
 #pragma once

-#include <Core/QueryProcessingStage.h>
-#include <Interpreters/StorageID.h>
-#include <Storages/IStorage_fwd.h>
-#include <Storages/StorageSnapshot.h>
 #include <Client/ConnectionPool.h>
+#include <Core/QueryProcessingStage.h>
 #include <Interpreters/Cluster.h>
+#include <Interpreters/StorageID.h>
 #include <Parsers/IAST.h>
+#include <Storages/IStorage_fwd.h>
+#include <Storages/MergeTree/ParallelReplicasReadingCoordinator.h>
+#include <Storages/StorageSnapshot.h>

 namespace DB
 {
@ -47,6 +48,9 @@ public:
        /// (When there is a local replica with big delay).
        bool lazy = false;
        time_t local_delay = 0;
+
+        /// Set only if parallel reading from replicas is used.
+        std::shared_ptr<ParallelReplicasReadingCoordinator> coordinator;
    };

    using Shards = std::vector<Shard>;
@ -76,16 +80,14 @@ public:
        std::unique_ptr<QueryPlan> remote_plan;
    };

-    ShardPlans createForShardWithParallelReplicas(
+    void createForShardWithParallelReplicas(
        const Cluster::ShardInfo & shard_info,
        const ASTPtr & query_ast,
        const StorageID & main_table,
-        const ASTPtr & table_function_ptr,
-        const ThrottlerPtr & throttler,
        ContextPtr context,
        UInt32 shard_count,
-        const std::shared_ptr<const StorageLimitsList> & storage_limits
-    );
+        std::vector<QueryPlanPtr> & local_plans,
+        Shards & remote_shards);

 private:
    const Block header;
--- a/src/Interpreters/ClusterProxy/executeQuery.cpp
+++ b/src/Interpreters/ClusterProxy/executeQuery.cpp
@ -1,19 +1,45 @@
-#include <Interpreters/ClusterProxy/executeQuery.h>
-#include <Interpreters/ClusterProxy/SelectStreamFactory.h>
+#include <Core/QueryProcessingStage.h>
 #include <Core/Settings.h>
-#include <Interpreters/Context.h>
+#include <DataTypes/DataTypesNumber.h>
 #include <Interpreters/Cluster.h>
+#include <Interpreters/ClusterProxy/SelectStreamFactory.h>
+#include <Interpreters/ClusterProxy/executeQuery.h>
+#include <Interpreters/Context.h>
 #include <Interpreters/IInterpreter.h>
-#include <Interpreters/ProcessList.h>
+#include <Interpreters/InterpreterSelectQuery.h>
 #include <Interpreters/OptimizeShardingKeyRewriteInVisitor.h>
-#include <QueryPipeline/Pipe.h>
 #include <Parsers/queryToString.h>
+#include <Interpreters/ProcessList.h>
 #include <Processors/QueryPlan/QueryPlan.h>
 #include <Processors/QueryPlan/ReadFromRemote.h>
 #include <Processors/QueryPlan/UnionStep.h>
+#include <QueryPipeline/Pipe.h>
 #include <Storages/SelectQueryInfo.h>
-#include <DataTypes/DataTypesNumber.h>

+using namespace DB;
+
+namespace
+{
+
+/// We determine output stream sort properties by a local plan (local because otherwise table could be unknown).
+/// If no local shard exist for this cluster, no sort properties will be provided, c'est la vie.
+auto getRemoteShardsOutputStreamSortingProperties(const std::vector<QueryPlanPtr> & plans, ContextMutablePtr context)
+{
+    SortDescription sort_description;
+    DataStream::SortScope sort_scope = DataStream::SortScope::None;
+    if (!plans.empty())
+    {
+        if (const auto * step = dynamic_cast<const ITransformingStep *>(plans.front()->getRootNode()->step.get());
+            step && step->getDataStreamTraits().can_enforce_sorting_properties_in_distributed_query)
+        {
+            step->adjustSettingsToEnforceSortingPropertiesInDistributedQuery(context);
+            sort_description = step->getOutputStream().sort_description;
+            sort_scope = step->getOutputStream().sort_scope;
+        }
+    }
+    return std::make_pair(sort_description, sort_scope);
+}
+}

 namespace DB
 {
@ -190,6 +216,8 @@ void executeQuery(
            "_shard_count", Block{{DataTypeUInt32().createColumnConst(1, shards), std::make_shared<DataTypeUInt32>(), "_shard_count"}});
        auto external_tables = context->getExternalTables();

+        auto && [sort_description, sort_scope] = getRemoteShardsOutputStreamSortingProperties(plans, new_context);
+
        auto plan = std::make_unique<QueryPlan>();
        auto read_from_remote = std::make_unique<ReadFromRemote>(
            std::move(remote_shards),
@ -203,7 +231,9 @@ void executeQuery(
            std::move(external_tables),
            log,
            shards,
-            query_info.storage_limits);
+            query_info.storage_limits,
+            std::move(sort_description),
+            std::move(sort_scope));

        read_from_remote->setStepDescription("Read from remote replica");
        plan->addStep(std::move(read_from_remote));
@ -235,10 +265,13 @@ void executeQueryWithParallelReplicas(
    const StorageID & main_table,
    const ASTPtr & table_func_ptr,
    SelectStreamFactory & stream_factory,
-    const ASTPtr & query_ast, ContextPtr context, const SelectQueryInfo & query_info,
+    const ASTPtr & query_ast,
+    ContextPtr context,
+    const SelectQueryInfo & query_info,
    const ExpressionActionsPtr & sharding_key_expr,
    const std::string & sharding_key_column_name,
-    const ClusterPtr & not_optimized_cluster)
+    const ClusterPtr & not_optimized_cluster,
+    QueryProcessingStage::Enum processed_stage)
 {
    const Settings & settings = context->getSettingsRef();

@ -261,6 +294,7 @@ void executeQueryWithParallelReplicas(


    std::vector<QueryPlanPtr> plans;
+    SelectStreamFactory::Shards remote_shards;
    size_t shards = query_info.getCluster()->getShardCount();

    for (const auto & shard_info : query_info.getCluster()->getShardsInfo())
@ -283,18 +317,43 @@ void executeQueryWithParallelReplicas(
        else
            query_ast_for_shard = query_ast;

-        auto shard_plans = stream_factory.createForShardWithParallelReplicas(shard_info,
-            query_ast_for_shard, main_table, table_func_ptr, throttler, context,
-            static_cast<UInt32>(shards), query_info.storage_limits);
+        stream_factory.createForShardWithParallelReplicas(
+            shard_info, query_ast_for_shard, main_table, context, static_cast<UInt32>(shards), plans, remote_shards);
+    }

-        if (!shard_plans.local_plan && !shard_plans.remote_plan)
-            throw Exception(ErrorCodes::LOGICAL_ERROR, "No plans were generated for reading from shard. This is a bug");
+    Scalars scalars = context->hasQueryContext() ? context->getQueryContext()->getScalars() : Scalars{};
+    scalars.emplace(
+        "_shard_count", Block{{DataTypeUInt32().createColumnConst(1, shards), std::make_shared<DataTypeUInt32>(), "_shard_count"}});
+    auto external_tables = context->getExternalTables();

-        if (shard_plans.local_plan)
-            plans.emplace_back(std::move(shard_plans.local_plan));
+    if (!remote_shards.empty())
+    {
+        auto new_context = Context::createCopy(context);
+        auto && [sort_description, sort_scope] = getRemoteShardsOutputStreamSortingProperties(plans, new_context);

-        if (shard_plans.remote_plan)
-            plans.emplace_back(std::move(shard_plans.remote_plan));
+        for (const auto & shard : remote_shards)
+        {
+            auto read_from_remote = std::make_unique<ReadFromParallelRemoteReplicasStep>(
+                shard.coordinator,
+                shard,
+                shard.header,
+                processed_stage,
+                main_table,
+                table_func_ptr,
+                new_context,
+                throttler,
+                scalars,
+                external_tables,
+                &Poco::Logger::get("ReadFromParallelRemoteReplicasStep"),
+                query_info.storage_limits,
+                sort_description,
+                sort_scope);
+
+            auto remote_plan = std::make_unique<QueryPlan>();
+            remote_plan->addStep(std::move(read_from_remote));
+            remote_plan->addInterpreterContext(new_context);
+            plans.emplace_back(std::move(remote_plan));
+        }
    }

    if (plans.empty())
--- a/src/Interpreters/ClusterProxy/executeQuery.h
+++ b/src/Interpreters/ClusterProxy/executeQuery.h
@ -58,11 +58,13 @@ void executeQueryWithParallelReplicas(
    const StorageID & main_table,
    const ASTPtr & table_func_ptr,
    SelectStreamFactory & stream_factory,
-    const ASTPtr & query_ast, ContextPtr context, const SelectQueryInfo & query_info,
+    const ASTPtr & query_ast,
+    ContextPtr context,
+    const SelectQueryInfo & query_info,
    const ExpressionActionsPtr & sharding_key_expr,
    const std::string & sharding_key_column_name,
-    const ClusterPtr & not_optimized_cluster);
-
+    const ClusterPtr & not_optimized_cluster,
+    QueryProcessingStage::Enum processed_stage);
 }

 }
--- a/src/Interpreters/InterpreterDropQuery.cpp
+++ b/src/Interpreters/InterpreterDropQuery.cpp
@ -7,6 +7,7 @@
 #include <Access/Common/AccessRightsElement.h>
 #include <Parsers/ASTDropQuery.h>
 #include <Storages/IStorage.h>
+#include <Storages/MergeTree/MergeTreeData.h>
 #include <Common/escapeForFileName.h>
 #include <Common/quoteString.h>
 #include <Common/typeid_cast.h>
@ -120,6 +121,8 @@ BlockIO InterpreterDropQuery::executeToTableImpl(ContextPtr context_, ASTDropQue
    auto [database, table] = query.if_exists ? DatabaseCatalog::instance().tryGetDatabaseAndTable(table_id, context_)
                                             : DatabaseCatalog::instance().getDatabaseAndTable(table_id, context_);

+    checkStorageSupportsTransactionsIfNeeded(table, context_);
+
    if (database && table)
    {
        auto & ast_drop_query = query.as<ASTDropQuery &>();
@ -207,18 +210,15 @@ BlockIO InterpreterDropQuery::executeToTableImpl(ContextPtr context_, ASTDropQue

            table->checkTableCanBeDropped();

-            TableExclusiveLockHolder table_lock;
-            /// We don't need this lock for ReplicatedMergeTree
-            if (!table->supportsReplication())
-            {
-                /// And for simple MergeTree we can stop merges before acquiring the lock
-                auto merges_blocker = table->getActionLock(ActionLocks::PartsMerge);
-                table_lock = table->lockExclusively(context_->getCurrentQueryId(), context_->getSettingsRef().lock_acquire_timeout);
-            }
+            TableExclusiveLockHolder table_excl_lock;
+            /// We don't need any lock for ReplicatedMergeTree and for simple MergeTree
+            /// For the rest of tables types exclusive lock is needed
+            if (!std::dynamic_pointer_cast<MergeTreeData>(table))
+                table_excl_lock = table->lockExclusively(context_->getCurrentQueryId(), context_->getSettingsRef().lock_acquire_timeout);

            auto metadata_snapshot = table->getInMemoryMetadataPtr();
            /// Drop table data, don't touch metadata
-            table->truncate(query_ptr, metadata_snapshot, context_, table_lock);
+            table->truncate(query_ptr, metadata_snapshot, context_, table_excl_lock);
        }
        else if (query.kind == ASTDropQuery::Kind::Drop)
        {
@ -464,4 +464,16 @@ void InterpreterDropQuery::executeDropQuery(ASTDropQuery::Kind kind, ContextPtr
    }
 }

+bool InterpreterDropQuery::supportsTransactions() const
+{
+    /// Enable only for truncate table with MergeTreeData engine
+
+    auto & drop = query_ptr->as<ASTDropQuery &>();
+
+    return drop.cluster.empty()
+            && !drop.temporary
+            && drop.kind == ASTDropQuery::Kind::Truncate
+            && drop.table;
+}
+
 }
--- a/src/Interpreters/InterpreterDropQuery.h
+++ b/src/Interpreters/InterpreterDropQuery.h
@ -28,6 +28,8 @@ public:

    static void executeDropQuery(ASTDropQuery::Kind kind, ContextPtr global_context, ContextPtr current_context, const StorageID & target_table_id, bool sync);

+    bool supportsTransactions() const override;
+
 private:
    AccessRightsElements getRequiredAccessForDDLOnCluster() const;
    ASTPtr query_ptr;
--- a/src/Interpreters/InterpreterExternalDDLQuery.cpp
+++ b/src/Interpreters/InterpreterExternalDDLQuery.cpp
@ -13,6 +13,7 @@
 #    include <Interpreters/MySQL/InterpretersMySQLDDLQuery.h>
 #    include <Parsers/MySQL/ASTAlterQuery.h>
 #    include <Parsers/MySQL/ASTCreateQuery.h>
+#    include <Parsers/MySQL/ASTDropQuery.h>
 #endif

 namespace DB
@ -44,7 +45,7 @@ BlockIO InterpreterExternalDDLQuery::execute()
        if (arguments.size() != 2 || !arguments[0]->as<ASTIdentifier>() || !arguments[1]->as<ASTIdentifier>())
            throw Exception("MySQL External require two identifier arguments.", ErrorCodes::BAD_ARGUMENTS);

-        if (external_ddl_query.external_ddl->as<ASTDropQuery>())
+        if (external_ddl_query.external_ddl->as<MySQLParser::ASTDropQuery>())
            return MySQLInterpreter::InterpreterMySQLDropQuery(
                external_ddl_query.external_ddl, getContext(), getIdentifierName(arguments[0]),
                getIdentifierName(arguments[1])).execute();
--- a/src/Interpreters/InterpreterSelectQuery.cpp
+++ b/src/Interpreters/InterpreterSelectQuery.cpp
@ -72,25 +72,27 @@

 #include <Storages/IStorage.h>
 #include <Storages/MergeTree/MergeTreeWhereOptimizer.h>
+#include <Storages/StorageDistributed.h>
 #include <Storages/StorageValues.h>
 #include <Storages/StorageView.h>

-#include <Functions/IFunction.h>
+#include <Columns/Collator.h>
+#include <Core/ColumnNumbers.h>
 #include <Core/Field.h>
 #include <Core/ProtocolDefines.h>
-#include <base/types.h>
-#include <base/sort.h>
-#include <Columns/Collator.h>
-#include <Common/FieldVisitorsAccurateComparison.h>
-#include <Common/FieldVisitorToString.h>
-#include <Common/typeid_cast.h>
-#include <Common/checkStackSize.h>
-#include <Core/ColumnNumbers.h>
+#include <Functions/IFunction.h>
 #include <Interpreters/Aggregator.h>
+#include <Interpreters/Cluster.h>
 #include <Interpreters/IJoin.h>
 #include <QueryPipeline/SizeLimits.h>
 #include <base/map.h>
+#include <base/sort.h>
+#include <base/types.h>
+#include <Common/FieldVisitorToString.h>
+#include <Common/FieldVisitorsAccurateComparison.h>
+#include <Common/checkStackSize.h>
 #include <Common/scope_guard_safe.h>
+#include <Common/typeid_cast.h>


 namespace DB
@ -1071,6 +1073,9 @@ static InterpolateDescriptionPtr getInterpolateDescription(

 static SortDescription getSortDescriptionFromGroupBy(const ASTSelectQuery & query)
 {
+    if (!query.groupBy())
+        return {};
+
    SortDescription order_descr;
    order_descr.reserve(query.groupBy()->children.size());

@ -1743,7 +1748,8 @@ static void executeMergeAggregatedImpl(
    const Settings & settings,
    const NamesAndTypesList & aggregation_keys,
    const AggregateDescriptions & aggregates,
-    bool should_produce_results_in_order_of_bucket_number)
+    bool should_produce_results_in_order_of_bucket_number,
+    SortDescription group_by_sort_description)
 {
    auto keys = aggregation_keys.getNames();
    if (has_grouping_sets)
@ -1773,7 +1779,11 @@ static void executeMergeAggregatedImpl(
        settings.distributed_aggregation_memory_efficient && is_remote_storage,
        settings.max_threads,
        settings.aggregation_memory_efficient_merge_threads,
-        should_produce_results_in_order_of_bucket_number);
+        should_produce_results_in_order_of_bucket_number,
+        settings.max_block_size,
+        settings.aggregation_in_order_max_block_bytes,
+        std::move(group_by_sort_description),
+        settings.enable_memory_bound_merging_of_aggregation_results);

    query_plan.addStep(std::move(merging_aggregated));
 }
@ -1837,6 +1847,9 @@ void InterpreterSelectQuery::addEmptySourceToQueryPlan(
            // Let's just choose the safe option since we don't know the value of `to_stage` here.
            const bool should_produce_results_in_order_of_bucket_number = true;

+            // It is used to determine if we should use memory bound merging strategy. Maybe it makes sense for projections, but so far this case is just left untouched.
+            SortDescription group_by_sort_description;
+
            executeMergeAggregatedImpl(
                query_plan,
                query_info.projection->aggregate_overflow_row,
@ -1846,7 +1859,8 @@ void InterpreterSelectQuery::addEmptySourceToQueryPlan(
                context_->getSettingsRef(),
                query_info.projection->aggregation_keys,
                query_info.projection->aggregate_descriptions,
-                should_produce_results_in_order_of_bucket_number);
+                should_produce_results_in_order_of_bucket_number,
+                std::move(group_by_sort_description));
        }
    }
 }
@ -2449,6 +2463,26 @@ void InterpreterSelectQuery::executeAggregation(QueryPlan & query_plan, const Ac
    else
        group_by_info = nullptr;

+    if (!group_by_info && settings.force_aggregation_in_order)
+    {
+        /// Not the most optimal implementation here, but this branch handles very marginal case.
+
+        group_by_sort_description = getSortDescriptionFromGroupBy(getSelectQuery());
+
+        auto sorting_step = std::make_unique<SortingStep>(
+            query_plan.getCurrentDataStream(),
+            group_by_sort_description,
+            0 /* LIMIT */,
+            SortingStep::Settings(*context),
+            settings.optimize_sorting_by_input_stream_properties);
+        sorting_step->setStepDescription("Enforced sorting for aggregation in order");
+
+        query_plan.addStep(std::move(sorting_step));
+
+        group_by_info = std::make_shared<InputOrderInfo>(
+            group_by_sort_description, group_by_sort_description.size(), 1 /* direction */, 0 /* limit */);
+    }
+
    auto merge_threads = max_streams;
    auto temporary_data_merge_threads = settings.aggregation_memory_efficient_merge_threads
        ? static_cast<size_t>(settings.aggregation_memory_efficient_merge_threads)
@ -2456,8 +2490,8 @@ void InterpreterSelectQuery::executeAggregation(QueryPlan & query_plan, const Ac

    bool storage_has_evenly_distributed_read = storage && storage->hasEvenlyDistributedRead();

-    const bool should_produce_results_in_order_of_bucket_number
-        = options.to_stage == QueryProcessingStage::WithMergeableState && settings.distributed_aggregation_memory_efficient;
+    const bool should_produce_results_in_order_of_bucket_number = options.to_stage == QueryProcessingStage::WithMergeableState
+        && (settings.distributed_aggregation_memory_efficient || settings.enable_memory_bound_merging_of_aggregation_results);

    auto aggregating_step = std::make_unique<AggregatingStep>(
        query_plan.getCurrentDataStream(),
@ -2472,7 +2506,8 @@ void InterpreterSelectQuery::executeAggregation(QueryPlan & query_plan, const Ac
        settings.group_by_use_nulls,
        std::move(group_by_info),
        std::move(group_by_sort_description),
-        should_produce_results_in_order_of_bucket_number);
+        should_produce_results_in_order_of_bucket_number,
+        settings.enable_memory_bound_merging_of_aggregation_results);
    query_plan.addStep(std::move(aggregating_step));
 }

@ -2485,8 +2520,14 @@ void InterpreterSelectQuery::executeMergeAggregated(QueryPlan & query_plan, bool
    if (query_info.projection && query_info.projection->desc->type == ProjectionDescription::Type::Aggregate)
        return;

+    const Settings & settings = context->getSettingsRef();
+
+    /// Used to determine if we should use memory bound merging strategy.
+    auto group_by_sort_description
+        = !query_analyzer->useGroupingSetKey() ? getSortDescriptionFromGroupBy(getSelectQuery()) : SortDescription{};
+
    const bool should_produce_results_in_order_of_bucket_number = options.to_stage == QueryProcessingStage::WithMergeableState
-        && context->getSettingsRef().distributed_aggregation_memory_efficient;
+        && (settings.distributed_aggregation_memory_efficient || settings.enable_memory_bound_merging_of_aggregation_results);

    executeMergeAggregatedImpl(
        query_plan,
@ -2497,7 +2538,8 @@ void InterpreterSelectQuery::executeMergeAggregated(QueryPlan & query_plan, bool
        context->getSettingsRef(),
        query_analyzer->aggregationKeys(),
        query_analyzer->aggregates(),
-        should_produce_results_in_order_of_bucket_number);
+        should_produce_results_in_order_of_bucket_number,
+        std::move(group_by_sort_description));
 }


--- a/src/Interpreters/JoinToSubqueryTransformVisitor.cpp
+++ b/src/Interpreters/JoinToSubqueryTransformVisitor.cpp
@ -560,11 +560,11 @@ std::vector<TableNeededColumns> normalizeColumnNamesExtractNeeded(
                    original_long_name = ident->name();

                size_t count = countTablesWithColumn(tables, short_name);
+                const auto & table = tables[*table_pos];

                /// isValidIdentifierBegin retuired to be consistent with TableJoin::deduplicateAndQualifyColumnNames
                if (count > 1 || aliases.contains(short_name) || !isValidIdentifierBegin(short_name.at(0)))
                {
-                    const auto & table = tables[*table_pos];
                    IdentifierSemantic::setColumnLongName(*ident, table.table); /// table.column -> table_alias.column
                    const auto & unique_long_name = ident->name();

@ -578,6 +578,13 @@ std::vector<TableNeededColumns> normalizeColumnNamesExtractNeeded(
                }
                else
                {
+                    if (!table.hasColumn(short_name))
+                    {
+                        throw Exception(ErrorCodes::UNKNOWN_IDENTIFIER,
+                                        "There's no column '{}' in table '{}'",
+                                        ident->name(),
+                                        table.table.getQualifiedNamePrefix(false));
+                    }
                    ident->setShortName(short_name); /// table.column -> column
                    needed_columns[*table_pos].no_clashes.emplace(short_name);
                }
--- a/src/Interpreters/MySQL/InterpretersMySQLDDLQuery.cpp
+++ b/src/Interpreters/MySQL/InterpretersMySQLDDLQuery.cpp
@ -6,6 +6,7 @@
 #include <Parsers/ASTIdentifier.h>
 #include <Parsers/ASTAlterQuery.h>
 #include <Parsers/ASTCreateQuery.h>
+#include <Parsers/ASTDropQuery.h>
 #include <Parsers/ASTColumnDeclaration.h>
 #include <Parsers/ASTIndexDeclaration.h>
 #include <Parsers/MySQL/ASTCreateQuery.h>
@ -543,15 +544,29 @@ void InterpreterDropImpl::validate(const InterpreterDropImpl::TQuery & /*query*/
 ASTs InterpreterDropImpl::getRewrittenQueries(
    const InterpreterDropImpl::TQuery & drop_query, ContextPtr context, const String & mapped_to_database, const String & mysql_database)
 {
-    const auto & database_name = resolveDatabase(drop_query.getDatabase(), mysql_database, mapped_to_database, context);
-
-    /// Skip drop database|view|dictionary
-    if (database_name != mapped_to_database || !drop_query.table || drop_query.is_view || drop_query.is_dictionary)
+    /// Skip drop database|view|dictionary|others
+    if (drop_query.kind != TQuery::Kind::Table)
        return {};
-
-    ASTPtr rewritten_query = drop_query.clone();
-    rewritten_query->as<ASTDropQuery>()->setDatabase(mapped_to_database);
-    return ASTs{rewritten_query};
+    TQuery::QualifiedNames tables = drop_query.names;
+    ASTs rewritten_querys;
+    for (const auto & table: tables)
+    {
+        const auto & database_name = resolveDatabase(table.schema, mysql_database, mapped_to_database, context);
+        if (database_name != mapped_to_database)
+            continue;
+        auto rewritten_query = std::make_shared<ASTDropQuery>();
+        rewritten_query->setTable(table.shortName);
+        rewritten_query->setDatabase(mapped_to_database);
+        if (drop_query.is_truncate)
+            rewritten_query->kind = ASTDropQuery::Kind::Truncate;
+        else
+            rewritten_query->kind = ASTDropQuery::Kind::Drop;
+        rewritten_query->is_view = false;
+        //To avoid failure, we always set exists
+        rewritten_query->if_exists = true;
+        rewritten_querys.push_back(rewritten_query);
+    }
+    return rewritten_querys;
 }

 void InterpreterRenameImpl::validate(const InterpreterRenameImpl::TQuery & rename_query, ContextPtr /*context*/)
--- a/src/Interpreters/MySQL/InterpretersMySQLDDLQuery.h
+++ b/src/Interpreters/MySQL/InterpretersMySQLDDLQuery.h
@ -2,11 +2,11 @@

 #include <Interpreters/IInterpreter.h>
 #include <Interpreters/executeQuery.h>
-#include <Parsers/ASTDropQuery.h>
 #include <Parsers/ASTRenameQuery.h>
 #include <Parsers/IAST_fwd.h>
 #include <Parsers/MySQL/ASTAlterQuery.h>
 #include <Parsers/MySQL/ASTCreateQuery.h>
+#include <Parsers/MySQL/ASTDropQuery.h>
 #include <Parsers/queryToString.h>
 #include <Parsers/ASTExpressionList.h>

@ -17,7 +17,7 @@ namespace MySQLInterpreter
 {
    struct InterpreterDropImpl
    {
-        using TQuery = ASTDropQuery;
+        using TQuery = MySQLParser::ASTDropQuery;

        static void validate(const TQuery & query, ContextPtr context);

--- a/src/Interpreters/Session.cpp
+++ b/src/Interpreters/Session.cpp
@ -117,6 +117,8 @@ public:
            if (!thread.joinable())
                thread = ThreadFromGlobalPool{&NamedSessionsStorage::cleanThread, this};

+            LOG_TRACE(log, "Create new session with session_id: {}, user_id: {}", key.second, key.first);
+
            return {session, true};
        }
        else
@ -124,6 +126,8 @@ public:
            /// Use existing session.
            const auto & session = it->second;

+            LOG_TEST(log, "Reuse session from storage with session_id: {}, user_id: {}", key.second, key.first);
+
            if (!session.unique())
                throw Exception("Session is locked by a concurrent client.", ErrorCodes::SESSION_IS_LOCKED);
            return {session, false};
@ -173,6 +177,10 @@ private:
                close_times.resize(close_index + 1);
            close_times[close_index].emplace_back(session.key);
        }
+
+        LOG_TEST(log, "Schedule closing session with session_id: {}, user_id: {}",
+                 session.key.second, session.key.first);
+
    }

    void cleanThread()
@ -214,12 +222,17 @@ private:
            {
                if (!session->second.unique())
                {
+                    LOG_TEST(log, "Delay closing session with session_id: {}, user_id: {}", key.second, key.first);
+
                    /// Skip but move it to close on the next cycle.
                    session->second->timeout = std::chrono::steady_clock::duration{0};
                    scheduleCloseSession(*session->second, lock);
                }
                else
+                {
+                    LOG_TRACE(log, "Close session with session_id: {}, user_id: {}", key.second, key.first);
                    sessions.erase(session);
+                }
            }
        }

@ -231,6 +244,8 @@ private:
    std::condition_variable cond;
    ThreadFromGlobalPool thread;
    bool quit = false;
+
+    Poco::Logger * log = &Poco::Logger::get("NamedSessionsStorage");
 };


@ -257,11 +272,6 @@ Session::Session(const ContextPtr & global_context_, ClientInfo::Interface inter

 Session::~Session()
 {
-    LOG_DEBUG(log, "{} Destroying {}",
-        toString(auth_id),
-        (named_session ? "named session '" + named_session->key.second + "'" : "unnamed session")
-    );
-
    /// Early release a NamedSessionData.
    if (named_session)
        named_session->release();
--- a/src/Interpreters/TransactionVersionMetadata.cpp
+++ b/src/Interpreters/TransactionVersionMetadata.cpp
@ -243,6 +243,9 @@ bool VersionMetadata::canBeRemoved()
    {
        /// Avoid access to Transaction log if transactions are not involved

+        if (creation_csn.load(std::memory_order_relaxed) == Tx::RolledBackCSN)
+            return true;
+
        TIDHash removal_lock = removal_tid_lock.load(std::memory_order_relaxed);
        if (!removal_lock)
            return false;
@ -380,8 +383,9 @@ void VersionMetadata::read(ReadBuffer & buf)

        if (name == CREATION_CSN_STR)
        {
-            chassert(!creation_csn);
-            creation_csn = read_csn();
+            auto new_val = read_csn();
+            chassert(!creation_csn || (creation_csn == new_val && creation_csn == Tx::PrehistoricCSN));
+            creation_csn = new_val;
        }
        else if (name == REMOVAL_TID_STR)
        {
--- a/src/Interpreters/executeQuery.cpp
+++ b/src/Interpreters/executeQuery.cpp
@ -592,13 +592,12 @@ static std::tuple<ASTPtr, BlockIO> executeQueryImpl(
                quota->checkExceeded(QuotaType::ERRORS);
            }

-            queue->push(ast, context);
+            auto insert_future = queue->push(ast, context);

            if (settings.wait_for_async_insert)
            {
                auto timeout = settings.wait_for_async_insert_timeout.totalMilliseconds();
-                auto query_id = context->getCurrentQueryId();
-                auto source = std::make_shared<WaitForAsyncInsertSource>(query_id, timeout, *queue);
+                auto source = std::make_shared<WaitForAsyncInsertSource>(std::move(insert_future), timeout);
                res.pipeline = QueryPipeline(Pipe(std::move(source)));
            }

--- a/src/Parsers/MySQL/ASTDropQuery.cpp
+++ b/src/Parsers/MySQL/ASTDropQuery.cpp
@ -0,0 +1,119 @@
+#include <Parsers/MySQL/ASTDropQuery.h>
+
+#include <Parsers/ASTIdentifier.h>
+#include <Parsers/CommonParsers.h>
+#include <Parsers/ExpressionElementParsers.h>
+#include <Parsers/parseDatabaseAndTableName.h>
+#include <Parsers/ExpressionListParsers.h>
+
+namespace DB
+{
+
+namespace MySQLParser
+{
+
+ASTPtr ASTDropQuery::clone() const
+{
+    auto res = std::make_shared<ASTDropQuery>(*this);
+    res->children.clear();
+    res->is_truncate = is_truncate;
+    res->if_exists = if_exists;
+    return res;
+}
+
+bool ParserDropQuery::parseImpl(IParser::Pos & pos, ASTPtr & node, Expected & expected)
+{
+    ParserKeyword s_drop("DROP");
+    ParserKeyword s_truncate("TRUNCATE");
+    ParserKeyword s_table("TABLE");
+    ParserKeyword s_database("DATABASE");
+    ParserKeyword s_if_exists("IF EXISTS");
+    ParserKeyword s_view("VIEW");
+    ParserKeyword on("ON");
+    ParserIdentifier name_p(false);
+
+    ParserKeyword s_event("EVENT");
+    ParserKeyword s_function("FUNCTION");
+    ParserKeyword s_index("INDEX");
+    ParserKeyword s_server("SERVER");
+    ParserKeyword s_trigger("TRIGGER");
+
+    auto query = std::make_shared<ASTDropQuery>();
+    node = query;
+    ASTDropQuery::QualifiedNames names;
+    bool if_exists = false;
+    bool is_truncate = false;
+
+    if (s_truncate.ignore(pos, expected) && s_table.ignore(pos, expected))
+    {
+        is_truncate = true;
+        query->kind = ASTDropQuery::Kind::Table;
+        ASTDropQuery::QualifiedName name;
+        if (parseDatabaseAndTableName(pos, expected, name.schema, name.shortName))
+            names.push_back(name);
+        else
+            return false;
+    }
+    else if (s_drop.ignore(pos, expected))
+    {
+        if (s_database.ignore(pos, expected))
+        {
+            query->kind = ASTDropQuery::Kind::Database;
+            if (s_if_exists.ignore(pos, expected))
+                if_exists = true;
+            ASTPtr database;
+            if (!name_p.parse(pos, database, expected))
+                return false;
+        }
+        else
+        {
+            if (s_view.ignore(pos, expected))
+                query->kind = ASTDropQuery::Kind::View;
+            else if (s_table.ignore(pos, expected))
+                query->kind = ASTDropQuery::Kind::Table;
+            else if (s_index.ignore(pos, expected))
+            {
+                ASTPtr index;
+                query->kind = ASTDropQuery::Kind::Index;
+                if (!(name_p.parse(pos, index, expected) && on.ignore(pos, expected)))
+                    return false;
+            }
+            else if (s_event.ignore(pos, expected) || s_function.ignore(pos, expected) || s_server.ignore(pos, expected)
+                || s_trigger.ignore(pos, expected))
+            {
+                query->kind = ASTDropQuery::Kind::Other;
+            }
+            else
+                return false;
+
+            if (s_if_exists.ignore(pos, expected))
+                if_exists = true;
+            //parse name
+            auto parse_element = [&]
+            {
+                ASTDropQuery::QualifiedName element;
+                if (parseDatabaseAndTableName(pos, expected, element.schema, element.shortName))
+                {
+                    names.emplace_back(std::move(element));
+                    return true;
+                }
+                return false;
+            };
+
+            if (!ParserList::parseUtil(pos, expected, parse_element, false))
+                return false;
+        }
+    }
+    else
+        return false;
+
+    query->if_exists = if_exists;
+    query->names = names;
+    query->is_truncate = is_truncate;
+
+    return true;
+}
+
+}
+
+}
--- a/src/Parsers/MySQL/ASTDropQuery.h
+++ b/src/Parsers/MySQL/ASTDropQuery.h
@ -0,0 +1,64 @@
+#pragma once
+
+#include <Parsers/IParserBase.h>
+#include <Parsers/MySQL/ASTDeclareIndex.h>
+#include <Parsers/MySQL/ASTDeclareColumn.h>
+#include <Parsers/MySQL/ASTDeclareTableOptions.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int NOT_IMPLEMENTED;
+}
+
+namespace MySQLParser
+{
+
+class ASTDropQuery : public IAST
+{
+public:
+    enum Kind
+    {
+        Table,
+        View,
+        Database,
+        Index,
+        /// TRIGGER,FUNCTION,EVENT and so on, No need for support
+        Other,
+    };
+    Kind kind;
+    struct QualifiedName
+    {
+        String schema;
+        String shortName;
+    };
+
+    using QualifiedNames = std::vector<QualifiedName>;
+    QualifiedNames names;
+    bool if_exists{false};
+    //drop or truncate
+    bool is_truncate{false};
+
+    ASTPtr clone() const override;
+    String getID(char /*delim*/) const override {return "ASTDropQuery" ;}
+
+protected:
+    void formatImpl(const FormatSettings & /*settings*/, FormatState & /*state*/, FormatStateStacked /*frame*/) const override
+    {
+        throw Exception("Method formatImpl is not supported by MySQLParser::ASTDropQuery.", ErrorCodes::NOT_IMPLEMENTED);
+    }
+};
+
+class ParserDropQuery : public IParserBase
+{
+protected:
+    const char * getName() const override { return "DROP query"; }
+
+    bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override;
+};
+
+}
+
+}
--- a/src/Parsers/ParserExternalDDLQuery.cpp
+++ b/src/Parsers/ParserExternalDDLQuery.cpp
@ -11,6 +11,7 @@
 #if USE_MYSQL
 #    include <Parsers/MySQL/ASTAlterQuery.h>
 #    include <Parsers/MySQL/ASTCreateQuery.h>
+#    include <Parsers/MySQL/ASTDropQuery.h>
 #endif

 namespace DB
@ -43,7 +44,7 @@ bool ParserExternalDDLQuery::parseImpl(IParser::Pos & pos, ASTPtr & node, Expect
    if (external_ddl_query->from->name == "MySQL")
    {
 #if USE_MYSQL
-        ParserDropQuery p_drop_query;
+        MySQLParser::ParserDropQuery p_drop_query;
        ParserRenameQuery p_rename_query;
        MySQLParser::ParserAlterQuery p_alter_query;
        MySQLParser::ParserCreateQuery p_create_query;
--- a/src/Planner/Planner.cpp
+++ b/src/Planner/Planner.cpp
@ -495,7 +495,8 @@ void Planner::buildQueryPlanIfNeeded()
            settings.group_by_use_nulls,
            std::move(input_order_info),
            std::move(group_by_sort_description),
-            should_produce_results_in_order_of_bucket_number);
+            should_produce_results_in_order_of_bucket_number,
+            settings.enable_memory_bound_merging_of_aggregation_results);
        query_plan.addStep(std::move(aggregating_step));

        if (query_node.isGroupByWithTotals())
--- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp
+++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp
@ -324,14 +324,31 @@ static ColumnPtr readOffsetsFromArrowListColumn(std::shared_ptr<arrow::ChunkedAr
    ColumnArray::Offsets & offsets_data = assert_cast<ColumnVector<UInt64> &>(*offsets_column).getData();
    offsets_data.reserve(arrow_column->length());

+    uint64_t start_offset = 0u;
+
    for (int chunk_i = 0, num_chunks = arrow_column->num_chunks(); chunk_i < num_chunks; ++chunk_i)
    {
        arrow::ListArray & list_chunk = dynamic_cast<arrow::ListArray &>(*(arrow_column->chunk(chunk_i)));
        auto arrow_offsets_array = list_chunk.offsets();
        auto & arrow_offsets = dynamic_cast<arrow::Int32Array &>(*arrow_offsets_array);
-        auto start = offsets_data.back();
+
+        /*
+         * It seems like arrow::ListArray::values() (nested column data) might or might not be shared across chunks.
+         * When it is shared, the offsets will be monotonically increasing. Otherwise, the offsets will be zero based.
+         * In order to account for both cases, the starting offset is updated whenever a zero-based offset is found.
+         * More info can be found in: https://lists.apache.org/thread/rrwfb9zo2dc58dhd9rblf20xd7wmy7jm and
+         * https://github.com/ClickHouse/ClickHouse/pull/43297
+         * */
+        if (list_chunk.offset() == 0)
+        {
+            start_offset = offsets_data.back();
+        }
+
        for (int64_t i = 1; i < arrow_offsets.length(); ++i)
-            offsets_data.emplace_back(start + arrow_offsets.Value(i));
+        {
+            auto offset = arrow_offsets.Value(i);
+            offsets_data.emplace_back(start_offset + offset);
+        }
    }
    return offsets_column;
 }
@ -467,8 +484,23 @@ static std::shared_ptr<arrow::ChunkedArray> getNestedArrowColumn(std::shared_ptr
    for (int chunk_i = 0, num_chunks = arrow_column->num_chunks(); chunk_i < num_chunks; ++chunk_i)
    {
        arrow::ListArray & list_chunk = dynamic_cast<arrow::ListArray &>(*(arrow_column->chunk(chunk_i)));
-        std::shared_ptr<arrow::Array> chunk = list_chunk.values();
-        array_vector.emplace_back(std::move(chunk));
+
+        /*
+         * It seems like arrow::ListArray::values() (nested column data) might or might not be shared across chunks.
+         * Therefore, simply appending arrow::ListArray::values() could lead to duplicated data to be appended.
+         * To properly handle this, arrow::ListArray::values() needs to be sliced based on the chunk offsets.
+         * arrow::ListArray::Flatten does that. More info on: https://lists.apache.org/thread/rrwfb9zo2dc58dhd9rblf20xd7wmy7jm and
+         * https://github.com/ClickHouse/ClickHouse/pull/43297
+         * */
+        auto flatten_result = list_chunk.Flatten();
+        if (flatten_result.ok())
+        {
+            array_vector.emplace_back(flatten_result.ValueOrDie());
+        }
+        else
+        {
+            throw Exception(ErrorCodes::INCORRECT_DATA, "Failed to flatten chunk '{}' of column of type '{}' ", chunk_i, arrow_column->type()->id());
+        }
    }
    return std::make_shared<arrow::ChunkedArray>(array_vector);
 }
--- a/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.cpp
+++ b/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.cpp
@ -55,11 +55,12 @@ void FinishAggregatingInOrderAlgorithm::consume(Input & input, size_t source_num
    if (!info)
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Chunk info was not set for chunk in FinishAggregatingInOrderAlgorithm");

-    const auto * arenas_info = typeid_cast<const ChunkInfoWithAllocatedBytes *>(info.get());
-    if (!arenas_info)
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Chunk should have ChunkInfoWithAllocatedBytes in FinishAggregatingInOrderAlgorithm");
+    Int64 allocated_bytes = 0;
+    /// Will be set by AggregatingInOrderTransform during local aggregation; will be nullptr during merging on initiator.
+    if (const auto * arenas_info = typeid_cast<const ChunkInfoWithAllocatedBytes *>(info.get()))
+        allocated_bytes = arenas_info->allocated_bytes;

-    states[source_num] = State{input.chunk, description, arenas_info->allocated_bytes};
+    states[source_num] = State{input.chunk, description, allocated_bytes};
 }

 IMergingAlgorithm::Status FinishAggregatingInOrderAlgorithm::merge()
@ -130,6 +131,7 @@ Chunk FinishAggregatingInOrderAlgorithm::prepareToMerge()

    auto info = std::make_shared<ChunksToMerge>();
    info->chunks = std::make_unique<Chunks>(std::move(chunks));
+    info->chunk_num = chunk_num++;

    Chunk chunk;
    chunk.setChunkInfo(std::move(info));
--- a/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.h
+++ b/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.h
@ -88,6 +88,7 @@ private:
    std::vector<size_t> inputs_to_update;

    std::vector<Chunk> chunks;
+    UInt64 chunk_num = 0;
    size_t accumulated_rows = 0;
    size_t accumulated_bytes = 0;
 };
--- a/src/Processors/QueryPlan/AggregatingStep.cpp
+++ b/src/Processors/QueryPlan/AggregatingStep.cpp
@ -1,34 +1,45 @@
 #include <cassert>
 #include <cstddef>
 #include <memory>
-#include <Processors/QueryPlan/AggregatingStep.h>
-#include <QueryPipeline/QueryPipelineBuilder.h>
-#include <Processors/Transforms/CopyTransform.h>
-#include <Processors/Transforms/AggregatingTransform.h>
-#include <Processors/Transforms/AggregatingInOrderTransform.h>
-#include <Processors/Transforms/MergingAggregatedMemoryEfficientTransform.h>
-#include <Processors/Transforms/ExpressionTransform.h>
+#include <Columns/ColumnFixedString.h>
+#include <DataTypes/DataTypeFixedString.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <Functions/FunctionFactory.h>
+#include <Interpreters/Aggregator.h>
+#include <Interpreters/Context.h>
 #include <Processors/Merges/AggregatingSortedTransform.h>
 #include <Processors/Merges/FinishAggregatingInOrderTransform.h>
-#include <Interpreters/Aggregator.h>
-#include <Functions/FunctionFactory.h>
+#include <Processors/QueryPlan/AggregatingStep.h>
 #include <Processors/QueryPlan/IQueryPlanStep.h>
-#include <Columns/ColumnFixedString.h>
-#include <DataTypes/DataTypesNumber.h>
-#include <DataTypes/DataTypeFixedString.h>
+#include <Processors/Transforms/AggregatingInOrderTransform.h>
+#include <Processors/Transforms/AggregatingTransform.h>
+#include <Processors/Transforms/CopyTransform.h>
+#include <Processors/Transforms/ExpressionTransform.h>
+#include <Processors/Transforms/MemoryBoundMerging.h>
+#include <Processors/Transforms/MergingAggregatedMemoryEfficientTransform.h>
+#include <QueryPipeline/QueryPipelineBuilder.h>

 namespace DB
 {

-static ITransformingStep::Traits getTraits(bool should_produce_results_in_order_of_bucket_number)
+static bool memoryBoundMergingWillBeUsed(
+    bool should_produce_results_in_order_of_bucket_number,
+    bool memory_bound_merging_of_aggregation_results_enabled,
+    InputOrderInfoPtr group_by_info)
+{
+    return should_produce_results_in_order_of_bucket_number && memory_bound_merging_of_aggregation_results_enabled && group_by_info;
+}
+
+static ITransformingStep::Traits getTraits(bool should_produce_results_in_order_of_bucket_number, bool memory_bound_merging_will_be_used)
 {
    return ITransformingStep::Traits
    {
        {
            .preserves_distinct_columns = false, /// Actually, we may check that distinct names are in aggregation keys
-            .returns_single_stream = should_produce_results_in_order_of_bucket_number, /// Actually, may also return single stream if should_produce_results_in_order_of_bucket_number = false
+            .returns_single_stream = should_produce_results_in_order_of_bucket_number || memory_bound_merging_will_be_used,
            .preserves_number_of_streams = false,
            .preserves_sorting = false,
+            .can_enforce_sorting_properties_in_distributed_query = memory_bound_merging_will_be_used,
        },
        {
            .preserves_number_of_rows = false,
@ -88,9 +99,16 @@ AggregatingStep::AggregatingStep(
    bool group_by_use_nulls_,
    InputOrderInfoPtr group_by_info_,
    SortDescription group_by_sort_description_,
-    bool should_produce_results_in_order_of_bucket_number_)
+    bool should_produce_results_in_order_of_bucket_number_,
+    bool memory_bound_merging_of_aggregation_results_enabled_)
    : ITransformingStep(
-        input_stream_, appendGroupingColumn(params_.getHeader(input_stream_.header, final_), params_.keys, grouping_sets_params_, group_by_use_nulls_), getTraits(should_produce_results_in_order_of_bucket_number_), false)
+        input_stream_,
+        appendGroupingColumn(params_.getHeader(input_stream_.header, final_), params_.keys, grouping_sets_params_, group_by_use_nulls_),
+        getTraits(
+            should_produce_results_in_order_of_bucket_number_,
+            DB::memoryBoundMergingWillBeUsed(
+                should_produce_results_in_order_of_bucket_number_, memory_bound_merging_of_aggregation_results_enabled_, group_by_info_)),
+        false)
    , params(std::move(params_))
    , grouping_sets_params(std::move(grouping_sets_params_))
    , final(final_)
@ -103,7 +121,13 @@ AggregatingStep::AggregatingStep(
    , group_by_info(std::move(group_by_info_))
    , group_by_sort_description(std::move(group_by_sort_description_))
    , should_produce_results_in_order_of_bucket_number(should_produce_results_in_order_of_bucket_number_)
+    , memory_bound_merging_of_aggregation_results_enabled(memory_bound_merging_of_aggregation_results_enabled_)
 {
+    if (memoryBoundMergingWillBeUsed())
+    {
+        output_stream->sort_description = group_by_sort_description;
+        output_stream->sort_scope = DataStream::SortScope::Global;
+    }
 }

 void AggregatingStep::transformPipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings & settings)
@ -336,10 +360,16 @@ void AggregatingStep::transformPipeline(QueryPipelineBuilder & pipeline, const B
            /// Do merge of aggregated data in parallel.
            pipeline.resize(merge_threads);

-            pipeline.addSimpleTransform([&](const Block &)
+            const auto & required_sort_description = memoryBoundMergingWillBeUsed() ? group_by_sort_description : SortDescription{};
+            pipeline.addSimpleTransform(
+                [&](const Block &)
+                { return std::make_shared<MergingAggregatedBucketTransform>(transform_params, required_sort_description); });
+
+            if (memoryBoundMergingWillBeUsed())
            {
-                return std::make_shared<MergingAggregatedBucketTransform>(transform_params);
-            });
+                pipeline.addTransform(
+                    std::make_shared<SortingAggregatedForMemoryBoundMergingTransform>(pipeline.getHeader(), pipeline.getNumStreams()));
+            }

            aggregating_sorted = collector.detachProcessors(1);
        }
@ -380,7 +410,6 @@ void AggregatingStep::transformPipeline(QueryPipelineBuilder & pipeline, const B
            return std::make_shared<AggregatingTransform>(header, transform_params, many_data, counter++, merge_threads, temporary_data_merge_threads);
        });

-        /// We add the explicit resize here, but not in case of aggregating in order, since AIO don't use two-level hash tables and thus returns only buckets with bucket_number = -1.
        pipeline.resize(should_produce_results_in_order_of_bucket_number ? 1 : params.max_threads, true /* force */);

        aggregating = collector.detachProcessors(0);
@ -426,4 +455,17 @@ void AggregatingStep::updateOutputStream()
        getDataStreamTraits());
 }

+void AggregatingStep::adjustSettingsToEnforceSortingPropertiesInDistributedQuery(ContextMutablePtr context) const
+{
+    context->setSetting("enable_memory_bound_merging_of_aggregation_results", true);
+    context->setSetting("optimize_aggregation_in_order", true);
+    context->setSetting("force_aggregation_in_order", true);
+}
+
+bool AggregatingStep::memoryBoundMergingWillBeUsed() const
+{
+    return DB::memoryBoundMergingWillBeUsed(
+        should_produce_results_in_order_of_bucket_number, memory_bound_merging_of_aggregation_results_enabled, group_by_info);
+}
+
 }
--- a/src/Processors/QueryPlan/AggregatingStep.h
+++ b/src/Processors/QueryPlan/AggregatingStep.h
@ -39,7 +39,8 @@ public:
        bool group_by_use_nulls_,
        InputOrderInfoPtr group_by_info_,
        SortDescription group_by_sort_description_,
-        bool should_produce_results_in_order_of_bucket_number_);
+        bool should_produce_results_in_order_of_bucket_number_,
+        bool memory_bound_merging_of_aggregation_results_enabled_);

    String getName() const override { return "Aggregating"; }

@ -52,9 +53,13 @@ public:

    const Aggregator::Params & getParams() const { return params; }

+    void adjustSettingsToEnforceSortingPropertiesInDistributedQuery(ContextMutablePtr context) const override;
+
 private:
    void updateOutputStream() override;

+    bool memoryBoundMergingWillBeUsed() const;
+
    Aggregator::Params params;
    GroupingSetsParamsList grouping_sets_params;
    bool final;
@ -69,9 +74,9 @@ private:
    InputOrderInfoPtr group_by_info;
    SortDescription group_by_sort_description;

-    /// It determines if we should resize pipeline to 1 at the end.
-    /// Needed in case of distributed memory efficient aggregation.
-    const bool should_produce_results_in_order_of_bucket_number;
+    /// These settings are used to determine if we should resize pipeline to 1 at the end.
+    bool should_produce_results_in_order_of_bucket_number;
+    bool memory_bound_merging_of_aggregation_results_enabled;

    Processors aggregating_in_order;
    Processors aggregating_sorted;
--- a/src/Processors/QueryPlan/IQueryPlanStep.h
+++ b/src/Processors/QueryPlan/IQueryPlanStep.h
@ -31,13 +31,13 @@ public:
    /// QueryPipeline has single port. Totals or extremes ports are not counted.
    bool has_single_port = false;

-    /// Sorting scope
+    /// Sorting scope. Please keep the mutual order (more strong mode should have greater value).
    enum class SortScope
    {
-        None,
-        Chunk, /// Separate chunks are sorted
-        Stream, /// Each data steam is sorted
-        Global, /// Data is globally sorted
+        None   = 0,
+        Chunk  = 1, /// Separate chunks are sorted
+        Stream = 2, /// Each data steam is sorted
+        Global = 3, /// Data is globally sorted
    };

    /// It is not guaranteed that header has columns from sort_description.
--- a/src/Processors/QueryPlan/ITransformingStep.h
+++ b/src/Processors/QueryPlan/ITransformingStep.h
@ -4,6 +4,11 @@
 namespace DB
 {

+namespace ErrorCodes
+{
+    extern const int NOT_IMPLEMENTED;
+}
+
 /// Step which has single input and single output data stream.
 /// It doesn't mean that pipeline has single port before or after such step.
 class ITransformingStep : public IQueryPlanStep
@ -29,6 +34,9 @@ public:
        /// Doesn't change row order.
        /// Examples: true for FilterStep, false for PartialSortingStep
        bool preserves_sorting;
+
+        /// See adjustSettingsToEnforceSortingPropertiesInDistributedQuery().
+        bool can_enforce_sorting_properties_in_distributed_query = false;
    };

    /// This flags are used by QueryPlan optimizers.
@ -73,6 +81,13 @@ public:
    /// Append extra processors for this step.
    void appendExtraProcessors(const Processors & extra_processors);

+    /// Enforcement is supposed to be done through the special settings that will be taken into account by remote nodes during query planning (e.g. force_aggregation_in_order).
+    /// Should be called only if data_stream_traits.can_enforce_sorting_properties_in_distributed_query == true.
+    virtual void adjustSettingsToEnforceSortingPropertiesInDistributedQuery(ContextMutablePtr) const
+    {
+        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Not implemented");
+    }
+
 protected:
    /// Clear distinct_columns if res_header doesn't contain all of them.
    static void updateDistinctColumns(const Block & res_header, NameSet & distinct_columns);
--- a/src/Processors/QueryPlan/MergingAggregatedStep.cpp
+++ b/src/Processors/QueryPlan/MergingAggregatedStep.cpp
@ -1,13 +1,25 @@
+#include <Interpreters/Context.h>
+#include <Processors/Merges/FinishAggregatingInOrderTransform.h>
 #include <Processors/QueryPlan/MergingAggregatedStep.h>
-#include <QueryPipeline/QueryPipelineBuilder.h>
 #include <Processors/Transforms/AggregatingTransform.h>
-#include <Processors/Transforms/MergingAggregatedTransform.h>
+#include <Processors/Transforms/MemoryBoundMerging.h>
 #include <Processors/Transforms/MergingAggregatedMemoryEfficientTransform.h>
+#include <Processors/Transforms/MergingAggregatedTransform.h>
+#include <QueryPipeline/QueryPipelineBuilder.h>

 namespace DB
 {

-static ITransformingStep::Traits getTraits(bool should_produce_results_in_order_of_bucket_number)
+static bool memoryBoundMergingWillBeUsed(
+    const DataStream & input_stream,
+    bool memory_bound_merging_of_aggregation_results_enabled,
+    const SortDescription & group_by_sort_description)
+{
+    return memory_bound_merging_of_aggregation_results_enabled && !group_by_sort_description.empty()
+        && input_stream.sort_scope >= DataStream::SortScope::Stream && input_stream.sort_description.hasPrefix(group_by_sort_description);
+}
+
+static ITransformingStep::Traits getTraits(bool should_produce_results_in_order_of_bucket_number, bool memory_bound_merging_will_be_used)
 {
    return ITransformingStep::Traits
    {
@ -16,6 +28,7 @@ static ITransformingStep::Traits getTraits(bool should_produce_results_in_order_
            .returns_single_stream = should_produce_results_in_order_of_bucket_number,
            .preserves_number_of_streams = false,
            .preserves_sorting = false,
+            .can_enforce_sorting_properties_in_distributed_query = memory_bound_merging_will_be_used,
        },
        {
            .preserves_number_of_rows = false,
@ -30,24 +43,74 @@ MergingAggregatedStep::MergingAggregatedStep(
    bool memory_efficient_aggregation_,
    size_t max_threads_,
    size_t memory_efficient_merge_threads_,
-    bool should_produce_results_in_order_of_bucket_number_)
+    bool should_produce_results_in_order_of_bucket_number_,
+    size_t max_block_size_,
+    size_t memory_bound_merging_max_block_bytes_,
+    SortDescription group_by_sort_description_,
+    bool memory_bound_merging_of_aggregation_results_enabled_)
    : ITransformingStep(
-        input_stream_, params_.getHeader(input_stream_.header, final_), getTraits(should_produce_results_in_order_of_bucket_number_))
+        input_stream_,
+        params_.getHeader(input_stream_.header, final_),
+        getTraits(
+            should_produce_results_in_order_of_bucket_number_,
+            DB::memoryBoundMergingWillBeUsed(
+                input_stream_, memory_bound_merging_of_aggregation_results_enabled_, group_by_sort_description_)))
    , params(std::move(params_))
    , final(final_)
    , memory_efficient_aggregation(memory_efficient_aggregation_)
    , max_threads(max_threads_)
    , memory_efficient_merge_threads(memory_efficient_merge_threads_)
+    , max_block_size(max_block_size_)
+    , memory_bound_merging_max_block_bytes(memory_bound_merging_max_block_bytes_)
+    , group_by_sort_description(std::move(group_by_sort_description_))
    , should_produce_results_in_order_of_bucket_number(should_produce_results_in_order_of_bucket_number_)
+    , memory_bound_merging_of_aggregation_results_enabled(memory_bound_merging_of_aggregation_results_enabled_)
 {
    /// Aggregation keys are distinct
    for (const auto & key : params.keys)
        output_stream->distinct_columns.insert(key);
+
+    if (memoryBoundMergingWillBeUsed() && should_produce_results_in_order_of_bucket_number)
+    {
+        output_stream->sort_description = group_by_sort_description;
+        output_stream->sort_scope = DataStream::SortScope::Global;
+    }
 }

 void MergingAggregatedStep::transformPipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &)
 {
    auto transform_params = std::make_shared<AggregatingTransformParams>(pipeline.getHeader(), std::move(params), final);
+
+    if (memoryBoundMergingWillBeUsed())
+    {
+        auto transform = std::make_shared<FinishAggregatingInOrderTransform>(
+            pipeline.getHeader(),
+            pipeline.getNumStreams(),
+            transform_params,
+            group_by_sort_description,
+            max_block_size,
+            memory_bound_merging_max_block_bytes);
+
+        pipeline.addTransform(std::move(transform));
+
+        /// Do merge of aggregated data in parallel.
+        pipeline.resize(max_threads);
+
+        const auto & required_sort_description
+            = should_produce_results_in_order_of_bucket_number ? group_by_sort_description : SortDescription{};
+
+        pipeline.addSimpleTransform(
+            [&](const Block &) { return std::make_shared<MergingAggregatedBucketTransform>(transform_params, required_sort_description); });
+
+        if (should_produce_results_in_order_of_bucket_number)
+        {
+            pipeline.addTransform(
+                std::make_shared<SortingAggregatedForMemoryBoundMergingTransform>(pipeline.getHeader(), pipeline.getNumStreams()));
+        }
+
+        return;
+    }
+
    if (!memory_efficient_aggregation)
    {
        /// We union several sources into one, paralleling the work.
@ -88,5 +151,14 @@ void MergingAggregatedStep::updateOutputStream()
        output_stream->distinct_columns.insert(key);
 }

-
+void MergingAggregatedStep::adjustSettingsToEnforceSortingPropertiesInDistributedQuery(ContextMutablePtr context) const
+{
+    context->setSetting("enable_memory_bound_merging_of_aggregation_results", true);
+}
+
+bool MergingAggregatedStep::memoryBoundMergingWillBeUsed() const
+{
+    return DB::memoryBoundMergingWillBeUsed(
+        input_streams.front(), memory_bound_merging_of_aggregation_results_enabled, group_by_sort_description);
+}
 }
--- a/src/Processors/QueryPlan/MergingAggregatedStep.h
+++ b/src/Processors/QueryPlan/MergingAggregatedStep.h
@ -20,7 +20,11 @@ public:
        bool memory_efficient_aggregation_,
        size_t max_threads_,
        size_t memory_efficient_merge_threads_,
-        bool should_produce_results_in_order_of_bucket_number_);
+        bool should_produce_results_in_order_of_bucket_number_,
+        size_t max_block_size_,
+        size_t memory_bound_merging_max_block_bytes_,
+        SortDescription group_by_sort_description_,
+        bool memory_bound_merging_of_aggregation_results_enabled_);

    String getName() const override { return "MergingAggregated"; }

@ -29,18 +33,25 @@ public:
    void describeActions(JSONBuilder::JSONMap & map) const override;
    void describeActions(FormatSettings & settings) const override;

+    void adjustSettingsToEnforceSortingPropertiesInDistributedQuery(ContextMutablePtr context) const override;
+
 private:
    void updateOutputStream() override;

+    bool memoryBoundMergingWillBeUsed() const;
+
    Aggregator::Params params;
    bool final;
    bool memory_efficient_aggregation;
    size_t max_threads;
    size_t memory_efficient_merge_threads;
+    const size_t max_block_size;
+    const size_t memory_bound_merging_max_block_bytes;
+    const SortDescription group_by_sort_description;

-    /// It determines if we should resize pipeline to 1 at the end.
-    /// Needed in case of distributed memory efficient aggregation over distributed table.
+    /// These settings are used to determine if we should resize pipeline to 1 at the end.
    const bool should_produce_results_in_order_of_bucket_number;
+    const bool memory_bound_merging_of_aggregation_results_enabled;
 };

 }
--- a/src/Processors/QueryPlan/ReadFromRemote.cpp
+++ b/src/Processors/QueryPlan/ReadFromRemote.cpp
@ -76,7 +76,9 @@ ReadFromRemote::ReadFromRemote(
    Tables external_tables_,
    Poco::Logger * log_,
    UInt32 shard_count_,
-    std::shared_ptr<const StorageLimitsList> storage_limits_)
+    std::shared_ptr<const StorageLimitsList> storage_limits_,
+    SortDescription output_sort_description_,
+    DataStream::SortScope output_sort_scope_)
    : ISourceStep(DataStream{.header = std::move(header_)})
    , shards(std::move(shards_))
    , stage(stage_)
@ -90,6 +92,8 @@ ReadFromRemote::ReadFromRemote(
    , log(log_)
    , shard_count(shard_count_)
 {
+    output_stream->sort_description = std::move(output_sort_description_);
+    output_stream->sort_scope = output_sort_scope_;
 }

 void ReadFromRemote::addLazyPipe(Pipes & pipes, const ClusterProxy::SelectStreamFactory::Shard & shard)
@ -239,7 +243,9 @@ ReadFromParallelRemoteReplicasStep::ReadFromParallelRemoteReplicasStep(
    Scalars scalars_,
    Tables external_tables_,
    Poco::Logger * log_,
-    std::shared_ptr<const StorageLimitsList> storage_limits_)
+    std::shared_ptr<const StorageLimitsList> storage_limits_,
+    SortDescription output_sort_description_,
+    DataStream::SortScope output_sort_scope_)
    : ISourceStep(DataStream{.header = std::move(header_)})
    , coordinator(std::move(coordinator_))
    , shard(std::move(shard_))
@ -260,6 +266,9 @@ ReadFromParallelRemoteReplicasStep::ReadFromParallelRemoteReplicasStep(
            description.push_back(fmt::format("Replica: {}", address.host_name));

    setStepDescription(boost::algorithm::join(description, ", "));
+
+    output_stream->sort_description = std::move(output_sort_description_);
+    output_stream->sort_scope = output_sort_scope_;
 }


--- a/src/Processors/QueryPlan/ReadFromRemote.h
+++ b/src/Processors/QueryPlan/ReadFromRemote.h
@ -33,7 +33,9 @@ public:
        Tables external_tables_,
        Poco::Logger * log_,
        UInt32 shard_count_,
-        std::shared_ptr<const StorageLimitsList> storage_limits_);
+        std::shared_ptr<const StorageLimitsList> storage_limits_,
+        SortDescription output_sort_description_,
+        DataStream::SortScope output_sort_scope_);

    String getName() const override { return "ReadFromRemote"; }

@ -83,7 +85,9 @@ public:
        Scalars scalars_,
        Tables external_tables_,
        Poco::Logger * log_,
-        std::shared_ptr<const StorageLimitsList> storage_limits_);
+        std::shared_ptr<const StorageLimitsList> storage_limits_,
+        SortDescription output_sort_description_,
+        DataStream::SortScope output_sort_scope_);

    String getName() const override { return "ReadFromRemoteParallelReplicas"; }

--- a/src/Processors/QueryPlan/UnionStep.cpp
+++ b/src/Processors/QueryPlan/UnionStep.cpp
@ -1,8 +1,9 @@
+#include <type_traits>
+#include <Interpreters/ExpressionActions.h>
 #include <Processors/QueryPlan/UnionStep.h>
-#include <QueryPipeline/QueryPipelineBuilder.h>
 #include <Processors/Sources/NullSource.h>
 #include <Processors/Transforms/ExpressionTransform.h>
-#include <Interpreters/ExpressionActions.h>
+#include <QueryPipeline/QueryPipelineBuilder.h>
 #include <base/defines.h>

 namespace DB
@ -35,6 +36,22 @@ UnionStep::UnionStep(DataStreams input_streams_, size_t max_threads_)
        output_stream = input_streams.front();
    else
        output_stream = DataStream{.header = header};
+
+    SortDescription common_sort_description = input_streams.front().sort_description;
+    DataStream::SortScope sort_scope = input_streams.front().sort_scope;
+    for (const auto & input_stream : input_streams)
+    {
+        common_sort_description = commonPrefix(common_sort_description, input_stream.sort_description);
+        sort_scope = std::min(sort_scope, input_stream.sort_scope);
+    }
+    if (!common_sort_description.empty() && sort_scope >= DataStream::SortScope::Chunk)
+    {
+        output_stream->sort_description = common_sort_description;
+        if (sort_scope == DataStream::SortScope::Global && input_streams.size() > 1)
+            output_stream->sort_scope = DataStream::SortScope::Stream;
+        else
+            output_stream->sort_scope = sort_scope;
+    }
 }

 QueryPipelineBuilderPtr UnionStep::updatePipeline(QueryPipelineBuilders pipelines, const BuildQueryPipelineSettings &)
--- a/src/Processors/Sources/WaitForAsyncInsertSource.h
+++ b/src/Processors/Sources/WaitForAsyncInsertSource.h
@ -6,18 +6,24 @@
 namespace DB
 {

+namespace ErrorCodes
+{
+    extern const int TIMEOUT_EXCEEDED;
+    extern const int LOGICAL_ERROR;
+}
+
 /// Source, that allow to wait until processing of
 /// asynchronous insert for specified query_id will be finished.
 class WaitForAsyncInsertSource : public ISource, WithContext
 {
 public:
    WaitForAsyncInsertSource(
-        const String & query_id_, size_t timeout_ms_, AsynchronousInsertQueue & queue_)
+        std::future<void> insert_future_, size_t timeout_ms_)
        : ISource(Block())
-        , query_id(query_id_)
+        , insert_future(std::move(insert_future_))
        , timeout_ms(timeout_ms_)
-        , queue(queue_)
    {
+        assert(insert_future.valid());
    }

    String getName() const override { return "WaitForAsyncInsert"; }
@ -25,14 +31,20 @@ public:
 protected:
    Chunk generate() override
    {
-        queue.waitForProcessingQuery(query_id, std::chrono::milliseconds(timeout_ms));
+        auto status = insert_future.wait_for(std::chrono::milliseconds(timeout_ms));
+        if (status == std::future_status::deferred)
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: got future in deferred state");
+
+        if (status == std::future_status::timeout)
+            throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, "Wait for async insert timeout ({} ms) exceeded)", timeout_ms);
+
+        insert_future.get();
        return Chunk();
    }

 private:
-    String query_id;
+    std::future<void> insert_future;
    size_t timeout_ms;
-    AsynchronousInsertQueue & queue;
 };

 }
--- a/src/Processors/Transforms/AggregatingInOrderTransform.cpp
+++ b/src/Processors/Transforms/AggregatingInOrderTransform.cpp
@ -170,7 +170,7 @@ void AggregatingInOrderTransform::consume(Chunk chunk)
            }
        }

-        current_memory_usage = getCurrentMemoryUsage() - initial_memory_usage;
+        current_memory_usage = std::max<Int64>(getCurrentMemoryUsage() - initial_memory_usage, 0);

        /// We finalize last key aggregation state if a new key found.
        if (key_end != rows)
--- a/src/Processors/Transforms/AggregatingTransform.h
+++ b/src/Processors/Transforms/AggregatingTransform.h
@ -14,6 +14,7 @@ class AggregatedChunkInfo : public ChunkInfo
 public:
    bool is_overflows = false;
    Int32 bucket_num = -1;
+    UInt64 chunk_num = 0; // chunk number in order of generation, used during memory bound merging to restore chunks order
 };

 using AggregatorList = std::list<Aggregator>;
--- a/src/Processors/Transforms/MemoryBoundMerging.h
+++ b/src/Processors/Transforms/MemoryBoundMerging.h
@ -0,0 +1,207 @@
+#pragma once
+
+#include <Core/SortDescription.h>
+#include <Interpreters/sortBlock.h>
+#include <Processors/IProcessor.h>
+#include <Processors/Transforms/AggregatingTransform.h>
+
+#include <Poco/Logger.h>
+#include <Common/logger_useful.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+}
+
+
+/// Has several inputs and single output.
+/// Read from inputs merged buckets with aggregated data, sort them by bucket number and block number.
+/// Presumption: inputs return chunks with increasing bucket and block number, there is at most one chunk with the given bucket and block number.
+class SortingAggregatedForMemoryBoundMergingTransform : public IProcessor
+{
+public:
+    explicit SortingAggregatedForMemoryBoundMergingTransform(const Block & header_, size_t num_inputs_)
+        : IProcessor(InputPorts(num_inputs_, header_), {header_})
+        , header(header_)
+        , num_inputs(num_inputs_)
+        , last_chunk_id(num_inputs, {std::numeric_limits<Int32>::min(), 0})
+        , is_input_finished(num_inputs, false)
+    {
+    }
+
+    String getName() const override { return "SortingAggregatedForMemoryBoundMergingTransform"; }
+
+    Status prepare() override
+    {
+        auto & output = outputs.front();
+
+        if (output.isFinished())
+        {
+            for (auto & input : inputs)
+                input.close();
+
+            return Status::Finished;
+        }
+
+        if (!output.canPush())
+        {
+            for (auto & input : inputs)
+                input.setNotNeeded();
+
+            return Status::PortFull;
+        }
+
+        /// Push if have chunk that is the next in order
+        bool pushed_to_output = tryPushChunk();
+
+        bool need_data = false;
+        bool all_finished = true;
+
+        /// Try read new chunk
+        auto in = inputs.begin();
+        for (size_t input_num = 0; input_num < num_inputs; ++input_num, ++in)
+        {
+            if (in->isFinished())
+            {
+                is_input_finished[input_num] = true;
+                continue;
+            }
+
+            /// We want to keep not more than `num_inputs` chunks in memory (and there will be only a single chunk with the given (bucket_id, chunk_num)).
+            const bool bucket_from_this_input_still_in_memory = chunks.contains(last_chunk_id[input_num]);
+            if (bucket_from_this_input_still_in_memory)
+            {
+                all_finished = false;
+                continue;
+            }
+
+            in->setNeeded();
+
+            if (!in->hasData())
+            {
+                need_data = true;
+                all_finished = false;
+                continue;
+            }
+
+            auto chunk = in->pull();
+            addChunk(std::move(chunk), input_num);
+
+            if (in->isFinished())
+            {
+                is_input_finished[input_num] = true;
+            }
+            else
+            {
+                /// If chunk was pulled, then we need data from this port.
+                need_data = true;
+                all_finished = false;
+            }
+        }
+
+        if (pushed_to_output)
+            return Status::PortFull;
+
+        if (tryPushChunk())
+            return Status::PortFull;
+
+        if (need_data)
+            return Status::NeedData;
+
+        if (!all_finished)
+            throw Exception(
+                "SortingAggregatedForMemoryBoundMergingTransform has read bucket, but couldn't push it.", ErrorCodes::LOGICAL_ERROR);
+
+        if (overflow_chunk)
+        {
+            output.push(std::move(overflow_chunk));
+            return Status::PortFull;
+        }
+
+        output.finish();
+        return Status::Finished;
+    }
+
+private:
+    bool tryPushChunk()
+    {
+        auto & output = outputs.front();
+
+        if (chunks.empty())
+            return false;
+
+        /// Chunk with min id
+        auto it = chunks.begin();
+        auto current_chunk_id = it->first;
+
+        /// Check if it is actually next in order
+        for (size_t input = 0; input < num_inputs; ++input)
+            if (!is_input_finished[input] && last_chunk_id[input] < current_chunk_id)
+                return false;
+
+        output.push(std::move(it->second));
+        chunks.erase(it);
+        return true;
+    }
+
+    void addChunk(Chunk chunk, size_t from_input)
+    {
+        if (!chunk.hasRows())
+            return;
+
+        const auto & info = chunk.getChunkInfo();
+        if (!info)
+            throw Exception(
+                "Chunk info was not set for chunk in SortingAggregatedForMemoryBoundMergingTransform.", ErrorCodes::LOGICAL_ERROR);
+
+        const auto * agg_info = typeid_cast<const AggregatedChunkInfo *>(info.get());
+        if (!agg_info)
+            throw Exception(
+                ErrorCodes::LOGICAL_ERROR, "Chunk should have AggregatedChunkInfo in SortingAggregatedForMemoryBoundMergingTransform.");
+
+        Int32 bucket_id = agg_info->bucket_num;
+        bool is_overflows = agg_info->is_overflows;
+        UInt64 chunk_num = agg_info->chunk_num;
+
+        if (is_overflows)
+            overflow_chunk = std::move(chunk);
+        else
+        {
+            const auto chunk_id = ChunkId{bucket_id, chunk_num};
+            if (chunks.contains(chunk_id))
+            {
+                throw Exception(
+                    ErrorCodes::LOGICAL_ERROR,
+                    "SortingAggregatedForMemoryBoundMergingTransform already got bucket with number {}",
+                    bucket_id);
+            }
+
+            chunks[chunk_id] = std::move(chunk);
+            last_chunk_id[from_input] = chunk_id;
+        }
+    }
+
+    struct ChunkId
+    {
+        Int32 bucket_id;
+        UInt64 chunk_num;
+
+        bool operator<(const ChunkId & other) const
+        {
+            return std::make_pair(bucket_id, chunk_num) < std::make_pair(other.bucket_id, other.chunk_num);
+        }
+    };
+
+    Block header;
+    size_t num_inputs;
+
+    std::vector<ChunkId> last_chunk_id;
+    std::vector<bool> is_input_finished;
+    std::map<ChunkId, Chunk> chunks;
+    Chunk overflow_chunk;
+};
+
+}
--- a/src/Processors/Transforms/MergingAggregatedMemoryEfficientTransform.cpp
+++ b/src/Processors/Transforms/MergingAggregatedMemoryEfficientTransform.cpp
@ -1,5 +1,6 @@
 #include <limits>
 #include <Interpreters/Aggregator.h>
+#include <Interpreters/sortBlock.h>
 #include <Processors/ISimpleTransform.h>
 #include <Processors/ResizeProcessor.h>
 #include <Processors/Transforms/AggregatingInOrderTransform.h>
@ -305,8 +306,9 @@ void GroupingAggregatedTransform::work()
 }


-MergingAggregatedBucketTransform::MergingAggregatedBucketTransform(AggregatingTransformParamsPtr params_)
-    : ISimpleTransform({}, params_->getHeader(), false), params(std::move(params_))
+MergingAggregatedBucketTransform::MergingAggregatedBucketTransform(
+    AggregatingTransformParamsPtr params_, const SortDescription & required_sort_description_)
+    : ISimpleTransform({}, params_->getHeader(), false), params(std::move(params_)), required_sort_description(required_sort_description_)
 {
    setInputNotNeededAfterRead(true);
 }
@ -356,9 +358,14 @@ void MergingAggregatedBucketTransform::transform(Chunk & chunk)
    auto res_info = std::make_shared<AggregatedChunkInfo>();
    res_info->is_overflows = chunks_to_merge->is_overflows;
    res_info->bucket_num = chunks_to_merge->bucket_num;
+    res_info->chunk_num = chunks_to_merge->chunk_num;
    chunk.setChunkInfo(std::move(res_info));

    auto block = params->aggregator.mergeBlocks(blocks_list, params->final);
+
+    if (!required_sort_description.empty())
+        sortBlock(block, required_sort_description);
+
    size_t num_rows = block.rows();
    chunk.setColumns(block.getColumns(), num_rows);
 }
--- a/src/Processors/Transforms/MergingAggregatedMemoryEfficientTransform.h
+++ b/src/Processors/Transforms/MergingAggregatedMemoryEfficientTransform.h
@ -1,9 +1,10 @@
 #pragma once
-#include <Processors/IProcessor.h>
+#include <Core/SortDescription.h>
 #include <Interpreters/Aggregator.h>
+#include <Processors/IProcessor.h>
 #include <Processors/ISimpleTransform.h>
-#include <Processors/Transforms/AggregatingTransform.h>
 #include <Processors/ResizeProcessor.h>
+#include <Processors/Transforms/AggregatingTransform.h>


 namespace DB
@ -105,7 +106,8 @@ private:
 class MergingAggregatedBucketTransform : public ISimpleTransform
 {
 public:
-    explicit MergingAggregatedBucketTransform(AggregatingTransformParamsPtr params);
+    explicit MergingAggregatedBucketTransform(
+        AggregatingTransformParamsPtr params, const SortDescription & required_sort_description_ = {});
    String getName() const override { return "MergingAggregatedBucketTransform"; }

 protected:
@ -113,6 +115,7 @@ protected:

 private:
    AggregatingTransformParamsPtr params;
+    const SortDescription required_sort_description;
 };

 /// Has several inputs and single output.
@ -142,6 +145,7 @@ struct ChunksToMerge : public ChunkInfo
    std::unique_ptr<Chunks> chunks;
    Int32 bucket_num = -1;
    bool is_overflows = false;
+    UInt64 chunk_num = 0; // chunk number in order of generation, used during memory bound merging to restore chunks order
 };

 class Pipe;
--- a/src/Storages/MergeTree/ActiveDataPartSet.cpp
+++ b/src/Storages/MergeTree/ActiveDataPartSet.cpp
@ -65,7 +65,7 @@ bool ActiveDataPartSet::add(const String & name, Strings * out_replaced_parts)
    }

    if (it != part_info_to_name.end() && !part_info.isDisjoint(it->first))
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Part {} intersects next part {}. It is a bug or a result of manual intervention in the ZooKeeper data.", name, it->first.getPartName());
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Part {} intersects part {}. It is a bug or a result of manual intervention in the ZooKeeper data.", name, it->first.getPartName());

    part_info_to_name.emplace(part_info, name);
    return true;
--- a/src/Storages/MergeTree/DataPartStorageOnDisk.cpp
+++ b/src/Storages/MergeTree/DataPartStorageOnDisk.cpp
@ -372,7 +372,12 @@ std::optional<String> DataPartStorageOnDisk::getRelativePathForPrefix(Poco::Logg

    for (int try_no = 0; try_no < 10; ++try_no)
    {
-        res = (prefix.empty() ? "" : prefix + "_") + part_dir + (try_no ? "_try" + DB::toString(try_no) : "");
+        if (prefix.empty())
+            res = part_dir + (try_no ? "_try" + DB::toString(try_no) : "");
+        else if (prefix.ends_with("_"))
+            res = prefix + part_dir + (try_no ? "_try" + DB::toString(try_no) : "");
+        else
+            res = prefix + "_" + part_dir + (try_no ? "_try" + DB::toString(try_no) : "");

        if (!volume->getDisk()->exists(full_relative_path / res))
            return res;
--- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp
+++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp
@ -1350,7 +1350,7 @@ void IMergeTreeDataPart::storeVersionMetadata(bool force) const
    if (!wasInvolvedInTransaction() && !force)
        return;

-    LOG_TEST(storage.log, "Writing version for {} (creation: {}, removal {})", name, version.creation_tid, version.removal_tid);
+    LOG_TEST(storage.log, "Writing version for {} (creation: {}, removal {}, creation csn {})", name, version.creation_tid, version.removal_tid, version.creation_csn);
    assert(storage.supportsTransactions());

    if (!isStoredOnDisk())
@ -1382,7 +1382,7 @@ void IMergeTreeDataPart::appendCSNToVersionMetadata(VersionMetadata::WhichCSN wh
 void IMergeTreeDataPart::appendRemovalTIDToVersionMetadata(bool clear) const
 {
    chassert(!version.creation_tid.isEmpty());
-    chassert(version.removal_csn == 0);
+    chassert(version.removal_csn == 0 || (version.removal_csn == Tx::PrehistoricCSN && version.removal_tid.isPrehistoric()));
    chassert(!version.removal_tid.isEmpty());
    chassert(isStoredOnDisk());

@ -1390,6 +1390,12 @@ void IMergeTreeDataPart::appendRemovalTIDToVersionMetadata(bool clear) const
    {
        /// Metadata file probably does not exist, because it was not written on part creation, because it was created without a transaction.
        /// Let's create it (if needed). Concurrent writes are not possible, because creation_csn is prehistoric and we own removal_tid_lock.
+
+        /// It can happen that VersionMetadata::isVisible sets creation_csn to PrehistoricCSN when creation_tid is Prehistoric
+        /// In order to avoid a race always write creation_csn as PrehistoricCSN for Prehistoric creation_tid
+        assert(version.creation_csn == Tx::UnknownCSN || version.creation_csn == Tx::PrehistoricCSN);
+        version.creation_csn.store(Tx::PrehistoricCSN);
+
        storeVersionMetadata();
        return;
    }
@ -1531,8 +1537,8 @@ bool IMergeTreeDataPart::assertHasValidVersionMetadata() const
    {
        WriteBufferFromOwnString expected;
        version.write(expected);
-        tryLogCurrentException(storage.log, fmt::format("File {} contains:\n{}\nexpected:\n{}\nlock: {}",
-                                                        version_file_name, content, expected.str(), version.removal_tid_lock));
+        tryLogCurrentException(storage.log, fmt::format("File {} contains:\n{}\nexpected:\n{}\nlock: {}\nname: {}",
+                                                        version_file_name, content, expected.str(), version.removal_tid_lock, name));
        return false;
    }
 }
@ -2023,8 +2029,7 @@ std::optional<std::string> getIndexExtensionFromFilesystem(const IDataPartStorag
        for (auto it = data_part_storage.iterate(); it->isValid(); it->next())
        {
            const auto & extension = fs::path(it->name()).extension();
-            if (extension == getIndexExtension(false)
-                    || extension == getIndexExtension(true))
+            if (extension == getIndexExtension(true))
                return extension;
        }
    }
@ -2036,4 +2041,20 @@ bool isCompressedFromIndexExtension(const String & index_extension)
    return index_extension == getIndexExtension(true);
 }

+Strings getPartsNamesWithStates(const MergeTreeDataPartsVector & parts)
+{
+    Strings part_names;
+    for (const auto & p : parts)
+        part_names.push_back(p->getNameWithState());
+    return part_names;
+}
+
+Strings getPartsNames(const MergeTreeDataPartsVector & parts)
+{
+    Strings part_names;
+    for (const auto & p : parts)
+        part_names.push_back(p->name);
+    return part_names;
+}
+
 }
--- a/src/Storages/MergeTree/IMergeTreeDataPart.h
+++ b/src/Storages/MergeTree/IMergeTreeDataPart.h
@ -595,4 +595,9 @@ inline String getIndexExtension(bool is_compressed_primary_key) { return is_comp
 std::optional<String> getIndexExtensionFromFilesystem(const IDataPartStorage & data_part_storage);
 bool isCompressedFromIndexExtension(const String & index_extension);

+using MergeTreeDataPartsVector = std::vector<MergeTreeDataPartPtr>;
+
+Strings getPartsNamesWithStates(const MergeTreeDataPartsVector & parts);
+Strings getPartsNames(const MergeTreeDataPartsVector & parts);
+
 }
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@ -20,6 +20,7 @@
 #include <DataTypes/ObjectUtils.h>
 #include <Columns/ColumnObject.h>
 #include <DataTypes/hasNullable.h>
+#include <Disks/createVolume.h>
 #include <Disks/ObjectStorages/DiskObjectStorage.h>
 #include <Functions/FunctionFactory.h>
 #include <Functions/IFunction.h>
@ -167,6 +168,7 @@ namespace ErrorCodes
    extern const int INCORRECT_QUERY;
    extern const int CANNOT_RESTORE_TABLE;
    extern const int ZERO_COPY_REPLICATION_ERROR;
+    extern const int SERIALIZATION_ERROR;
 }


@ -1683,7 +1685,7 @@ size_t MergeTreeData::clearOldTemporaryDirectories(size_t custom_directories_lif
    return cleared_count;
 }

-scope_guard MergeTreeData::getTemporaryPartDirectoryHolder(const String & part_dir_name)
+scope_guard MergeTreeData::getTemporaryPartDirectoryHolder(const String & part_dir_name) const
 {
    temporary_parts.add(part_dir_name);
    return [this, part_dir_name]() { temporary_parts.remove(part_dir_name); };
@ -1713,6 +1715,7 @@ MergeTreeData::DataPartsVector MergeTreeData::grabOldParts(bool force)
    /// in the "zero-copy replication" (because it is a non-production feature).
    /// Please don't use "zero-copy replication" (a non-production feature) in production.
    /// It is not ready for production usage. Don't use it.
+
    bool need_remove_parts_in_order = supportsReplication() && getSettings()->allow_remote_fs_zero_copy_replication;

    if (need_remove_parts_in_order)
@ -1729,7 +1732,6 @@ MergeTreeData::DataPartsVector MergeTreeData::grabOldParts(bool force)
        need_remove_parts_in_order = has_zero_copy_disk;
    }

-    time_t now = time(nullptr);
    std::vector<DataPartIteratorByStateAndInfo> parts_to_delete;
    std::vector<MergeTreePartInfo> skipped_parts;

@ -1745,6 +1747,8 @@ MergeTreeData::DataPartsVector MergeTreeData::grabOldParts(bool force)
        return false;
    };

+    auto time_now = time(nullptr);
+
    {
        auto parts_lock = lockParts();

@ -1760,8 +1764,6 @@ MergeTreeData::DataPartsVector MergeTreeData::grabOldParts(bool force)
                continue;
            }

-            auto part_remove_time = part->remove_time.load(std::memory_order_relaxed);
-
            /// Grab only parts that are not used by anyone (SELECTs for example).
            if (!part.unique())
            {
@ -1769,7 +1771,8 @@ MergeTreeData::DataPartsVector MergeTreeData::grabOldParts(bool force)
                continue;
            }

-            if ((part_remove_time < now && now - part_remove_time > getSettings()->old_parts_lifetime.totalSeconds() && !has_skipped_mutation_parent(part))
+            auto part_remove_time = part->remove_time.load(std::memory_order_relaxed);
+            if ((part_remove_time < time_now && time_now - part_remove_time > getSettings()->old_parts_lifetime.totalSeconds() && !has_skipped_mutation_parent(part))
                || force
                || isInMemoryPart(part)     /// Remove in-memory parts immediately to not store excessive data in RAM
                || (part->version.creation_csn == Tx::RolledBackCSN && getSettings()->remove_rolled_back_parts_immediately))
@ -1779,6 +1782,7 @@ MergeTreeData::DataPartsVector MergeTreeData::grabOldParts(bool force)
            else
            {
                skipped_parts.push_back(part->info);
+                continue;
            }
        }

@ -1791,7 +1795,8 @@ MergeTreeData::DataPartsVector MergeTreeData::grabOldParts(bool force)
    }

    if (!res.empty())
-        LOG_TRACE(log, "Found {} old parts to remove.", res.size());
+        LOG_TRACE(log, "Found {} old parts to remove. Parts {}",
+                  res.size(), fmt::join(getPartsNames(res), ", "));

    return res;
 }
@ -1826,6 +1831,8 @@ void MergeTreeData::removePartsFinally(const MergeTreeData::DataPartsVector & pa

            (*it)->assertState({DataPartState::Deleting});

+            LOG_DEBUG(log, "Finally removing part from memory {}", part->name);
+
            data_parts_indexes.erase(it);
        }
    }
@ -1921,6 +1928,8 @@ void MergeTreeData::clearPartsFromFilesystem(const DataPartsVector & parts, bool
    {
        get_failed_parts();

+        LOG_DEBUG(log, "Failed to remove all parts, all count {}, removed {}", parts.size(), part_names_succeed.size());
+
        if (throw_on_error)
            throw;
    }
@ -2111,11 +2120,24 @@ size_t MergeTreeData::clearEmptyParts()
        if (part->rows_count != 0)
            continue;

-        /// Do not try to drop uncommitted parts.
+        /// Do not try to drop uncommitted parts. If the newest tx doesn't see it that is probably hasn't been committed jet
        if (!part->version.getCreationTID().isPrehistoric() && !part->version.isVisible(TransactionLog::instance().getLatestSnapshot()))
            continue;

-        LOG_TRACE(log, "Will drop empty part {}", part->name);
+        /// Don't drop empty parts that cover other parts
+        /// Otherwise covered parts resurrect
+        {
+            auto lock = lockParts();
+            if (part->getState() != DataPartState::Active)
+                continue;
+
+            DataPartsVector covered_parts = getCoveredOutdatedParts(part, lock);
+            if (!covered_parts.empty())
+                continue;
+        }
+
+        LOG_INFO(log, "Will drop empty part {}", part->name);
+
        dropPartNoWaitNoThrow(part->name);
        ++cleared_count;
    }
@ -2893,16 +2915,16 @@ MergeTreeData::PartsTemporaryRename::~PartsTemporaryRename()
    }
 }

-
-MergeTreeData::DataPartsVector MergeTreeData::getActivePartsToReplace(
-    const MergeTreePartInfo & new_part_info,
-    const String & new_part_name,
-    DataPartPtr & out_covering_part,
+MergeTreeData::PartHierarchy MergeTreeData::getPartHierarchy(
+    const MergeTreePartInfo & part_info,
+    DataPartState state,
    DataPartsLock & /* data_parts_lock */) const
 {
+    PartHierarchy result;
+
    /// Parts contained in the part are consecutive in data_parts, intersecting the insertion place for the part itself.
-    auto it_middle = data_parts_by_state_and_info.lower_bound(DataPartStateAndInfo{DataPartState::Active, new_part_info});
-    auto committed_parts_range = getDataPartsStateRange(DataPartState::Active);
+    auto it_middle = data_parts_by_state_and_info.lower_bound(DataPartStateAndInfo{state, part_info});
+    auto committed_parts_range = getDataPartsStateRange(state);

    /// Go to the left.
    DataPartIteratorByStateAndInfo begin = it_middle;
@ -2910,17 +2932,16 @@ MergeTreeData::DataPartsVector MergeTreeData::getActivePartsToReplace(
    {
        auto prev = std::prev(begin);

-        if (!new_part_info.contains((*prev)->info))
+        if (!part_info.contains((*prev)->info))
        {
-            if ((*prev)->info.contains(new_part_info))
+            if ((*prev)->info.contains(part_info))
            {
-                out_covering_part = *prev;
-                return {};
+                result.covering_parts.push_back(*prev);
+            }
+            else if (!part_info.isDisjoint((*prev)->info))
+            {
+                result.intersected_parts.push_back(*prev);
            }
-
-            if (!new_part_info.isDisjoint((*prev)->info))
-                throw Exception(ErrorCodes::LOGICAL_ERROR, "Part {} intersects previous part {}. It is a bug.",
-                                new_part_name, (*prev)->getNameWithState());

            break;
        }
@ -2928,24 +2949,29 @@ MergeTreeData::DataPartsVector MergeTreeData::getActivePartsToReplace(
        begin = prev;
    }

+    std::reverse(result.covering_parts.begin(), result.covering_parts.end());
+
    /// Go to the right.
    DataPartIteratorByStateAndInfo end = it_middle;
    while (end != committed_parts_range.end())
    {
-        if ((*end)->info == new_part_info)
-            throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected duplicate part {}. It is a bug.", (*end)->getNameWithState());
-
-        if (!new_part_info.contains((*end)->info))
+        if ((*end)->info == part_info)
        {
-            if ((*end)->info.contains(new_part_info))
-            {
-                out_covering_part = *end;
-                return {};
-            }
+            result.duplicate_part = *end;
+            result.covering_parts.clear();
+            return result;
+        }

-            if (!new_part_info.isDisjoint((*end)->info))
-                throw Exception(ErrorCodes::LOGICAL_ERROR, "Part {} intersects next part {}. It is a bug.",
-                                new_part_name, (*end)->getNameWithState());
+        if (!part_info.contains((*end)->info))
+        {
+            if ((*end)->info.contains(part_info))
+            {
+                result.covering_parts.push_back(*end);
+            }
+            else if (!part_info.isDisjoint((*end)->info))
+            {
+                result.intersected_parts.push_back(*end);
+            }

            break;
        }
@ -2953,31 +2979,47 @@ MergeTreeData::DataPartsVector MergeTreeData::getActivePartsToReplace(
        ++end;
    }

-    return DataPartsVector{begin, end};
+    result.covered_parts.insert(result.covered_parts.end(), begin, end);
+
+    return result;
 }

-
-bool MergeTreeData::renameTempPartAndAdd(
-    MutableDataPartPtr & part,
-    Transaction & out_transaction,
-    DataPartsLock & lock)
+MergeTreeData::DataPartsVector MergeTreeData::getCoveredOutdatedParts(
+    const DataPartPtr & part,
+    DataPartsLock & data_parts_lock) const
 {
-    DataPartsVector covered_parts;
+    part->assertState({DataPartState::Active, DataPartState::PreActive});
+    PartHierarchy hierarchy = getPartHierarchy(part->info, DataPartState::Outdated, data_parts_lock);

-    if (!renameTempPartAndReplaceImpl(part, out_transaction, lock, &covered_parts))
-        return false;
+    if (hierarchy.duplicate_part)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected duplicate part {}. It is a bug.", hierarchy.duplicate_part->getNameWithState());

-    if (!covered_parts.empty())
-        throw Exception("Added part " + part->name + " covers " + toString(covered_parts.size())
-            + " existing part(s) (including " + covered_parts[0]->name + ")", ErrorCodes::LOGICAL_ERROR);
-
-    return true;
+    return hierarchy.covered_parts;
 }

-void MergeTreeData::checkPartCanBeAddedToTable(MutableDataPartPtr & part, DataPartsLock & lock) const
+MergeTreeData::DataPartsVector MergeTreeData::getActivePartsToReplace(
+    const MergeTreePartInfo & new_part_info,
+    const String & new_part_name,
+    DataPartPtr & out_covering_part,
+    DataPartsLock & data_parts_lock) const
 {
-    part->assertState({DataPartState::Temporary});
+    PartHierarchy hierarchy = getPartHierarchy(new_part_info, DataPartState::Active, data_parts_lock);

+    if (!hierarchy.intersected_parts.empty())
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Part {} intersects part {}. It is a bug.",
+                        new_part_name, hierarchy.intersected_parts.back()->getNameWithState());
+
+    if (hierarchy.duplicate_part)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected duplicate part {}. It is a bug.", hierarchy.duplicate_part->getNameWithState());
+
+    if (!hierarchy.covering_parts.empty())
+        out_covering_part = std::move(hierarchy.covering_parts.back());
+
+    return std::move(hierarchy.covered_parts);
+}
+
+void MergeTreeData::checkPartPartition(MutableDataPartPtr & part, DataPartsLock & lock) const
+{
    if (DataPartPtr existing_part_in_partition = getAnyPartInPartition(part->info.partition_id, lock))
    {
        if (part->partition.value != existing_part_in_partition->partition.value)
@ -2986,14 +3028,22 @@ void MergeTreeData::checkPartCanBeAddedToTable(MutableDataPartPtr & part, DataPa
                + existing_part_in_partition->name + ", newly added part: " + part->name,
                ErrorCodes::CORRUPTED_DATA);
    }
+}

-    if (auto it_duplicate = data_parts_by_info.find(part->info); it_duplicate != data_parts_by_info.end())
+void MergeTreeData::checkPartDuplicate(MutableDataPartPtr & part, Transaction & transaction, DataPartsLock & /*lock*/) const
+{
+    auto it_duplicate = data_parts_by_info.find(part->info);
+
+    if (it_duplicate != data_parts_by_info.end())
    {
        String message = "Part " + (*it_duplicate)->getNameWithState() + " already exists";

        if ((*it_duplicate)->checkState({DataPartState::Outdated, DataPartState::Deleting}))
            throw Exception(message + ", but it will be deleted soon", ErrorCodes::PART_IS_TEMPORARILY_LOCKED);

+        if (transaction.txn)
+            throw Exception(message, ErrorCodes::SERIALIZATION_ERROR);
+
        throw Exception(message, ErrorCodes::DUPLICATE_DATA_PART);
    }
 }
@ -3022,49 +3072,59 @@ bool MergeTreeData::renameTempPartAndReplaceImpl(
    DataPartsLock & lock,
    DataPartsVector * out_covered_parts)
 {
-    LOG_TRACE(log, "Renaming temporary part {} to {}.", part->getDataPartStorage().getPartDirectory(), part->name);
+    LOG_TRACE(log, "Renaming temporary part {} to {} with tid {}.", part->getDataPartStorage().getPartDirectory(), part->name, out_transaction.getTID());

    if (&out_transaction.data != this)
-        throw Exception("MergeTreeData::Transaction for one table cannot be used with another. It is a bug.",
-            ErrorCodes::LOGICAL_ERROR);
+        throw Exception("MergeTreeData::Transaction for one table cannot be used with another. It is a bug.", ErrorCodes::LOGICAL_ERROR);
+
+    part->assertState({DataPartState::Temporary});
+    checkPartPartition(part, lock);
+    checkPartDuplicate(part, out_transaction, lock);
+
+    PartHierarchy hierarchy = getPartHierarchy(part->info, DataPartState::Active, lock);
+
+    if (!hierarchy.intersected_parts.empty())
+    {
+        String message = fmt::format("Part {} intersects part {}", part->name, hierarchy.intersected_parts.back()->getNameWithState());
+
+        // Drop part|partition operation inside some transactions sees some stale snapshot from the time when transactions has been started.
+        // So such operation may attempt to delete already outdated part. In this case, this outdated part is most likely covered by the other part and intersection may occur.
+        // Part mayght be outdated due to merge|mutation|update|optimization operations.
+        if (part->isEmpty() || (hierarchy.intersected_parts.size() == 1 && hierarchy.intersected_parts.back()->isEmpty()))
+        {
+            message += fmt::format(" One of them is empty part. That is a race between drop operation under transaction and a merge/mutation.");
+            throw Exception(message, ErrorCodes::SERIALIZATION_ERROR);
+        }
+
+        if (hierarchy.intersected_parts.size() > 1)
+            message += fmt::format(" There are {} intersected parts.", hierarchy.intersected_parts.size());
+
+        throw Exception(ErrorCodes::LOGICAL_ERROR, message + " It is a bug.");
+    }

    if (part->hasLightweightDelete())
        has_lightweight_delete_parts.store(true);

-    checkPartCanBeAddedToTable(part, lock);
-
-    DataPartPtr covering_part;
-    DataPartsVector covered_parts = getActivePartsToReplace(part->info, part->name, covering_part, lock);
-
-    if (covering_part)
-    {
-        LOG_WARNING(log, "Tried to add obsolete part {} covered by {}", part->name, covering_part->getNameWithState());
-        return false;
-    }
-
    /// All checks are passed. Now we can rename the part on disk.
    /// So, we maintain invariant: if a non-temporary part in filesystem then it is in data_parts
    preparePartForCommit(part, out_transaction);

    if (out_covered_parts)
    {
-        out_covered_parts->reserve(covered_parts.size());
-
-        for (DataPartPtr & covered_part : covered_parts)
-            out_covered_parts->emplace_back(std::move(covered_part));
+        out_covered_parts->reserve(out_covered_parts->size() + hierarchy.covered_parts.size());
+        std::move(hierarchy.covered_parts.begin(), hierarchy.covered_parts.end(), std::back_inserter(*out_covered_parts));
    }

    return true;
 }

-MergeTreeData::DataPartsVector MergeTreeData::renameTempPartAndReplaceUnlocked(
+bool MergeTreeData::renameTempPartAndReplaceUnlocked(
    MutableDataPartPtr & part,
    Transaction & out_transaction,
-    DataPartsLock & lock)
+    DataPartsLock & lock,
+    DataPartsVector * out_covered_parts)
 {
-    DataPartsVector covered_parts;
-    renameTempPartAndReplaceImpl(part, out_transaction, lock, &covered_parts);
-    return covered_parts;
+    return renameTempPartAndReplaceImpl(part, out_transaction, lock, out_covered_parts);
 }

 MergeTreeData::DataPartsVector MergeTreeData::renameTempPartAndReplace(
@ -3072,7 +3132,26 @@ MergeTreeData::DataPartsVector MergeTreeData::renameTempPartAndReplace(
    Transaction & out_transaction)
 {
    auto part_lock = lockParts();
-    return renameTempPartAndReplaceUnlocked(part, out_transaction, part_lock);
+    DataPartsVector covered_parts;
+    renameTempPartAndReplaceImpl(part, out_transaction, part_lock, &covered_parts);
+    return covered_parts;
+}
+
+bool MergeTreeData::renameTempPartAndAdd(
+    MutableDataPartPtr & part,
+    Transaction & out_transaction,
+    DataPartsLock & lock)
+{
+    DataPartsVector covered_parts;
+
+    if (!renameTempPartAndReplaceImpl(part, out_transaction, lock, &covered_parts))
+        return false;
+
+    if (!covered_parts.empty())
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Added part {} covers {} existing part(s) (including {})",
+            part->name, toString(covered_parts.size()), covered_parts[0]->name);
+
+    return true;
 }

 void MergeTreeData::removePartsFromWorkingSet(MergeTreeTransaction * txn, const MergeTreeData::DataPartsVector & remove, bool clear_without_timeout, DataPartsLock & acquired_lock)
@ -4549,17 +4628,7 @@ String MergeTreeData::getPartitionIDFromQuery(const ASTPtr & ast, ContextPtr loc

 DataPartsVector MergeTreeData::getVisibleDataPartsVector(ContextPtr local_context) const
 {
-    DataPartsVector res;
-    if (const auto * txn = local_context->getCurrentTransaction().get())
-    {
-        res = getDataPartsVectorForInternalUsage({DataPartState::Active, DataPartState::Outdated});
-        filterVisibleDataParts(res, txn->getSnapshot(), txn->tid);
-    }
-    else
-    {
-        res = getDataPartsVectorForInternalUsage();
-    }
-    return res;
+    return getVisibleDataPartsVector(local_context->getCurrentTransaction());
 }

 DataPartsVector MergeTreeData::getVisibleDataPartsVectorUnlocked(ContextPtr local_context, const DataPartsLock & lock) const
@ -4611,17 +4680,8 @@ void MergeTreeData::filterVisibleDataParts(DataPartsVector & maybe_visible_parts
    std::erase_if(maybe_visible_parts, need_remove_pred);
    [[maybe_unused]] size_t visible_size = maybe_visible_parts.size();

-
-    auto get_part_names = [&maybe_visible_parts]() -> Strings
-    {
-        Strings visible_part_names;
-        for (const auto & p : maybe_visible_parts)
-            visible_part_names.push_back(p->name);
-        return visible_part_names;
-    };
-
    LOG_TEST(log, "Got {} parts (of {}) visible in snapshot {} (TID {}): {}",
-             visible_size, total_size, snapshot_version, current_tid, fmt::join(get_part_names(), ", "));
+             visible_size, total_size, snapshot_version, current_tid, fmt::join(getPartsNamesWithStates(maybe_visible_parts), ", "));
 }


@ -5108,6 +5168,7 @@ CompressionCodecPtr MergeTreeData::getCompressionCodecForPart(size_t part_size_c
        static_cast<double>(part_size_compressed) / getTotalActiveSizeInBytes());
 }

+
 MergeTreeData::DataParts MergeTreeData::getDataParts(const DataPartStates & affordable_states) const
 {
    DataParts res;
@ -5170,11 +5231,16 @@ void MergeTreeData::Transaction::rollbackPartsToTemporaryState()
    clear();
 }

+TransactionID MergeTreeData::Transaction::getTID() const
+{
+    if (txn)
+        return txn->tid;
+    return Tx::PrehistoricTID;
+}
+
 void MergeTreeData::Transaction::addPart(MutableDataPartPtr & part)
 {
    precommitted_parts.insert(part);
-    if (asInMemoryPart(part))
-        has_in_memory_parts = true;
 }

 void MergeTreeData::Transaction::rollback()
@ -5182,11 +5248,14 @@ void MergeTreeData::Transaction::rollback()
    if (!isEmpty())
    {
        WriteBufferFromOwnString buf;
-        buf << " Removing parts:";
+        buf << "Removing parts:";
        for (const auto & part : precommitted_parts)
            buf << " " << part->getDataPartStorage().getPartDirectory();
        buf << ".";
-        LOG_DEBUG(data.log, "Undoing transaction.{}", buf.str());
+        LOG_DEBUG(data.log, "Undoing transaction {}. {}", getTID(), buf.str());
+
+        for (const auto & part : precommitted_parts)
+            part->version.creation_csn.store(Tx::RolledBackCSN);

        auto lock = data.lockParts();

@ -5217,7 +5286,6 @@ void MergeTreeData::Transaction::rollback()
 void MergeTreeData::Transaction::clear()
 {
    precommitted_parts.clear();
-    has_in_memory_parts = false;
 }

 MergeTreeData::DataPartsVector MergeTreeData::Transaction::commit(MergeTreeData::DataPartsLock * acquired_parts_lock)
@ -5234,26 +5302,41 @@ MergeTreeData::DataPartsVector MergeTreeData::Transaction::commit(MergeTreeData:
            if (part->getDataPartStorage().hasActiveTransaction())
                part->getDataPartStorage().commitTransaction();

-        bool commit_to_wal = has_in_memory_parts && settings->in_memory_parts_enable_wal;
-        if (txn || commit_to_wal)
-        {
-            MergeTreeData::WriteAheadLogPtr wal;
-            if (commit_to_wal)
-                wal = data.getWriteAheadLog();
-
+        if (txn)
            for (const auto & part : precommitted_parts)
            {
-                if (txn)
-                {
-                    DataPartPtr covering_part;
-                    DataPartsVector covered_parts = data.getActivePartsToReplace(part->info, part->name, covering_part, *owing_parts_lock);
-                    MergeTreeTransaction::addNewPartAndRemoveCovered(data.shared_from_this(), part, covered_parts, txn);
-                }
+                DataPartPtr covering_part;
+                DataPartsVector covered_active_parts = data.getActivePartsToReplace(part->info, part->name, covering_part, *owing_parts_lock);

-                if (auto part_in_memory = asInMemoryPart(part))
-                    wal->addPart(part_in_memory);
+                /// outdated parts should be also collected here
+                /// the visible outdated parts should be tried to be removed
+                /// more likely the conflict happens at the removing visible outdated parts, what is right actually
+                DataPartsVector covered_outdated_parts = data.getCoveredOutdatedParts(part, *owing_parts_lock);
+
+                LOG_TEST(data.log, "Got {} oudated parts covered by {} (TID {} CSN {}): {}",
+                         covered_outdated_parts.size(), part->getNameWithState(), txn->tid, txn->getSnapshot(), fmt::join(getPartsNames(covered_outdated_parts), ", "));
+                data.filterVisibleDataParts(covered_outdated_parts, txn->getSnapshot(), txn->tid);
+
+                DataPartsVector covered_parts;
+                covered_parts.reserve(covered_active_parts.size() + covered_outdated_parts.size());
+                std::move(covered_active_parts.begin(), covered_active_parts.end(), std::back_inserter(covered_parts));
+                std::move(covered_outdated_parts.begin(), covered_outdated_parts.end(), std::back_inserter(covered_parts));
+
+                MergeTreeTransaction::addNewPartAndRemoveCovered(data.shared_from_this(), part, covered_parts, txn);
            }
-        }
+
+        MergeTreeData::WriteAheadLogPtr wal;
+        auto get_inited_wal = [&] ()
+        {
+            if (!wal)
+                wal = data.getWriteAheadLog();
+            return wal;
+        };
+
+        if (settings->in_memory_parts_enable_wal)
+            for (const auto & part : precommitted_parts)
+                if (auto part_in_memory = asInMemoryPart(part))
+                    get_inited_wal()->addPart(part_in_memory);

        NOEXCEPT_SCOPE({
            auto current_time = time(nullptr);
@ -5298,6 +5381,10 @@ MergeTreeData::DataPartsVector MergeTreeData::Transaction::commit(MergeTreeData:

                        data.modifyPartState(covered_part, DataPartState::Outdated);
                        data.removePartContributionToColumnAndSecondaryIndexSizes(covered_part);
+
+                        if (settings->in_memory_parts_enable_wal)
+                            if (isInMemoryPart(covered_part))
+                                get_inited_wal()->dropPart(covered_part->name);
                    }

                    reduce_parts += covered_parts.size();
@ -6280,24 +6367,38 @@ std::pair<MergeTreeData::MutableDataPartPtr, scope_guard> MergeTreeData::cloneAn
    auto reservation = src_part->getDataPartStorage().reserve(src_part->getBytesOnDisk());
    auto src_part_storage = src_part->getDataPartStoragePtr();

+    scope_guard src_flushed_tmp_dir_lock;
+    MergeTreeData::MutableDataPartPtr src_flushed_tmp_part;
+
    /// If source part is in memory, flush it to disk and clone it already in on-disk format
+    /// Protect tmp dir from removing by cleanup thread with src_flushed_tmp_dir_lock
+    /// Construct src_flushed_tmp_part in order to delete part with its directory at destructor
    if (auto src_part_in_memory = asInMemoryPart(src_part))
    {
-        auto flushed_part_path = src_part_in_memory->getRelativePathForPrefix(tmp_part_prefix);
-        src_part_storage = src_part_in_memory->flushToDisk(*flushed_part_path, metadata_snapshot);
+        auto flushed_part_path = *src_part_in_memory->getRelativePathForPrefix(tmp_part_prefix);
+
+        auto tmp_src_part_file_name = fs::path(tmp_dst_part_name).filename();
+        src_flushed_tmp_dir_lock = src_part->storage.getTemporaryPartDirectoryHolder(tmp_src_part_file_name);
+
+        auto flushed_part_storage = src_part_in_memory->flushToDisk(flushed_part_path, metadata_snapshot);
+        src_flushed_tmp_part = createPart(src_part->name, src_part->info, flushed_part_storage);
+        src_flushed_tmp_part->is_temp = true;
+
+        src_part_storage = flushed_part_storage;
    }

    String with_copy;
    if (copy_instead_of_hardlink)
        with_copy = " (copying data)";

-    LOG_DEBUG(log, "Cloning part {} to {}{}",
-              src_part_storage->getFullPath(),
-              std::string(fs::path(src_part_storage->getFullRootPath()) / tmp_dst_part_name),
-              with_copy);
-
    auto dst_part_storage = src_part_storage->freeze(relative_data_path, tmp_dst_part_name, /* make_source_readonly */ false, {}, copy_instead_of_hardlink, files_to_copy_instead_of_hardlinks);

+    LOG_DEBUG(log, "Clone {} part {} to {}{}",
+              src_flushed_tmp_part ? "flushed" : "",
+              src_part_storage->getFullPath(),
+              std::string(fs::path(dst_part_storage->getFullRootPath()) / tmp_dst_part_name),
+              with_copy);
+
    auto dst_data_part = createPart(dst_part_name, dst_part_info, dst_part_storage);

    if (!copy_instead_of_hardlink && hardlinked_files)
@ -6463,12 +6564,21 @@ PartitionCommandsResultInfo MergeTreeData::freezePartitionsByMatcher(
        LOG_DEBUG(log, "Freezing part {} snapshot will be placed at {}", part->name, backup_path);

        auto data_part_storage = part->getDataPartStoragePtr();
-        String src_part_path = data_part_storage->getRelativePath();
        String backup_part_path = fs::path(backup_path) / relative_data_path;
+
+        scope_guard src_flushed_tmp_dir_lock;
+        MergeTreeData::MutableDataPartPtr src_flushed_tmp_part;
+
        if (auto part_in_memory = asInMemoryPart(part))
        {
-            auto flushed_part_path = part_in_memory->getRelativePathForPrefix("tmp_freeze");
-            data_part_storage = part_in_memory->flushToDisk(*flushed_part_path, metadata_snapshot);
+            auto flushed_part_path = *part_in_memory->getRelativePathForPrefix("tmp_freeze");
+            src_flushed_tmp_dir_lock = part->storage.getTemporaryPartDirectoryHolder("tmp_freeze" + part->name);
+
+            auto flushed_part_storage = part_in_memory->flushToDisk(flushed_part_path, metadata_snapshot);
+            src_flushed_tmp_part = createPart(part->name, part->info, flushed_part_storage);
+            src_flushed_tmp_part->is_temp = true;
+
+            data_part_storage = flushed_part_storage;
        }

        auto callback = [this, &part, &backup_part_path](const DiskPtr & disk)
@ -6553,6 +6663,7 @@ bool MergeTreeData::canReplacePartition(const DataPartPtr & src_part) const
        if (canUseAdaptiveGranularity() && !src_part->index_granularity_info.mark_type.adaptive)
            return false;
    }
+
    return true;
 }

@ -7221,6 +7332,89 @@ void MergeTreeData::incrementMergedPartsProfileEvent(MergeTreeDataPartType type)
    }
 }

+MergeTreeData::MutableDataPartPtr MergeTreeData::createEmptyPart(
+        MergeTreePartInfo & new_part_info, const MergeTreePartition & partition, const String & new_part_name,
+        const MergeTreeTransactionPtr & txn)
+{
+    auto metadata_snapshot = getInMemoryMetadataPtr();
+    auto settings = getSettings();
+
+    auto block = metadata_snapshot->getSampleBlock();
+    NamesAndTypesList columns = metadata_snapshot->getColumns().getAllPhysical().filter(block.getNames());
+    setAllObjectsToDummyTupleType(columns);
+
+    auto minmax_idx = std::make_shared<IMergeTreeDataPart::MinMaxIndex>();
+    minmax_idx->update(block, getMinMaxColumnsNames(metadata_snapshot->getPartitionKey()));
+
+    DB::IMergeTreeDataPart::TTLInfos move_ttl_infos;
+    VolumePtr volume = getStoragePolicy()->getVolume(0);
+    ReservationPtr reservation = reserveSpacePreferringTTLRules(metadata_snapshot, 0, move_ttl_infos, time(nullptr), 0, true);
+    VolumePtr data_part_volume = createVolumeFromReservation(reservation, volume);
+
+    auto new_data_part_storage = std::make_shared<DataPartStorageOnDisk>(
+        data_part_volume,
+        getRelativeDataPath(),
+        EMPTY_PART_TMP_PREFIX + new_part_name);
+
+    auto new_data_part = createPart(
+        new_part_name,
+        choosePartTypeOnDisk(0, block.rows()),
+        new_part_info,
+        new_data_part_storage
+        );
+
+    new_data_part->name = new_part_name;
+
+    if (settings->assign_part_uuids)
+        new_data_part->uuid = UUIDHelpers::generateV4();
+
+    new_data_part->setColumns(columns, {});
+    new_data_part->rows_count = block.rows();
+
+    new_data_part->partition = partition;
+
+    new_data_part->minmax_idx = std::move(minmax_idx);
+    new_data_part->is_temp = true;
+
+    SyncGuardPtr sync_guard;
+    if (new_data_part->isStoredOnDisk())
+    {
+        /// The name could be non-unique in case of stale files from previous runs.
+        if (new_data_part_storage->exists())
+        {
+            /// The path has to be unique, all tmp directories are deleted at startup in case of stale files from previous runs.
+            /// New part have to capture its name, therefore there is no concurrentcy in directory creation
+            throw Exception(ErrorCodes::LOGICAL_ERROR,
+                            "New empty part is about to matirialize but the dirrectory already exist"
+                            ", new part {}"
+                            ", directory {}",
+                            new_part_name, new_data_part_storage->getFullPath());
+        }
+
+        new_data_part_storage->createDirectories();
+
+        if (getSettings()->fsync_part_directory)
+            sync_guard = new_data_part_storage->getDirectorySyncGuard();
+    }
+
+    /// This effectively chooses minimal compression method:
+    ///  either default lz4 or compression method with zero thresholds on absolute and relative part size.
+    auto compression_codec = getContext()->chooseCompressionCodec(0, 0);
+
+    const auto & index_factory = MergeTreeIndexFactory::instance();
+    MergedBlockOutputStream out(new_data_part, metadata_snapshot, columns,
+        index_factory.getMany(metadata_snapshot->getSecondaryIndices()), compression_codec, txn);
+
+    bool sync_on_insert = settings->fsync_after_insert;
+
+    out.write(block);
+    /// Here is no projections as no data inside
+
+    out.finalizePart(new_data_part, sync_on_insert);
+
+    return new_data_part;
+}
+
 CurrentlySubmergingEmergingTagger::~CurrentlySubmergingEmergingTagger()
 {
    std::lock_guard lock(storage.currently_submerging_emerging_mutex);
--- a/src/Storages/MergeTree/MergeTreeData.h
+++ b/src/Storages/MergeTree/MergeTreeData.h
@ -9,6 +9,7 @@
 #include <IO/ReadBufferFromFile.h>
 #include <DataTypes/DataTypeString.h>
 #include <DataTypes/DataTypesNumber.h>
+#include <Disks/StoragePolicy.h>
 #include <Processors/Merges/Algorithms/Graphite.h>
 #include <Storages/MergeTree/BackgroundJobsAssignee.h>
 #include <Storages/MergeTree/MergeTreeIndices.h>
@ -29,7 +30,6 @@
 #include <Storages/extractKeyExpressionList.h>
 #include <Storages/PartitionCommands.h>
 #include <Interpreters/PartLog.h>
-#include <Disks/StoragePolicy.h>


 #include <boost/multi_index_container.hpp>
@ -220,6 +220,9 @@ public:
    using DataPartsLock = std::unique_lock<std::mutex>;
    DataPartsLock lockParts() const { return DataPartsLock(data_parts_mutex); }

+    using OperationDataPartsLock = std::unique_lock<std::mutex>;
+    OperationDataPartsLock lockOperationsWithParts() const { return OperationDataPartsLock(operation_with_data_parts_mutex); }
+
    MergeTreeDataPartType choosePartType(size_t bytes_uncompressed, size_t rows_count) const;
    MergeTreeDataPartType choosePartTypeOnDisk(size_t bytes_uncompressed, size_t rows_count) const;

@ -271,6 +274,8 @@ public:
            }
        }

+        TransactionID getTID() const;
+
    private:
        friend class MergeTreeData;

@ -278,7 +283,6 @@ public:
        MergeTreeTransaction * txn;
        MutableDataParts precommitted_parts;
        MutableDataParts locked_parts;
-        bool has_in_memory_parts = false;

        void clear();
    };
@ -563,10 +567,11 @@ public:
        Transaction & out_transaction);

    /// Unlocked version of previous one. Useful when added multiple parts with a single lock.
-    DataPartsVector renameTempPartAndReplaceUnlocked(
+    bool renameTempPartAndReplaceUnlocked(
        MutableDataPartPtr & part,
        Transaction & out_transaction,
-        DataPartsLock & lock);
+        DataPartsLock & lock,
+        DataPartsVector * out_covered_parts = nullptr);

    /// Remove parts from working set immediately (without wait for background
    /// process). Transfer part state to temporary. Have very limited usage only
@ -917,6 +922,9 @@ public:
    using WriteAheadLogPtr = std::shared_ptr<MergeTreeWriteAheadLog>;
    WriteAheadLogPtr getWriteAheadLog();

+    constexpr static auto EMPTY_PART_TMP_PREFIX = "tmp_empty_";
+    MergeTreeData::MutableDataPartPtr createEmptyPart(MergeTreePartInfo & new_part_info, const MergeTreePartition & partition, const String & new_part_name, const MergeTreeTransactionPtr & txn);
+
    MergeTreeDataFormatVersion format_version;

    /// Merging params - what additional actions to perform during merge.
@ -1025,7 +1033,7 @@ public:
    using MatcherFn = std::function<bool(const String &)>;

    /// Returns an object that protects temporary directory from cleanup
-    scope_guard getTemporaryPartDirectoryHolder(const String & part_dir_name);
+    scope_guard getTemporaryPartDirectoryHolder(const String & part_dir_name) const;

 protected:
    friend class IMergeTreeDataPart;
@ -1108,6 +1116,10 @@ protected:
    DataPartsIndexes::index<TagByInfo>::type & data_parts_by_info;
    DataPartsIndexes::index<TagByStateAndInfo>::type & data_parts_by_state_and_info;

+    /// Mutex for critical sections which alter set of parts
+    /// It is like truncate, drop/detach partition
+    mutable std::mutex operation_with_data_parts_mutex;
+
    /// Current description of columns of data type Object.
    /// It changes only when set of parts is changed and is
    /// protected by @data_parts_mutex.
@ -1217,6 +1229,23 @@ protected:
        DataPartPtr & out_covering_part,
        DataPartsLock & data_parts_lock) const;

+    DataPartsVector getCoveredOutdatedParts(
+        const DataPartPtr & part,
+        DataPartsLock & data_parts_lock) const;
+
+    struct PartHierarchy
+    {
+        DataPartPtr duplicate_part;
+        DataPartsVector covering_parts;
+        DataPartsVector covered_parts;
+        DataPartsVector intersected_parts;
+    };
+
+    PartHierarchy getPartHierarchy(
+        const MergeTreePartInfo & part_info,
+        DataPartState state,
+        DataPartsLock & /* data_parts_lock */) const;
+
    /// Checks whether the column is in the primary key, possibly wrapped in a chain of functions with single argument.
    bool isPrimaryOrMinMaxKeyColumnPossiblyWrappedInFunctions(const ASTPtr & node, const StorageMetadataPtr & metadata_snapshot) const;

@ -1286,8 +1315,9 @@ protected:
    static void incrementMergedPartsProfileEvent(MergeTreeDataPartType type);

 private:
-    /// Checking that candidate part doesn't break invariants: correct partition and doesn't exist already
-    void checkPartCanBeAddedToTable(MutableDataPartPtr & part, DataPartsLock & lock) const;
+    /// Checking that candidate part doesn't break invariants: correct partition
+    void checkPartPartition(MutableDataPartPtr & part, DataPartsLock & lock) const;
+    void checkPartDuplicate(MutableDataPartPtr & part, Transaction & transaction, DataPartsLock & lock) const;

    /// Preparing itself to be committed in memory: fill some fields inside part, add it to data_parts_indexes
    /// in precommitted state and to transaction
@ -1377,7 +1407,7 @@ private:

    static MutableDataPartPtr preparePartForRemoval(const DataPartPtr & part);

-    TemporaryParts temporary_parts;
+    mutable TemporaryParts temporary_parts;
 };

 /// RAII struct to record big parts that are submerging or emerging.
--- a/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp
@ -244,7 +244,7 @@ SelectPartsDecision MergeTreeDataMergerMutator::selectPartsToMerge(
            * So we have to check if this part is currently being inserted with quorum and so on and so forth.
            * Obviously we have to check it manually only for the first part
            * of each partition because it will be automatically checked for a pair of parts. */
-            if (!can_merge_callback(nullptr, part, txn.get(), nullptr))
+            if (!can_merge_callback(nullptr, part, txn.get(), out_disable_reason))
                continue;

            /// This part can be merged only with next parts (no prev part exists), so start
@ -256,7 +256,7 @@ SelectPartsDecision MergeTreeDataMergerMutator::selectPartsToMerge(
        {
            /// If we cannot merge with previous part we had to start new parts
            /// interval (in the same partition)
-            if (!can_merge_callback(*prev_part, part, txn.get(), nullptr))
+            if (!can_merge_callback(*prev_part, part, txn.get(), out_disable_reason))
            {
                /// Now we have no previous part
                prev_part = nullptr;
@ -268,7 +268,7 @@ SelectPartsDecision MergeTreeDataMergerMutator::selectPartsToMerge(
                /// for example, merge is already assigned for such parts, or they participate in quorum inserts
                /// and so on.
                /// Also we don't start new interval here (maybe all next parts cannot be merged and we don't want to have empty interval)
-                if (!can_merge_callback(nullptr, part, txn.get(), nullptr))
+                if (!can_merge_callback(nullptr, part, txn.get(), out_disable_reason))
                    continue;

                /// Starting new interval in the same partition
--- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
@ -408,7 +408,8 @@ QueryPlanPtr MergeTreeDataSelectExecutor::read(
                    /* group_by_use_nulls */ false,
                    std::move(group_by_info),
                    std::move(group_by_sort_description),
-                    should_produce_results_in_order_of_bucket_number);
+                    should_produce_results_in_order_of_bucket_number,
+                    settings.enable_memory_bound_merging_of_aggregation_results);
                query_plan->addStep(std::move(aggregating_step));
            };

--- a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp
+++ b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp
@ -732,27 +732,126 @@ int32_t ReplicatedMergeTreeQueue::pullLogsToQueue(zkutil::ZooKeeperPtr zookeeper
 namespace
 {

-Names getPartNamesToMutate(
-    const ReplicatedMergeTreeMutationEntry & mutation, const ActiveDataPartSet & parts, const DropPartsRanges & drop_ranges)
-{
-    Names result;
-    for (const auto & pair : mutation.block_numbers)
-    {
-        const String & partition_id = pair.first;
-        Int64 block_num = pair.second;

+/// Simplified representation of queue entry. Contain two sets
+/// 1) Which parts we will receive after entry execution
+/// 2) Which parts we will drop/remove after entry execution
+///
+/// We use this representation to understand which parts mutation actually have to mutate.
+struct QueueEntryRepresentation
+{
+    std::vector<std::string> produced_parts;
+    std::vector<std::string> dropped_parts;
+};
+
+using QueueRepresentation = std::map<std::string, QueueEntryRepresentation>;
+
+/// Produce a map from queue znode name to simplified entry representation.
+QueueRepresentation getQueueRepresentation(const std::list<ReplicatedMergeTreeLogEntryPtr> & entries, MergeTreeDataFormatVersion format_version)
+{
+    using LogEntryType = ReplicatedMergeTreeLogEntryData::Type;
+    QueueRepresentation result;
+    for (const auto & entry : entries)
+    {
+        const auto & key = entry->znode_name;
+        switch (entry->type)
+        {
+            /// explicetely specify all types of entries without default, so if
+            /// someone decide to add new type it will produce a compiler warning (error in our case)
+            case LogEntryType::GET_PART:
+            case LogEntryType::ATTACH_PART:
+            case LogEntryType::MERGE_PARTS:
+            case LogEntryType::MUTATE_PART:
+            {
+                result[key].produced_parts.push_back(entry->new_part_name);
+                break;
+            }
+            case LogEntryType::REPLACE_RANGE:
+            {
+                /// Quite tricky entry, it both produce and drop parts (in some cases)
+                const auto & new_parts = entry->replace_range_entry->new_part_names;
+                auto & produced_parts = result[key].produced_parts;
+                produced_parts.insert(
+                    produced_parts.end(), new_parts.begin(), new_parts.end());
+
+                if (auto drop_range = entry->getDropRange(format_version))
+                {
+                    auto & dropped_parts = result[key].dropped_parts;
+                    dropped_parts.push_back(*drop_range);
+                }
+                break;
+            }
+            case LogEntryType::DROP_RANGE:
+            {
+                result[key].dropped_parts.push_back(entry->new_part_name);
+                break;
+            }
+            /// These entries don't produce/drop any parts
+            case LogEntryType::EMPTY:
+            case LogEntryType::ALTER_METADATA:
+            case LogEntryType::CLEAR_INDEX:
+            case LogEntryType::CLEAR_COLUMN:
+            case LogEntryType::SYNC_PINNED_PART_UUIDS:
+            case LogEntryType::CLONE_PART_FROM_SHARD:
+            {
+                break;
+            }
+        }
+    }
+    return result;
+}
+
+/// Try to understand which part we need to mutate to finish mutation. In ReplicatedQueue we have two sets of parts:
+/// current parts -- set of parts which we actually have (on disk)
+/// virtual parts -- set of parts which we will have after we will execute our queue
+///
+/// From the first glance it can sound that these two sets should be enough to understand which parts we have to mutate
+/// to finish mutation but it's not true:
+/// 1) Obviously we cannot rely on current_parts because we can have stale state (some parts are absent, some merges not finished). We also have to account parts which we will
+///    get after queue execution.
+/// 2) But we cannot rely on virtual_parts for this, because they contain parts which we will get after we have executed our queue. So if we need to execute mutation 0000000001 for part all_0_0_0
+///    and we have already pulled entry to mutate this part into own queue our virtual parts will contain part all_0_0_0_1, not part all_0_0_0.
+///
+/// To avoid such issues we simply traverse all entries in queue in order and applying diff (add parts/remove parts) to current parts if they could be affected by mutation. Such approach is expensive
+/// but we do it only once since we get the mutation. After that we just update parts_to_do for each mutation when pulling entries into our queue (addPartToMutations, removePartFromMutations).
+ActiveDataPartSet getPartNamesToMutate(
+    const ReplicatedMergeTreeMutationEntry & mutation, const ActiveDataPartSet & current_parts,
+    const QueueRepresentation & queue_representation, MergeTreeDataFormatVersion format_version)
+{
+    ActiveDataPartSet result(format_version);
+    /// Traverse mutation by partition
+    for (const auto & [partition_id, block_num] : mutation.block_numbers)
+    {
        /// Note that we cannot simply count all parts to mutate using getPartsCoveredBy(appropriate part_info)
        /// because they are not consecutive in `parts`.
        MergeTreePartInfo covering_part_info(
            partition_id, 0, block_num, MergeTreePartInfo::MAX_LEVEL, MergeTreePartInfo::MAX_BLOCK_NUMBER);
-        for (const String & covered_part_name : parts.getPartsCoveredBy(covering_part_info))
+
+        /// First of all add all affected current_parts
+        for (const String & covered_part_name : current_parts.getPartsCoveredBy(covering_part_info))
        {
-            auto part_info = MergeTreePartInfo::fromPartName(covered_part_name, parts.getFormatVersion());
+            auto part_info = MergeTreePartInfo::fromPartName(covered_part_name, current_parts.getFormatVersion());
            if (part_info.getDataVersion() < block_num)
+                result.add(covered_part_name);
+        }
+
+        /// Traverse queue and update affected current_parts
+        for (const auto & [_, entry_representation] : queue_representation)
+        {
+            /// First we have to drop something if entry drop parts
+            for (const auto & part_to_drop : entry_representation.dropped_parts)
            {
-                /// We don't need to mutate part if it's covered by DROP_RANGE
-                if (!drop_ranges.hasDropRange(part_info))
-                    result.push_back(covered_part_name);
+                auto part_to_drop_info = MergeTreePartInfo::fromPartName(part_to_drop, format_version);
+                if (part_to_drop_info.partition_id == partition_id)
+                    result.removePartAndCoveredParts(part_to_drop);
+            }
+
+            /// After we have to add parts if entry adds them
+            for (const auto & part_to_add : entry_representation.produced_parts)
+            {
+                auto part_to_add_info = MergeTreePartInfo::fromPartName(part_to_add, format_version);
+                if (part_to_add_info.partition_id == partition_id && part_to_add_info.getDataVersion() < block_num)
+                    result.add(part_to_add);
            }
        }
    }
@ -858,20 +957,13 @@ void ReplicatedMergeTreeQueue::updateMutations(zkutil::ZooKeeperPtr zookeeper, C
                    LOG_TRACE(log, "Adding mutation {} for partition {} for all block numbers less than {}", entry->znode_name, partition_id, block_num);
                }

-                /// Initialize `mutation.parts_to_do`.
-                /// We need to mutate all parts in `current_parts` and all parts that will appear after queue entries execution.
-                /// So, we need to mutate all parts in virtual_parts (with the corresponding block numbers).
-                Strings virtual_parts_to_mutate = getPartNamesToMutate(*entry, virtual_parts, drop_ranges);
-                for (const String & current_part_to_mutate : virtual_parts_to_mutate)
-                {
-                    assert(MergeTreePartInfo::fromPartName(current_part_to_mutate, format_version).level < MergeTreePartInfo::MAX_LEVEL);
-                    mutation.parts_to_do.add(current_part_to_mutate);
-                }
+                /// Initialize `mutation.parts_to_do`. We cannot use only current_parts + virtual_parts here so we
+                /// traverse all the queue and build correct state of parts_to_do.
+                auto queue_representation = getQueueRepresentation(queue, format_version);
+                mutation.parts_to_do = getPartNamesToMutate(*entry, virtual_parts, queue_representation, format_version);

                if (mutation.parts_to_do.size() == 0)
-                {
                    some_mutations_are_probably_done = true;
-                }

                /// otherwise it's already done
                if (entry->isAlterMutation() && entry->znode_name > mutation_pointer)
@ -1774,8 +1866,11 @@ bool ReplicatedMergeTreeQueue::tryFinalizeMutations(zkutil::ZooKeeperPtr zookeep
            }
            else if (mutation.parts_to_do.size() == 0)
            {
+                /// Why it doesn't mean that mutation 100% finished? Because when we were creating part_to_do set
+                /// some INSERT queries could be in progress. So we have to double-check that no affected committing block
+                /// numbers exist and no new parts were surprisingly committed.
                LOG_TRACE(log, "Will check if mutation {} is done", mutation.entry->znode_name);
-                candidates.push_back(mutation.entry);
+                candidates.emplace_back(mutation.entry);
            }
        }
    }
@ -1785,12 +1880,15 @@ bool ReplicatedMergeTreeQueue::tryFinalizeMutations(zkutil::ZooKeeperPtr zookeep
    else
        LOG_DEBUG(log, "Trying to finalize {} mutations", candidates.size());

+    /// We need to check committing block numbers and new parts which could be committed.
+    /// Actually we don't need most of predicate logic here but it all the code related to committing blocks
+    /// and updatading queue state is implemented there.
    auto merge_pred = getMergePredicate(zookeeper);

    std::vector<const ReplicatedMergeTreeMutationEntry *> finished;
-    for (const ReplicatedMergeTreeMutationEntryPtr & candidate : candidates)
+    for (const auto & candidate : candidates)
    {
-        if (merge_pred.isMutationFinished(*candidate))
+        if (merge_pred.isMutationFinished(candidate->znode_name, candidate->block_numbers))
            finished.push_back(candidate.get());
    }

@ -2312,9 +2410,11 @@ std::optional<std::pair<Int64, int>> ReplicatedMergeTreeMergePredicate::getDesir
 }


-bool ReplicatedMergeTreeMergePredicate::isMutationFinished(const ReplicatedMergeTreeMutationEntry & mutation) const
+bool ReplicatedMergeTreeMergePredicate::isMutationFinished(const std::string & znode_name, const std::map<String, int64_t> & block_numbers) const
 {
-    for (const auto & kv : mutation.block_numbers)
+    /// Check committing block numbers, maybe some affected inserts
+    /// still not written to disk and committed to ZK.
+    for (const auto & kv : block_numbers)
    {
        const String & partition_id = kv.first;
        Int64 block_num = kv.second;
@ -2326,24 +2426,28 @@ bool ReplicatedMergeTreeMergePredicate::isMutationFinished(const ReplicatedMerge
                partition_it->second.begin(), partition_it->second.lower_bound(block_num));
            if (blocks_count)
            {
-                LOG_TRACE(queue.log, "Mutation {} is not done yet because in partition ID {} there are still {} uncommitted blocks.", mutation.znode_name, partition_id, blocks_count);
+                LOG_TRACE(queue.log, "Mutation {} is not done yet because in partition ID {} there are still {} uncommitted blocks.", znode_name, partition_id, blocks_count);
                return false;
            }
        }
    }

+    std::lock_guard lock(queue.state_mutex);
+    /// When we creating predicate we have updated the queue. Some committing inserts can now be committed so
+    /// we check parts_to_do one more time. Also this code is async so mutation actually could be deleted from memory.
+    if (auto it = queue.mutations_by_znode.find(znode_name); it != queue.mutations_by_znode.end())
    {
-        std::lock_guard lock(queue.state_mutex);
+        if (it->second.parts_to_do.size() == 0)
+            return true;

-        size_t suddenly_appeared_parts = getPartNamesToMutate(mutation, queue.virtual_parts, queue.drop_ranges).size();
-        if (suddenly_appeared_parts)
-        {
-            LOG_TRACE(queue.log, "Mutation {} is not done yet because {} parts to mutate suddenly appeared.", mutation.znode_name, suddenly_appeared_parts);
-            return false;
-        }
+        LOG_TRACE(queue.log, "Mutation {} is not done because some parts [{}] were just committed", znode_name, fmt::join(it->second.parts_to_do.getParts(), ", "));
+        return false;
+    }
+    else
+    {
+        LOG_TRACE(queue.log, "Mutation {} is done because it doesn't exist anymore", znode_name);
+        return true;
    }
-
-    return true;
 }

 bool ReplicatedMergeTreeMergePredicate::hasDropRange(const MergeTreePartInfo & new_drop_range_info) const
--- a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.h
+++ b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.h
@ -517,7 +517,7 @@ public:
    /// don't glue them together. Alter is rare operation, so it shouldn't affect performance.
    std::optional<std::pair<Int64, int>> getDesiredMutationVersion(const MergeTreeData::DataPartPtr & part) const;

-    bool isMutationFinished(const ReplicatedMergeTreeMutationEntry & mutation) const;
+    bool isMutationFinished(const std::string & znode_name, const std::map<String, int64_t> & block_numbers) const;

    /// The version of "log" node that is used to check that no new merges have appeared.
    int32_t getVersion() const { return merges_version; }
--- a/src/Storages/StorageDistributed.cpp
+++ b/src/Storages/StorageDistributed.cpp
@ -703,7 +703,7 @@ void StorageDistributed::read(
            select_stream_factory, modified_query_ast,
            local_context, query_info,
            sharding_key_expr, sharding_key_column_name,
-            query_info.cluster);
+            query_info.cluster, processed_stage);
    else
        ClusterProxy::executeQuery(
            query_plan, header, processed_stage,
--- a/src/Storages/StorageMergeTree.cpp
+++ b/src/Storages/StorageMergeTree.cpp
@ -279,25 +279,6 @@ void StorageMergeTree::drop()
    dropAllData();
 }

-void StorageMergeTree::truncate(const ASTPtr &, const StorageMetadataPtr &, ContextPtr local_context, TableExclusiveLockHolder &)
-{
-    {
-        /// Asks to complete merges and does not allow them to start.
-        /// This protects against "revival" of data for a removed partition after completion of merge.
-        auto merge_blocker = stopMergesAndWait();
-
-        auto data_parts_lock = lockParts();
-        auto parts_to_remove = getVisibleDataPartsVectorUnlocked(local_context, data_parts_lock);
-        removePartsFromWorkingSet(local_context->getCurrentTransaction().get(), parts_to_remove, true, data_parts_lock);
-
-        LOG_INFO(log, "Removed {} parts.", parts_to_remove.size());
-    }
-
-    clearOldMutations(true);
-    clearOldPartsFromFilesystem();
-}
-
-
 void StorageMergeTree::alter(
    const AlterCommands & commands,
    ContextPtr local_context,
@ -826,22 +807,28 @@ MergeMutateSelectedEntryPtr StorageMergeTree::selectPartsToMerge(
    CurrentlyMergingPartsTaggerPtr merging_tagger;
    MergeList::EntryPtr merge_entry;

-    auto can_merge = [this, &lock](const DataPartPtr & left, const DataPartPtr & right, const MergeTreeTransaction * tx, String *) -> bool
+    auto can_merge = [this, &lock](const DataPartPtr & left, const DataPartPtr & right, const MergeTreeTransaction * tx, String * disable_reason) -> bool
    {
        if (tx)
        {
            /// Cannot merge parts if some of them are not visible in current snapshot
            /// TODO Transactions: We can use simplified visibility rules (without CSN lookup) here
-            if (left && !left->version.isVisible(tx->getSnapshot(), Tx::EmptyTID))
-                return false;
-            if (right && !right->version.isVisible(tx->getSnapshot(), Tx::EmptyTID))
+            if ((left && !left->version.isVisible(tx->getSnapshot(), Tx::EmptyTID))
+                    || (right && !right->version.isVisible(tx->getSnapshot(), Tx::EmptyTID)))
+            {
+                if (disable_reason)
+                    *disable_reason = "Some part is not visible in transaction";
                return false;
+            }

            /// Do not try to merge parts that are locked for removal (merge will probably fail)
-            if (left && left->version.isRemovalTIDLocked())
-                return false;
-            if (right && right->version.isRemovalTIDLocked())
+            if ((left && left->version.isRemovalTIDLocked())
+                    || (right && right->version.isRemovalTIDLocked()))
+            {
+                if (disable_reason)
+                    *disable_reason = "Some part is locked for removal in another cuncurrent transaction";
                return false;
+            }
        }

        /// This predicate is checked for the first part of each range.
@ -1398,7 +1385,6 @@ ActionLock StorageMergeTree::stopMergesAndWait()
    return merge_blocker;
 }

-
 MergeTreeDataPartPtr StorageMergeTree::outdatePart(MergeTreeTransaction * txn, const String & part_name, bool force)
 {
    if (force)
@ -1407,7 +1393,8 @@ MergeTreeDataPartPtr StorageMergeTree::outdatePart(MergeTreeTransaction * txn, c
        auto merge_blocker = stopMergesAndWait();
        auto part = getPartIfExists(part_name, {MergeTreeDataPartState::Active});
        if (!part)
-            throw Exception("Part " + part_name + " not found, won't try to drop it.", ErrorCodes::NO_SUCH_DATA_PART);
+            throw Exception(ErrorCodes::NO_SUCH_DATA_PART, "Part {} not found, won't try to drop it.", part_name);
+
        removePartsFromWorkingSet(txn, {part}, true);
        return part;
    }
@ -1434,72 +1421,261 @@ MergeTreeDataPartPtr StorageMergeTree::outdatePart(MergeTreeTransaction * txn, c
 void StorageMergeTree::dropPartNoWaitNoThrow(const String & part_name)
 {
    if (auto part = outdatePart(NO_TRANSACTION_RAW, part_name, /*force=*/ false))
-        dropPartsImpl({part}, /*detach=*/ false);
+    {
+        if (deduplication_log)
+        {
+            deduplication_log->dropPart(part->info);
+        }
+
+        /// Need to destroy part objects before clearing them from filesystem.
+        part.reset();
+
+        clearOldPartsFromFilesystem();
+
+        LOG_INFO(log, "Removed 1 part {}.", part_name);
+    }

    /// Else nothing to do, part was removed in some different way
 }

-void StorageMergeTree::dropPart(const String & part_name, bool detach, ContextPtr query_context)
+struct FutureNewEmptyPart
 {
-    if (auto part = outdatePart(query_context->getCurrentTransaction().get(), part_name, /*force=*/ true))
-        dropPartsImpl({part}, detach);
+    MergeTreePartInfo part_info;
+    MergeTreePartition partition;
+    std::string part_name;
+
+    scope_guard tmp_dir_guard;
+
+    StorageMergeTree::MutableDataPartPtr data_part;
+
+    std::string getDirName() const { return StorageMergeTree::EMPTY_PART_TMP_PREFIX + part_name; }
+};
+
+using FutureNewEmptyParts = std::vector<FutureNewEmptyPart>;
+
+Strings getPartsNames(const FutureNewEmptyParts & parts)
+{
+    Strings part_names;
+    for (const auto & p : parts)
+        part_names.push_back(p.part_name);
+    return part_names;
 }

-void StorageMergeTree::dropPartition(const ASTPtr & partition, bool detach, ContextPtr local_context)
+FutureNewEmptyParts initCoverageWithNewEmptyParts(const DataPartsVector & old_parts)
 {
-    DataPartsVector parts_to_remove;
-    /// New scope controls lifetime of merge_blocker.
+    FutureNewEmptyParts future_parts;
+
+    for (const auto & old_part : old_parts)
    {
-        /// Asks to complete merges and does not allow them to start.
-        /// This protects against "revival" of data for a removed partition after completion of merge.
-        auto merge_blocker = stopMergesAndWait();
-        auto data_parts_lock = lockParts();
-        const auto * partition_ast = partition->as<ASTPartition>();
-        if (partition_ast && partition_ast->all)
-            parts_to_remove = getVisibleDataPartsVectorUnlocked(local_context, data_parts_lock);
-        else
-        {
-            String partition_id = getPartitionIDFromQuery(partition, local_context, &data_parts_lock);
-            parts_to_remove = getVisibleDataPartsVectorInPartition(local_context, partition_id, data_parts_lock);
-        }
-        /// TODO should we throw an exception if parts_to_remove is empty?
-        removePartsFromWorkingSet(local_context->getCurrentTransaction().get(), parts_to_remove, true, data_parts_lock);
+        future_parts.emplace_back();
+        auto & new_part = future_parts.back();
+
+        new_part.part_info = old_part->info;
+        new_part.part_info.level += 1;
+        new_part.partition = old_part->partition;
+        new_part.part_name = old_part->getNewName(new_part.part_info);
    }

-    dropPartsImpl(std::move(parts_to_remove), detach);
+    return future_parts;
 }

-void StorageMergeTree::dropPartsImpl(DataPartsVector && parts_to_remove, bool detach)
+StorageMergeTree::MutableDataPartsVector createEmptyDataParts(MergeTreeData & data, FutureNewEmptyParts & future_parts, const MergeTreeTransactionPtr & txn)
 {
-    auto metadata_snapshot = getInMemoryMetadataPtr();
+    StorageMergeTree::MutableDataPartsVector data_parts;
+    for (auto & part: future_parts)
+        data_parts.push_back(data.createEmptyPart(part.part_info, part.partition, part.part_name, txn));
+    return data_parts;
+}

-    if (detach)
+void captureTmpDirectoryHolders(MergeTreeData & data, FutureNewEmptyParts & future_parts)
+{
+    for (auto & part : future_parts)
+        part.tmp_dir_guard = data.getTemporaryPartDirectoryHolder(part.getDirName());
+}
+
+void StorageMergeTree::renameAndCommitEmptyParts(MutableDataPartsVector & new_parts, Transaction & transaction)
+{
+    DataPartsVector covered_parts;
+
+    for (auto & part: new_parts)
    {
-        /// If DETACH clone parts to detached/ directory
-        /// NOTE: no race with background cleanup until we hold pointers to parts
-        for (const auto & part : parts_to_remove)
+        DataPartsVector covered_parts_by_one_part = renameTempPartAndReplace(part, transaction);
+
+        if (covered_parts_by_one_part.size() > 1)
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Part {} expected to cover not more then 1 part. {} covered parts have been found. This is a bug.",
+                            part->name, covered_parts_by_one_part.size());
+
+        std::move(covered_parts_by_one_part.begin(), covered_parts_by_one_part.end(), std::back_inserter(covered_parts));
+    }
+
+    LOG_INFO(log, "Remove {} parts by covering them with empty {} parts. With txn {}.",
+             covered_parts.size(), new_parts.size(), transaction.getTID());
+
+    transaction.commit();
+
+    /// Remove covered parts without waiting for old_parts_lifetime seconds.
+    for (auto & part: covered_parts)
+        part->remove_time.store(0, std::memory_order_relaxed);
+
+    if (deduplication_log)
+        for (const auto & part : covered_parts)
+            deduplication_log->dropPart(part->info);
+}
+
+void StorageMergeTree::truncate(const ASTPtr &, const StorageMetadataPtr &, ContextPtr query_context, TableExclusiveLockHolder &)
+{
+    /// Asks to complete merges and does not allow them to start.
+    /// This protects against "revival" of data for a removed partition after completion of merge.
+    auto merge_blocker = stopMergesAndWait();
+
+    Stopwatch watch;
+
+    auto txn = query_context->getCurrentTransaction();
+    MergeTreeData::Transaction transaction(*this, txn.get());
+    {
+        auto operation_data_parts_lock = lockOperationsWithParts();
+
+        auto parts = getVisibleDataPartsVector(query_context);
+
+        auto future_parts = initCoverageWithNewEmptyParts(parts);
+
+        LOG_TEST(log, "Made {} empty parts in order to cover {} parts. Empty parts: {}, covered parts: {}. With txn {}",
+                 future_parts.size(), parts.size(),
+                 fmt::join(getPartsNames(future_parts), ", "), fmt::join(getPartsNamesWithStates(parts), ", "),
+                 transaction.getTID());
+
+        captureTmpDirectoryHolders(*this, future_parts);
+
+        auto new_data_parts = createEmptyDataParts(*this, future_parts, txn);
+        renameAndCommitEmptyParts(new_data_parts, transaction);
+
+        PartLog::addNewParts(query_context, new_data_parts, watch.elapsed());
+
+        LOG_INFO(log, "Truncated table with {} parts by replacing them with new empty {} parts. With txn {}",
+                 parts.size(), future_parts.size(),
+                 transaction.getTID());
+    }
+
+    /// Old parts are needed to be destroyed before clearing them from filesystem.
+    clearOldMutations(true);
+    clearOldPartsFromFilesystem();
+    clearEmptyParts();
+}
+
+void StorageMergeTree::dropPart(const String & part_name, bool detach, ContextPtr query_context)
+{
+    /// Asks to complete merges and does not allow them to start.
+    /// This protects against "revival" of data for a removed partition after completion of merge.
+    auto merge_blocker = stopMergesAndWait();
+
+    Stopwatch watch;
+
+    /// It's important to create it outside of lock scope because
+    /// otherwise it can lock parts in destructor and deadlock is possible.
+    auto txn = query_context->getCurrentTransaction();
+    MergeTreeData::Transaction transaction(*this, txn.get());
+    {
+        auto operation_data_parts_lock = lockOperationsWithParts();
+
+        auto part = getPartIfExists(part_name, {MergeTreeDataPartState::Active});
+        if (!part)
+            throw Exception(ErrorCodes::NO_SUCH_DATA_PART, "Part {} not found, won't try to drop it.", part_name);
+
+        if (detach)
        {
+            auto metadata_snapshot = getInMemoryMetadataPtr();
            LOG_INFO(log, "Detaching {}", part->getDataPartStorage().getPartDirectory());
            part->makeCloneInDetached("", metadata_snapshot);
        }
+
+        {
+            auto future_parts = initCoverageWithNewEmptyParts({part});
+
+            LOG_TEST(log, "Made {} empty parts in order to cover {} part. With txn {}",
+                     fmt::join(getPartsNames(future_parts), ", "), fmt::join(getPartsNames({part}), ", "),
+                     transaction.getTID());
+
+            captureTmpDirectoryHolders(*this, future_parts);
+
+            auto new_data_parts = createEmptyDataParts(*this, future_parts, txn);
+            renameAndCommitEmptyParts(new_data_parts, transaction);
+
+            PartLog::addNewParts(query_context, new_data_parts, watch.elapsed());
+
+            const auto * op = detach ? "Detached" : "Dropped";
+            LOG_INFO(log, "{} {} part by replacing it with new empty {} part. With txn {}",
+                     op, part->name, future_parts[0].part_name,
+                     transaction.getTID());
+        }
    }

-    if (deduplication_log)
-    {
-        for (const auto & part : parts_to_remove)
-            deduplication_log->dropPart(part->info);
-    }
-
-    if (detach)
-        LOG_INFO(log, "Detached {} parts.", parts_to_remove.size());
-    else
-        LOG_INFO(log, "Removed {} parts.", parts_to_remove.size());
-
-    /// Need to destroy part objects before clearing them from filesystem.
-    parts_to_remove.clear();
+    /// Old part objects is needed to be destroyed before clearing them from filesystem.
+    clearOldMutations(true);
    clearOldPartsFromFilesystem();
+    clearEmptyParts();
 }

+void StorageMergeTree::dropPartition(const ASTPtr & partition, bool detach, ContextPtr query_context)
+{
+    const auto * partition_ast = partition->as<ASTPartition>();
+
+    /// Asks to complete merges and does not allow them to start.
+    /// This protects against "revival" of data for a removed partition after completion of merge.
+    auto merge_blocker = stopMergesAndWait();
+
+    Stopwatch watch;
+
+    /// It's important to create it outside of lock scope because
+    /// otherwise it can lock parts in destructor and deadlock is possible.
+    auto txn = query_context->getCurrentTransaction();
+    MergeTreeData::Transaction transaction(*this, txn.get());
+    {
+        auto operation_data_parts_lock = lockOperationsWithParts();
+
+        DataPartsVector parts;
+        {
+            if (partition_ast && partition_ast->all)
+                parts = getVisibleDataPartsVector(query_context);
+            else
+            {
+                String partition_id = getPartitionIDFromQuery(partition, query_context);
+                parts = getVisibleDataPartsVectorInPartition(query_context, partition_id);
+            }
+        }
+
+        if (detach)
+            for (const auto & part : parts)
+            {
+                auto metadata_snapshot = getInMemoryMetadataPtr();
+                LOG_INFO(log, "Detaching {}", part->getDataPartStorage().getPartDirectory());
+                part->makeCloneInDetached("", metadata_snapshot);
+            }
+
+        auto future_parts = initCoverageWithNewEmptyParts(parts);
+
+        LOG_TEST(log, "Made {} empty parts in order to cover {} parts. Empty parts: {}, covered parts: {}. With txn {}",
+                 future_parts.size(), parts.size(),
+                 fmt::join(getPartsNames(future_parts), ", "), fmt::join(getPartsNames(parts), ", "),
+                 transaction.getTID());
+
+        captureTmpDirectoryHolders(*this, future_parts);
+
+        auto new_data_parts = createEmptyDataParts(*this, future_parts, txn);
+        renameAndCommitEmptyParts(new_data_parts, transaction);
+
+        PartLog::addNewParts(query_context, new_data_parts, watch.elapsed());
+
+        const auto * op = detach ? "Detached" : "Dropped";
+        LOG_INFO(log, "{} partition with {} parts by replacing them with new empty {} parts. With txn {}",
+                 op, parts.size(), future_parts.size(),
+                 transaction.getTID());
+    }
+
+    /// Old parts are needed to be destroyed before clearing them from filesystem.
+    clearOldMutations(true);
+    clearOldPartsFromFilesystem();
+    clearEmptyParts();
+}

 PartitionCommandsResultInfo StorageMergeTree::attachPartition(
    const ASTPtr & partition, const StorageMetadataPtr & /* metadata_snapshot */,
--- a/src/Storages/StorageMergeTree.h
+++ b/src/Storages/StorageMergeTree.h
@ -169,6 +169,8 @@ private:
            String * out_disable_reason = nullptr,
            bool optimize_skip_merged_partitions = false);

+    void renameAndCommitEmptyParts(MutableDataPartsVector & new_parts, Transaction & transaction);
+
    /// Make part state outdated and queue it to remove without timeout
    /// If force, then stop merges and block them until part state became outdated. Throw exception if part doesn't exists
    /// If not force, then take merges selector and check that part is not participating in background operations.
@ -217,7 +219,6 @@ private:
    void dropPartNoWaitNoThrow(const String & part_name) override;
    void dropPart(const String & part_name, bool detach, ContextPtr context) override;
    void dropPartition(const ASTPtr & partition, bool detach, ContextPtr context) override;
-    void dropPartsImpl(DataPartsVector && parts_to_remove, bool detach);
    PartitionCommandsResultInfo attachPartition(const ASTPtr & partition, const StorageMetadataPtr & metadata_snapshot, bool part, ContextPtr context) override;

    void replacePartitionFrom(const StoragePtr & source_table, const ASTPtr & partition, bool replace, ContextPtr context) override;
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@ -67,7 +67,6 @@
 #include <IO/Operators.h>
 #include <IO/ConnectionTimeouts.h>
 #include <IO/ConnectionTimeoutsContext.h>
-#include <Disks/createVolume.h>

 #include <Interpreters/InterpreterAlterQuery.h>
 #include <Interpreters/PartLog.h>
@ -131,7 +130,7 @@ namespace ErrorCodes
    extern const int NO_ZOOKEEPER;
    extern const int INCORRECT_DATA;
    extern const int INCOMPATIBLE_COLUMNS;
-    extern const int REPLICA_IS_ALREADY_EXIST;
+    extern const int REPLICA_ALREADY_EXISTS;
    extern const int NO_REPLICA_HAS_PART;
    extern const int LOGICAL_ERROR;
    extern const int TOO_MANY_UNEXPECTED_DATA_PARTS;
@ -779,7 +778,7 @@ bool StorageReplicatedMergeTree::createTableIfNotExists(const StorageMetadataPtr
    /// Do not use LOGICAL_ERROR code, because it may happen if user has specified wrong zookeeper_path
    throw Exception("Cannot create table, because it is created concurrently every time "
                    "or because of wrong zookeeper_path "
-                    "or because of logical error", ErrorCodes::REPLICA_IS_ALREADY_EXIST);
+                    "or because of logical error", ErrorCodes::REPLICA_ALREADY_EXISTS);
 }

 void StorageReplicatedMergeTree::createReplica(const StorageMetadataPtr & metadata_snapshot)
@ -843,7 +842,7 @@ void StorageReplicatedMergeTree::createReplica(const StorageMetadataPtr & metada
        switch (code)
        {
            case Coordination::Error::ZNODEEXISTS:
-                throw Exception(ErrorCodes::REPLICA_IS_ALREADY_EXIST, "Replica {} already exists", replica_path);
+                throw Exception(ErrorCodes::REPLICA_ALREADY_EXISTS, "Replica {} already exists", replica_path);
            case Coordination::Error::ZBADVERSION:
                LOG_ERROR(log, "Retrying createReplica(), because some other replicas were created at the same time");
                break;
@ -1554,7 +1553,7 @@ bool StorageReplicatedMergeTree::executeLogEntry(LogEntry & entry)

    if (entry.type == LogEntry::ATTACH_PART)
    {
-        if (MutableDataPartPtr part = attachPartHelperFoundValidPart(entry); part)
+        if (MutableDataPartPtr part = attachPartHelperFoundValidPart(entry))
        {
            LOG_TRACE(log, "Found valid local part for {}, preparing the transaction", part->name);

@ -7645,7 +7644,15 @@ void StorageReplicatedMergeTree::createTableSharedID() const
        return;
    }

-    auto zookeeper = getZooKeeper();
+    /// We may call getTableSharedID when table is shut down. If exception happen, restarting thread will be already turned
+    /// off and nobody will reconnect our zookeeper connection. In this case we use zookeeper connection from
+    /// context.
+    ZooKeeperPtr zookeeper;
+    if (shutdown_called.load())
+        zookeeper = getZooKeeperIfTableShutDown();
+    else
+        zookeeper = getZooKeeper();
+
    String zookeeper_table_id_path = fs::path(zookeeper_path) / "table_shared_id";
    String id;
    if (!zookeeper->tryGet(zookeeper_table_id_path, id))
@ -8265,56 +8272,25 @@ bool StorageReplicatedMergeTree::checkIfDetachedPartitionExists(const String & p
 bool StorageReplicatedMergeTree::createEmptyPartInsteadOfLost(zkutil::ZooKeeperPtr zookeeper, const String & lost_part_name)
 {
    LOG_INFO(log, "Going to replace lost part {} with empty part", lost_part_name);
-    auto metadata_snapshot = getInMemoryMetadataPtr();
-    auto settings = getSettings();
-
-    constexpr static auto TMP_PREFIX = "tmp_empty_";

    auto new_part_info = MergeTreePartInfo::fromPartName(lost_part_name, format_version);
-    auto block = metadata_snapshot->getSampleBlock();

-    DB::IMergeTreeDataPart::TTLInfos move_ttl_infos;
-
-    NamesAndTypesList columns = metadata_snapshot->getColumns().getAllPhysical().filter(block.getNames());
-    ReservationPtr reservation = reserveSpacePreferringTTLRules(metadata_snapshot, 0, move_ttl_infos, time(nullptr), 0, true);
-    VolumePtr volume = getStoragePolicy()->getVolume(0);
-
-    auto minmax_idx = std::make_shared<IMergeTreeDataPart::MinMaxIndex>();
-    minmax_idx->update(block, getMinMaxColumnsNames(metadata_snapshot->getPartitionKey()));
-
-    auto new_volume = createVolumeFromReservation(reservation, volume);
-
-    auto data_part_storage = std::make_shared<DataPartStorageOnDisk>(
-        new_volume,
-        relative_data_path,
-        TMP_PREFIX + lost_part_name);
-
-    data_part_storage->beginTransaction();
-
-    auto new_data_part = createPart(
-        lost_part_name,
-        choosePartType(0, block.rows()),
-        new_part_info,
-        data_part_storage);
-
-    if (settings->assign_part_uuids)
-        new_data_part->uuid = UUIDHelpers::generateV4();
-
-    new_data_part->setColumns(columns, {});
-    new_data_part->rows_count = block.rows();
+    auto metadata_snapshot = getInMemoryMetadataPtr();

+    MergeTreePartition partition;
    {
-        auto lock = lockParts();
+        DataPartsLock lock = lockParts();
+
        auto parts_in_partition = getDataPartsPartitionRange(new_part_info.partition_id);
        if (!parts_in_partition.empty())
        {
-            new_data_part->partition = (*parts_in_partition.begin())->partition;
+            partition = (*parts_in_partition.begin())->partition;
        }
        else if (auto parsed_partition = MergeTreePartition::tryParseValueFromID(
                     new_part_info.partition_id,
                     metadata_snapshot->getPartitionKey().sample_block))
        {
-            new_data_part->partition = MergeTreePartition(*parsed_partition);
+            partition = MergeTreePartition(*parsed_partition);
        }
        else
        {
@ -8322,43 +8298,10 @@ bool StorageReplicatedMergeTree::createEmptyPartInsteadOfLost(zkutil::ZooKeeperP
                             "resolve this manually using DROP/DETACH PARTITION.", lost_part_name, new_part_info.partition_id);
            return false;
        }
-
    }

-    new_data_part->minmax_idx = std::move(minmax_idx);
-    new_data_part->is_temp = true;
-
-    SyncGuardPtr sync_guard;
-    if (new_data_part->isStoredOnDisk())
-    {
-        /// The name could be non-unique in case of stale files from previous runs.
-        if (data_part_storage->exists())
-        {
-            LOG_WARNING(log, "Removing old temporary directory {}", new_data_part->getDataPartStorage().getFullPath());
-            data_part_storage->removeRecursive();
-        }
-
-        data_part_storage->createDirectories();
-
-        if (getSettings()->fsync_part_directory)
-            sync_guard = data_part_storage->getDirectorySyncGuard();
-    }
-
-    /// This effectively chooses minimal compression method:
-    ///  either default lz4 or compression method with zero thresholds on absolute and relative part size.
-    auto compression_codec = getContext()->chooseCompressionCodec(0, 0);
-
-    const auto & index_factory = MergeTreeIndexFactory::instance();
-    MergedBlockOutputStream out(new_data_part, metadata_snapshot, columns,
-        index_factory.getMany(metadata_snapshot->getSecondaryIndices()), compression_codec, NO_TRANSACTION_PTR);
-
-    bool sync_on_insert = settings->fsync_after_insert;
-
-    out.write(block);
-    /// TODO(ab): What projections should we add to the empty part? How can we make sure that it
-    /// won't block future merges? Perhaps we should also check part emptiness when selecting parts
-    /// to merge.
-    out.finalizePart(new_data_part, sync_on_insert);
+    MergeTreeData::MutableDataPartPtr new_data_part = createEmptyPart(new_part_info, partition, lost_part_name, NO_TRANSACTION_PTR);
+    new_data_part->name = lost_part_name;

    try
    {
--- a/src/Storages/System/StorageSystemAsynchronousInserts.cpp
+++ b/src/Storages/System/StorageSystemAsynchronousInserts.cpp
@ -27,8 +27,6 @@ NamesAndTypesList StorageSystemAsynchronousInserts::getNamesAndTypes()
        {"total_bytes", std::make_shared<DataTypeUInt64>()},
        {"entries.query_id", std::make_shared<DataTypeArray>(std::make_shared<DataTypeString>())},
        {"entries.bytes", std::make_shared<DataTypeArray>(std::make_shared<DataTypeUInt64>())},
-        {"entries.finished", std::make_shared<DataTypeArray>(std::make_shared<DataTypeUInt8>())},
-        {"entries.exception", std::make_shared<DataTypeArray>(std::make_shared<DataTypeString>())},
    };
 }

@ -40,78 +38,56 @@ void StorageSystemAsynchronousInserts::fillData(MutableColumns & res_columns, Co
    if (!insert_queue)
        return;

-    auto [queue, queue_lock] = insert_queue->getQueueLocked();
-    for (const auto & [key, elem] : queue)
+    for (size_t shard_num = 0; shard_num < insert_queue->getPoolSize(); ++shard_num)
    {
-        std::lock_guard elem_lock(elem->mutex);
+        auto [queue, queue_lock] = insert_queue->getQueueLocked(shard_num);

-        if (!elem->data)
-            continue;
-
-        auto time_in_microseconds = [](const time_point<steady_clock> & timestamp)
+        for (const auto & [first_update, elem] : queue)
        {
-            auto time_diff = duration_cast<microseconds>(steady_clock::now() - timestamp);
-            auto time_us = (system_clock::now() - time_diff).time_since_epoch().count();
+            const auto & [key, data] = elem;

-            DecimalUtils::DecimalComponents<DateTime64> components{time_us / 1'000'000, time_us % 1'000'000};
-            return DecimalField(DecimalUtils::decimalFromComponents<DateTime64>(components, TIME_SCALE), TIME_SCALE);
-        };
-
-        const auto & insert_query = key.query->as<const ASTInsertQuery &>();
-        size_t i = 0;
-
-        res_columns[i++]->insert(queryToString(insert_query));
-
-        /// If query is "INSERT INTO FUNCTION" then table_id is empty.
-        if (insert_query.table_id)
-        {
-            res_columns[i++]->insert(insert_query.table_id.getDatabaseName());
-            res_columns[i++]->insert(insert_query.table_id.getTableName());
-        }
-        else
-        {
-            res_columns[i++]->insertDefault();
-            res_columns[i++]->insertDefault();
-        }
-
-        res_columns[i++]->insert(insert_query.format);
-        res_columns[i++]->insert(time_in_microseconds(elem->data->first_update));
-        res_columns[i++]->insert(elem->data->size);
-
-        Array arr_query_id;
-        Array arr_bytes;
-        Array arr_finished;
-        Array arr_exception;
-
-        for (const auto & entry : elem->data->entries)
-        {
-            arr_query_id.push_back(entry->query_id);
-            arr_bytes.push_back(entry->bytes.size());
-            arr_finished.push_back(entry->isFinished());
-
-            if (auto exception = entry->getException())
+            auto time_in_microseconds = [](const time_point<steady_clock> & timestamp)
            {
-                try
-                {
-                    std::rethrow_exception(exception);
-                }
-                catch (const Exception & e)
-                {
-                    arr_exception.push_back(e.displayText());
-                }
-                catch (...)
-                {
-                    arr_exception.push_back("Unknown exception");
-                }
+                auto time_diff = duration_cast<microseconds>(steady_clock::now() - timestamp);
+                auto time_us = (system_clock::now() - time_diff).time_since_epoch().count();
+
+                DecimalUtils::DecimalComponents<DateTime64> components{time_us / 1'000'000, time_us % 1'000'000};
+                return DecimalField(DecimalUtils::decimalFromComponents<DateTime64>(components, TIME_SCALE), TIME_SCALE);
+            };
+
+            const auto & insert_query = key.query->as<const ASTInsertQuery &>();
+            size_t i = 0;
+
+            res_columns[i++]->insert(queryToString(insert_query));
+
+            /// If query is "INSERT INTO FUNCTION" then table_id is empty.
+            if (insert_query.table_id)
+            {
+                res_columns[i++]->insert(insert_query.table_id.getDatabaseName());
+                res_columns[i++]->insert(insert_query.table_id.getTableName());
            }
            else
-                arr_exception.push_back("");
-        }
+            {
+                res_columns[i++]->insertDefault();
+                res_columns[i++]->insertDefault();
+            }

-        res_columns[i++]->insert(arr_query_id);
-        res_columns[i++]->insert(arr_bytes);
-        res_columns[i++]->insert(arr_finished);
-        res_columns[i++]->insert(arr_exception);
+            res_columns[i++]->insert(insert_query.format);
+            res_columns[i++]->insert(time_in_microseconds(first_update));
+            res_columns[i++]->insert(data->size_in_bytes);
+
+            Array arr_query_id;
+            Array arr_bytes;
+
+            for (const auto & entry : data->entries)
+            {
+                arr_query_id.push_back(entry->query_id);
+                arr_bytes.push_back(entry->bytes.size());
+            }
+
+            res_columns[i++]->insert(arr_query_id);
+            res_columns[i++]->insert(arr_bytes);
+        }
    }
 }

--- a/src/Storages/System/StorageSystemParts.cpp
+++ b/src/Storages/System/StorageSystemParts.cpp
@ -195,21 +195,22 @@ void StorageSystemParts::processNextStorage(
        if (columns_mask[src_index++])
            columns[res_index++]->insert(info.engine);

-        if (part->isStoredOnDisk())
+        if (columns_mask[src_index++])
        {
-            if (columns_mask[src_index++])
+            if (part->isStoredOnDisk())
                columns[res_index++]->insert(part->getDataPartStorage().getDiskName());
-            if (columns_mask[src_index++])
-                columns[res_index++]->insert(part->getDataPartStorage().getFullPath());
-        }
-        else
-        {
-            if (columns_mask[src_index++])
-                columns[res_index++]->insertDefault();
-            if (columns_mask[src_index++])
+            else
                columns[res_index++]->insertDefault();
        }

+        if (columns_mask[src_index++])
+        {
+            // The full path changes at clean up thread under deleting state, do not read it, avoid the race
+            if (part->isStoredOnDisk() && part_state != State::Deleting)
+                columns[res_index++]->insert(part->getDataPartStorage().getFullPath());
+            else
+                columns[res_index++]->insertDefault();
+        }

        {
            MinimalisticDataPartChecksums helper;
--- a/src/Storages/System/StorageSystemPartsColumns.cpp
+++ b/src/Storages/System/StorageSystemPartsColumns.cpp
@ -192,7 +192,13 @@ void StorageSystemPartsColumns::processNextStorage(
            if (columns_mask[src_index++])
                columns[res_index++]->insert(part->getDataPartStorage().getDiskName());
            if (columns_mask[src_index++])
-                columns[res_index++]->insert(part->getDataPartStorage().getFullPath());
+            {
+                // The full path changes at clean up thread under deleting state, do not read it, avoid the race
+                if (part_state != State::Deleting)
+                    columns[res_index++]->insert(part->getDataPartStorage().getFullPath());
+                else
+                    columns[res_index++]->insertDefault();
+            }

            if (columns_mask[src_index++])
                columns[res_index++]->insert(column.name);
--- a/tests/ci/ci_runners_metrics_lambda/app.py
+++ b/tests/ci/ci_runners_metrics_lambda/app.py
@ -11,6 +11,17 @@ import requests
 import boto3
 from botocore.exceptions import ClientError

+UNIVERSAL_LABEL = "universal"
+RUNNER_TYPE_LABELS = [
+    "builder",
+    "func-tester",
+    "func-tester-aarch64",
+    "fuzzer-unit-tester",
+    "stress-tester",
+    "style-checker",
+    "style-checker-aarch64",
+]
+

 def get_dead_runners_in_ec2(runners):
    ids = {
@ -170,26 +181,23 @@ def list_runners(access_token):
 def group_runners_by_tag(listed_runners):
    result = {}

-    RUNNER_TYPE_LABELS = [
-        "builder",
-        "func-tester",
-        "func-tester-aarch64",
-        "fuzzer-unit-tester",
-        "stress-tester",
-        "style-checker",
-        "style-checker-aarch64",
-    ]
+    def add_to_result(tag, runner):
+        if tag not in result:
+            result[tag] = []
+        result[tag].append(runner)
+
    for runner in listed_runners:
+        if UNIVERSAL_LABEL in runner.tags:
+            # Do not proceed other labels if UNIVERSAL_LABEL is included
+            add_to_result(UNIVERSAL_LABEL, runner)
+            continue
+
        for tag in runner.tags:
            if tag in RUNNER_TYPE_LABELS:
-                if tag not in result:
-                    result[tag] = []
-                result[tag].append(runner)
+                add_to_result(tag, runner)
                break
        else:
-            if "unlabeled" not in result:
-                result["unlabeled"] = []
-            result["unlabeled"].append(runner)
+            add_to_result("unlabeled", runner)
    return result


--- a/Show More
+++ b/Show More
				`@ -0,0 +1 @@`
				`Subproject commit 3078dc6039f8c0bffcb1904f81cfe6b2c3209435`