Merge remote-tracking branch 'blessed/master' into parallel_replicas_row_estimation

2024-11-23 08:02:02 +00:00 · 2023-10-19 15:20:50 +00:00 · 2023-10-19 15:20:50 +00:00 · 4a53943926
commit 4a53943926
parent 0c49f72433 85e3c31f14
138 changed files with 4733 additions and 2195 deletions
--- a/.github/workflows/backport_branches.yml
+++ b/.github/workflows/backport_branches.yml
@ -77,6 +77,7 @@ jobs:
        with:
          clear-repository: true
          fetch-depth: 0  # to find ancestor merge commits necessary for finding proper docker tags
+          filter: tree:0
      - name: Download changed aarch64 images
        uses: actions/download-artifact@v3
        with:
@ -185,6 +186,7 @@ jobs:
          clear-repository: true
          submodules: true
          fetch-depth: 0 # For a proper version and performance artifacts
+          filter: tree:0
      - name: Build
        run: |
          sudo rm -fr "$TEMP_PATH"
@ -227,6 +229,7 @@ jobs:
          clear-repository: true
          submodules: true
          fetch-depth: 0 # For a proper version and performance artifacts
+          filter: tree:0
      - name: Build
        run: |
          sudo rm -fr "$TEMP_PATH"
@ -399,6 +402,7 @@ jobs:
          clear-repository: true
          submodules: true
          fetch-depth: 0 # otherwise we will have no info about contributors
+          filter: tree:0
      - name: Apply sparse checkout for contrib # in order to check that it doesn't break build
        run: |
          rm -rf "$GITHUB_WORKSPACE/contrib" && echo 'removed'
@ -448,6 +452,7 @@ jobs:
          clear-repository: true
          submodules: true
          fetch-depth: 0 # otherwise we will have no info about contributors
+          filter: tree:0
      - name: Apply sparse checkout for contrib # in order to check that it doesn't break build
        run: |
          rm -rf "$GITHUB_WORKSPACE/contrib" && echo 'removed'
@ -487,6 +492,7 @@ jobs:
        with:
          clear-repository: true
          fetch-depth: 0  # It MUST BE THE SAME for all dependencies and the job itself
+          filter: tree:0
      - name: Check docker clickhouse/clickhouse-server building
        run: |
          cd "$GITHUB_WORKSPACE/tests/ci"
--- a/.github/workflows/docs_check.yml
+++ b/.github/workflows/docs_check.yml
@ -18,6 +18,7 @@ on:  # yamllint disable-line rule:truthy
      - 'docs/**'
      - 'utils/check-style/aspell-ignore/**'
      - 'tests/ci/docs_check.py'
+      - '.github/workflows/docs_check.yml'
 jobs:
  CheckLabels:
    runs-on: [self-hosted, style-checker]
@ -73,6 +74,7 @@ jobs:
        with:
          clear-repository: true
          fetch-depth: 0  # to find ancestor merge commits necessary for finding proper docker tags
+          filter: tree:0
      - name: Download changed aarch64 images
        uses: actions/download-artifact@v3
        with:
--- a/.github/workflows/jepsen.yml
+++ b/.github/workflows/jepsen.yml
@ -24,6 +24,7 @@ jobs:
        with:
          clear-repository: true
          fetch-depth: 0
+          filter: tree:0
      - name: Jepsen Test
        run: |
          sudo rm -fr "$TEMP_PATH"
@ -53,6 +54,7 @@ jobs:
  #       with:
  #         clear-repository: true
  #         fetch-depth: 0
+  #         filter: tree:0
  #     - name: Jepsen Test
  #       run: |
  #         sudo rm -fr "$TEMP_PATH"
--- a/.github/workflows/master.yml
+++ b/.github/workflows/master.yml
@ -61,6 +61,7 @@ jobs:
        with:
          clear-repository: true
          fetch-depth: 0  # to find ancestor merge commits necessary for finding proper docker tags
+          filter: tree:0
      - name: Download changed aarch64 images
        uses: actions/download-artifact@v3
        with:
@ -200,6 +201,7 @@ jobs:
          clear-repository: true
          submodules: true
          fetch-depth: 0 # For a proper version and performance artifacts
+          filter: tree:0
      - name: Build
        run: |
          sudo rm -fr "$TEMP_PATH"
@ -242,6 +244,7 @@ jobs:
          clear-repository: true
          submodules: true
          fetch-depth: 0 # For a proper version and performance artifacts
+          filter: tree:0
      - name: Build
        run: |
          sudo rm -fr "$TEMP_PATH"
@ -283,6 +286,7 @@ jobs:
          clear-repository: true
          submodules: true
          fetch-depth: 0 # otherwise we will have no info about contributors
+          filter: tree:0
      - name: Build
        run: |
          sudo rm -fr "$TEMP_PATH"
@ -581,6 +585,7 @@ jobs:
          clear-repository: true
          submodules: true
          fetch-depth: 0 # otherwise we will have no info about contributors
+          filter: tree:0
      - name: Apply sparse checkout for contrib # in order to check that it doesn't break build
        run: |
          rm -rf "$GITHUB_WORKSPACE/contrib" && echo 'removed'
@ -630,6 +635,7 @@ jobs:
          clear-repository: true
          submodules: true
          fetch-depth: 0 # otherwise we will have no info about contributors
+          filter: tree:0
      - name: Build
        run: |
          sudo rm -fr "$TEMP_PATH"
@ -672,6 +678,7 @@ jobs:
          clear-repository: true
          submodules: true
          fetch-depth: 0 # otherwise we will have no info about contributors
+          filter: tree:0
      - name: Build
        run: |
          sudo rm -fr "$TEMP_PATH"
@ -714,6 +721,7 @@ jobs:
          clear-repository: true
          submodules: true
          fetch-depth: 0 # otherwise we will have no info about contributors
+          filter: tree:0
      - name: Apply sparse checkout for contrib # in order to check that it doesn't break build
        run: |
          rm -rf "$GITHUB_WORKSPACE/contrib" && echo 'removed'
@ -763,6 +771,7 @@ jobs:
          clear-repository: true
          submodules: true
          fetch-depth: 0 # otherwise we will have no info about contributors
+          filter: tree:0
      - name: Build
        run: |
          sudo rm -fr "$TEMP_PATH"
@ -805,6 +814,7 @@ jobs:
          clear-repository: true
          submodules: true
          fetch-depth: 0 # otherwise we will have no info about contributors
+          filter: tree:0
      - name: Build
        run: |
          sudo rm -fr "$TEMP_PATH"
@ -847,6 +857,7 @@ jobs:
          clear-repository: true
          submodules: true
          fetch-depth: 0 # otherwise we will have no info about contributors
+          filter: tree:0
      - name: Build
        run: |
          sudo rm -fr "$TEMP_PATH"
@ -889,6 +900,7 @@ jobs:
          clear-repository: true
          submodules: true
          fetch-depth: 0 # otherwise we will have no info about contributors
+          filter: tree:0
      - name: Build
        run: |
          sudo rm -fr "$TEMP_PATH"
@ -931,6 +943,7 @@ jobs:
          clear-repository: true
          submodules: true
          fetch-depth: 0 # otherwise we will have no info about contributors
+          filter: tree:0
      - name: Build
        run: |
          sudo rm -fr "$TEMP_PATH"
@ -963,6 +976,7 @@ jobs:
        with:
          clear-repository: true
          fetch-depth: 0  # It MUST BE THE SAME for all dependencies and the job itself
+          filter: tree:0
      - name: Check docker clickhouse/clickhouse-server building
        run: |
          cd "$GITHUB_WORKSPACE/tests/ci"
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@ -54,6 +54,7 @@ jobs:
        with:
          clear-repository: true
          fetch-depth: 0  # to find ancestor merge commits necessary for finding proper docker tags
+          filter: tree:0
      - name: Download changed aarch64 images
        uses: actions/download-artifact@v3
        with:
@ -90,6 +91,7 @@ jobs:
        with:
          clear-repository: true
          fetch-depth: 0  # Shallow clones should be disabled for a better relevancy of analysis
+          filter: tree:0
          submodules: true
      - name: Set up JDK 11
        uses: actions/setup-java@v1
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@ -18,6 +18,7 @@ on:  # yamllint disable-line rule:truthy
      - 'docs/**'
      - 'utils/check-style/aspell-ignore/**'
      - 'tests/ci/docs_check.py'
+      - '.github/workflows/docs_check.yml'
 ##########################################################################################
 ##################################### SMALL CHECKS #######################################
 ##########################################################################################
@ -94,6 +95,7 @@ jobs:
        with:
          clear-repository: true
          fetch-depth: 0  # to find ancestor merge commits necessary for finding proper docker tags
+          filter: tree:0
      - name: Download changed aarch64 images
        uses: actions/download-artifact@v3
        with:
@ -266,6 +268,7 @@ jobs:
        with:
          clear-repository: true
          fetch-depth: 0  # for performance artifact
+          filter: tree:0
          submodules: true
      - name: Build
        run: |
@ -350,6 +353,7 @@ jobs:
          clear-repository: true
          submodules: true
          fetch-depth: 0  # for performance artifact
+          filter: tree:0
      - name: Build
        run: |
          sudo rm -fr "$TEMP_PATH"
@ -1021,6 +1025,7 @@ jobs:
        with:
          clear-repository: true
          fetch-depth: 0  # It MUST BE THE SAME for all dependencies and the job itself
+          filter: tree:0
      - name: Check docker clickhouse/clickhouse-server building
        run: |
          cd "$GITHUB_WORKSPACE/tests/ci"
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@ -49,6 +49,7 @@ jobs:
      with:
        clear-repository: true
        fetch-depth: 0  # otherwise we will have no version info
+        filter: tree:0
        ref: ${{ env.GITHUB_TAG }}
    - name: Check docker clickhouse/clickhouse-server building
      run: |
--- a/.github/workflows/release_branches.yml
+++ b/.github/workflows/release_branches.yml
@ -53,6 +53,7 @@ jobs:
        with:
          clear-repository: true
          fetch-depth: 0  # to find ancestor merge commits necessary for finding proper docker tags
+          filter: tree:0
      - name: Download changed aarch64 images
        uses: actions/download-artifact@v3
        with:
@ -161,6 +162,7 @@ jobs:
          clear-repository: true
          submodules: true
          fetch-depth: 0 # otherwise we will have no info about contributors
+          filter: tree:0
      - name: Build
        run: |
          sudo rm -fr "$TEMP_PATH"
@ -203,6 +205,7 @@ jobs:
          clear-repository: true
          submodules: true
          fetch-depth: 0 # For a proper version and performance artifacts
+          filter: tree:0
      - name: Build
        run: |
          sudo rm -fr "$TEMP_PATH"
@ -456,6 +459,7 @@ jobs:
          clear-repository: true
          submodules: true
          fetch-depth: 0 # otherwise we will have no info about contributors
+          filter: tree:0
      - name: Apply sparse checkout for contrib # in order to check that it doesn't break build
        run: |
          rm -rf "$GITHUB_WORKSPACE/contrib" && echo 'removed'
@ -505,6 +509,7 @@ jobs:
          clear-repository: true
          submodules: true
          fetch-depth: 0 # otherwise we will have no info about contributors
+          filter: tree:0
      - name: Apply sparse checkout for contrib # in order to check that it doesn't break build
        run: |
          rm -rf "$GITHUB_WORKSPACE/contrib" && echo 'removed'
@ -544,6 +549,7 @@ jobs:
        with:
          clear-repository: true
          fetch-depth: 0  # It MUST BE THE SAME for all dependencies and the job itself
+          filter: tree:0
      - name: Check docker clickhouse/clickhouse-server building
        run: |
          cd "$GITHUB_WORKSPACE/tests/ci"
--- a/.github/workflows/tags_stable.yml
+++ b/.github/workflows/tags_stable.yml
@ -38,6 +38,7 @@ jobs:
      with:
        ref: master
        fetch-depth: 0
+        filter: tree:0
    - name: Update versions, docker version, changelog, security
      env:
        GITHUB_TOKEN: ${{ secrets.ROBOT_CLICKHOUSE_COMMIT_TOKEN }}
--- a/contrib/grpc
+++ b/contrib/grpc
@ -1 +1 @@
-Subproject commit 3f975ecab377cd5f739af780566596128f17bb74
+Subproject commit c52656e2bfcda3450bd6a7c247d2d9eeb8498524
--- a/contrib/snappy-cmake/CMakeLists.txt
+++ b/contrib/snappy-cmake/CMakeLists.txt
@ -24,6 +24,12 @@ else ()
    set (SNAPPY_HAVE_SSSE3 0)
 endif ()

+if (ARCH_AMD64 AND ENABLE_SSE42)
+    set (SNAPPY_HAVE_X86_CRC32 1)
+else ()
+    set (SNAPPY_HAVE_X86_CRC32 0)
+endif ()
+
 configure_file(
  "${SOURCE_DIR}/cmake/config.h.in"
  "${CMAKE_CURRENT_BINARY_DIR}/config.h")
--- a/docker/keeper/Dockerfile
+++ b/docker/keeper/Dockerfile
@ -2,8 +2,8 @@
 # If the image is built from Dockerfile.alpine, then the `-alpine` suffix is added automatically,
 # so the only purpose of Dockerfile.ubuntu is to push `latest`, `head` and so on w/o suffixes
 FROM ubuntu:20.04 AS glibc-donor
-
 ARG TARGETARCH
+
 RUN arch=${TARGETARCH:-amd64} \
    && case $arch in \
        amd64) rarch=x86_64 ;; \
@ -31,7 +31,9 @@ RUN arch=${TARGETARCH:-amd64} \
        arm64) ln -sf /lib/ld-2.31.so /lib/ld-linux-aarch64.so.1 ;; \
    esac

-ARG REPOSITORY="https://s3.amazonaws.com/clickhouse-builds/22.4/31c367d3cd3aefd316778601ff6565119fe36682/package_release"
+# lts / testing / prestable / etc
+ARG REPO_CHANNEL="stable"
+ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}"
 ARG VERSION="23.9.1.1854"
 ARG PACKAGES="clickhouse-keeper"

@ -46,16 +48,14 @@ ARG PACKAGES="clickhouse-keeper"
 ARG TARGETARCH
 RUN arch=${TARGETARCH:-amd64} \
    && for package in ${PACKAGES}; do \
-        { \
-            { echo "Get ${REPOSITORY}/${package}-${VERSION}-${arch}.tgz" \
-                && wget -c -q "${REPOSITORY}/${package}-${VERSION}-${arch}.tgz" -O "/tmp/${package}-${VERSION}-${arch}.tgz" \
-                && tar xvzf "/tmp/${package}-${VERSION}-${arch}.tgz" --strip-components=1 -C / ; \
-            } || \
-            { echo "Fallback to ${REPOSITORY}/${package}-${VERSION}.tgz" \
-                && wget -c -q "${REPOSITORY}/${package}-${VERSION}.tgz" -O "/tmp/${package}-${VERSION}.tgz" \
-                && tar xvzf "/tmp/${package}-${VERSION}.tgz" --strip-components=2 -C / ; \
-            } ; \
-        } || exit 1 \
+    ( \
+        cd /tmp \
+        && echo "Get ${REPOSITORY}/${package}-${VERSION}-${arch}.tgz" \
+            && wget -c -q "${REPOSITORY}/${package}-${VERSION}-${arch}.tgz" \
+            && wget -c -q "${REPOSITORY}/${package}-${VERSION}-${arch}.tgz.sha512" \
+            && sed 's:/output/:/tmp/:' < "${package}-${VERSION}-${arch}.tgz.sha512" | sha512sum -c \
+            && tar xvzf "${package}-${VERSION}-${arch}.tgz" --strip-components=1 -C / \
+    ) \
    ; done \
    && rm /tmp/*.tgz /install -r \
    && addgroup -S -g 101 clickhouse \
--- a/docker/packager/binary/build.sh
+++ b/docker/packager/binary/build.sh
@ -172,10 +172,15 @@ then
    # This is why we add this repository snapshot from CI to the performance test
    # package.
    mkdir "$PERF_OUTPUT"/ch
-    git -C "$PERF_OUTPUT"/ch init --bare
-    git -C "$PERF_OUTPUT"/ch remote add origin /build
-    git -C "$PERF_OUTPUT"/ch fetch --no-tags --depth 50 origin HEAD:pr
-    git -C "$PERF_OUTPUT"/ch fetch --no-tags --depth 50 origin master:master
+    # Copy .git only, but skip modules, using tar
+    tar c -C /build/ --exclude='.git/modules/**' .git | tar x -C "$PERF_OUTPUT"/ch
+    # Create branch pr and origin/master to have them for the following performance comparison
+    git -C "$PERF_OUTPUT"/ch branch pr
+    git -C "$PERF_OUTPUT"/ch fetch --no-tags --depth 50 origin master:origin/master
+    # Clean remote, to not have it stale
+    git -C "$PERF_OUTPUT"/ch remote | xargs -n1 git -C "$PERF_OUTPUT"/ch remote remove
+    # And clean all tags
+    git -C "$PERF_OUTPUT"/ch tag | xargs git -C "$PERF_OUTPUT"/ch tag -d
    git -C "$PERF_OUTPUT"/ch reset --soft pr
    git -C "$PERF_OUTPUT"/ch log -5
    (
--- a/docker/server/Dockerfile.alpine
+++ b/docker/server/Dockerfile.alpine
@ -23,7 +23,6 @@ COPY docker_related_config.xml /etc/clickhouse-server/config.d/
 COPY entrypoint.sh /entrypoint.sh

 ARG TARGETARCH
-
 RUN arch=${TARGETARCH:-amd64} \
    && case $arch in \
        amd64) mkdir -p /lib64 && ln -sf /lib/ld-2.31.so /lib64/ld-linux-x86-64.so.2 ;; \
@ -45,16 +44,14 @@ ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static"

 RUN arch=${TARGETARCH:-amd64} \
    && for package in ${PACKAGES}; do \
-        { \
-            { echo "Get ${REPOSITORY}/${package}-${VERSION}-${arch}.tgz" \
-                && wget -c -q "${REPOSITORY}/${package}-${VERSION}-${arch}.tgz" -O "/tmp/${package}-${VERSION}-${arch}.tgz" \
-                && tar xvzf "/tmp/${package}-${VERSION}-${arch}.tgz" --strip-components=1 -C / ; \
-            } || \
-            { echo "Fallback to ${REPOSITORY}/${package}-${VERSION}.tgz" \
-                && wget -c -q "${REPOSITORY}/${package}-${VERSION}.tgz" -O "/tmp/${package}-${VERSION}.tgz" \
-                && tar xvzf "/tmp/${package}-${VERSION}.tgz" --strip-components=2 -C / ; \
-            } ; \
-        } || exit 1 \
+    ( \
+        cd /tmp \
+        && echo "Get ${REPOSITORY}/${package}-${VERSION}-${arch}.tgz" \
+            && wget -c -q "${REPOSITORY}/${package}-${VERSION}-${arch}.tgz" \
+            && wget -c -q "${REPOSITORY}/${package}-${VERSION}-${arch}.tgz.sha512" \
+            && sed 's:/output/:/tmp/:' < "${package}-${VERSION}-${arch}.tgz.sha512" | sha512sum -c \
+            && tar xvzf "${package}-${VERSION}-${arch}.tgz" --strip-components=1 -C / \
+    ) \
    ; done \
    && rm /tmp/*.tgz /install -r \
    && addgroup -S -g 101 clickhouse \
--- a/docker/server/Dockerfile.ubuntu
+++ b/docker/server/Dockerfile.ubuntu
@ -5,6 +5,13 @@ ARG DEBIAN_FRONTEND=noninteractive

 # ARG for quick switch to a given ubuntu mirror
 ARG apt_archive="http://archive.ubuntu.com"
+
+# user/group precreated explicitly with fixed uid/gid on purpose.
+# It is especially important for rootless containers: in that case entrypoint
+# can't do chown and owners of mounted volumes should be configured externally.
+# We do that in advance at the begining of Dockerfile before any packages will be
+# installed to prevent picking those uid / gid by some unrelated software.
+# The same uid / gid (101) is used both for alpine and ubuntu.
 RUN sed -i "s|http://archive.ubuntu.com|${apt_archive}|g" /etc/apt/sources.list \
    && groupadd -r clickhouse --gid=101 \
    && useradd -r -g clickhouse --uid=101 --home-dir=/var/lib/clickhouse --shell=/bin/bash clickhouse \
@ -35,13 +42,6 @@ ARG deb_location_url=""
 # from a single binary url (useful for non-standard builds - with sanitizers, for arm64).
 ARG single_binary_location_url=""

-# user/group precreated explicitly with fixed uid/gid on purpose.
-# It is especially important for rootless containers: in that case entrypoint
-# can't do chown and owners of mounted volumes should be configured externally.
-# We do that in advance at the begining of Dockerfile before any packages will be
-# installed to prevent picking those uid / gid by some unrelated software.
-# The same uid / gid (101) is used both for alpine and ubuntu.
-
 ARG TARGETARCH

 # install from a web location with deb packages
--- a/docker/test/fuzzer/run-fuzzer.sh
+++ b/docker/test/fuzzer/run-fuzzer.sh
@ -337,8 +337,8 @@ quit
        # which is confusing.
        task_exit_code=$fuzzer_exit_code
        echo "failure" > status.txt
-        { rg --text -o "Found error:.*" fuzzer.log \
-            || rg --text -ao "Exception:.*" fuzzer.log \
+        { rg -ao "Found error:.*" fuzzer.log \
+            || rg -ao "Exception:.*" fuzzer.log \
            || echo "Fuzzer failed ($fuzzer_exit_code). See the logs." ; } \
            | tail -1 > description.txt
    fi
--- a/docker/test/stateless/stress_tests.lib
+++ b/docker/test/stateless/stress_tests.lib
@ -61,11 +61,13 @@ function configure()
        sudo mv /etc/clickhouse-server/config.d/keeper_port.xml.tmp /etc/clickhouse-server/config.d/keeper_port.xml
    }

-    # Randomize all Keeper feature flags
-    randomize_config_boolean_value filtered_list
-    randomize_config_boolean_value multi_read
-    randomize_config_boolean_value check_not_exists
-    randomize_config_boolean_value create_if_not_exists
+    if [[ -n "$RANDOMIZE_KEEPER_FEATURE_FLAGS" ]] && [[ "$RANDOMIZE_KEEPER_FEATURE_FLAGS" -eq 1 ]]; then
+        # Randomize all Keeper feature flags
+        randomize_config_boolean_value filtered_list
+        randomize_config_boolean_value multi_read
+        randomize_config_boolean_value check_not_exists
+        randomize_config_boolean_value create_if_not_exists
+    fi

    sudo chown clickhouse /etc/clickhouse-server/config.d/keeper_port.xml
    sudo chgrp clickhouse /etc/clickhouse-server/config.d/keeper_port.xml
--- a/docs/en/development/continuous-integration.md
+++ b/docs/en/development/continuous-integration.md
@ -67,6 +67,48 @@ This check means that the CI system started to process the pull request. When it
 Performs some simple regex-based checks of code style, using the [`utils/check-style/check-style`](https://github.com/ClickHouse/ClickHouse/blob/master/utils/check-style/check-style) binary (note that it can be run locally).
 If it fails, fix the style errors following the [code style guide](style.md).

+#### Running style check locally:
+```sh
+mkdir -p /tmp/test_output
+# running all checks
+docker run --rm --volume=.:/ClickHouse --volume=/tmp/test_output:/test_output -u $(id -u ${USER}):$(id -g ${USER}) --cap-add=SYS_PTRACE clickhouse/style-test
+
+# run specified check script (e.g.: ./check-mypy)
+docker run --rm --volume=.:/ClickHouse --volume=/tmp/test_output:/test_output -u $(id -u ${USER}):$(id -g ${USER}) --cap-add=SYS_PTRACE --entrypoint= -w/ClickHouse/utils/check-style clickhouse/style-test ./check-mypy
+
+# find all style check scripts under the directory:
+cd ./utils/check-style
+
+# Check duplicate includes
+./check-duplicate-includes.sh
+
+# Check c++ formatiing
+./check-style
+
+# Check python formatting with black
+./check-black
+
+# Check python type hinting with mypy
+./check-mypy
+
+# Check code with codespell
+./check-typos
+
+# Check docs spelling
+./check-doc-aspell
+
+# Check whitespaces
+./check-whitespaces
+
+# Check github actions workflows
+./check-workflows
+
+# Check submodules
+./check-submodules
+
+# Check shell scripts with shellcheck
+./shellcheck-run.sh
+```

 ## Fast Test
 Normally this is the first check that is ran for a PR. It builds ClickHouse and
@ -75,6 +117,15 @@ some. If it fails, further checks are not started until it is fixed. Look at
 the report to see which tests fail, then reproduce the failure locally as
 described [here](tests.md#functional-test-locally).

+#### Running Fast Test locally:
+```sh
+mkdir -p /tmp/test_output
+mkdir -p /tmp/fasttest-workspace
+cd ClickHouse
+# this docker command performs minimal ClickHouse build and run FastTests against it
+docker run --rm --cap-add=SYS_PTRACE -u $(id -u ${USER}):$(id -g ${USER})  --network=host -e FASTTEST_WORKSPACE=/fasttest-workspace -e FASTTEST_OUTPUT=/test_output -e FASTTEST_SOURCE=/ClickHouse --cap-add=SYS_PTRACE -e stage=clone_submodules --volume=/tmp/fasttest-workspace:/fasttest-workspace --volume=.:/ClickHouse --volume=/tmp/test_output:/test_output clickhouse/fasttest
+```
+

 #### Status Page Files
 - `runlog.out.log` is the general log that includes all other logs.
@ -122,6 +173,13 @@ Builds ClickHouse in various configurations for use in further steps. You have t
 ## Special Build Check
 Performs static analysis and code style checks using `clang-tidy`. The report is similar to the [build check](#build-check). Fix the errors found in the build log.

+#### Running clang-tidy locally:
+There is a convenience `packager` script that runs the clang-tidy build in docker
+```sh
+mkdir build_tidy
+./docker/packager/packager --output-dir=./build_tidy --package-type=binary --compiler=clang-17 --debug-build --clang-tidy
+```
+

 ## Functional Stateless Tests
 Runs [stateless functional tests](tests.md#functional-tests) for ClickHouse
--- a/docs/en/engines/database-engines/materialized-postgresql.md
+++ b/docs/en/engines/database-engines/materialized-postgresql.md
@ -197,6 +197,11 @@ Replication of [**TOAST**](https://www.postgresql.org/docs/9.5/storage-toast.htm
    ALTER DATABASE postgres_database MODIFY SETTING materialized_postgresql_max_block_size = <new_size>;
    ```

+### `materialized_postgresql_use_unique_replication_consumer_identifier` {#materialized_postgresql_use_unique_replication_consumer_identifier}
+
+Use a unique replication consumer identifier for replication. Default: `0`.
+If set to `1`, allows to setup several `MaterializedPostgreSQL` tables pointing to the same `PostgreSQL` table.
+
 ## Notes {#notes}

 ### Failover of the logical replication slot {#logical-replication-slot-failover}
--- a/docs/en/engines/table-engines/integrations/s3queue.md
+++ b/docs/en/engines/table-engines/integrations/s3queue.md
@ -24,12 +24,15 @@ CREATE TABLE s3_queue_engine_table (name String, value UInt32)
    [after_processing = 'keep',]
    [keeper_path = '',]
    [s3queue_loading_retries = 0,]
+    [s3queue_processing_threads_num = 1,]
+    [s3queue_enable_logging_to_s3queue_log = 0,]
    [s3queue_polling_min_timeout_ms = 1000,]
    [s3queue_polling_max_timeout_ms = 10000,]
    [s3queue_polling_backoff_ms = 0,]
-    [s3queue_tracked_files_limit = 1000,]
    [s3queue_tracked_file_ttl_sec = 0,]
-    [s3queue_polling_size = 50,]
+    [s3queue_tracked_files_limit = 1000,]
+    [s3queue_cleanup_interval_min_ms = 10000,]
+    [s3queue_cleanup_interval_max_ms = 30000,]
 ```

 **Engine parameters**
@ -46,7 +49,7 @@ CREATE TABLE s3_queue_engine_table (name String, value UInt32)
 CREATE TABLE s3queue_engine_table (name String, value UInt32)
 ENGINE=S3Queue('https://clickhouse-public-datasets.s3.amazonaws.com/my-test-bucket-768/*', 'CSV', 'gzip')
 SETTINGS
-    mode = 'ordered';
+    mode = 'unordered';
 ```

 Using named collections:
@ -109,6 +112,18 @@ Possible values:

 Default value: `0`.

+### s3queue_processing_threads_num {#processing_threads_num}
+
+Number of threads to perform processing. Applies only for `Unordered` mode.
+
+Default value: `1`.
+
+### s3queue_enable_logging_to_s3queue_log {#enable_logging_to_s3queue_log}
+
+Enable logging to `system.s3queue_log`.
+
+Default value: `0`.
+
 ### s3queue_polling_min_timeout_ms {#polling_min_timeout_ms}

 Minimal timeout before next polling (in milliseconds).
@ -161,18 +176,17 @@ Possible values:

 Default value: `0`.

-### s3queue_polling_size {#polling_size}
+### s3queue_cleanup_interval_min_ms {#cleanup_interval_min_ms}

-Maximum files to fetch from S3 with SELECT or in background task.
-Engine takes files for processing from S3 in batches.
-We limit the batch size to increase concurrency if multiple table engines with the same `keeper_path` consume files from the same path.
+For 'Ordered' mode. Defines a minimum boundary for reschedule interval for a background task, which is responsible for maintaining tracked file TTL and maximum tracked files set.

-Possible values:
+Default value: `10000`.

- Positive integer.
+### s3queue_cleanup_interval_max_ms {#cleanup_interval_max_ms}

-Default value: `50`.
+For 'Ordered' mode. Defines a maximum boundary for reschedule interval for a background task, which is responsible for maintaining tracked file TTL and maximum tracked files set.

+Default value: `30000`.

 ## S3-related Settings {#s3-settings}

@ -227,6 +241,118 @@ For more information about virtual columns see [here](../../../engines/table-eng

 Constructions with `{}` are similar to the [remote](../../../sql-reference/table-functions/remote.md) table function.

-:::note
-If the listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`.
-:::
+## Limitations {#limitations}
+
+1. Duplicated rows can be as a result of:
+
+- an exception happens during parsing in the middle of file processing and retries are enabled via `s3queue_loading_retries`;
+
+- `S3Queue` is configured on multiple servers pointing to the same path in zookeeper and keeper session expires before one server managed to commit processed file, which could lead to another server taking processing of the file, which could be partially or fully processed by the first server;
+
+- abnormal server termination.
+
+2. `S3Queue` is configured on multiple servers pointing to the same path in zookeeper and `Ordered` mode is used, then `s3queue_loading_retries` will not work. This will be fixed soon.
+
+
+## Introspection {#introspection}
+
+For introspection use `system.s3queue` stateless table and `system.s3queue_log` persistent table.
+
+1. `system.s3queue`. This table is not persistent and shows in-memory state of `S3Queue`: which files are currently being processed, which files are processed or failed.
+
+``` sql
+┌─statement──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
+│ CREATE TABLE system.s3queue
+(
+    `database` String,
+    `table` String,
+    `file_name` String,
+    `rows_processed` UInt64,
+    `status` String,
+    `processing_start_time` Nullable(DateTime),
+    `processing_end_time` Nullable(DateTime),
+    `ProfileEvents` Map(String, UInt64)
+    `exception` String
+)
+ENGINE = SystemS3Queue
+COMMENT 'SYSTEM TABLE is built on the fly.' │
+└────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+```
+
+Example:
+
+``` sql
+
+SELECT *
+FROM system.s3queue
+
+Row 1:
+──────
+zookeeper_path:        /clickhouse/s3queue/25ea5621-ae8c-40c7-96d0-cec959c5ab88/3b3f66a1-9866-4c2e-ba78-b6bfa154207e
+file_name:             wikistat/original/pageviews-20150501-030000.gz
+rows_processed:        5068534
+status:                Processed
+processing_start_time: 2023-10-13 13:09:48
+processing_end_time:   2023-10-13 13:10:31
+ProfileEvents:         {'ZooKeeperTransactions':3,'ZooKeeperGet':2,'ZooKeeperMulti':1,'SelectedRows':5068534,'SelectedBytes':198132283,'ContextLock':1,'S3QueueSetFileProcessingMicroseconds':2480,'S3QueueSetFileProcessedMicroseconds':9985,'S3QueuePullMicroseconds':273776,'LogTest':17}
+exception:
+```
+
+2. `system.s3queue_log`. Persistent table. Has the same information as `system.s3queue`, but for `processed` and `failed` files.
+
+The table has the following structure:
+
+``` sql
+SHOW CREATE TABLE system.s3queue_log
+
+Query id: 0ad619c3-0f2a-4ee4-8b40-c73d86e04314
+
+┌─statement──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
+│ CREATE TABLE system.s3queue_log
+(
+    `event_date` Date,
+    `event_time` DateTime,
+    `table_uuid` String,
+    `file_name` String,
+    `rows_processed` UInt64,
+    `status` Enum8('Processed' = 0, 'Failed' = 1),
+    `processing_start_time` Nullable(DateTime),
+    `processing_end_time` Nullable(DateTime),
+    `ProfileEvents` Map(String, UInt64),
+    `exception` String
+)
+ENGINE = MergeTree
+PARTITION BY toYYYYMM(event_date)
+ORDER BY (event_date, event_time)
+SETTINGS index_granularity = 8192 │
+└────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
+```
+
+In order to use `system.s3queue_log` define its configuration in server config file:
+
+``` xml
+    <s3queue_log>
+        <database>system</database>
+        <table>s3queue_log</table>
+    </s3queue_log>
+```
+
+Example:
+
+``` sql
+SELECT *
+FROM system.s3queue_log
+
+Row 1:
+──────
+event_date:            2023-10-13
+event_time:            2023-10-13 13:10:12
+table_uuid:
+file_name:             wikistat/original/pageviews-20150501-020000.gz
+rows_processed:        5112621
+status:                Processed
+processing_start_time: 2023-10-13 13:09:48
+processing_end_time:   2023-10-13 13:10:12
+ProfileEvents:         {'ZooKeeperTransactions':3,'ZooKeeperGet':2,'ZooKeeperMulti':1,'SelectedRows':5112621,'SelectedBytes':198577687,'ContextLock':1,'S3QueueSetFileProcessingMicroseconds':1934,'S3QueueSetFileProcessedMicroseconds':17063,'S3QueuePullMicroseconds':5841972,'LogTest':17}
+exception:
+```
--- a/docs/en/engines/table-engines/mergetree-family/invertedindexes.md
+++ b/docs/en/engines/table-engines/mergetree-family/invertedindexes.md
@ -58,6 +58,12 @@ where `N` specifies the tokenizer:
 - `inverted(0)` (or shorter: `inverted()`) set the tokenizer to "tokens", i.e. split strings along spaces,
 - `inverted(N)` with `N` between 2 and 8 sets the tokenizer to "ngrams(N)"

+The maximum rows per postings list can be specified as the second parameter. This parameter can be used to control postings list sizes to avoid generating huge postings list files. The following variants exist:
+
+- `inverted(ngrams, max_rows_per_postings_list)`: Use given max_rows_per_postings_list (assuming it is not 0)
+- `inverted(ngrams, 0)`: No limitation of maximum rows per postings list
+- `inverted(ngrams)`: Use a default maximum rows which is 64K.
+
 Being a type of skipping index, inverted indexes can be dropped or added to a column after table creation:

 ``` sql
--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@ -3310,6 +3310,28 @@ Possible values:

 Default value: `0`.

+## mysql_map_string_to_text_in_show_columns {#mysql_map_string_to_text_in_show_columns}
+
+When enabled, [String](../../sql-reference/data-types/string.md) ClickHouse data type will be displayed as `TEXT` in [SHOW COLUMNS](../../sql-reference/statements/show.md#show_columns).
+
+Has effect only when [use_mysql_types_in_show_columns](#use_mysql_types_in_show_columns) is enabled.
+
+- 0 - Use `BLOB`.
+- 1 - Use `TEXT`.
+
+Default value: `0`.
+
+## mysql_map_fixed_string_to_text_in_show_columns {#mysql_map_fixed_string_to_text_in_show_columns}
+
+When enabled, [FixedString](../../sql-reference/data-types/fixedstring.md) ClickHouse data type will be displayed as `TEXT` in [SHOW COLUMNS](../../sql-reference/statements/show.md#show_columns).
+
+Has effect only when [use_mysql_types_in_show_columns](#use_mysql_types_in_show_columns) is enabled.
+
+- 0 - Use `BLOB`.
+- 1 - Use `TEXT`.
+
+Default value: `0`.
+
 ## execute_merges_on_single_replica_time_threshold {#execute-merges-on-single-replica-time-threshold}

 Enables special logic to perform merges on replicas.
--- a/docs/en/sql-reference/aggregate-functions/reference/index.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/index.md
@ -103,4 +103,5 @@ ClickHouse-specific aggregate functions:
 - [quantileInterpolatedWeighted](./quantileinterpolatedweighted.md)
 - [sparkBar](./sparkbar.md)
 - [sumCount](./sumcount.md)
+- [largestTriangleThreeBuckets](./largestTriangleThreeBuckets.md)

--- a/docs/en/sql-reference/aggregate-functions/reference/largestTriangleThreeBuckets.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/largestTriangleThreeBuckets.md
@ -0,0 +1,67 @@
+---
+slug: /en/sql-reference/aggregate-functions/reference/largestTriangleThreeBuckets
+sidebar_position: 312
+sidebar_label: largestTriangleThreeBuckets
+---
+
+# largestTriangleThreeBuckets
+
+Applies the [Largest-Triangle-Three-Buckets](https://skemman.is/bitstream/1946/15343/3/SS_MSthesis.pdf) algorithm to the input data.
+The algorithm is used for downsampling time series data for visualization. It is designed to operate on series sorted by x coordinate.
+It works by dividing the sorted series into buckets and then finding the largest triangle in each bucket. The number of buckets is equal to the number of points in the resulting series.
+the function will sort data by `x` and then apply the downsampling algorithm to the sorted data.
+
+**Syntax**
+
+``` sql
+largestTriangleThreeBuckets(n)(x, y)
+```
+
+Alias: `lttb`.
+
+**Arguments**
+
+- `x` — x coordinate. [Integer](../../../sql-reference/data-types/int-uint.md) , [Float](../../../sql-reference/data-types/float.md) , [Decimal](../../../sql-reference/data-types/decimal.md)  , [Date](../../../sql-reference/data-types/date.md), [Date32](../../../sql-reference/data-types/date32.md), [DateTime](../../../sql-reference/data-types/datetime.md), [DateTime64](../../../sql-reference/data-types/datetime64.md).
+- `y` — y coordinate. [Integer](../../../sql-reference/data-types/int-uint.md) , [Float](../../../sql-reference/data-types/float.md) , [Decimal](../../../sql-reference/data-types/decimal.md)  , [Date](../../../sql-reference/data-types/date.md), [Date32](../../../sql-reference/data-types/date32.md), [DateTime](../../../sql-reference/data-types/datetime.md), [DateTime64](../../../sql-reference/data-types/datetime64.md).
+
+**Parameters**
+
+- `n` — number of points in the resulting series. [UInt64](../../../sql-reference/data-types/int-uint.md).
+
+**Returned values**
+
+[Array](../../../sql-reference/data-types/array.md) of [Tuple](../../../sql-reference/data-types/tuple.md) with two elements:
+
+**Example**
+
+Input table:
+
+``` text
+┌─────x───────┬───────y──────┐
+│ 1.000000000 │ 10.000000000 │
+│ 2.000000000 │ 20.000000000 │
+│ 3.000000000 │ 15.000000000 │
+│ 8.000000000 │ 60.000000000 │
+│ 9.000000000 │ 55.000000000 │
+│ 10.00000000 │ 70.000000000 │
+│ 4.000000000 │ 30.000000000 │
+│ 5.000000000 │ 40.000000000 │
+│ 6.000000000 │ 35.000000000 │
+│ 7.000000000 │ 50.000000000 │
+└─────────────┴──────────────┘
+```
+
+Query:
+
+``` sql
+SELECT largestTriangleThreeBuckets(4)(x, y) FROM largestTriangleThreeBuckets_test;
+```
+
+Result:
+
+``` text
+┌────────largestTriangleThreeBuckets(3)(x, y)───────────┐
+│           [(1,10),(3,15),(5,40),(10,70)]              │
+└───────────────────────────────────────────────────────┘
+```
+
--- a/packages/build
+++ b/packages/build
@ -112,7 +112,8 @@ EOF
        tar -czf "$TARBALL" -C "$OUTPUT_DIR" "$PKG_DIR"
    fi

-    sha512sum "$TARBALL" > "$TARBALL".sha512
+    # Cut the $OUTPUT_DIR/ from the sha512sum output to make it universal
+    sha512sum "$TARBALL" | sed "s|$OUTPUT_DIR/||" > "$TARBALL".sha512

    rm -r "$PKG_PATH"
 }
--- a/programs/copier/ClusterCopier.cpp
+++ b/programs/copier/ClusterCopier.cpp
@ -391,7 +391,7 @@ zkutil::EphemeralNodeHolder::Ptr ClusterCopier::createTaskWorkerNodeAndWaitIfNee
            auto code = zookeeper->tryMulti(ops, responses);

            if (code == Coordination::Error::ZOK || code == Coordination::Error::ZNODEEXISTS)
-                return std::make_shared<zkutil::EphemeralNodeHolder>(current_worker_path, *zookeeper, false, false, description);
+                return zkutil::EphemeralNodeHolder::existing(current_worker_path, *zookeeper);

            if (code == Coordination::Error::ZBADVERSION)
            {
--- a/programs/odbc-bridge/ColumnInfoHandler.cpp
+++ b/programs/odbc-bridge/ColumnInfoHandler.cpp
@ -145,6 +145,10 @@ void ODBCColumnsInfoHandler::handleRequest(HTTPServerRequest & request, HTTPServ
            if (tables.next())
            {
                catalog_name = tables.table_catalog();
+                /// `tables.next()` call is mandatory to drain the iterator before next operation and avoid "Invalid cursor state"
+                if (tables.next())
+                    throw Exception(ErrorCodes::UNKNOWN_TABLE, "Driver returned more than one table for '{}': '{}' and '{}'",
+                                    table_name, catalog_name, tables.table_schema());
                LOG_TRACE(log, "Will fetch info for table '{}.{}'", catalog_name, table_name);
                return catalog.find_columns(/* column = */ "", table_name, /* schema = */ "", catalog_name);
            }
@ -153,6 +157,10 @@ void ODBCColumnsInfoHandler::handleRequest(HTTPServerRequest & request, HTTPServ
            if (tables.next())
            {
                catalog_name = tables.table_catalog();
+                /// `tables.next()` call is mandatory to drain the iterator before next operation and avoid "Invalid cursor state"
+                if (tables.next())
+                    throw Exception(ErrorCodes::UNKNOWN_TABLE, "Driver returned more than one table for '{}': '{}' and '{}'",
+                                    table_name, catalog_name, tables.table_schema());
                LOG_TRACE(log, "Will fetch info for table '{}.{}.{}'", catalog_name, schema_name, table_name);
                return catalog.find_columns(/* column = */ "", table_name, schema_name, catalog_name);
            }
--- a/programs/odbc-bridge/ODBCPooledConnectionFactory.h
+++ b/programs/odbc-bridge/ODBCPooledConnectionFactory.h
@ -91,16 +91,17 @@ T execute(nanodbc::ConnectionHolderPtr connection_holder, std::function<T(nanodb
    }
    catch (const nanodbc::database_error & e)
    {
-        LOG_ERROR(
-            &Poco::Logger::get("ODBCConnection"),
-            "ODBC query failed with error: {}, state: {}, native code: {}",
-            e.what(), e.state(), e.native());
-
        /// SQLState, connection related errors start with 08 (main: 08S01), cursor invalid state is 24000.
        /// Invalid cursor state is a retriable error.
        /// Invalid transaction state 25000. Truncate to 2 letters on purpose.
        /// https://docs.microsoft.com/ru-ru/sql/odbc/reference/appendixes/appendix-a-odbc-error-codes?view=sql-server-ver15
-        if (e.state().starts_with("08") || e.state().starts_with("24") || e.state().starts_with("25"))
+        bool is_retriable = e.state().starts_with("08") || e.state().starts_with("24") || e.state().starts_with("25");
+        LOG_ERROR(
+            &Poco::Logger::get("ODBCConnection"),
+            "ODBC query failed with error: {}, state: {}, native code: {}{}",
+            e.what(), e.state(), e.native(), is_retriable ? ", will retry" : "");
+
+        if (is_retriable)
        {
            connection_holder->updateConnection();
            return query_func(connection_holder->get());
--- a/programs/server/dashboard.html
+++ b/programs/server/dashboard.html
@ -821,77 +821,85 @@ function insertChart(i) {
    let move_text = document.createTextNode('✥');
    move.appendChild(move_text);

-    let is_dragging = false;
-    move.addEventListener('mousedown', e => {
-        const idx = getCurrentIndex();
-        is_dragging = true;
+    let drag_state = {
+        is_dragging: false,
+        idx: null,
+        offset_x: null,
+        offset_y: null,
+        displace_idx: null,
+        displace_chart: null
+    };
+
+    function dragStop(e) {
+        drag_state.is_dragging = false;
+        chart.className = 'chart';
+        chart.style.left = null;
+        chart.style.top = null;
+
+        if (drag_state.displace_idx !== null) {
+            const elem = queries[drag_state.idx];
+            queries.splice(drag_state.idx, 1);
+            queries.splice(drag_state.displace_idx, 0, elem);
+
+            drag_state.displace_chart.className = 'chart';
+            drawAll();
+        }
+    }
+
+    function dragMove(e) {
+        if (!drag_state.is_dragging) return;
+
+        let x = e.clientX - drag_state.offset_x;
+        let y = e.clientY - drag_state.offset_y;
+
+        chart.style.left = `${x}px`;
+        chart.style.top = `${y}px`;
+
+        drag_state.displace_idx = null;
+        drag_state.displace_chart = null;
+        let current_idx = -1;
+        for (const elem of charts.querySelectorAll('.chart')) {
+            ++current_idx;
+            if (current_idx == drag_state.idx) {
+                continue;
+            }
+
+            const this_rect = chart.getBoundingClientRect();
+            const this_center_x = this_rect.left + this_rect.width / 2;
+            const this_center_y = this_rect.top + this_rect.height / 2;
+
+            const elem_rect = elem.getBoundingClientRect();
+
+            if (this_center_x >= elem_rect.left && this_center_x <= elem_rect.right
+                && this_center_y >= elem_rect.top && this_center_y <= elem_rect.bottom) {
+
+                elem.className = 'chart chart-displaced';
+                drag_state.displace_idx = current_idx;
+                drag_state.displace_chart = elem;
+            } else {
+                elem.className = 'chart';
+            }
+        }
+    }
+
+    function dragStart(e) {
+        if (e.button !== 0) return; /// left button only
+        move.setPointerCapture(e.pointerId);
+
+        drag_state.is_dragging = true;
+        drag_state.idx = getCurrentIndex();
        chart.className = 'chart chart-moving';

-        let offset_x = e.clientX;
-        let offset_y = e.clientY;
+        drag_state.offset_x = e.clientX;
+        drag_state.offset_y = e.clientY;
+    }

-        let displace_idx = null;
-        let displace_chart = null;
-
-        function mouseup(e) {
-            is_dragging = false;
-            chart.className = 'chart';
-            chart.style.left = null;
-            chart.style.top = null;
-
-            if (displace_idx !== null) {
-                const elem = queries[idx];
-                queries.splice(idx, 1);
-                queries.splice(displace_idx, 0, elem);
-
-                displace_chart.className = 'chart';
-                drawAll();
-            }
-        }
-
-        function mousemove(e) {
-            if (!is_dragging) {
-                document.body.removeEventListener('mousemove', mousemove);
-                document.body.removeEventListener('mouseup', mouseup);
-                return;
-            }
-
-            let x = e.clientX - offset_x;
-            let y = e.clientY - offset_y;
-
-            chart.style.left = `${x}px`;
-            chart.style.top = `${y}px`;
-
-            displace_idx = null;
-            displace_chart = null;
-            let current_idx = -1;
-            for (const elem of charts.querySelectorAll('.chart')) {
-                ++current_idx;
-                if (current_idx == idx) {
-                    continue;
-                }
-
-                const this_rect = chart.getBoundingClientRect();
-                const this_center_x = this_rect.left + this_rect.width / 2;
-                const this_center_y = this_rect.top + this_rect.height / 2;
-
-                const elem_rect = elem.getBoundingClientRect();
-
-                if (this_center_x >= elem_rect.left && this_center_x <= elem_rect.right
-                    && this_center_y >= elem_rect.top && this_center_y <= elem_rect.bottom) {
-
-                    elem.className = 'chart chart-displaced';
-                    displace_idx = current_idx;
-                    displace_chart = elem;
-                } else {
-                    elem.className = 'chart';
-                }
-            }
-        }
-
-        document.body.addEventListener('mouseup', mouseup);
-        document.body.addEventListener('mousemove', mousemove);
-    });
+    /// Read https://www.redblobgames.com/making-of/draggable/
+    move.addEventListener('pointerdown', dragStart);
+    move.addEventListener('pointermove', dragMove);
+    move.addEventListener('pointerup', dragStop);
+    move.addEventListener('pointerancel', dragStop);
+    move.addEventListener('touchstart', (e) => e.preventDefault());

    let maximize = document.createElement('a');
    let maximize_text = document.createTextNode('🗖');
--- a/src/AggregateFunctions/AggregateFunctionLargestTriangleThreeBuckets.cpp
+++ b/src/AggregateFunctions/AggregateFunctionLargestTriangleThreeBuckets.cpp
@ -0,0 +1,52 @@
+#include <AggregateFunctions/AggregateFunctionFactory.h>
+#include <AggregateFunctions/AggregateFunctionLargestTriangleThreeBuckets.h>
+#include <AggregateFunctions/FactoryHelpers.h>
+#include <AggregateFunctions/Helpers.h>
+
+
+namespace ErrorCodes
+{
+extern const int NOT_IMPLEMENTED;
+}
+
+namespace DB
+{
+struct Settings;
+
+namespace
+{
+
+    AggregateFunctionPtr
+    createAggregateFunctionLargestTriangleThreeBuckets(const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings *)
+    {
+        assertBinary(name, argument_types);
+
+
+        if (!(isNumber(argument_types[0]) || isDateOrDate32(argument_types[0]) || isDateTime(argument_types[0])
+              || isDateTime64(argument_types[0])))
+            throw Exception(
+                ErrorCodes::NOT_IMPLEMENTED,
+                "Aggregate function {} only supports Date, Date32, DateTime, DateTime64 and Number as the first argument",
+                name);
+
+        if (!(isNumber(argument_types[1]) || isDateOrDate32(argument_types[1]) || isDateTime(argument_types[1])
+              || isDateTime64(argument_types[1])))
+            throw Exception(
+                ErrorCodes::NOT_IMPLEMENTED,
+                "Aggregate function {} only supports Date, Date32, DateTime, DateTime64 and Number as the second argument",
+                name);
+
+        return std::make_shared<AggregateFunctionLargestTriangleThreeBuckets>(argument_types, parameters);
+    }
+
+}
+
+
+void registerAggregateFunctionLargestTriangleThreeBuckets(AggregateFunctionFactory & factory)
+{
+    factory.registerFunction(AggregateFunctionLargestTriangleThreeBuckets::name, createAggregateFunctionLargestTriangleThreeBuckets);
+    factory.registerAlias("lttb", AggregateFunctionLargestTriangleThreeBuckets::name);
+}
+
+
+}
--- a/src/AggregateFunctions/AggregateFunctionLargestTriangleThreeBuckets.h
+++ b/src/AggregateFunctions/AggregateFunctionLargestTriangleThreeBuckets.h
@ -0,0 +1,327 @@
+#pragma once
+
+#include <iostream>
+#include <limits>
+#include <numeric>
+#include <AggregateFunctions/IAggregateFunction.h>
+#include <AggregateFunctions/StatCommon.h>
+#include <Columns/ColumnArray.h>
+#include <Columns/ColumnTuple.h>
+#include <Columns/ColumnVector.h>
+#include <Columns/ColumnsDateTime.h>
+#include <DataTypes/DataTypeArray.h>
+#include <DataTypes/DataTypeNullable.h>
+#include <DataTypes/DataTypeTuple.h>
+#include <DataTypes/DataTypesDecimal.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <IO/ReadHelpers.h>
+#include <IO/WriteHelpers.h>
+#include <base/types.h>
+#include <Common/PODArray_fwd.h>
+#include <Common/assert_cast.h>
+
+#include <boost/math/distributions/normal.hpp>
+
+namespace DB
+{
+struct Settings;
+
+namespace ErrorCodes
+{
+    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
+}
+
+
+struct LargestTriangleThreeBucketsData : public StatisticalSample<Float64, Float64>
+{
+    void add(const Float64 xval, const Float64 yval, Arena * arena)
+    {
+        this->addX(xval, arena);
+        this->addY(yval, arena);
+    }
+
+    void sort(Arena * arena)
+    {
+        // sort the this->x and this->y in ascending order of this->x using index
+        std::vector<size_t> index(this->x.size());
+
+        std::iota(index.begin(), index.end(), 0);
+        ::sort(index.begin(), index.end(), [&](size_t i1, size_t i2) { return this->x[i1] < this->x[i2]; });
+
+        SampleX temp_x{};
+        SampleY temp_y{};
+
+        for (size_t i = 0; i < this->x.size(); ++i)
+        {
+            temp_x.push_back(this->x[index[i]], arena);
+            temp_y.push_back(this->y[index[i]], arena);
+        }
+
+        for (size_t i = 0; i < this->x.size(); ++i)
+        {
+            this->x[i] = temp_x[i];
+            this->y[i] = temp_y[i];
+        }
+    }
+
+    PODArray<std::pair<Float64, Float64>> getResult(size_t total_buckets, Arena * arena)
+    {
+        // Sort the data
+        this->sort(arena);
+
+        PODArray<std::pair<Float64, Float64>> result;
+
+        // Handle special cases for small data list
+        if (this->x.size() <= total_buckets)
+        {
+            for (size_t i = 0; i < this->x.size(); ++i)
+            {
+                result.emplace_back(std::make_pair(this->x[i], this->y[i]));
+            }
+            return result;
+        }
+
+        // Handle special cases for 0 or 1 or 2 buckets
+        if (total_buckets == 0)
+            return result;
+        if (total_buckets == 1)
+        {
+            result.emplace_back(std::make_pair(this->x.front(), this->y.front()));
+            return result;
+        }
+        if (total_buckets == 2)
+        {
+            result.emplace_back(std::make_pair(this->x.front(), this->y.front()));
+            result.emplace_back(std::make_pair(this->x.back(), this->y.back()));
+            return result;
+        }
+
+        // Find the size of each bucket
+        size_t single_bucket_size = this->x.size() / total_buckets;
+
+        // Include the first data point
+        result.emplace_back(std::make_pair(this->x[0], this->y[0]));
+
+        for (size_t i = 1; i < total_buckets - 1; ++i) // Skip the first and last bucket
+        {
+            size_t start_index = i * single_bucket_size;
+            size_t end_index = (i + 1) * single_bucket_size;
+
+            // Compute the average point in the next bucket
+            Float64 avg_x = 0;
+            Float64 avg_y = 0;
+            for (size_t j = end_index; j < (i + 2) * single_bucket_size; ++j)
+            {
+                avg_x += this->x[j];
+                avg_y += this->y[j];
+            }
+            avg_x /= single_bucket_size;
+            avg_y /= single_bucket_size;
+
+            // Find the point in the current bucket that forms the largest triangle
+            size_t max_index = start_index;
+            Float64 max_area = 0.0;
+            for (size_t j = start_index; j < end_index; ++j)
+            {
+                Float64 area = std::abs(
+                    0.5
+                    * (result.back().first * this->y[j] + this->x[j] * avg_y + avg_x * result.back().second - result.back().first * avg_y
+                       - this->x[j] * result.back().second - avg_x * this->y[j]));
+                if (area > max_area)
+                {
+                    max_area = area;
+                    max_index = j;
+                }
+            }
+
+            // Include the selected point
+            result.emplace_back(std::make_pair(this->x[max_index], this->y[max_index]));
+        }
+
+        // Include the last data point
+        result.emplace_back(std::make_pair(this->x.back(), this->y.back()));
+
+        return result;
+    }
+};
+
+class AggregateFunctionLargestTriangleThreeBuckets final : public IAggregateFunctionDataHelper<LargestTriangleThreeBucketsData, AggregateFunctionLargestTriangleThreeBuckets>
+{
+private:
+    UInt64 total_buckets{0};
+    TypeIndex x_type;
+    TypeIndex y_type;
+
+public:
+    explicit AggregateFunctionLargestTriangleThreeBuckets(const DataTypes & arguments, const Array & params)
+        : IAggregateFunctionDataHelper<LargestTriangleThreeBucketsData, AggregateFunctionLargestTriangleThreeBuckets>({arguments}, {}, createResultType(arguments))
+    {
+        if (params.size() != 1)
+            throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Aggregate function {} require one parameter", getName());
+
+        if (params[0].getType() != Field::Types::UInt64)
+            throw Exception(
+                ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Aggregate function {} require first parameter to be a UInt64", getName());
+
+        total_buckets = params[0].get<UInt64>();
+
+        this->x_type = WhichDataType(arguments[0]).idx;
+        this->y_type = WhichDataType(arguments[1]).idx;
+    }
+
+    static constexpr auto name = "largestTriangleThreeBuckets";
+
+    String getName() const override { return name; }
+
+    bool allocatesMemoryInArena() const override { return true; }
+
+    static DataTypePtr createResultType(const DataTypes & arguments)
+    {
+        TypeIndex x_type = arguments[0]->getTypeId();
+        TypeIndex y_type = arguments[1]->getTypeId();
+
+        UInt32 x_scale = 0;
+        UInt32 y_scale = 0;
+
+        if (const auto * datetime64_type = typeid_cast<const DataTypeDateTime64 *>(arguments[0].get()))
+        {
+            x_scale = datetime64_type->getScale();
+        }
+
+        if (const auto * datetime64_type = typeid_cast<const DataTypeDateTime64 *>(arguments[1].get()))
+        {
+            y_scale = datetime64_type->getScale();
+        }
+
+        DataTypes types = {getDataTypeFromTypeIndex(x_type, x_scale), getDataTypeFromTypeIndex(y_type, y_scale)};
+
+        auto tuple = std::make_shared<DataTypeTuple>(std::move(types));
+
+        return std::make_shared<DataTypeArray>(tuple);
+    }
+
+    static DataTypePtr getDataTypeFromTypeIndex(TypeIndex type_index, UInt32 scale)
+    {
+        DataTypePtr data_type;
+        switch (type_index)
+        {
+            case TypeIndex::Date:
+                data_type = std::make_shared<DataTypeDate>();
+                break;
+            case TypeIndex::Date32:
+                data_type = std::make_shared<DataTypeDate32>();
+                break;
+            case TypeIndex::DateTime:
+                data_type = std::make_shared<DataTypeDateTime>();
+                break;
+            case TypeIndex::DateTime64:
+                data_type = std::make_shared<DataTypeDateTime64>(scale);
+                break;
+            default:
+                data_type = std::make_shared<DataTypeNumber<Float64>>();
+        }
+        return data_type;
+    }
+
+    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena * arena) const override
+    {
+        Float64 x = getFloat64DataFromColumn(columns[0], row_num, this->x_type);
+        Float64 y = getFloat64DataFromColumn(columns[1], row_num, this->y_type);
+        this->data(place).add(x, y, arena);
+    }
+
+    Float64 getFloat64DataFromColumn(const IColumn * column, size_t row_num, TypeIndex type_index) const
+    {
+        switch (type_index)
+        {
+            case TypeIndex::Date:
+                return static_cast<const ColumnDate &>(*column).getData()[row_num];
+            case TypeIndex::Date32:
+                return static_cast<const ColumnDate32 &>(*column).getData()[row_num];
+            case TypeIndex::DateTime:
+                return static_cast<const ColumnDateTime &>(*column).getData()[row_num];
+            case TypeIndex::DateTime64:
+                return static_cast<const ColumnDateTime64 &>(*column).getData()[row_num];
+            default:
+                return column->getFloat64(row_num);
+        }
+    }
+
+    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena * arena) const override
+    {
+        auto & a = this->data(place);
+        const auto & b = this->data(rhs);
+
+        a.merge(b, arena);
+    }
+
+    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
+    {
+        this->data(place).write(buf);
+    }
+
+    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena * arena) const override
+    {
+        this->data(place).read(buf, arena);
+    }
+
+    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena * arena) const override
+    {
+        auto res = this->data(place).getResult(total_buckets, arena);
+
+        auto & col = assert_cast<ColumnArray &>(to);
+        auto & col_offsets = assert_cast<ColumnArray::ColumnOffsets &>(col.getOffsetsColumn());
+
+        auto column_x_adder_func = getColumnAdderFunc(x_type);
+        auto column_y_adder_func = getColumnAdderFunc(y_type);
+
+        for (size_t i = 0; i < res.size(); ++i)
+        {
+            auto & column_tuple = assert_cast<ColumnTuple &>(col.getData());
+            column_x_adder_func(column_tuple.getColumn(0), res[i].first);
+            column_y_adder_func(column_tuple.getColumn(1), res[i].second);
+        }
+
+        col_offsets.getData().push_back(col.getData().size());
+    }
+
+    std::function<void(IColumn &, Float64)> getColumnAdderFunc(TypeIndex type_index) const
+    {
+        switch (type_index)
+        {
+            case TypeIndex::Date:
+                return [](IColumn & column, Float64 value)
+                {
+                    auto & col = assert_cast<ColumnDate &>(column);
+                    col.getData().push_back(static_cast<UInt16>(value));
+                };
+            case TypeIndex::Date32:
+                return [](IColumn & column, Float64 value)
+                {
+                    auto & col = assert_cast<ColumnDate32 &>(column);
+                    col.getData().push_back(static_cast<UInt32>(value));
+                };
+            case TypeIndex::DateTime:
+                return [](IColumn & column, Float64 value)
+                {
+                    auto & col = assert_cast<ColumnDateTime &>(column);
+                    col.getData().push_back(static_cast<UInt32>(value));
+                };
+            case TypeIndex::DateTime64:
+                return [](IColumn & column, Float64 value)
+                {
+                    auto & col = assert_cast<ColumnDateTime64 &>(column);
+                    col.getData().push_back(static_cast<UInt64>(value));
+                };
+            default:
+                return [](IColumn & column, Float64 value)
+                {
+                    auto & col = assert_cast<ColumnFloat64 &>(column);
+                    col.getData().push_back(value);
+                };
+        }
+    }
+};
+
+}
--- a/src/AggregateFunctions/AggregateFunctionMaxIntersections.h
+++ b/src/AggregateFunctions/AggregateFunctionMaxIntersections.h
@ -129,7 +129,10 @@ public:
        {
            writePODBinary(value[i].first, buf);
            writePODBinary(zero_padding, buf);
-            writePODBinary(value[i].second, buf);
+            if constexpr (std::endian::native == std::endian::little)
+                writePODBinary(value[i].second, buf);
+            else
+                writePODBinary(std::byteswap(value[i].second), buf);
        }
    }

--- a/src/AggregateFunctions/registerAggregateFunctions.cpp
+++ b/src/AggregateFunctions/registerAggregateFunctions.cpp
@ -82,6 +82,7 @@ void registerAggregateFunctionIntervalLengthSum(AggregateFunctionFactory &);
 void registerAggregateFunctionAnalysisOfVariance(AggregateFunctionFactory &);
 void registerAggregateFunctionFlameGraph(AggregateFunctionFactory &);
 void registerAggregateFunctionKolmogorovSmirnovTest(AggregateFunctionFactory & factory);
+void registerAggregateFunctionLargestTriangleThreeBuckets(AggregateFunctionFactory & factory);

 class AggregateFunctionCombinatorFactory;
 void registerAggregateFunctionCombinatorIf(AggregateFunctionCombinatorFactory &);
@ -176,6 +177,7 @@ void registerAggregateFunctions()
        registerAggregateFunctionAnalysisOfVariance(factory);
        registerAggregateFunctionFlameGraph(factory);
        registerAggregateFunctionKolmogorovSmirnovTest(factory);
+        registerAggregateFunctionLargestTriangleThreeBuckets(factory);

        registerWindowFunctions(factory);
    }
--- a/src/Client/MultiplexedConnections.cpp
+++ b/src/Client/MultiplexedConnections.cpp
@ -1,7 +1,6 @@
 #include <Client/MultiplexedConnections.h>

 #include <Common/thread_local_rng.h>
-#include <Common/logger_useful.h>
 #include <Core/Protocol.h>
 #include <IO/ConnectionTimeouts.h>
 #include <IO/Operators.h>
@ -24,14 +23,6 @@ namespace ErrorCodes
 }


-#define MUTEX_LOCK_TEMPORARY_DEBUG_INSTRUMENTATION \
-    mutex_last_locked_by.store((getThreadId() << 32) | __LINE__); \
-    memcpy(mutex_memory_dump.data(), &cancel_mutex, mutex_memory_dump.size()); \
-    mutex_locked += 1; \
-    SCOPE_EXIT({ mutex_locked -= 1; });
-/// When you remove this macro, please also remove the clang-tidy suppressions at the beginning + end of this file.
-
-
 MultiplexedConnections::MultiplexedConnections(Connection & connection, const Settings & settings_, const ThrottlerPtr & throttler)
    : settings(settings_)
 {
@ -86,7 +77,6 @@ MultiplexedConnections::MultiplexedConnections(
 void MultiplexedConnections::sendScalarsData(Scalars & data)
 {
    std::lock_guard lock(cancel_mutex);
-    MUTEX_LOCK_TEMPORARY_DEBUG_INSTRUMENTATION

    if (!sent_query)
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot send scalars data: query not yet sent.");
@ -102,7 +92,6 @@ void MultiplexedConnections::sendScalarsData(Scalars & data)
 void MultiplexedConnections::sendExternalTablesData(std::vector<ExternalTablesData> & data)
 {
    std::lock_guard lock(cancel_mutex);
-    MUTEX_LOCK_TEMPORARY_DEBUG_INSTRUMENTATION

    if (!sent_query)
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot send external tables data: query not yet sent.");
@ -131,7 +120,6 @@ void MultiplexedConnections::sendQuery(
    bool with_pending_data)
 {
    std::lock_guard lock(cancel_mutex);
-    MUTEX_LOCK_TEMPORARY_DEBUG_INSTRUMENTATION

    if (sent_query)
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Query already sent.");
@ -189,7 +177,6 @@ void MultiplexedConnections::sendQuery(
 void MultiplexedConnections::sendIgnoredPartUUIDs(const std::vector<UUID> & uuids)
 {
    std::lock_guard lock(cancel_mutex);
-    MUTEX_LOCK_TEMPORARY_DEBUG_INSTRUMENTATION

    if (sent_query)
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot send uuids after query is sent.");
@ -206,7 +193,6 @@ void MultiplexedConnections::sendIgnoredPartUUIDs(const std::vector<UUID> & uuid
 void MultiplexedConnections::sendReadTaskResponse(const String & response)
 {
    std::lock_guard lock(cancel_mutex);
-    MUTEX_LOCK_TEMPORARY_DEBUG_INSTRUMENTATION
    if (cancelled)
        return;
    current_connection->sendReadTaskResponse(response);
@ -216,7 +202,6 @@ void MultiplexedConnections::sendReadTaskResponse(const String & response)
 void MultiplexedConnections::sendMergeTreeReadTaskResponse(const ParallelReadResponse & response)
 {
    std::lock_guard lock(cancel_mutex);
-    MUTEX_LOCK_TEMPORARY_DEBUG_INSTRUMENTATION
    if (cancelled)
        return;
    current_connection->sendMergeTreeReadTaskResponse(response);
@ -226,29 +211,13 @@ void MultiplexedConnections::sendMergeTreeReadTaskResponse(const ParallelReadRes
 Packet MultiplexedConnections::receivePacket()
 {
    std::lock_guard lock(cancel_mutex);
-    MUTEX_LOCK_TEMPORARY_DEBUG_INSTRUMENTATION
    Packet packet = receivePacketUnlocked({});
    return packet;
 }

 void MultiplexedConnections::disconnect()
 {
-    /// We've seen this lock mysteriously get stuck forever, without any other thread seeming to
-    /// hold the mutex. This is temporary code to print some extra information next time it happens.
-    /// std::lock_guard lock(cancel_mutex);
-    if (!cancel_mutex.try_lock_for(std::chrono::hours(1)))
-    {
-        UInt64 last_locked = mutex_last_locked_by.load();
-        std::array<UInt8, sizeof(std::timed_mutex)> new_memory_dump;
-        memcpy(new_memory_dump.data(), &cancel_mutex, new_memory_dump.size());
-        LOG_ERROR(&Poco::Logger::get("MultiplexedConnections"), "Deadlock in MultiplexedConnections::disconnect()! Mutex was last (instrumentedly) locked by thread {} on line {}, lock balance: {}, mutex memory when last locked: {}, mutex memory now: {}", last_locked >> 32, last_locked & 0xffffffff, mutex_locked.load(), hexString(mutex_memory_dump.data(), mutex_memory_dump.size()), hexString(new_memory_dump.data(), new_memory_dump.size()));
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Deadlock in MultiplexedConnections::disconnect()");
-    }
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wthread-safety-analysis"
-    std::lock_guard lock(cancel_mutex, std::adopt_lock);
-#pragma clang diagnostic pop
-    MUTEX_LOCK_TEMPORARY_DEBUG_INSTRUMENTATION
+    std::lock_guard lock(cancel_mutex);

    for (ReplicaState & state : replica_states)
    {
@ -264,7 +233,6 @@ void MultiplexedConnections::disconnect()
 void MultiplexedConnections::sendCancel()
 {
    std::lock_guard lock(cancel_mutex);
-    MUTEX_LOCK_TEMPORARY_DEBUG_INSTRUMENTATION

    if (!sent_query || cancelled)
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot cancel. Either no query sent or already cancelled.");
@ -282,7 +250,6 @@ void MultiplexedConnections::sendCancel()
 Packet MultiplexedConnections::drain()
 {
    std::lock_guard lock(cancel_mutex);
-    MUTEX_LOCK_TEMPORARY_DEBUG_INSTRUMENTATION

    if (!cancelled)
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot drain connections: cancel first.");
@ -323,7 +290,6 @@ Packet MultiplexedConnections::drain()
 std::string MultiplexedConnections::dumpAddresses() const
 {
    std::lock_guard lock(cancel_mutex);
-    MUTEX_LOCK_TEMPORARY_DEBUG_INSTRUMENTATION
    return dumpAddressesUnlocked();
 }

--- a/src/Client/MultiplexedConnections.h
+++ b/src/Client/MultiplexedConnections.h
@ -106,14 +106,7 @@ private:
    std::optional<ReplicaInfo> replica_info;

    /// A mutex for the sendCancel function to execute safely in separate thread.
-    mutable std::timed_mutex cancel_mutex;
-
-    /// Temporary instrumentation to debug a weird deadlock on cancel_mutex.
-    /// TODO: Once the investigation is done, get rid of these, and of INSTRUMENTED_LOCK_MUTEX, and
-    ///       change cancel_mutex to std::mutex.
-    mutable std::atomic<UInt64> mutex_last_locked_by{0};
-    mutable std::atomic<Int64> mutex_locked{0};
-    mutable std::array<UInt8, sizeof(std::timed_mutex)> mutex_memory_dump;
+    mutable std::mutex cancel_mutex;

    friend struct RemoteQueryExecutorRoutine;
 };
--- a/src/Common/ProfileEvents.cpp
+++ b/src/Common/ProfileEvents.cpp
@ -530,6 +530,13 @@ The server successfully detected this situation and will download merged part fr
    M(OverflowThrow, "Number of times, data processing was cancelled by query complexity limitation with setting '*_overflow_mode' = 'throw' and exception was thrown.") \
    M(OverflowAny, "Number of times approximate GROUP BY was in effect: when aggregation was performed only on top of first 'max_rows_to_group_by' unique keys and other keys were ignored due to 'group_by_overflow_mode' = 'any'.") \
    \
+    M(S3QueueSetFileProcessingMicroseconds, "Time spent to set file as processing")\
+    M(S3QueueSetFileProcessedMicroseconds, "Time spent to set file as processed")\
+    M(S3QueueSetFileFailedMicroseconds, "Time spent to set file as failed")\
+    M(S3QueueCleanupMaxSetSizeOrTTLMicroseconds, "Time spent to set file as failed")\
+    M(S3QueuePullMicroseconds, "Time spent to read file data")\
+    M(S3QueueLockLocalFileStatusesMicroseconds, "Time spent to lock local file statuses")\
+    \
    M(ServerStartupMilliseconds, "Time elapsed from starting server to listening to sockets in milliseconds")\
    M(IOUringSQEsSubmitted, "Total number of io_uring SQEs submitted") \
    M(IOUringSQEsResubmits, "Total number of io_uring SQE resubmits performed") \
@ -589,9 +596,14 @@ Timer::Timer(Counters & counters_, Event timer_event_, Event counter_event, Reso
    counters.increment(counter_event);
 }

+UInt64 Timer::get()
+{
+    return watch.elapsedNanoseconds() / static_cast<UInt64>(resolution);
+}
+
 void Timer::end()
 {
-    counters.increment(timer_event, watch.elapsedNanoseconds() / static_cast<UInt64>(resolution));
+    counters.increment(timer_event, get());
    watch.reset();
 }

--- a/src/Common/ProfileEvents.h
+++ b/src/Common/ProfileEvents.h
@ -41,6 +41,7 @@ namespace ProfileEvents
        ~Timer() { end(); }
        void cancel() { watch.reset(); }
        void end();
+        UInt64 get();

    private:
        Counters & counters;
--- a/src/Common/SystemLogBase.cpp
+++ b/src/Common/SystemLogBase.cpp
@ -10,6 +10,7 @@
 #include <Interpreters/TextLog.h>
 #include <Interpreters/TraceLog.h>
 #include <Interpreters/FilesystemCacheLog.h>
+#include <Interpreters/S3QueueLog.h>
 #include <Interpreters/FilesystemReadPrefetchesLog.h>
 #include <Interpreters/ProcessorsProfileLog.h>
 #include <Interpreters/ZooKeeperLog.h>
--- a/src/Common/SystemLogBase.h
+++ b/src/Common/SystemLogBase.h
@ -27,6 +27,7 @@
    M(ZooKeeperLogElement) \
    M(ProcessorProfileLogElement) \
    M(TextLogElement) \
+    M(S3QueueLogElement) \
    M(FilesystemCacheLogElement) \
    M(FilesystemReadPrefetchesLogElement) \
    M(AsynchronousInsertLogElement) \
--- a/src/Common/ZooKeeper/ZooKeeper.h
+++ b/src/Common/ZooKeeper/ZooKeeper.h
@ -644,11 +644,18 @@ class EphemeralNodeHolder
 public:
    using Ptr = std::shared_ptr<EphemeralNodeHolder>;

-    EphemeralNodeHolder(const std::string & path_, ZooKeeper & zookeeper_, bool create, bool sequential, const std::string & data)
+    EphemeralNodeHolder(const std::string & path_, ZooKeeper & zookeeper_, bool create, bool try_create, bool sequential, const std::string & data)
            : path(path_), zookeeper(zookeeper_)
    {
        if (create)
+        {
            path = zookeeper.create(path, data, sequential ? CreateMode::EphemeralSequential : CreateMode::Ephemeral);
+            need_remove = created = true;
+        }
+        else if (try_create)
+        {
+            need_remove = created = Coordination::Error::ZOK == zookeeper.tryCreate(path, data, sequential ? CreateMode::EphemeralSequential : CreateMode::Ephemeral);
+        }
    }

    std::string getPath() const
@ -656,19 +663,32 @@ public:
        return path;
    }

+    bool isCreated() const
+    {
+        return created;
+    }
+
    static Ptr create(const std::string & path, ZooKeeper & zookeeper, const std::string & data = "")
    {
-        return std::make_shared<EphemeralNodeHolder>(path, zookeeper, true, false, data);
+        return std::make_shared<EphemeralNodeHolder>(path, zookeeper, true, false, false, data);
+    }
+
+    static Ptr tryCreate(const std::string & path, ZooKeeper & zookeeper, const std::string & data = "")
+    {
+        auto node = std::make_shared<EphemeralNodeHolder>(path, zookeeper, false, true, false, data);
+        if (node->isCreated())
+            return node;
+        return nullptr;
    }

    static Ptr createSequential(const std::string & path, ZooKeeper & zookeeper, const std::string & data = "")
    {
-        return std::make_shared<EphemeralNodeHolder>(path, zookeeper, true, true, data);
+        return std::make_shared<EphemeralNodeHolder>(path, zookeeper, true, false, true, data);
    }

    static Ptr existing(const std::string & path, ZooKeeper & zookeeper)
    {
-        return std::make_shared<EphemeralNodeHolder>(path, zookeeper, false, false, "");
+        return std::make_shared<EphemeralNodeHolder>(path, zookeeper, false, false, false, "");
    }

    void setAlreadyRemoved()
@ -702,6 +722,7 @@ private:
    ZooKeeper & zookeeper;
    CurrentMetrics::Increment metric_increment{CurrentMetrics::EphemeralNode};
    bool need_remove = true;
+    bool created = false;
 };

 using EphemeralNodeHolderPtr = EphemeralNodeHolder::Ptr;
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -103,11 +103,12 @@ class IColumn;
    M(Bool, s3_check_objects_after_upload, false, "Check each uploaded object to s3 with head request to be sure that upload was successful", 0) \
    M(Bool, s3_allow_parallel_part_upload, true, "Use multiple threads for s3 multipart upload. It may lead to slightly higher memory usage", 0) \
    M(Bool, s3_throw_on_zero_files_match, false, "Throw an error, when ListObjects request cannot match any files", 0) \
-    M(UInt64, s3_retry_attempts, 10, "Setting for Aws::Client::RetryStrategy, Aws::Client does retries itself, 0 means no retries", 0) \
+    M(UInt64, s3_retry_attempts, 100, "Setting for Aws::Client::RetryStrategy, Aws::Client does retries itself, 0 means no retries", 0) \
    M(UInt64, s3_request_timeout_ms, 3000, "Idleness timeout for sending and receiving data to/from S3. Fail if a single TCP read or write call blocks for this long.", 0) \
    M(UInt64, s3_http_connection_pool_size, 1000, "How many reusable open connections to keep per S3 endpoint. Only applies to the S3 table engine and table function, not to S3 disks (for disks, use disk config instead). Global setting, can only be set in config, overriding it per session or per query has no effect.", 0) \
    M(Bool, enable_s3_requests_logging, false, "Enable very explicit logging of S3 requests. Makes sense for debug only.", 0) \
    M(String, s3queue_default_zookeeper_path, "/clickhouse/s3queue/", "Default zookeeper path prefix for S3Queue engine", 0) \
+    M(Bool, s3queue_enable_logging_to_s3queue_log, false, "Enable writing to system.s3queue_log. The value can be overwritten per table with table settings", 0) \
    M(UInt64, hdfs_replication, 0, "The actual number of replications can be specified when the hdfs file is created.", 0) \
    M(Bool, hdfs_truncate_on_insert, false, "Enables or disables truncate before insert in s3 engine tables", 0) \
    M(Bool, hdfs_create_new_file_on_insert, false, "Enables or disables creating a new file on each insert in hdfs engine tables", 0) \
@ -205,7 +206,9 @@ class IColumn;
    M(Bool, allow_experimental_inverted_index, false, "If it is set to true, allow to use experimental inverted index.", 0) \
    \
    M(UInt64, mysql_max_rows_to_insert, 65536, "The maximum number of rows in MySQL batch insertion of the MySQL storage engine", 0) \
-    M(Bool, use_mysql_types_in_show_columns, false, "Show MySQL types in SHOW COLUMNS and system.columns", 0) \
+    M(Bool, use_mysql_types_in_show_columns, false, "Show native MySQL types in SHOW [FULL] COLUMNS", 0) \
+    M(Bool, mysql_map_string_to_text_in_show_columns, false, "If enabled, String type will be mapped to TEXT in SHOW [FULL] COLUMNS, BLOB otherwise. Will only take effect if use_mysql_types_in_show_columns is enabled too", 0) \
+    M(Bool, mysql_map_fixed_string_to_text_in_show_columns, false, "If enabled, FixedString type will be mapped to TEXT in SHOW [FULL] COLUMNS, BLOB otherwise. Will only take effect if use_mysql_types_in_show_columns is enabled too", 0) \
    \
    M(UInt64, optimize_min_equality_disjunction_chain_length, 3, "The minimum length of the expression `expr = x1 OR ... expr = xN` for optimization ", 0) \
    \
--- a/src/Databases/PostgreSQL/DatabaseMaterializedPostgreSQL.cpp
+++ b/src/Databases/PostgreSQL/DatabaseMaterializedPostgreSQL.cpp
@ -52,10 +52,29 @@ DatabaseMaterializedPostgreSQL::DatabaseMaterializedPostgreSQL(
    , remote_database_name(postgres_database_name)
    , connection_info(connection_info_)
    , settings(std::move(settings_))
-    , startup_task(getContext()->getSchedulePool().createTask("MaterializedPostgreSQLDatabaseStartup", [this]{ startSynchronization(); }))
+    , startup_task(getContext()->getSchedulePool().createTask("MaterializedPostgreSQLDatabaseStartup", [this]{ tryStartSynchronization(); }))
 {
 }

+void DatabaseMaterializedPostgreSQL::tryStartSynchronization()
+{
+    if (shutdown_called)
+        return;
+
+    try
+    {
+        startSynchronization();
+        LOG_INFO(log, "Successfully loaded tables from PostgreSQL and started replication");
+    }
+    catch (...)
+    {
+        LOG_ERROR(log, "Failed to start replication from PostgreSQL, "
+                  "will retry. Error: {}", getCurrentExceptionMessage(true));
+
+        if (!shutdown_called)
+            startup_task->scheduleAfter(5000);
+    }
+}

 void DatabaseMaterializedPostgreSQL::startSynchronization()
 {
@ -64,9 +83,10 @@ void DatabaseMaterializedPostgreSQL::startSynchronization()
        return;

    replication_handler = std::make_unique<PostgreSQLReplicationHandler>(
-            /* replication_identifier */ TSA_SUPPRESS_WARNING_FOR_READ(database_name),    /// FIXME
            remote_database_name,
+            /* table_name */"",
            TSA_SUPPRESS_WARNING_FOR_READ(database_name),     /// FIXME
+            toString(getUUID()),
            connection_info,
            getContext(),
            is_attach,
@ -114,15 +134,7 @@ void DatabaseMaterializedPostgreSQL::startSynchronization()

    LOG_TRACE(log, "Loaded {} tables. Starting synchronization", materialized_tables.size());

-    try
-    {
-        replication_handler->startup(/* delayed */false);
-    }
-    catch (...)
-    {
-        tryLogCurrentException(__PRETTY_FUNCTION__);
-        throw;
-    }
+    replication_handler->startup(/* delayed */false);
 }


@ -401,6 +413,7 @@ void DatabaseMaterializedPostgreSQL::detachTablePermanently(ContextPtr, const St

 void DatabaseMaterializedPostgreSQL::shutdown()
 {
+    shutdown_called = true;
    startup_task->deactivate();
    stopReplication();
    DatabaseAtomic::shutdown();
@ -413,7 +426,6 @@ void DatabaseMaterializedPostgreSQL::stopReplication()
    if (replication_handler)
        replication_handler->shutdown();

-    shutdown_called = true;
    /// Clear wrappers over nested, all access is not done to nested tables directly.
    materialized_tables.clear();
 }
--- a/src/Databases/PostgreSQL/DatabaseMaterializedPostgreSQL.h
+++ b/src/Databases/PostgreSQL/DatabaseMaterializedPostgreSQL.h
@ -73,6 +73,7 @@ protected:
    ASTPtr getCreateTableQueryImpl(const String & table_name, ContextPtr local_context, bool throw_on_error) const override;

 private:
+    void tryStartSynchronization();
    void startSynchronization();

    ASTPtr createAlterSettingsQuery(const SettingChange & new_setting);
--- a/src/Databases/PostgreSQL/DatabasePostgreSQL.cpp
+++ b/src/Databases/PostgreSQL/DatabasePostgreSQL.cpp
@ -322,8 +322,19 @@ void DatabasePostgreSQL::loadStoredObjects(ContextMutablePtr /* context */, Load
 void DatabasePostgreSQL::removeOutdatedTables()
 {
    std::lock_guard lock{mutex};
-    auto connection_holder = pool->get();
-    auto actual_tables = fetchPostgreSQLTablesList(connection_holder->get(), configuration.schema);
+
+    std::set<std::string> actual_tables;
+    try
+    {
+        auto connection_holder = pool->get();
+        actual_tables = fetchPostgreSQLTablesList(connection_holder->get(), configuration.schema);
+    }
+    catch (...)
+    {
+        tryLogCurrentException(__PRETTY_FUNCTION__);
+        cleaner_task->scheduleAfter(cleaner_reschedule_ms);
+        return;
+    }

    if (cache_tables)
    {
--- a/src/Formats/FormatFactory.cpp
+++ b/src/Formats/FormatFactory.cpp
@ -410,7 +410,7 @@ std::unique_ptr<ReadBuffer> FormatFactory::wrapReadBufferIfNeeded(

 static void addExistingProgressToOutputFormat(OutputFormatPtr format, ContextPtr context)
 {
-    auto element_id = context->getProcessListElement();
+    auto element_id = context->getProcessListElementSafe();
    if (element_id)
    {
        /// While preparing the query there might have been progress (for example in subscalar subqueries) so add it here
--- a/src/IO/HTTPCommon.cpp
+++ b/src/IO/HTTPCommon.cpp
@ -324,6 +324,14 @@ namespace
            auto retry_timeout = timeouts.connection_timeout.totalMilliseconds();
            auto session = pool_ptr->second->get(retry_timeout);

+            const auto & session_data = session->sessionData();
+            if (session_data.empty() || !Poco::AnyCast<HTTPSessionReuseTag>(&session_data))
+            {
+                /// Reset session if it is not reusable. See comment for HTTPSessionReuseTag.
+                session->reset();
+            }
+            session->attachSessionData({});
+
            setTimeouts(*session, timeouts);

            return session;
--- a/src/IO/ReadBufferFromS3.cpp
+++ b/src/IO/ReadBufferFromS3.cpp
@ -74,8 +74,17 @@ void resetSessionIfNeeded(bool read_all_range_successfully, std::optional<Aws::S
    }
    else if (auto session = getSession(*read_result); !session.isNull())
    {
-        DB::markSessionForReuse(session);
-        ProfileEvents::increment(ProfileEvents::ReadBufferFromS3PreservedSessions);
+        if (!session->getProxyHost().empty())
+        {
+            /// Reset proxified sessions because proxy can change for every request. See ProxyConfigurationResolver.
+            resetSession(*read_result);
+            ProfileEvents::increment(ProfileEvents::ReadBufferFromS3ResetSessions);
+        }
+        else
+        {
+            DB::markSessionForReuse(session);
+            ProfileEvents::increment(ProfileEvents::ReadBufferFromS3PreservedSessions);
+        }
    }
 }
 }
--- a/src/IO/S3/PocoHTTPClient.cpp
+++ b/src/IO/S3/PocoHTTPClient.cpp
@ -276,7 +276,7 @@ void PocoHTTPClient::makeRequestInternal(
 {
    /// Most sessions in pool are already connected and it is not possible to set proxy host/port to a connected session.
    const auto request_configuration = per_request_configuration();
-    if (http_connection_pool_size && request_configuration.host.empty())
+    if (http_connection_pool_size)
        makeRequestInternalImpl<true>(request, request_configuration, response, readLimiter, writeLimiter);
    else
        makeRequestInternalImpl<false>(request, request_configuration, response, readLimiter, writeLimiter);
--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@ -3588,6 +3588,15 @@ std::shared_ptr<FilesystemCacheLog> Context::getFilesystemCacheLog() const
    return shared->system_logs->filesystem_cache_log;
 }

+std::shared_ptr<S3QueueLog> Context::getS3QueueLog() const
+{
+    auto lock = getGlobalSharedLock();
+    if (!shared->system_logs)
+        return {};
+
+    return shared->system_logs->s3_queue_log;
+}
+
 std::shared_ptr<FilesystemReadPrefetchesLog> Context::getFilesystemReadPrefetchesLog() const
 {
    auto lock = getGlobalSharedLock();
--- a/src/Interpreters/Context.h
+++ b/src/Interpreters/Context.h
@ -105,6 +105,7 @@ class TransactionsInfoLog;
 class ProcessorsProfileLog;
 class FilesystemCacheLog;
 class FilesystemReadPrefetchesLog;
+class S3QueueLog;
 class AsynchronousInsertLog;
 class BackupLog;
 class IAsynchronousReader;
@ -1041,6 +1042,7 @@ public:
    std::shared_ptr<TransactionsInfoLog> getTransactionsInfoLog() const;
    std::shared_ptr<ProcessorsProfileLog> getProcessorsProfileLog() const;
    std::shared_ptr<FilesystemCacheLog> getFilesystemCacheLog() const;
+    std::shared_ptr<S3QueueLog> getS3QueueLog() const;
    std::shared_ptr<FilesystemReadPrefetchesLog> getFilesystemReadPrefetchesLog() const;
    std::shared_ptr<AsynchronousInsertLog> getAsynchronousInsertLog() const;
    std::shared_ptr<BackupLog> getBackupLog() const;
--- a/src/Interpreters/GinFilter.cpp
+++ b/src/Interpreters/GinFilter.cpp
@ -21,14 +21,15 @@ namespace ErrorCodes
    extern const int BAD_ARGUMENTS;
 }

-GinFilterParameters::GinFilterParameters(size_t ngrams_, Float64 density_)
+GinFilterParameters::GinFilterParameters(size_t ngrams_, UInt64 max_rows_per_postings_list_)
    : ngrams(ngrams_)
-    , density(density_)
+    , max_rows_per_postings_list(max_rows_per_postings_list_)
 {
+    if (max_rows_per_postings_list == UNLIMITED_ROWS_PER_POSTINGS_LIST)
+        max_rows_per_postings_list = std::numeric_limits<UInt64>::max();
+
    if (ngrams > 8)
        throw Exception(ErrorCodes::BAD_ARGUMENTS, "The size of inverted index filter cannot be greater than 8");
-    if (density <= 0 || density > 1)
-        throw Exception(ErrorCodes::BAD_ARGUMENTS, "The density inverted index gin filter must be between 0 and 1");
 }

 GinFilter::GinFilter(const GinFilterParameters & params_)
@ -36,7 +37,7 @@ GinFilter::GinFilter(const GinFilterParameters & params_)
 {
 }

-void GinFilter::add(const char * data, size_t len, UInt32 rowID, GinIndexStorePtr & store, UInt64 limit) const
+void GinFilter::add(const char * data, size_t len, UInt32 rowID, GinIndexStorePtr & store) const
 {
    if (len > FST::MAX_TERM_LENGTH)
        return;
@ -51,8 +52,7 @@ void GinFilter::add(const char * data, size_t len, UInt32 rowID, GinIndexStorePt
    }
    else
    {
-        UInt64 size_limit = std::lround(limit * params.density);
-        auto builder = std::make_shared<GinIndexPostingsBuilder>(size_limit);
+        auto builder = std::make_shared<GinIndexPostingsBuilder>(params.max_rows_per_postings_list);
        builder->add(rowID);

        store->setPostingsBuilder(term, builder);
--- a/src/Interpreters/GinFilter.h
+++ b/src/Interpreters/GinFilter.h
@ -8,13 +8,16 @@ namespace DB
 {

 static inline constexpr auto INVERTED_INDEX_NAME = "inverted";
+static inline constexpr UInt64 UNLIMITED_ROWS_PER_POSTINGS_LIST = 0;
+static inline constexpr UInt64 MIN_ROWS_PER_POSTINGS_LIST = 8 * 1024;
+static inline constexpr UInt64 DEFAULT_MAX_ROWS_PER_POSTINGS_LIST = 64 * 1024;

 struct GinFilterParameters
 {
-    GinFilterParameters(size_t ngrams_, Float64 density_);
+    GinFilterParameters(size_t ngrams_, UInt64 max_rows_per_postings_list_);

    size_t ngrams;
-    Float64 density;
+    UInt64 max_rows_per_postings_list;
 };

 struct GinSegmentWithRowIdRange
@ -42,7 +45,7 @@ public:

    /// Add term (located at 'data' with length 'len') and its row ID to the postings list builder
    /// for building inverted index for the given store.
-    void add(const char * data, size_t len, UInt32 rowID, GinIndexStorePtr & store, UInt64 limit) const;
+    void add(const char * data, size_t len, UInt32 rowID, GinIndexStorePtr & store) const;

    /// Accumulate (segmentID, RowIDStart, RowIDEnd) for building skipping index
    void addRowRangeToGinFilter(UInt32 segmentID, UInt32 rowIDStart, UInt32 rowIDEnd);
--- a/src/Interpreters/InterpreterShowColumnsQuery.cpp
+++ b/src/Interpreters/InterpreterShowColumnsQuery.cpp
@ -24,7 +24,10 @@ String InterpreterShowColumnsQuery::getRewrittenQuery()
 {
    const auto & query = query_ptr->as<ASTShowColumnsQuery &>();

-    const bool use_mysql_types = getContext()->getSettingsRef().use_mysql_types_in_show_columns;
+    const auto & settings = getContext()->getSettingsRef();
+    const bool use_mysql_types = settings.use_mysql_types_in_show_columns;
+    const bool remap_string_as_text = settings.mysql_map_string_to_text_in_show_columns;
+    const bool remap_fixed_string_as_text = settings.mysql_map_fixed_string_to_text_in_show_columns;

    WriteBufferFromOwnString buf_database;
    String resolved_database = getContext()->resolveDatabase(query.database);
@ -37,42 +40,51 @@ String InterpreterShowColumnsQuery::getRewrittenQuery()

    String rewritten_query;
    if (use_mysql_types)
+    {
        /// Cheapskate SQL-based mapping from native types to MySQL types, see https://dev.mysql.com/doc/refman/8.0/en/data-types.html
        /// Only used with setting 'use_mysql_types_in_show_columns = 1'
        /// Known issues:
        /// - Enums are translated to TEXT
-        rewritten_query += R"(
+        rewritten_query += fmt::format(
+            R"(
 WITH map(
-        'Int8',       'TINYINT',
-        'Int16',      'SMALLINT',
-        'Int32',      'INTEGER',
-        'Int64',      'BIGINT',
-        'UInt8',      'TINYINT UNSIGNED',
-        'UInt16',     'SMALLINT UNSIGNED',
-        'UInt32',     'INTEGER UNSIGNED',
-        'UInt64',     'BIGINT UNSIGNED',
-        'Float32',    'FLOAT',
-        'Float64',    'DOUBLE',
-        'String',     'BLOB',
-        'UUID',       'CHAR',
-        'Bool',       'TINYINT',
-        'Date',       'DATE',
-        'Date32',     'DATE',
-        'DateTime',   'DATETIME',
-        'DateTime64', 'DATETIME',
-        'Map',        'JSON',
-        'Tuple',      'JSON',
-        'Object',     'JSON') AS native_to_mysql_mapping,
-    splitByRegexp('\(|\)', type_) AS split,
-    multiIf(startsWith(type_, 'LowCardinality(Nullable'), split[3],
-             startsWith(type_, 'LowCardinality'), split[2],
-             startsWith(type_, 'Nullable'), split[2],
-             split[1]) AS inner_type,
-     if (length(split) > 1, splitByString(', ', split[2]), []) AS decimal_scale_and_precision,
-     multiIf(inner_type = 'Decimal' AND toInt8(decimal_scale_and_precision[1]) <= 65 AND toInt8(decimal_scale_and_precision[2]) <= 30, concat('DECIMAL(', decimal_scale_and_precision[1], ', ', decimal_scale_and_precision[2], ')'),
-             mapContains(native_to_mysql_mapping, inner_type) = true, native_to_mysql_mapping[inner_type],
-             'TEXT') AS mysql_type
-        )";
+        'Int8',        'TINYINT',
+        'Int16',       'SMALLINT',
+        'Int32',       'INTEGER',
+        'Int64',       'BIGINT',
+        'UInt8',       'TINYINT UNSIGNED',
+        'UInt16',      'SMALLINT UNSIGNED',
+        'UInt32',      'INTEGER UNSIGNED',
+        'UInt64',      'BIGINT UNSIGNED',
+        'Float32',     'FLOAT',
+        'Float64',     'DOUBLE',
+        'UUID',        'CHAR',
+        'Bool',        'TINYINT',
+        'Date',        'DATE',
+        'Date32',      'DATE',
+        'DateTime',    'DATETIME',
+        'DateTime64',  'DATETIME',
+        'Map',         'JSON',
+        'Tuple',       'JSON',
+        'Object',      'JSON',
+        'String',      '{}',
+        'FixedString', '{}') AS native_to_mysql_mapping,
+        )",
+        remap_string_as_text ? "TEXT" : "BLOB",
+        remap_fixed_string_as_text ? "TEXT" : "BLOB");
+
+        rewritten_query += R"(
+        splitByRegexp('\(|\)', type_) AS split,
+        multiIf(startsWith(type_, 'LowCardinality(Nullable'), split[3],
+                startsWith(type_, 'LowCardinality'), split[2],
+                startsWith(type_, 'Nullable'), split[2],
+                split[1]) AS inner_type,
+        if (length(split) > 1, splitByString(', ', split[2]), []) AS decimal_scale_and_precision,
+        multiIf(inner_type = 'Decimal' AND toInt8(decimal_scale_and_precision[1]) <= 65 AND toInt8(decimal_scale_and_precision[2]) <= 30, concat('DECIMAL(', decimal_scale_and_precision[1], ', ', decimal_scale_and_precision[2], ')'),
+                mapContains(native_to_mysql_mapping, inner_type) = true, native_to_mysql_mapping[inner_type],
+                'TEXT') AS mysql_type
+            )";
+    }

    rewritten_query += R"(
 SELECT
--- a/src/Interpreters/QueryNormalizer.cpp
+++ b/src/Interpreters/QueryNormalizer.cpp
@ -68,6 +68,10 @@ private:

 void QueryNormalizer::visit(ASTIdentifier & node, ASTPtr & ast, Data & data)
 {
+    /// We do handle cycles via tracking current_asts
+    /// but in case of bug in that tricky logic we need to prevent stack overflow
+    checkStackSize();
+
    auto & current_asts = data.current_asts;
    String & current_alias = data.current_alias;

--- a/src/Interpreters/S3QueueLog.cpp
+++ b/src/Interpreters/S3QueueLog.cpp
@ -0,0 +1,62 @@
+#include <DataTypes/DataTypeDate.h>
+#include <DataTypes/DataTypeDateTime.h>
+#include <DataTypes/DataTypeString.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <DataTypes/DataTypeNullable.h>
+#include <DataTypes/DataTypeMap.h>
+#include <Interpreters/ProfileEventsExt.h>
+#include <DataTypes/DataTypeEnum.h>
+#include <Interpreters/S3QueueLog.h>
+
+
+namespace DB
+{
+
+NamesAndTypesList S3QueueLogElement::getNamesAndTypes()
+{
+    auto status_datatype = std::make_shared<DataTypeEnum8>(
+        DataTypeEnum8::Values
+        {
+            {"Processed", static_cast<Int8>(S3QueueLogElement::S3QueueStatus::Processed)},
+            {"Failed", static_cast<Int8>(S3QueueLogElement::S3QueueStatus::Failed)},
+        });
+    return {
+        {"event_date", std::make_shared<DataTypeDate>()},
+        {"event_time", std::make_shared<DataTypeDateTime>()},
+        {"table_uuid", std::make_shared<DataTypeString>()},
+        {"file_name", std::make_shared<DataTypeString>()},
+        {"rows_processed", std::make_shared<DataTypeUInt64>()},
+        {"status", status_datatype},
+        {"processing_start_time", std::make_shared<DataTypeNullable>(std::make_shared<DataTypeDateTime>())},
+        {"processing_end_time", std::make_shared<DataTypeNullable>(std::make_shared<DataTypeDateTime>())},
+        {"ProfileEvents", std::make_shared<DataTypeMap>(std::make_shared<DataTypeString>(), std::make_shared<DataTypeUInt64>())},
+        {"exception", std::make_shared<DataTypeString>()},
+    };
+}
+
+void S3QueueLogElement::appendToBlock(MutableColumns & columns) const
+{
+    size_t i = 0;
+    columns[i++]->insert(DateLUT::instance().toDayNum(event_time).toUnderType());
+    columns[i++]->insert(event_time);
+    columns[i++]->insert(table_uuid);
+    columns[i++]->insert(file_name);
+    columns[i++]->insert(rows_processed);
+    columns[i++]->insert(status);
+
+    if (processing_start_time)
+        columns[i++]->insert(processing_start_time);
+    else
+        columns[i++]->insertDefault();
+
+    if (processing_end_time)
+        columns[i++]->insert(processing_end_time);
+    else
+        columns[i++]->insertDefault();
+
+    ProfileEvents::dumpToMapColumn(counters_snapshot, columns[i++].get(), true);
+
+    columns[i++]->insert(exception);
+}
+
+}
--- a/src/Interpreters/S3QueueLog.h
+++ b/src/Interpreters/S3QueueLog.h
@ -0,0 +1,43 @@
+#pragma once
+
+#include <Common/ProfileEvents.h>
+#include <Core/NamesAndAliases.h>
+#include <Core/NamesAndTypes.h>
+#include <Interpreters/SystemLog.h>
+
+namespace DB
+{
+
+struct S3QueueLogElement
+{
+    time_t event_time{};
+    std::string table_uuid;
+    std::string file_name;
+    size_t rows_processed = 0;
+
+    enum class S3QueueStatus
+    {
+        Processed,
+        Failed,
+    };
+    S3QueueStatus status;
+    ProfileEvents::Counters::Snapshot counters_snapshot;
+    time_t processing_start_time;
+    time_t processing_end_time;
+    std::string exception;
+
+    static std::string name() { return "S3QueueLog"; }
+
+    static NamesAndTypesList getNamesAndTypes();
+    static NamesAndAliases getNamesAndAliases() { return {}; }
+
+    void appendToBlock(MutableColumns & columns) const;
+    static const char * getCustomColumnList() { return nullptr; }
+};
+
+class S3QueueLog : public SystemLog<S3QueueLogElement>
+{
+    using SystemLog<S3QueueLogElement>::SystemLog;
+};
+
+}
--- a/src/Interpreters/SystemLog.cpp
+++ b/src/Interpreters/SystemLog.cpp
@ -19,6 +19,7 @@
 #include <Interpreters/TransactionsInfoLog.h>
 #include <Interpreters/FilesystemCacheLog.h>
 #include <Interpreters/FilesystemReadPrefetchesLog.h>
+#include <Interpreters/S3QueueLog.h>
 #include <Interpreters/ZooKeeperLog.h>
 #include <Interpreters/BackupLog.h>
 #include <Parsers/ASTCreateQuery.h>
@ -289,6 +290,7 @@ SystemLogs::SystemLogs(ContextPtr global_context, const Poco::Util::AbstractConf
    processors_profile_log = createSystemLog<ProcessorsProfileLog>(global_context, "system", "processors_profile_log", config, "processors_profile_log");
    asynchronous_insert_log = createSystemLog<AsynchronousInsertLog>(global_context, "system", "asynchronous_insert_log", config, "asynchronous_insert_log");
    backup_log = createSystemLog<BackupLog>(global_context, "system", "backup_log", config, "backup_log");
+    s3_queue_log = createSystemLog<S3QueueLog>(global_context, "system", "s3queue_log", config, "s3queue_log");

    if (query_log)
        logs.emplace_back(query_log.get());
@ -329,6 +331,8 @@ SystemLogs::SystemLogs(ContextPtr global_context, const Poco::Util::AbstractConf
        logs.emplace_back(asynchronous_insert_log.get());
    if (backup_log)
        logs.emplace_back(backup_log.get());
+    if (s3_queue_log)
+        logs.emplace_back(s3_queue_log.get());

    try
    {
--- a/src/Interpreters/SystemLog.h
+++ b/src/Interpreters/SystemLog.h
@ -50,6 +50,7 @@ class FilesystemCacheLog;
 class FilesystemReadPrefetchesLog;
 class AsynchronousInsertLog;
 class BackupLog;
+class S3QueueLog;

 /// System logs should be destroyed in destructor of the last Context and before tables,
 ///  because SystemLog destruction makes insert query while flushing data into underlying tables
@ -70,6 +71,7 @@ struct SystemLogs
    std::shared_ptr<MetricLog> metric_log;              /// Used to log all metrics.
    std::shared_ptr<FilesystemCacheLog> filesystem_cache_log;
    std::shared_ptr<FilesystemReadPrefetchesLog> filesystem_read_prefetches_log;
+    std::shared_ptr<S3QueueLog> s3_queue_log;
    /// Metrics from system.asynchronous_metrics.
    std::shared_ptr<AsynchronousMetricLog> asynchronous_metric_log;
    /// OpenTelemetry trace spans.
--- a/src/Interpreters/executeQuery.cpp
+++ b/src/Interpreters/executeQuery.cpp
@ -988,7 +988,11 @@ static std::tuple<ASTPtr, BlockIO> executeQueryImpl(
        }

        QueryCachePtr query_cache = context->getQueryCache();
-        const bool can_use_query_cache = query_cache != nullptr && settings.use_query_cache && !internal && (ast->as<ASTSelectQuery>() || ast->as<ASTSelectWithUnionQuery>());
+        const bool can_use_query_cache = query_cache != nullptr
+            && settings.use_query_cache
+            && !internal
+            && client_info.query_kind == ClientInfo::QueryKind::INITIAL_QUERY
+            && (ast->as<ASTSelectQuery>() || ast->as<ASTSelectWithUnionQuery>());
        QueryCache::Usage query_cache_usage = QueryCache::Usage::None;

        if (!async_insert)
@ -1332,7 +1336,46 @@ void executeQuery(
    BlockIO streams;
    OutputFormatPtr output_format;

-    std::tie(ast, streams) = executeQueryImpl(begin, end, context, false, QueryProcessingStage::Complete, &istr);
+    auto update_format_for_exception_if_needed = [&]()
+    {
+        if (!output_format)
+        {
+            try
+            {
+                String format_name = context->getDefaultFormat();
+                output_format = FormatFactory::instance().getOutputFormat(format_name, ostr, {}, context, output_format_settings);
+                if (output_format && output_format->supportsWritingException())
+                {
+                    /// Force an update of the headers before we start writing
+                    result_details.content_type = output_format->getContentType();
+                    result_details.format = format_name;
+                    set_result_details(result_details);
+                    set_result_details = nullptr;
+                }
+            }
+            catch (const DB::Exception & e)
+            {
+                /// Ignore this exception and report the original one
+                LOG_WARNING(&Poco::Logger::get("executeQuery"), getExceptionMessageAndPattern(e, true));
+            }
+        }
+    };
+
+    try
+    {
+        std::tie(ast, streams) = executeQueryImpl(begin, end, context, false, QueryProcessingStage::Complete, &istr);
+    }
+    catch (...)
+    {
+        if (handle_exception_in_output_format)
+        {
+            update_format_for_exception_if_needed();
+            if (output_format)
+                handle_exception_in_output_format(*output_format);
+        }
+        throw;
+    }
+
    auto & pipeline = streams.pipeline;

    std::unique_ptr<WriteBuffer> compressed_buffer;
@ -1426,8 +1469,12 @@ void executeQuery(
    }
    catch (...)
    {
-        if (handle_exception_in_output_format && output_format)
-            handle_exception_in_output_format(*output_format);
+        if (handle_exception_in_output_format)
+        {
+            update_format_for_exception_if_needed();
+            if (output_format)
+                handle_exception_in_output_format(*output_format);
+        }
        streams.onException();
        throw;
    }
--- a/src/Parsers/ASTPartition.cpp
+++ b/src/Parsers/ASTPartition.cpp
@ -36,8 +36,9 @@ String ASTPartition::getID(char delim) const
 {
    if (value)
        return "Partition";
-    else
-        return "Partition_ID" + (delim + id->getID());
+
+    std::string id_string = id ? id->getID() : "";
+    return "Partition_ID" + (delim + id_string);
 }

 ASTPtr ASTPartition::clone() const
--- a/src/QueryPipeline/RemoteQueryExecutor.cpp
+++ b/src/QueryPipeline/RemoteQueryExecutor.cpp
@ -574,7 +574,7 @@ void RemoteQueryExecutor::processMergeTreeInitialReadAnnouncement(InitialAllRang
    if (!extension || !extension->parallel_reading_coordinator)
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Coordinator for parallel reading from replicas is not initialized");

-    extension->parallel_reading_coordinator->handleInitialAllRangesAnnouncement(announcement);
+    extension->parallel_reading_coordinator->handleInitialAllRangesAnnouncement(std::move(announcement));
 }

 void RemoteQueryExecutor::finish()
--- a/src/Storages/MergeTree/ActiveDataPartSet.cpp
+++ b/src/Storages/MergeTree/ActiveDataPartSet.cpp
@ -21,18 +21,46 @@ ActiveDataPartSet::ActiveDataPartSet(MergeTreeDataFormatVersion format_version_,
        add(name);
 }

-bool ActiveDataPartSet::add(const String & name, Strings * out_replaced_parts)
+ActiveDataPartSet::AddPartOutcome ActiveDataPartSet::tryAddPart(const MergeTreePartInfo & part_info, String * out_reason)
 {
-    auto part_info = MergeTreePartInfo::fromPartName(name, format_version);
-    return add(part_info, name, out_replaced_parts);
+    return addImpl(part_info, part_info.getPartNameAndCheckFormat(format_version), nullptr, out_reason);
 }

 bool ActiveDataPartSet::add(const MergeTreePartInfo & part_info, const String & name, Strings * out_replaced_parts)
+{
+    String out_reason;
+    AddPartOutcome outcome = addImpl(part_info, name, out_replaced_parts, &out_reason);
+    if (outcome == AddPartOutcome::HasIntersectingPart)
+    {
+        chassert(!out_reason.empty());
+        throw Exception(ErrorCodes::LOGICAL_ERROR, fmt::runtime(out_reason));
+    }
+
+    return outcome == AddPartOutcome::Added;
+}
+
+
+bool ActiveDataPartSet::add(const String & name, Strings * out_replaced_parts)
+{
+    auto part_info = MergeTreePartInfo::fromPartName(name, format_version);
+    String out_reason;
+    AddPartOutcome outcome = addImpl(part_info, name, out_replaced_parts, &out_reason);
+    if (outcome == AddPartOutcome::HasIntersectingPart)
+    {
+        chassert(!out_reason.empty());
+        throw Exception(ErrorCodes::LOGICAL_ERROR, fmt::runtime(out_reason));
+    }
+
+    return outcome == AddPartOutcome::Added;
+}
+
+
+ActiveDataPartSet::AddPartOutcome ActiveDataPartSet::addImpl(const MergeTreePartInfo & part_info, const String & name, Strings * out_replaced_parts, String * out_reason)
 {
    /// TODO make it exception safe (out_replaced_parts->push_back(...) may throw)

    if (getContainingPartImpl(part_info) != part_info_to_name.end())
-        return false;
+        return AddPartOutcome::HasCovering;

    /// Parts contained in `part` are located contiguously in `part_info_to_name`, overlapping with the place where the part itself would be inserted.
    auto it = part_info_to_name.lower_bound(part_info);
@ -47,10 +75,15 @@ bool ActiveDataPartSet::add(const MergeTreePartInfo & part_info, const String &
        if (!part_info.contains(it->first))
        {
            if (!part_info.isDisjoint(it->first))
-                throw Exception(ErrorCodes::LOGICAL_ERROR,
-                                "Part {} intersects previous part {}. "
-                                "It is a bug or a result of manual intervention in the ZooKeeper data.",
-                                part_info.getPartNameForLogs(), it->first.getPartNameForLogs());
+            {
+                if (out_reason != nullptr)
+                    *out_reason = fmt::format(
+                        "Part {} intersects previous part {}. "
+                        "It is a bug or a result of manual intervention in the ZooKeeper data.",
+                        part_info.getPartNameForLogs(),
+                        it->first.getPartNameForLogs());
+                return AddPartOutcome::HasIntersectingPart;
+            }
            ++it;
            break;
        }
@ -73,18 +106,33 @@ bool ActiveDataPartSet::add(const MergeTreePartInfo & part_info, const String &
    }

    if (it != part_info_to_name.end() && !part_info.isDisjoint(it->first))
-        throw Exception(ErrorCodes::LOGICAL_ERROR,
-                        "Part {} intersects part {}. It is a bug or a result of manual intervention "
-                        "in the ZooKeeper data.", name, it->first.getPartNameForLogs());
+    {
+        if (out_reason != nullptr)
+            *out_reason = fmt::format(
+                "Part {} intersects part {}. It is a bug or a result of manual intervention "
+                "in the ZooKeeper data.",
+                name,
+                it->first.getPartNameForLogs());
+
+        return AddPartOutcome::HasIntersectingPart;
+    }

    part_info_to_name.emplace(part_info, name);
-    return true;
+    return AddPartOutcome::Added;

 }

 bool ActiveDataPartSet::add(const MergeTreePartInfo & part_info, Strings * out_replaced_parts)
 {
-    return add(part_info, part_info.getPartNameAndCheckFormat(format_version), out_replaced_parts);
+    String out_reason;
+    AddPartOutcome outcome = addImpl(part_info, part_info.getPartNameAndCheckFormat(format_version), out_replaced_parts, &out_reason);
+    if (outcome == AddPartOutcome::HasIntersectingPart)
+    {
+        chassert(!out_reason.empty());
+        throw Exception(ErrorCodes::LOGICAL_ERROR, fmt::runtime(out_reason));
+    }
+
+    return outcome == AddPartOutcome::Added;
 }


--- a/src/Storages/MergeTree/ActiveDataPartSet.h
+++ b/src/Storages/MergeTree/ActiveDataPartSet.h
@ -22,6 +22,13 @@ using Strings = std::vector<String>;
 class ActiveDataPartSet
 {
 public:
+    enum class AddPartOutcome
+    {
+        Added,
+        HasCovering,
+        HasIntersectingPart,
+    };
+
    explicit ActiveDataPartSet(MergeTreeDataFormatVersion format_version_) : format_version(format_version_) {}
    ActiveDataPartSet(MergeTreeDataFormatVersion format_version_, const Strings & names);

@ -43,6 +50,8 @@ public:
    bool add(const MergeTreePartInfo & part_info, const String & name, Strings * out_replaced_parts = nullptr);
    bool add(const MergeTreePartInfo & part_info, Strings * out_replaced_parts = nullptr);

+    AddPartOutcome tryAddPart(const MergeTreePartInfo & part_info, String * out_reason = nullptr);
+
    bool remove(const MergeTreePartInfo & part_info)
    {
        return part_info_to_name.erase(part_info) > 0;
@ -97,6 +106,8 @@ public:
    MergeTreeDataFormatVersion getFormatVersion() const { return format_version; }

 private:
+
+    AddPartOutcome addImpl(const MergeTreePartInfo & part_info, const String & name, Strings * out_replaced_parts = nullptr, String * out_reason = nullptr);
    MergeTreeDataFormatVersion format_version;
    std::map<MergeTreePartInfo, String> part_info_to_name;

--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@ -20,6 +20,7 @@
 #include <Common/ThreadFuzzer.h>
 #include <Common/getNumberOfPhysicalCPUCores.h>
 #include <Common/Config/ConfigHelper.h>
+#include <Storages/MergeTree/RangesInDataPart.h>
 #include <Compression/CompressedReadBuffer.h>
 #include <Core/QueryProcessingStage.h>
 #include <DataTypes/DataTypeEnum.h>
@ -76,6 +77,7 @@
 #include <Storages/MergeTree/MergeTreeDataPartInMemory.h>
 #include <Storages/MergeTree/MergeTreeDataPartWide.h>
 #include <Storages/MergeTree/DataPartStorageOnDiskFull.h>
+#include <Storages/MergeTree/ActiveDataPartSet.h>
 #include <Storages/StorageMergeTree.h>
 #include <Storages/StorageReplicatedMergeTree.h>
 #include <Storages/VirtualColumnUtils.h>
@ -96,6 +98,7 @@
 #include <iomanip>
 #include <limits>
 #include <optional>
+#include <ranges>
 #include <set>
 #include <thread>
 #include <typeinfo>
@ -3915,25 +3918,17 @@ void MergeTreeData::forcefullyMovePartToDetachedAndRemoveFromMemory(const MergeT
        return;
    }

+    /// Let's restore some parts covered by unexpected to avoid partial data
    if (restore_covered)
    {
        Strings restored;
-        bool error = false;
-        String error_parts;
-
-        Int64 pos = part->info.min_block;
+        Strings error_parts;

        auto is_appropriate_state = [] (DataPartState state)
        {
            return state == DataPartState::Active || state == DataPartState::Outdated;
        };

-        auto update_error = [&] (DataPartIteratorByInfo it)
-        {
-            error = true;
-            error_parts += (*it)->getNameWithState() + " ";
-        };
-
        auto activate_part = [this, &restored_active_part](auto it)
        {
            /// It's not clear what to do if we try to activate part that was removed in transaction.
@ -3951,68 +3946,90 @@ void MergeTreeData::forcefullyMovePartToDetachedAndRemoveFromMemory(const MergeT
            restored_active_part = true;
        };

-        auto it_middle = data_parts_by_info.lower_bound(part->info);
+        /// ActiveDataPartSet allows to restore most top-level parts instead of unexpected.
+        /// It can be important in case of assigned merges. If unexpected part is result of some
+        /// finished, but not committed merge then we should restore (at least try to restore)
+        /// closest ancestors for the unexpected part to be able to execute it.
+        /// However it's not guaranteed because outdated parts can intersect
+        ActiveDataPartSet parts_for_replacement(format_version);
+        auto range = getDataPartsPartitionRange(part->info.partition_id);
+        DataPartsVector parts_candidates(range.begin(), range.end());

-        /// Restore the leftmost part covered by the part
-        if (it_middle != data_parts_by_info.begin())
+        /// In case of intersecting outdated parts we want to add bigger parts (with higher level) first
+        auto comparator = [] (const DataPartPtr left, const DataPartPtr right) -> bool
        {
-            auto it = std::prev(it_middle);
-
-            if (part->contains(**it) && is_appropriate_state((*it)->getState()))
-            {
-                /// Maybe, we must consider part level somehow
-                if ((*it)->info.min_block != part->info.min_block)
-                    update_error(it);
-
-                if ((*it)->getState() != DataPartState::Active)
-                    activate_part(it);
-
-                pos = (*it)->info.max_block + 1;
-                restored.push_back((*it)->name);
-            }
-            else if ((*it)->info.partition_id == part->info.partition_id)
-                update_error(it);
+            if (left->info.level < right->info.level)
+                return true;
+            else if (left->info.level > right->info.level)
+                return false;
            else
-                error = true;
+                return left->info.mutation < right->info.mutation;
+        };
+        std::sort(parts_candidates.begin(), parts_candidates.end(), comparator);
+        /// From larger to smaller parts
+        for (const auto & part_candidate_in_partition : parts_candidates | std::views::reverse)
+        {
+            if (part->info.contains(part_candidate_in_partition->info)
+                && is_appropriate_state(part_candidate_in_partition->getState()))
+            {
+                String out_reason;
+                /// Outdated parts can itersect legally (because of DROP_PART) here it's okay, we
+                /// are trying to do out best to restore covered parts.
+                auto outcome = parts_for_replacement.tryAddPart(part_candidate_in_partition->info, &out_reason);
+                if (outcome == ActiveDataPartSet::AddPartOutcome::HasIntersectingPart)
+                {
+                    error_parts.push_back(part->name);
+                    LOG_ERROR(log, "Failed to restore part {}, because of intersection reason '{}'", part->name, out_reason);
+                }
+            }
+        }
+
+        if (parts_for_replacement.size() > 0)
+        {
+            std::vector<std::pair<uint64_t, uint64_t>> holes_list;
+            /// Most part of the code below is just to write pretty message
+            auto part_infos = parts_for_replacement.getPartInfos();
+            int64_t current_right_block = part_infos[0].min_block;
+            for (const auto & top_level_part_to_replace : part_infos)
+            {
+                auto data_part_it = data_parts_by_info.find(top_level_part_to_replace);
+                if (data_part_it == data_parts_by_info.end())
+                    throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot find part {} in own set", top_level_part_to_replace.getPartNameForLogs());
+                activate_part(data_part_it);
+                restored.push_back((*data_part_it)->name);
+                if (top_level_part_to_replace.min_block - current_right_block > 1)
+                    holes_list.emplace_back(current_right_block, top_level_part_to_replace.min_block);
+                current_right_block = top_level_part_to_replace.max_block;
+            }
+            if (part->info.max_block != current_right_block)
+                holes_list.emplace_back(current_right_block, part->info.max_block);
+
+            for (const String & name : restored)
+                LOG_INFO(log, "Activated part {} in place of unexpected {}", name, part->name);
+
+            if (!error_parts.empty() || !holes_list.empty())
+            {
+                std::string error_parts_message, holes_list_message;
+                if (!error_parts.empty())
+                    error_parts_message = fmt::format(" Parts failed to restore because of intersection: [{}]", fmt::join(error_parts, ", "));
+                if (!holes_list.empty())
+                {
+                    if (!error_parts.empty())
+                        holes_list_message = ".";
+
+                    Strings holes_list_pairs;
+                    for (const auto & [left_side, right_side] : holes_list)
+                        holes_list_pairs.push_back(fmt::format("({}, {})", left_side + 1, right_side - 1));
+                    holes_list_message += fmt::format(" Block ranges failed to restore: [{}]", fmt::join(holes_list_pairs, ", "));
+                }
+                LOG_WARNING(log, "The set of parts restored in place of {} looks incomplete. "
+                                 "SELECT queries may observe gaps in data until this replica is synchronized with other replicas.{}{}",
+                            part->name, error_parts_message, holes_list_message);
+            }
        }
        else
-            error = true;
-
-        /// Restore "right" parts
-        for (auto it = it_middle; it != data_parts_by_info.end() && part->contains(**it); ++it)
        {
-            if ((*it)->info.min_block < pos)
-                continue;
-
-            if (!is_appropriate_state((*it)->getState()))
-            {
-                update_error(it);
-                continue;
-            }
-
-            if ((*it)->info.min_block > pos)
-                update_error(it);
-
-            if ((*it)->getState() != DataPartState::Active)
-                activate_part(it);
-
-            pos = (*it)->info.max_block + 1;
-            restored.push_back((*it)->name);
-        }
-
-        if (pos != part->info.max_block + 1)
-            error = true;
-
-        for (const String & name : restored)
-        {
-            LOG_INFO(log, "Activated part {}", name);
-        }
-
-        if (error)
-        {
-            LOG_WARNING(log, "The set of parts restored in place of {} looks incomplete. "
-                             "SELECT queries may observe gaps in data until this replica is synchronized with other replicas.{}",
-                        part->name, (error_parts.empty() ? "" : " Suspicious parts: " + error_parts));
+            LOG_INFO(log, "Don't find any parts for replacement instead of unexpected {}", part->name);
        }
    }

--- a/src/Storages/MergeTree/MergeTreeIndexInverted.cpp
+++ b/src/Storages/MergeTree/MergeTreeIndexInverted.cpp
@ -109,14 +109,14 @@ MergeTreeIndexGranulePtr MergeTreeIndexAggregatorInverted::getGranuleAndReset()
    return new_granule;
 }

-void MergeTreeIndexAggregatorInverted::addToGinFilter(UInt32 rowID, const char * data, size_t length, GinFilter & gin_filter, UInt64 limit)
+void MergeTreeIndexAggregatorInverted::addToGinFilter(UInt32 rowID, const char * data, size_t length, GinFilter & gin_filter)
 {
    size_t cur = 0;
    size_t token_start = 0;
    size_t token_len = 0;

    while (cur < length && token_extractor->nextInStringPadded(data, length, &cur, &token_start, &token_len))
-        gin_filter.add(data + token_start, token_len, rowID, store, limit);
+        gin_filter.add(data + token_start, token_len, rowID, store);
 }

 void MergeTreeIndexAggregatorInverted::update(const Block & block, size_t * pos, size_t limit)
@ -150,7 +150,7 @@ void MergeTreeIndexAggregatorInverted::update(const Block & block, size_t * pos,
                for (size_t row_num = 0; row_num < elements_size; ++row_num)
                {
                    auto ref = column_key.getDataAt(element_start_row + row_num);
-                    addToGinFilter(row_id, ref.data, ref.size, granule->gin_filters[col], rows_read);
+                    addToGinFilter(row_id, ref.data, ref.size, granule->gin_filters[col]);
                    store->incrementCurrentSizeBy(ref.size);
                }
                current_position += 1;
@ -165,7 +165,7 @@ void MergeTreeIndexAggregatorInverted::update(const Block & block, size_t * pos,
            for (size_t i = 0; i < rows_read; ++i)
            {
                auto ref = column->getDataAt(current_position + i);
-                addToGinFilter(row_id, ref.data, ref.size, granule->gin_filters[col], rows_read);
+                addToGinFilter(row_id, ref.data, ref.size, granule->gin_filters[col]);
                store->incrementCurrentSizeBy(ref.size);
                row_id++;
                if (store->needToWrite())
@ -735,8 +735,8 @@ MergeTreeIndexPtr invertedIndexCreator(
    const IndexDescription & index)
 {
    size_t n = index.arguments.empty() ? 0 : index.arguments[0].get<size_t>();
-    Float64 density = index.arguments.size() < 2 ? 1.0 : index.arguments[1].get<Float64>();
-    GinFilterParameters params(n, density);
+    UInt64 max_rows = index.arguments.size() < 2 ? DEFAULT_MAX_ROWS_PER_POSTINGS_LIST : index.arguments[1].get<UInt64>();
+    GinFilterParameters params(n, max_rows);

    /// Use SplitTokenExtractor when n is 0, otherwise use NgramTokenExtractor
    if (n > 0)
@ -780,13 +780,16 @@ void invertedIndexValidator(const IndexDescription & index, bool /*attach*/)
    if (!index.arguments.empty() && index.arguments[0].getType() != Field::Types::UInt64)
        throw Exception(ErrorCodes::INCORRECT_QUERY, "The first Inverted index argument must be positive integer.");

-    if (index.arguments.size() == 2 && (index.arguments[1].getType() != Field::Types::Float64 || index.arguments[1].get<Float64>() <= 0 || index.arguments[1].get<Float64>() > 1))
-        throw Exception(ErrorCodes::INCORRECT_QUERY, "The second Inverted index argument must be a float between 0 and 1.");
-
+    if (index.arguments.size() == 2)
+    {
+        if (index.arguments[1].getType() != Field::Types::UInt64)
+            throw Exception(ErrorCodes::INCORRECT_QUERY, "The second Inverted index argument must be UInt64");
+        if (index.arguments[1].get<UInt64>() != UNLIMITED_ROWS_PER_POSTINGS_LIST && index.arguments[1].get<UInt64>() < MIN_ROWS_PER_POSTINGS_LIST)
+            throw Exception(ErrorCodes::INCORRECT_QUERY, "The maximum rows per postings list must be no less than {}", MIN_ROWS_PER_POSTINGS_LIST);
+    }
    /// Just validate
    size_t ngrams = index.arguments.empty() ? 0 : index.arguments[0].get<size_t>();
-    Float64 density = index.arguments.size() < 2 ? 1.0 : index.arguments[1].get<Float64>();
-    GinFilterParameters params(ngrams, density);
+    UInt64 max_rows_per_postings_list = index.arguments.size() < 2 ? DEFAULT_MAX_ROWS_PER_POSTINGS_LIST : index.arguments[1].get<UInt64>();
+    GinFilterParameters params(ngrams, max_rows_per_postings_list);
 }
-
 }
--- a/src/Storages/MergeTree/MergeTreeIndexInverted.h
+++ b/src/Storages/MergeTree/MergeTreeIndexInverted.h
@ -48,7 +48,7 @@ struct MergeTreeIndexAggregatorInverted final : IMergeTreeIndexAggregator

    void update(const Block & block, size_t * pos, size_t limit) override;

-    void addToGinFilter(UInt32 rowID, const char * data, size_t length, GinFilter & gin_filter, UInt64 limit);
+    void addToGinFilter(UInt32 rowID, const char * data, size_t length, GinFilter & gin_filter);

    GinIndexStorePtr store;
    Names index_columns;
--- a/src/Storages/MergeTree/ParallelReplicasReadingCoordinator.cpp
+++ b/src/Storages/MergeTree/ParallelReplicasReadingCoordinator.cpp
@ -134,7 +134,7 @@ public:
    void handleInitialAllRangesAnnouncement(InitialAllRangesAnnouncement announcement) override;
    void markReplicaAsUnavailable(size_t replica_number) override;

-    void updateReadingState(const InitialAllRangesAnnouncement & announcement);
+    void updateReadingState(InitialAllRangesAnnouncement announcement);
    void finalizeReadingState();

    size_t computeConsistentHash(const MergeTreePartInfo & info) const
@ -152,12 +152,12 @@ DefaultCoordinator::~DefaultCoordinator()
    LOG_DEBUG(log, "Coordination done: {}", toString(stats));
 }

-void DefaultCoordinator::updateReadingState(const InitialAllRangesAnnouncement & announcement)
+void DefaultCoordinator::updateReadingState(InitialAllRangesAnnouncement announcement)
 {
    PartRefs parts_diff;

    /// To get rid of duplicates
-    for (const auto & part: announcement.description)
+    for (auto && part: announcement.description)
    {
        auto the_same_it = std::find_if(all_parts_to_read.begin(), all_parts_to_read.end(),
            [&part] (const Part & other) { return other.description.info.getPartNameV1() == part.info.getPartNameV1(); });
@ -176,12 +176,7 @@ void DefaultCoordinator::updateReadingState(const InitialAllRangesAnnouncement &
        if (covering_or_the_same_it != all_parts_to_read.end())
            continue;

-        auto new_part = Part{
-            .description = part,
-            .replicas = {announcement.replica_num}
-        };
-
-        auto [insert_it, _] = all_parts_to_read.insert(new_part);
+        auto [insert_it, _] = all_parts_to_read.emplace(Part{.description = std::move(part), .replicas = {announcement.replica_num}});
        parts_diff.push_back(insert_it);
    }

@ -242,12 +237,14 @@ void DefaultCoordinator::finalizeReadingState()

 void DefaultCoordinator::handleInitialAllRangesAnnouncement(InitialAllRangesAnnouncement announcement)
 {
-    updateReadingState(announcement);
+    const auto replica_num = announcement.replica_num;

-    if (announcement.replica_num >= stats.size())
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Replica number ({}) is bigger than total replicas count ({})", announcement.replica_num, stats.size());
+    updateReadingState(std::move(announcement));

-    stats[announcement.replica_num].number_of_requests +=1;
+    if (replica_num >= stats.size())
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Replica number ({}) is bigger than total replicas count ({})", replica_num, stats.size());
+
+    ++stats[replica_num].number_of_requests;

    ++sent_initial_requests;
    LOG_DEBUG(log, "Sent initial requests: {} Replicas count: {}", sent_initial_requests, replicas_count);
@ -385,7 +382,7 @@ void InOrderCoordinator<mode>::handleInitialAllRangesAnnouncement(InitialAllRang
    LOG_TRACE(log, "Received an announcement {}", announcement.describe());

    /// To get rid of duplicates
-    for (const auto & part: announcement.description)
+    for (auto && part: announcement.description)
    {
        auto the_same_it = std::find_if(all_parts_to_read.begin(), all_parts_to_read.end(),
            [&part] (const Part & other) { return other.description.info == part.info; });
@ -404,13 +401,8 @@ void InOrderCoordinator<mode>::handleInitialAllRangesAnnouncement(InitialAllRang
        if (covering_or_the_same_it != all_parts_to_read.end())
            continue;

-        auto new_part = Part{
-            .description = part,
-            .replicas = {announcement.replica_num}
-        };
-
-        auto insert_it = all_parts_to_read.insert(new_part);
-        auto & ranges = insert_it.first->description.ranges;
+        auto [inserted_it, _] = all_parts_to_read.emplace(Part{.description = std::move(part), .replicas = {announcement.replica_num}});
+        auto & ranges = inserted_it->description.ranges;
        std::sort(ranges.begin(), ranges.end());
    }
 }
@ -517,7 +509,7 @@ void ParallelReplicasReadingCoordinator::handleInitialAllRangesAnnouncement(Init
    }


-    return pimpl->handleInitialAllRangesAnnouncement(announcement);
+    return pimpl->handleInitialAllRangesAnnouncement(std::move(announcement));
 }

 ParallelReadResponse ParallelReplicasReadingCoordinator::handleRequest(ParallelReadRequest request)
--- a/src/Storages/PostgreSQL/MaterializedPostgreSQLConsumer.cpp
+++ b/src/Storages/PostgreSQL/MaterializedPostgreSQLConsumer.cpp
@ -566,6 +566,7 @@ void MaterializedPostgreSQLConsumer::processReplicationMessage(const char * repl

 void MaterializedPostgreSQLConsumer::syncTables()
 {
+    size_t synced_tables = 0;
    while (!tables_to_sync.empty())
    {
        auto table_name = *tables_to_sync.begin();
@ -596,6 +597,7 @@ void MaterializedPostgreSQLConsumer::syncTables()

                CompletedPipelineExecutor executor(io.pipeline);
                executor.execute();
+                ++synced_tables;
            }
        }
        catch (...)
@ -608,7 +610,8 @@ void MaterializedPostgreSQLConsumer::syncTables()
        tables_to_sync.erase(tables_to_sync.begin());
    }

-    LOG_DEBUG(log, "Table sync end for {} tables, last lsn: {} = {}, (attempted lsn {})", tables_to_sync.size(), current_lsn, getLSNValue(current_lsn), getLSNValue(final_lsn));
+    LOG_DEBUG(log, "Table sync end for {} tables, last lsn: {} = {}, (attempted lsn {})",
+              synced_tables, current_lsn, getLSNValue(current_lsn), getLSNValue(final_lsn));

    updateLsn();
 }
--- a/src/Storages/PostgreSQL/MaterializedPostgreSQLSettings.h
+++ b/src/Storages/PostgreSQL/MaterializedPostgreSQLSettings.h
@ -24,6 +24,7 @@ namespace DB
    M(UInt64, materialized_postgresql_backoff_min_ms, 200, "Poll backoff start point", 0) \
    M(UInt64, materialized_postgresql_backoff_max_ms, 10000, "Poll backoff max point", 0) \
    M(UInt64, materialized_postgresql_backoff_factor, 2, "Poll backoff factor", 0) \
+    M(Bool, materialized_postgresql_use_unique_replication_consumer_identifier, false, "Should a unique consumer be registered for table replication", 0) \

 DECLARE_SETTINGS_TRAITS(MaterializedPostgreSQLSettingsTraits, LIST_OF_MATERIALIZED_POSTGRESQL_SETTINGS)

--- a/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.cpp
+++ b/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.cpp
@ -17,12 +17,14 @@
 #include <Interpreters/Context.h>
 #include <Databases/DatabaseOnDisk.h>
 #include <boost/algorithm/string/trim.hpp>
+#include <Poco/String.h>


 namespace DB
 {

 static const auto CLEANUP_RESCHEDULE_MS = 600000 * 3; /// 30 min
+static constexpr size_t replication_slot_name_max_size = 64;

 namespace ErrorCodes
 {
@ -56,10 +58,70 @@ private:
 };


+namespace
+{
+    /// There can be several replication slots per publication, but one publication per table/database replication.
+    /// Replication slot might be unique (contain uuid) to allow have multiple replicas for the same PostgreSQL table/database.
+
+    String getPublicationName(const String & postgres_database, const String & postgres_table)
+    {
+        return fmt::format(
+            "{}_ch_publication",
+            postgres_table.empty() ? postgres_database : fmt::format("{}_{}", postgres_database, postgres_table));
+    }
+
+    void checkReplicationSlot(String name)
+    {
+        for (const auto & c : name)
+        {
+            const bool ok = (std::isalpha(c) && std::islower(c)) || std::isdigit(c) || c == '_';
+            if (!ok)
+            {
+                throw Exception(
+                    ErrorCodes::BAD_ARGUMENTS,
+                    "Replication slot can contain lower-case letters, numbers, and the underscore character. "
+                    "Got: {}", name);
+            }
+        }
+
+        if (name.size() > replication_slot_name_max_size)
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Too big replication slot size: {}", name);
+    }
+
+    String normalizeReplicationSlot(String name)
+    {
+        name = Poco::toLower(name);
+        for (auto & c : name)
+            if (c == '-')
+                c = '_';
+        return name;
+    }
+
+    String getReplicationSlotName(
+        const String & postgres_database,
+        const String & postgres_table,
+        const String & clickhouse_uuid,
+        const MaterializedPostgreSQLSettings & replication_settings)
+    {
+        String slot_name = replication_settings.materialized_postgresql_replication_slot;
+        if (slot_name.empty())
+        {
+            if (replication_settings.materialized_postgresql_use_unique_replication_consumer_identifier)
+                slot_name = clickhouse_uuid;
+            else
+                slot_name = postgres_table.empty() ? postgres_database : fmt::format("{}_{}_ch_replication_slot", postgres_database, postgres_table);
+
+            slot_name = normalizeReplicationSlot(slot_name);
+        }
+        return slot_name;
+    }
+}
+
 PostgreSQLReplicationHandler::PostgreSQLReplicationHandler(
-    const String & replication_identifier,
    const String & postgres_database_,
-    const String & current_database_name_,
+    const String & postgres_table_,
+    const String & clickhouse_database_,
+    const String & clickhouse_uuid_,
    const postgres::ConnectionInfo & connection_info_,
    ContextPtr context_,
    bool is_attach_,
@ -70,14 +132,18 @@ PostgreSQLReplicationHandler::PostgreSQLReplicationHandler(
    , is_attach(is_attach_)
    , postgres_database(postgres_database_)
    , postgres_schema(replication_settings.materialized_postgresql_schema)
-    , current_database_name(current_database_name_)
+    , current_database_name(clickhouse_database_)
    , connection_info(connection_info_)
    , max_block_size(replication_settings.materialized_postgresql_max_block_size)
    , is_materialized_postgresql_database(is_materialized_postgresql_database_)
    , tables_list(replication_settings.materialized_postgresql_tables_list)
    , schema_list(replication_settings.materialized_postgresql_schema_list)
    , schema_as_a_part_of_table_name(!schema_list.empty() || replication_settings.materialized_postgresql_tables_list_with_schema)
+    , user_managed_slot(!replication_settings.materialized_postgresql_replication_slot.value.empty())
    , user_provided_snapshot(replication_settings.materialized_postgresql_snapshot)
+    , replication_slot(getReplicationSlotName(postgres_database_, postgres_table_, clickhouse_uuid_, replication_settings))
+    , tmp_replication_slot(replication_slot + "_tmp")
+    , publication_name(getPublicationName(postgres_database_, postgres_table_))
    , reschedule_backoff_min_ms(replication_settings.materialized_postgresql_backoff_min_ms)
    , reschedule_backoff_max_ms(replication_settings.materialized_postgresql_backoff_max_ms)
    , reschedule_backoff_factor(replication_settings.materialized_postgresql_backoff_factor)
@ -89,13 +155,9 @@ PostgreSQLReplicationHandler::PostgreSQLReplicationHandler(
    if (!schema_list.empty() && !postgres_schema.empty())
        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot have schema list and common schema at the same time");

-    replication_slot = replication_settings.materialized_postgresql_replication_slot;
-    if (replication_slot.empty())
-    {
-        user_managed_slot = false;
-        replication_slot = fmt::format("{}_ch_replication_slot", replication_identifier);
-    }
-    publication_name = fmt::format("{}_ch_publication", replication_identifier);
+    checkReplicationSlot(replication_slot);
+
+    LOG_INFO(log, "Using replication slot {} and publication {}", replication_slot, publication_name);

    startup_task = getContext()->getSchedulePool().createTask("PostgreSQLReplicaStartup", [this]{ checkConnectionAndStart(); });
    consumer_task = getContext()->getSchedulePool().createTask("PostgreSQLReplicaStartup", [this]{ consumerFunc(); });
@ -496,7 +558,7 @@ void PostgreSQLReplicationHandler::createPublicationIfNeeded(pqxx::nontransactio
            throw Exception(ErrorCodes::LOGICAL_ERROR, "No table found to be replicated");

        /// 'ONLY' means just a table, without descendants.
-        std::string query_str = fmt::format("CREATE PUBLICATION {} FOR TABLE ONLY {}", publication_name, tables_list);
+        std::string query_str = fmt::format("CREATE PUBLICATION {} FOR TABLE ONLY {}", doubleQuoteString(publication_name), tables_list);
        try
        {
            tx.exec(query_str);
@ -519,7 +581,7 @@ bool PostgreSQLReplicationHandler::isReplicationSlotExist(pqxx::nontransaction &
 {
    String slot_name;
    if (temporary)
-        slot_name = replication_slot + "_tmp";
+        slot_name = tmp_replication_slot;
    else
        slot_name = replication_slot;

@ -546,11 +608,11 @@ void PostgreSQLReplicationHandler::createReplicationSlot(

    String query_str, slot_name;
    if (temporary)
-        slot_name = replication_slot + "_tmp";
+        slot_name = tmp_replication_slot;
    else
        slot_name = replication_slot;

-    query_str = fmt::format("CREATE_REPLICATION_SLOT {} LOGICAL pgoutput EXPORT_SNAPSHOT", slot_name);
+    query_str = fmt::format("CREATE_REPLICATION_SLOT {} LOGICAL pgoutput EXPORT_SNAPSHOT", doubleQuoteString(slot_name));

    try
    {
@ -573,7 +635,7 @@ void PostgreSQLReplicationHandler::dropReplicationSlot(pqxx::nontransaction & tx

    std::string slot_name;
    if (temporary)
-        slot_name = replication_slot + "_tmp";
+        slot_name = tmp_replication_slot;
    else
        slot_name = replication_slot;

--- a/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.h
+++ b/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.h
@ -21,9 +21,10 @@ public:
    using ConsumerPtr = std::shared_ptr<MaterializedPostgreSQLConsumer>;

    PostgreSQLReplicationHandler(
-            const String & replication_identifier,
            const String & postgres_database_,
-            const String & current_database_name_,
+            const String & postgres_table_,
+            const String & clickhouse_database_,
+            const String & clickhouse_uuid_,
            const postgres::ConnectionInfo & connection_info_,
            ContextPtr context_,
            bool is_attach_,
@ -128,10 +129,11 @@ private:
    /// This is possible to allow replicating tables from multiple schemas in the same MaterializedPostgreSQL database engine.
    mutable bool schema_as_a_part_of_table_name = false;

-    bool user_managed_slot = true;
-    String user_provided_snapshot;
-
-    String replication_slot, publication_name;
+    const bool user_managed_slot;
+    const String user_provided_snapshot;
+    const String replication_slot;
+    const String tmp_replication_slot;
+    const String publication_name;

    /// Replication consumer. Manages decoding of replication stream and syncing into tables.
    ConsumerPtr consumer;
--- a/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.cpp
+++ b/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.cpp
@ -74,13 +74,13 @@ StorageMaterializedPostgreSQL::StorageMaterializedPostgreSQL(

    setInMemoryMetadata(storage_metadata);

-    String replication_identifier = remote_database_name + "_" + remote_table_name_;
    replication_settings->materialized_postgresql_tables_list = remote_table_name_;

    replication_handler = std::make_unique<PostgreSQLReplicationHandler>(
-            replication_identifier,
            remote_database_name,
+            remote_table_name_,
            table_id_.database_name,
+            toString(table_id_.uuid),
            connection_info,
            getContext(),
            is_attach,
--- a/src/Storages/RocksDB/EmbeddedRocksDBSink.cpp
+++ b/src/Storages/RocksDB/EmbeddedRocksDBSink.cpp
@ -26,12 +26,13 @@ EmbeddedRocksDBSink::EmbeddedRocksDBSink(
            break;
        ++primary_key_pos;
    }
+    serializations = getHeader().getSerializations();
 }

 void EmbeddedRocksDBSink::consume(Chunk chunk)
 {
    auto rows = chunk.getNumRows();
-    auto block = getHeader().cloneWithColumns(chunk.detachColumns());
+    const auto & columns = chunk.getColumns();

    WriteBufferFromOwnString wb_key;
    WriteBufferFromOwnString wb_value;
@ -43,12 +44,9 @@ void EmbeddedRocksDBSink::consume(Chunk chunk)
        wb_key.restart();
        wb_value.restart();

-        size_t idx = 0;
-        for (const auto & elem : block)
-        {
-            elem.type->getDefaultSerialization()->serializeBinary(*elem.column, i, idx == primary_key_pos ? wb_key : wb_value, {});
-            ++idx;
-        }
+        for (size_t idx = 0; idx < columns.size(); ++idx)
+            serializations[idx]->serializeBinary(*columns[idx], i, idx == primary_key_pos ? wb_key : wb_value, {});
+
        status = batch.Put(wb_key.str(), wb_value.str());
        if (!status.ok())
            throw Exception(ErrorCodes::ROCKSDB_ERROR, "RocksDB write error: {}", status.ToString());
--- a/src/Storages/RocksDB/EmbeddedRocksDBSink.h
+++ b/src/Storages/RocksDB/EmbeddedRocksDBSink.h
@ -24,6 +24,7 @@ private:
    StorageEmbeddedRocksDB & storage;
    StorageMetadataPtr metadata_snapshot;
    size_t primary_key_pos = 0;
+    Serializations serializations;
 };

 }
--- a/src/Storages/S3Queue/S3QueueFilesMetadata.cpp
+++ b/src/Storages/S3Queue/S3QueueFilesMetadata.cpp
--- a/src/Storages/S3Queue/S3QueueFilesMetadata.h
+++ b/src/Storages/S3Queue/S3QueueFilesMetadata.h
@ -1,127 +1,171 @@
 #pragma once
+#include "config.h"

-#if USE_AWS_S3
+#include <filesystem>
+#include <Core/Types.h>
+#include <Core/SettingsEnums.h>
+#include <Core/BackgroundSchedulePool.h>
+#include <Common/ZooKeeper/ZooKeeper.h>

-#    include <Core/UUID.h>
-#    include <Interpreters/Context.h>
-#    include <Storages/StorageS3Settings.h>
-#    include <Common/ZooKeeper/ZooKeeper.h>
+namespace fs = std::filesystem;
+namespace Poco { class Logger; }

 namespace DB
 {
-class StorageS3Queue;
 struct S3QueueSettings;
+class StorageS3Queue;

+/**
+ * A class for managing S3Queue metadata in zookeeper, e.g.
+ * the following folders:
+ * - <path_to_metadata>/processing
+ * - <path_to_metadata>/processed
+ * - <path_to_metadata>/failed
+ *
+ * Depending on S3Queue processing mode (ordered or unordered)
+ * we can differently store metadata in /processed node.
+ *
+ * Implements caching of zookeeper metadata for faster responses.
+ * Cached part is located in LocalFileStatuses.
+ *
+ * In case of Unordered mode - if files TTL is enabled or maximum tracked files limit is set
+ * starts a background cleanup thread which is responsible for maintaining them.
+ */
 class S3QueueFilesMetadata
 {
 public:
-    struct TrackedCollectionItem
+    class ProcessingNodeHolder;
+    using ProcessingNodeHolderPtr = std::shared_ptr<ProcessingNodeHolder>;
+
+    S3QueueFilesMetadata(const fs::path & zookeeper_path_, const S3QueueSettings & settings_);
+
+    ~S3QueueFilesMetadata();
+
+    void setFileProcessed(ProcessingNodeHolderPtr holder);
+
+    void setFileFailed(ProcessingNodeHolderPtr holder, const std::string & exception_message);
+
+    struct FileStatus
    {
-        TrackedCollectionItem() = default;
-        TrackedCollectionItem(const String & file_path_, UInt64 timestamp_, UInt64 retries_count_, const String & last_exception_)
-            : file_path(file_path_), timestamp(timestamp_), retries_count(retries_count_), last_exception(last_exception_) {}
-        String file_path;
-        UInt64 timestamp = 0;
-        UInt64 retries_count = 0;
-        String last_exception;
+        enum class State
+        {
+            Processing,
+            Processed,
+            Failed,
+            None
+        };
+        State state = State::None;
+
+        std::atomic<size_t> processed_rows = 0;
+        time_t processing_start_time = 0;
+        time_t processing_end_time = 0;
+        size_t retries = 0;
+        std::string last_exception;
+        ProfileEvents::Counters profile_counters;
+
+        std::mutex processing_lock;
+        std::mutex metadata_lock;
    };
+    using FileStatusPtr = std::shared_ptr<FileStatus>;
+    using FileStatuses = std::unordered_map<std::string, FileStatusPtr>;

-    using S3FilesCollection = std::unordered_set<String>;
-    using TrackedFiles = std::deque<TrackedCollectionItem>;
+    /// Set file as processing, if it is not alreaty processed, failed or processing.
+    std::pair<ProcessingNodeHolderPtr, FileStatusPtr> trySetFileAsProcessing(const std::string & path);

-    S3QueueFilesMetadata(const StorageS3Queue * storage_, const S3QueueSettings & settings_);
+    FileStatusPtr getFileStatus(const std::string & path);

-    void setFilesProcessing(const Strings & file_paths);
-    void setFileProcessed(const String & file_path);
-    bool setFileFailed(const String & file_path, const String & exception_message);
+    FileStatuses getFileStateses() const { return local_file_statuses.getAll(); }

-    S3FilesCollection getProcessedFailedAndProcessingFiles();
-    String getMaxProcessedFile();
-    std::shared_ptr<zkutil::EphemeralNodeHolder> acquireLock(zkutil::ZooKeeperPtr zookeeper);
+    bool checkSettings(const S3QueueSettings & settings) const;

-    struct S3QueueCollection
-    {
-    public:
-        virtual ~S3QueueCollection() = default;
-        virtual String toString() const;
-        S3FilesCollection getFileNames();
-
-        virtual void parse(const String & collection_str) = 0;
-
-    protected:
-        TrackedFiles files;
-
-        void read(ReadBuffer & in);
-        void write(WriteBuffer & out) const;
-    };
-
-    struct S3QueueProcessedCollection : public S3QueueCollection
-    {
-    public:
-        S3QueueProcessedCollection(const UInt64 & max_size_, const UInt64 & max_age_);
-
-        void parse(const String & collection_str) override;
-        void add(const String & file_name);
-
-    private:
-        const UInt64 max_size;
-        const UInt64 max_age;
-    };
-
-    struct S3QueueFailedCollection : S3QueueCollection
-    {
-    public:
-        S3QueueFailedCollection(const UInt64 & max_retries_count_);
-
-        void parse(const String & collection_str) override;
-        bool add(const String & file_name, const String & exception_message);
-
-        S3FilesCollection getFileNames();
-
-    private:
-        UInt64 max_retries_count;
-    };
-
-    struct S3QueueProcessingCollection
-    {
-    public:
-        S3QueueProcessingCollection() = default;
-
-        void parse(const String & collection_str);
-        void add(const Strings & file_names);
-        void remove(const String & file_name);
-
-        String toString() const;
-        const S3FilesCollection & getFileNames() const { return files; }
-
-    private:
-        S3FilesCollection files;
-    };
+    void deactivateCleanupTask();

 private:
-    const StorageS3Queue * storage;
    const S3QueueMode mode;
    const UInt64 max_set_size;
    const UInt64 max_set_age_sec;
    const UInt64 max_loading_retries;
+    const size_t min_cleanup_interval_ms;
+    const size_t max_cleanup_interval_ms;

-    const String zookeeper_processing_path;
-    const String zookeeper_processed_path;
-    const String zookeeper_failed_path;
-    const String zookeeper_lock_path;
+    const fs::path zookeeper_processing_path;
+    const fs::path zookeeper_processed_path;
+    const fs::path zookeeper_failed_path;
+    const fs::path zookeeper_cleanup_lock_path;

-    mutable std::mutex mutex;
    Poco::Logger * log;

-    S3FilesCollection getFailedFiles();
-    S3FilesCollection getProcessingFiles();
-    S3FilesCollection getUnorderedProcessedFiles();
+    std::atomic_bool shutdown = false;
+    BackgroundSchedulePool::TaskHolder task;

-    void removeProcessingFile(const String & file_path);
+    std::string getNodeName(const std::string & path);
+
+    zkutil::ZooKeeperPtr getZooKeeper() const;
+
+    void setFileProcessedForOrderedMode(ProcessingNodeHolderPtr holder);
+    void setFileProcessedForUnorderedMode(ProcessingNodeHolderPtr holder);
+
+    enum class SetFileProcessingResult
+    {
+        Success,
+        ProcessingByOtherNode,
+        AlreadyProcessed,
+        AlreadyFailed,
+    };
+    std::pair<SetFileProcessingResult, ProcessingNodeHolderPtr> trySetFileAsProcessingForOrderedMode(const std::string & path);
+    std::pair<SetFileProcessingResult, ProcessingNodeHolderPtr> trySetFileAsProcessingForUnorderedMode(const std::string & path);
+
+    struct NodeMetadata
+    {
+        std::string file_path;
+        UInt64 last_processed_timestamp = 0;
+        std::string last_exception;
+        UInt64 retries = 0;
+        std::string processing_id; /// For ephemeral processing node.
+
+        std::string toString() const;
+        static NodeMetadata fromString(const std::string & metadata_str);
+    };
+
+    NodeMetadata createNodeMetadata(const std::string & path, const std::string & exception = "", size_t retries = 0);
+
+    void cleanupThreadFunc();
+    void cleanupThreadFuncImpl();
+
+    struct LocalFileStatuses
+    {
+        FileStatuses file_statuses;
+        mutable std::mutex mutex;
+
+        FileStatuses getAll() const;
+        FileStatusPtr get(const std::string & filename, bool create);
+        bool remove(const std::string & filename, bool if_exists);
+        std::unique_lock<std::mutex> lock() const;
+    };
+    LocalFileStatuses local_file_statuses;
 };

+class S3QueueFilesMetadata::ProcessingNodeHolder
+{
+    friend class S3QueueFilesMetadata;
+public:
+    ProcessingNodeHolder(
+        const std::string & processing_id_,
+        const std::string & path_,
+        const std::string & zk_node_path_,
+        zkutil::ZooKeeperPtr zk_client_);
+
+    ~ProcessingNodeHolder();
+
+private:
+    bool remove(Coordination::Requests * requests = nullptr, Coordination::Responses * responses = nullptr);
+
+    zkutil::ZooKeeperPtr zk_client;
+    std::string path;
+    std::string zk_node_path;
+    std::string processing_id;
+    bool removed = false;
+    Poco::Logger * log;
+};

 }
-
-
-#endif
--- a/src/Storages/S3Queue/S3QueueMetadataFactory.cpp
+++ b/src/Storages/S3Queue/S3QueueMetadataFactory.cpp
@ -0,0 +1,70 @@
+#include <Storages/S3Queue/S3QueueMetadataFactory.h>
+#include <Interpreters/Context.h>
+
+namespace DB
+{
+namespace ErrorCodes
+{
+    extern const int BAD_ARGUMENTS;
+}
+
+S3QueueMetadataFactory & S3QueueMetadataFactory::instance()
+{
+    static S3QueueMetadataFactory ret;
+    return ret;
+}
+
+S3QueueMetadataFactory::FilesMetadataPtr
+S3QueueMetadataFactory::getOrCreate(const std::string & zookeeper_path, const S3QueueSettings & settings)
+{
+    std::lock_guard lock(mutex);
+    auto it = metadata_by_path.find(zookeeper_path);
+    if (it == metadata_by_path.end())
+    {
+        it = metadata_by_path.emplace(zookeeper_path, std::make_shared<S3QueueFilesMetadata>(fs::path(zookeeper_path), settings)).first;
+    }
+    else if (it->second.metadata->checkSettings(settings))
+    {
+        it->second.ref_count += 1;
+    }
+    else
+    {
+        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Metadata with the same `s3queue_zookeeper_path` "
+                        "was already created but with different settings");
+    }
+    return it->second.metadata;
+}
+
+void S3QueueMetadataFactory::remove(const std::string & zookeeper_path)
+{
+    std::lock_guard lock(mutex);
+    auto it = metadata_by_path.find(zookeeper_path);
+
+    if (it == metadata_by_path.end())
+        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Metadata with zookeeper path {} does not exist", zookeeper_path);
+
+    if (--it->second.ref_count == 0)
+    {
+        try
+        {
+            auto zk_client = Context::getGlobalContextInstance()->getZooKeeper();
+            zk_client->tryRemove(it->first);
+        }
+        catch (...)
+        {
+            tryLogCurrentException(__PRETTY_FUNCTION__);
+        }
+
+        metadata_by_path.erase(it);
+    }
+}
+
+std::unordered_map<std::string, S3QueueMetadataFactory::FilesMetadataPtr> S3QueueMetadataFactory::getAll()
+{
+    std::unordered_map<std::string, S3QueueMetadataFactory::FilesMetadataPtr> result;
+    for (const auto & [zk_path, metadata_and_ref_count] : metadata_by_path)
+        result.emplace(zk_path, metadata_and_ref_count.metadata);
+    return result;
+}
+
+}
--- a/src/Storages/S3Queue/S3QueueMetadataFactory.h
+++ b/src/Storages/S3Queue/S3QueueMetadataFactory.h
@ -0,0 +1,36 @@
+#pragma once
+#include <boost/noncopyable.hpp>
+#include <Storages/S3Queue/S3QueueSettings.h>
+#include <Storages/S3Queue/S3QueueFilesMetadata.h>
+
+namespace DB
+{
+
+class S3QueueMetadataFactory final : private boost::noncopyable
+{
+public:
+    using FilesMetadataPtr = std::shared_ptr<S3QueueFilesMetadata>;
+
+    static S3QueueMetadataFactory & instance();
+
+    FilesMetadataPtr getOrCreate(const std::string & zookeeper_path, const S3QueueSettings & settings);
+
+    void remove(const std::string & zookeeper_path);
+
+    std::unordered_map<std::string, FilesMetadataPtr> getAll();
+
+private:
+    struct Metadata
+    {
+        explicit Metadata(std::shared_ptr<S3QueueFilesMetadata> metadata_) : metadata(metadata_), ref_count(1) {}
+
+        std::shared_ptr<S3QueueFilesMetadata> metadata;
+        size_t ref_count = 0;
+    };
+    using MetadataByPath = std::unordered_map<std::string, Metadata>;
+
+    MetadataByPath metadata_by_path;
+    std::mutex mutex;
+};
+
+}
--- a/src/Storages/S3Queue/S3QueueSettings.cpp
+++ b/src/Storages/S3Queue/S3QueueSettings.cpp
@ -1,8 +1,8 @@
+#include <Storages/S3Queue/S3QueueSettings.h>
+#include <Common/Exception.h>
 #include <Parsers/ASTCreateQuery.h>
 #include <Parsers/ASTFunction.h>
 #include <Parsers/ASTSetQuery.h>
-#include <Storages/S3Queue/S3QueueSettings.h>
-#include <Common/Exception.h>


 namespace DB
--- a/src/Storages/S3Queue/S3QueueSettings.h
+++ b/src/Storages/S3Queue/S3QueueSettings.h
@ -19,17 +19,16 @@ class ASTStorage;
      0) \
    M(S3QueueAction, after_processing, S3QueueAction::KEEP, "Delete or keep file in S3 after successful processing", 0) \
    M(String, keeper_path, "", "Zookeeper node path", 0) \
-    M(UInt64, s3queue_loading_retries, 0, "Retry loading up to specified number of times", 0) \
-    M(UInt64, s3queue_polling_min_timeout_ms, 1000, "Minimal timeout before next polling", 0) \
-    M(UInt64, s3queue_polling_max_timeout_ms, 10000, "Maximum timeout before next polling", 0) \
-    M(UInt64, s3queue_polling_backoff_ms, 0, "Polling backoff", 0) \
-    M(UInt64, s3queue_tracked_files_limit, 1000, "Max set size for tracking processed files in unordered mode in ZooKeeper", 0) \
-    M(UInt64, \
-      s3queue_tracked_file_ttl_sec, \
-      0, \
-      "Maximum number of seconds to store processed files in ZooKeeper node (store forever by default)", \
-      0) \
-    M(UInt64, s3queue_polling_size, 50, "Maximum files to fetch from S3 with SELECT", 0)
+    M(UInt32, s3queue_loading_retries, 0, "Retry loading up to specified number of times", 0) \
+    M(UInt32, s3queue_processing_threads_num, 1, "Number of processing threads", 0) \
+    M(UInt32, s3queue_enable_logging_to_s3queue_log, 1, "Enable logging to system table system.s3queue_log", 0) \
+    M(UInt32, s3queue_tracked_file_ttl_sec, 0, "Maximum number of seconds to store processed files in ZooKeeper node (store forever by default)", 0) \
+    M(UInt32, s3queue_polling_min_timeout_ms, 1000, "Minimal timeout before next polling", 0) \
+    M(UInt32, s3queue_polling_max_timeout_ms, 10000, "Maximum timeout before next polling", 0) \
+    M(UInt32, s3queue_polling_backoff_ms, 1000, "Polling backoff", 0) \
+    M(UInt32, s3queue_tracked_files_limit, 1000, "For unordered mode. Max set size for tracking processed files in ZooKeeper", 0) \
+    M(UInt32, s3queue_cleanup_interval_min_ms, 60000, "For unordered mode. Polling backoff min for cleanup", 0) \
+    M(UInt32, s3queue_cleanup_interval_max_ms, 60000, "For unordered mode. Polling backoff max for cleanup", 0) \

 #define LIST_OF_S3QUEUE_SETTINGS(M, ALIAS) \
    S3QUEUE_RELATED_SETTINGS(M, ALIAS) \
--- a/src/Storages/S3Queue/S3QueueSource.cpp
+++ b/src/Storages/S3Queue/S3QueueSource.cpp
@ -1,59 +1,24 @@
-#include <algorithm>
-#include <Common/ProfileEvents.h>
-#include <Common/ZooKeeper/ZooKeeper.h>
-#include "IO/ParallelReadBuffer.h"
-#include "Parsers/ASTCreateQuery.h"
 #include "config.h"

 #if USE_AWS_S3
-
-#    include <Common/isValidUTF8.h>
-
-#    include <Functions/FunctionsConversion.h>
-
-#    include <IO/S3/Requests.h>
-#    include <IO/S3Common.h>
-
-#    include <Interpreters/TreeRewriter.h>
-
-#    include <Parsers/ASTFunction.h>
-#    include <Parsers/ASTInsertQuery.h>
-
-#    include <Storages/NamedCollectionsHelpers.h>
-#    include <Storages/PartitionedSink.h>
-#    include <Storages/S3Queue/S3QueueSource.h>
-#    include <Storages/StorageS3.h>
-#    include <Storages/StorageS3Settings.h>
-#    include <Storages/VirtualColumnUtils.h>
-
-#    include <Formats/FormatFactory.h>
-
-#    include <Processors/Formats/IInputFormat.h>
-#    include <Processors/Formats/IOutputFormat.h>
-#    include <Processors/Transforms/AddingDefaultsTransform.h>
-
-#    include <QueryPipeline/QueryPipelineBuilder.h>
-
-#    include <DataTypes/DataTypeString.h>
-
-#    include <Common/CurrentMetrics.h>
-#    include <Common/NamedCollections/NamedCollections.h>
-#    include <Common/parseGlobs.h>
-
-#    include <Processors/ISource.h>
-#    include <Processors/Sinks/SinkToStorage.h>
+#include <Common/ProfileEvents.h>
+#include <Common/CurrentMetrics.h>
+#include <Common/ZooKeeper/ZooKeeper.h>
+#include <Common/logger_useful.h>
+#include <Common/getRandomASCIIString.h>
+#include <Storages/S3Queue/S3QueueSource.h>
+#include <Storages/VirtualColumnUtils.h>


 namespace CurrentMetrics
 {
-extern const Metric StorageS3Threads;
-extern const Metric StorageS3ThreadsActive;
+    extern const Metric StorageS3Threads;
+    extern const Metric StorageS3ThreadsActive;
 }

 namespace ProfileEvents
 {
-extern const Event S3DeleteObjects;
-extern const Event S3ListObjects;
+    extern const Event S3QueuePullMicroseconds;
 }

 namespace DB
@ -62,148 +27,83 @@ namespace DB
 namespace ErrorCodes
 {
    extern const int S3_ERROR;
+    extern const int NOT_IMPLEMENTED;
 }

-
-StorageS3QueueSource::QueueGlobIterator::QueueGlobIterator(
-    const S3::Client & client_,
-    const S3::URI & globbed_uri_,
-    ASTPtr query,
-    const NamesAndTypesList & virtual_columns,
-    ContextPtr context,
-    UInt64 & max_poll_size_,
-    const S3Settings::RequestSettings & request_settings_)
-    : max_poll_size(max_poll_size_)
-    , glob_iterator(std::make_unique<StorageS3QueueSource::DisclosedGlobIterator>(
-          client_, globbed_uri_, query, virtual_columns, context, nullptr, request_settings_))
+StorageS3QueueSource::S3QueueKeyWithInfo::S3QueueKeyWithInfo(
+        const std::string & key_,
+        std::optional<S3::ObjectInfo> info_,
+        Metadata::ProcessingNodeHolderPtr processing_holder_,
+        FileStatusPtr file_status_)
+    : StorageS3Source::KeyWithInfo(key_, info_)
+    , processing_holder(processing_holder_)
+    , file_status(file_status_)
 {
-    /// todo(kssenii): remove this loop, it should not be here
-    while (true)
-    {
-        KeyWithInfo val = glob_iterator->next();
-        if (val.key.empty())
-            break;
-        keys_buf.push_back(val);
-    }
 }

-Strings StorageS3QueueSource::QueueGlobIterator::filterProcessingFiles(
-    const S3QueueMode & engine_mode, std::unordered_set<String> & exclude_keys, const String & max_file)
+StorageS3QueueSource::FileIterator::FileIterator(
+    std::shared_ptr<S3QueueFilesMetadata> metadata_,
+    std::unique_ptr<GlobIterator> glob_iterator_,
+    std::atomic<bool> & shutdown_called_)
+    : metadata(metadata_)
+    , glob_iterator(std::move(glob_iterator_))
+    , shutdown_called(shutdown_called_)
 {
-    for (const KeyWithInfo & val : keys_buf)
-    {
-        auto full_path = val.key;
-        if (exclude_keys.find(full_path) != exclude_keys.end())
-        {
-            LOG_TEST(log, "File {} will be skipped, because it was found in exclude files list "
-                     "(either already processed or failed to be processed)", val.key);
-            continue;
-        }
-
-        if ((engine_mode == S3QueueMode::ORDERED) && (full_path.compare(max_file) <= 0))
-            continue;
-
-        if ((processing_keys.size() < max_poll_size) || (engine_mode == S3QueueMode::ORDERED))
-        {
-            processing_keys.push_back(val);
-        }
-        else
-        {
-            break;
-        }
-    }
-
-    if (engine_mode == S3QueueMode::ORDERED)
-    {
-        std::sort(
-            processing_keys.begin(),
-            processing_keys.end(),
-            [](const KeyWithInfo & lhs, const KeyWithInfo & rhs) { return lhs.key.compare(rhs.key) < 0; });
-
-        if (processing_keys.size() > max_poll_size)
-        {
-            processing_keys.erase(processing_keys.begin() + max_poll_size, processing_keys.end());
-        }
-    }
-
-    Strings keys;
-    for (const auto & key_info : processing_keys)
-        keys.push_back(key_info.key);
-
-    processing_keys.push_back(KeyWithInfo());
-    processing_iterator = processing_keys.begin();
-    return keys;
 }

-
-StorageS3QueueSource::KeyWithInfo StorageS3QueueSource::QueueGlobIterator::next()
+StorageS3QueueSource::KeyWithInfoPtr StorageS3QueueSource::FileIterator::next()
 {
-    std::lock_guard lock(mutex);
-    if (processing_iterator != processing_keys.end())
+    while (!shutdown_called)
    {
-        return *processing_iterator++;
-    }
+        KeyWithInfoPtr val = glob_iterator->next();

-    return KeyWithInfo();
+        if (!val || shutdown_called)
+            return {};
+
+        if (auto [processing_holder, processing_file_status] = metadata->trySetFileAsProcessing(val->key);
+            processing_holder && !shutdown_called)
+        {
+            return std::make_shared<S3QueueKeyWithInfo>(val->key, val->info, processing_holder, processing_file_status);
+        }
+    }
+    return {};
 }

-size_t StorageS3QueueSource::QueueGlobIterator::estimatedKeysCount()
+size_t StorageS3QueueSource::FileIterator::estimatedKeysCount()
 {
-    return keys_buf.size();
+    throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method estimateKeysCount is not implemented");
 }

 StorageS3QueueSource::StorageS3QueueSource(
-    const ReadFromFormatInfo & info,
-    const String & format_,
    String name_,
-    ContextPtr context_,
-    std::optional<FormatSettings> format_settings_,
-    UInt64 max_block_size_,
-    const S3Settings::RequestSettings & request_settings_,
-    String compression_hint_,
-    const std::shared_ptr<const S3::Client> & client_,
-    const String & bucket_,
-    const String & version_id_,
-    const String & url_host_and_port,
-    std::shared_ptr<IIterator> file_iterator_,
+    const Block & header_,
+    std::unique_ptr<StorageS3Source> internal_source_,
    std::shared_ptr<S3QueueFilesMetadata> files_metadata_,
    const S3QueueAction & action_,
-    const size_t download_thread_num_)
-    : ISource(info.source_header)
+    RemoveFileFunc remove_file_func_,
+    const NamesAndTypesList & requested_virtual_columns_,
+    ContextPtr context_,
+    const std::atomic<bool> & shutdown_called_,
+    std::shared_ptr<S3QueueLog> s3_queue_log_,
+    const StorageID & storage_id_)
+    : ISource(header_)
    , WithContext(context_)
    , name(std::move(name_))
-    , bucket(bucket_)
-    , version_id(version_id_)
-    , format(format_)
-    , columns_desc(info.columns_description)
-    , request_settings(request_settings_)
-    , client(client_)
-    , files_metadata(files_metadata_)
-    , requested_virtual_columns(info.requested_virtual_columns)
-    , requested_columns(info.requested_columns)
-    , file_iterator(file_iterator_)
    , action(action_)
+    , files_metadata(files_metadata_)
+    , internal_source(std::move(internal_source_))
+    , requested_virtual_columns(requested_virtual_columns_)
+    , shutdown_called(shutdown_called_)
+    , s3_queue_log(s3_queue_log_)
+    , storage_id(storage_id_)
+    , remove_file_func(remove_file_func_)
+    , log(&Poco::Logger::get("StorageS3QueueSource"))
 {
-    internal_source = std::make_shared<StorageS3Source>(
-        info,
-        format_,
-        name_,
-        context_,
-        format_settings_,
-        max_block_size_,
-        request_settings_,
-        compression_hint_,
-        client_,
-        bucket_,
-        version_id_,
-        url_host_and_port,
-        file_iterator,
-        download_thread_num_,
-        false,
-        /* query_info */ std::nullopt);
    reader = std::move(internal_source->reader);
    if (reader)
+    {
        reader_future = std::move(internal_source->reader_future);
+    }
 }

 StorageS3QueueSource::~StorageS3QueueSource()
@ -218,61 +118,87 @@ String StorageS3QueueSource::getName() const

 Chunk StorageS3QueueSource::generate()
 {
-    auto file_progress = getContext()->getFileProgressCallback();
    while (true)
    {
-        if (isCancelled() || !reader)
+        if (!reader)
+            break;
+
+        if (isCancelled())
        {
-            if (reader)
-                reader->cancel();
+            reader->cancel();
            break;
        }

-        Chunk chunk;
-        bool success_in_pulling = false;
+        if (shutdown_called)
+        {
+            if (processed_rows_from_file)
+            {
+                /// We could delay shutdown until files, which already started processing before the shutdown, finished.
+                /// But if files are big and `s3queue_processing_threads_num` is not small, it can take a significant time.
+                /// Anyway we cannot do anything in case of SIGTERM, so destination table must anyway support deduplication,
+                /// so here we will rely on it here as well.
+                LOG_WARNING(
+                    log, "Shutdown called, {} rows are already processed, but file is not fully processed",
+                    processed_rows_from_file);
+            }
+            break;
+        }
+
+        const auto * key_with_info = dynamic_cast<const S3QueueKeyWithInfo *>(&reader.getKeyWithInfo());
+        auto file_status = key_with_info->file_status;
+
+        auto * prev_scope = CurrentThread::get().attachProfileCountersScope(&file_status->profile_counters);
+        SCOPE_EXIT({ CurrentThread::get().attachProfileCountersScope(prev_scope); });
+        /// FIXME:  if files are compressed, profile counters update does not work fully (s3 related counters are not saved). Why?
+
        try
        {
+            auto timer = DB::CurrentThread::getProfileEvents().timer(ProfileEvents::S3QueuePullMicroseconds);
+
+            Chunk chunk;
            if (reader->pull(chunk))
            {
-                UInt64 num_rows = chunk.getNumRows();
-                auto file_path = reader.getPath();
+                LOG_TEST(log, "Read {} rows from file: {}", chunk.getNumRows(), reader.getPath());

-                for (const auto & virtual_column : requested_virtual_columns)
-                {
-                    if (virtual_column.name == "_path")
-                    {
-                        chunk.addColumn(virtual_column.type->createColumnConst(num_rows, file_path)->convertToFullColumnIfConst());
-                    }
-                    else if (virtual_column.name == "_file")
-                    {
-                        size_t last_slash_pos = file_path.find_last_of('/');
-                        auto column = virtual_column.type->createColumnConst(num_rows, file_path.substr(last_slash_pos + 1));
-                        chunk.addColumn(column->convertToFullColumnIfConst());
-                    }
-                }
-                success_in_pulling = true;
+                file_status->processed_rows += chunk.getNumRows();
+                processed_rows_from_file += chunk.getNumRows();
+
+                VirtualColumnUtils::addRequestedPathAndFileVirtualsToChunk(chunk, requested_virtual_columns, reader.getPath());
+                return chunk;
            }
        }
-        catch (const Exception & e)
+        catch (...)
        {
-            LOG_ERROR(log, "Exception in chunk pulling: {} ", e.displayText());
-            files_metadata->setFileFailed(reader.getFile(), e.message());
-            success_in_pulling = false;
-        }
-        if (success_in_pulling)
-        {
-            applyActionAfterProcessing(reader.getFile());
-            files_metadata->setFileProcessed(reader.getFile());
-            return chunk;
+            const auto message = getCurrentExceptionMessage(true);
+            LOG_ERROR(log, "Got an error while pulling chunk. Will set file {} as failed. Error: {} ", reader.getFile(), message);
+
+            files_metadata->setFileFailed(key_with_info->processing_holder, message);
+
+            appendLogElement(reader.getFile(), *file_status, processed_rows_from_file, false);
+            throw;
        }

+        files_metadata->setFileProcessed(key_with_info->processing_holder);
+        applyActionAfterProcessing(reader.getFile());

-        assert(reader_future.valid());
+        appendLogElement(reader.getFile(), *file_status, processed_rows_from_file, true);
+        file_status.reset();
+        processed_rows_from_file = 0;
+
+        if (shutdown_called)
+        {
+            LOG_INFO(log, "Shutdown was called, stopping sync");
+            break;
+        }
+
+        chassert(reader_future.valid());
        reader = reader_future.get();

        if (!reader)
            break;

+        file_status = files_metadata->getFileStatus(reader.getFile());
+
        /// Even if task is finished the thread may be not freed in pool.
        /// So wait until it will be freed before scheduling a new task.
        internal_source->create_reader_pool.wait();
@ -282,35 +208,42 @@ Chunk StorageS3QueueSource::generate()
    return {};
 }

-
-void StorageS3QueueSource::applyActionAfterProcessing(const String & file_path)
+void StorageS3QueueSource::applyActionAfterProcessing(const String & path)
 {
    switch (action)
    {
        case S3QueueAction::DELETE:
-            deleteProcessedObject(file_path);
+        {
+            assert(remove_file_func);
+            remove_file_func(path);
            break;
+        }
        case S3QueueAction::KEEP:
            break;
    }
 }

-void StorageS3QueueSource::deleteProcessedObject(const String & file_path)
+void StorageS3QueueSource::appendLogElement(const std::string & filename, S3QueueFilesMetadata::FileStatus & file_status_, size_t processed_rows, bool processed)
 {
-    LOG_INFO(log, "Delete processed file {} from bucket {}", file_path, bucket);
+    if (!s3_queue_log)
+        return;

-    S3::DeleteObjectRequest request;
-    request.WithKey(file_path).WithBucket(bucket);
-    auto outcome = client->DeleteObject(request);
-    if (!outcome.IsSuccess())
+    S3QueueLogElement elem{};
    {
-        const auto & err = outcome.GetError();
-        LOG_ERROR(log, "{} (Code: {})", err.GetMessage(), static_cast<size_t>(err.GetErrorType()));
-    }
-    else
-    {
-        LOG_TRACE(log, "Object with path {} was removed from S3", file_path);
+        std::lock_guard lock(file_status_.metadata_lock);
+        elem = S3QueueLogElement
+        {
+            .event_time = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()),
+            .file_name = filename,
+            .rows_processed = processed_rows,
+            .status = processed ? S3QueueLogElement::S3QueueStatus::Processed : S3QueueLogElement::S3QueueStatus::Failed,
+            .counters_snapshot = file_status_.profile_counters.getPartiallyAtomicSnapshot(),
+            .processing_start_time = file_status_.processing_start_time,
+            .processing_end_time = file_status_.processing_end_time,
+            .exception = file_status_.last_exception,
+        };
    }
+    s3_queue_log->add(std::move(elem));
 }

 }
--- a/src/Storages/S3Queue/S3QueueSource.h
+++ b/src/Storages/S3Queue/S3QueueSource.h
@ -2,125 +2,101 @@
 #include "config.h"

 #if USE_AWS_S3
+#include <Common/ZooKeeper/ZooKeeper.h>
+#include <Processors/ISource.h>
+#include <Storages/S3Queue/S3QueueFilesMetadata.h>
+#include <Storages/StorageS3.h>
+#include <Interpreters/S3QueueLog.h>

-#    include <Core/Types.h>
-
-#    include <Compression/CompressionInfo.h>
-
-#    include <Storages/IStorage.h>
-#    include <Storages/S3Queue/S3QueueFilesMetadata.h>
-#    include <Storages/StorageS3.h>
-#    include <Storages/StorageS3Settings.h>
-#    include <Storages/prepareReadingFromFormat.h>
-
-#    include <IO/CompressionMethod.h>
-#    include <IO/S3/getObjectInfo.h>
-#    include <Interpreters/Context.h>
-#    include <Interpreters/threadPoolCallbackRunner.h>
-#    include <Processors/Executors/PullingPipelineExecutor.h>
-#    include <Processors/ISource.h>
-#    include <Storages/Cache/SchemaCache.h>
-#    include <Storages/StorageConfiguration.h>
-#    include <Poco/URI.h>
-#    include <Common/ZooKeeper/ZooKeeper.h>
-#    include <Common/logger_useful.h>

+namespace Poco { class Logger; }

 namespace DB
 {

-
 class StorageS3QueueSource : public ISource, WithContext
 {
 public:
    using IIterator = StorageS3Source::IIterator;
-    using DisclosedGlobIterator = StorageS3Source::DisclosedGlobIterator;
-    using KeysWithInfo = StorageS3Source::KeysWithInfo;
-    using KeyWithInfo = StorageS3Source::KeyWithInfo;
-    class QueueGlobIterator : public IIterator
+    using KeyWithInfoPtr = StorageS3Source::KeyWithInfoPtr;
+    using GlobIterator = StorageS3Source::DisclosedGlobIterator;
+    using ZooKeeperGetter = std::function<zkutil::ZooKeeperPtr()>;
+    using RemoveFileFunc = std::function<void(std::string)>;
+    using FileStatusPtr = S3QueueFilesMetadata::FileStatusPtr;
+    using Metadata = S3QueueFilesMetadata;
+
+    struct S3QueueKeyWithInfo : public StorageS3Source::KeyWithInfo
+    {
+        S3QueueKeyWithInfo(
+                const std::string & key_,
+                std::optional<S3::ObjectInfo> info_,
+                Metadata::ProcessingNodeHolderPtr processing_holder_,
+                FileStatusPtr file_status_);
+
+        Metadata::ProcessingNodeHolderPtr processing_holder;
+        FileStatusPtr file_status;
+    };
+
+    class FileIterator : public IIterator
    {
    public:
-        QueueGlobIterator(
-            const S3::Client & client_,
-            const S3::URI & globbed_uri_,
-            ASTPtr query,
-            const NamesAndTypesList & virtual_columns,
-            ContextPtr context,
-            UInt64 & max_poll_size_,
-            const S3Settings::RequestSettings & request_settings_ = {});
+        FileIterator(std::shared_ptr<S3QueueFilesMetadata> metadata_, std::unique_ptr<GlobIterator> glob_iterator_, std::atomic<bool> & shutdown_called_);

-        KeyWithInfo next() override;
-
-        Strings
-        filterProcessingFiles(const S3QueueMode & engine_mode, std::unordered_set<String> & exclude_keys, const String & max_file = "");
+        /// Note:
+        /// List results in s3 are always returned in UTF-8 binary order.
+        /// (https://docs.aws.amazon.com/AmazonS3/latest/userguide/ListingKeysUsingAPIs.html)
+        KeyWithInfoPtr next() override;

        size_t estimatedKeysCount() override;

    private:
-        UInt64 max_poll_size;
-        KeysWithInfo keys_buf;
-        KeysWithInfo processing_keys;
-        mutable std::mutex mutex;
-        std::unique_ptr<DisclosedGlobIterator> glob_iterator;
-        std::vector<KeyWithInfo>::iterator processing_iterator;
-
-        Poco::Logger * log = &Poco::Logger::get("StorageS3QueueSourceIterator");
+        const std::shared_ptr<S3QueueFilesMetadata> metadata;
+        const std::unique_ptr<GlobIterator> glob_iterator;
+        std::atomic<bool> & shutdown_called;
+        std::mutex mutex;
    };

-    static Block getHeader(Block sample_block, const std::vector<NameAndTypePair> & requested_virtual_columns);
-
    StorageS3QueueSource(
-        const ReadFromFormatInfo & info,
-        const String & format,
        String name_,
-        ContextPtr context_,
-        std::optional<FormatSettings> format_settings_,
-        UInt64 max_block_size_,
-        const S3Settings::RequestSettings & request_settings_,
-        String compression_hint_,
-        const std::shared_ptr<const S3::Client> & client_,
-        const String & bucket,
-        const String & version_id,
-        const String & url_host_and_port,
-        std::shared_ptr<IIterator> file_iterator_,
+        const Block & header_,
+        std::unique_ptr<StorageS3Source> internal_source_,
        std::shared_ptr<S3QueueFilesMetadata> files_metadata_,
        const S3QueueAction & action_,
-        size_t download_thread_num);
+        RemoveFileFunc remove_file_func_,
+        const NamesAndTypesList & requested_virtual_columns_,
+        ContextPtr context_,
+        const std::atomic<bool> & shutdown_called_,
+        std::shared_ptr<S3QueueLog> s3_queue_log_,
+        const StorageID & storage_id_);

    ~StorageS3QueueSource() override;

+    static Block getHeader(Block sample_block, const std::vector<NameAndTypePair> & requested_virtual_columns);
+
    String getName() const override;

    Chunk generate() override;

-
 private:
-    String name;
-    String bucket;
-    String version_id;
-    String format;
-    ColumnsDescription columns_desc;
-    S3Settings::RequestSettings request_settings;
-    std::shared_ptr<const S3::Client> client;
+    const String name;
+    const S3QueueAction action;
+    const std::shared_ptr<S3QueueFilesMetadata> files_metadata;
+    const std::shared_ptr<StorageS3Source> internal_source;
+    const NamesAndTypesList requested_virtual_columns;
+    const std::atomic<bool> & shutdown_called;
+    const std::shared_ptr<S3QueueLog> s3_queue_log;
+    const StorageID storage_id;
+
+    RemoveFileFunc remove_file_func;
+    Poco::Logger * log;

-    std::shared_ptr<S3QueueFilesMetadata> files_metadata;
    using ReaderHolder = StorageS3Source::ReaderHolder;
    ReaderHolder reader;
-
-    NamesAndTypesList requested_virtual_columns;
-    NamesAndTypesList requested_columns;
-    std::shared_ptr<IIterator> file_iterator;
-    const S3QueueAction action;
-
-    Poco::Logger * log = &Poco::Logger::get("StorageS3QueueSource");
-
    std::future<ReaderHolder> reader_future;
+    size_t processed_rows_from_file = 0;

-    mutable std::mutex mutex;
-
-    std::shared_ptr<StorageS3Source> internal_source;
-    void deleteProcessedObject(const String & file_path);
-    void applyActionAfterProcessing(const String & file_path);
+    void applyActionAfterProcessing(const String & path);
+    void appendLogElement(const std::string & filename, S3QueueFilesMetadata::FileStatus & file_status_, size_t processed_rows, bool processed);
 };

 }
--- a/src/Storages/S3Queue/S3QueueTableMetadata.cpp
+++ b/src/Storages/S3Queue/S3QueueTableMetadata.cpp
@ -2,12 +2,12 @@

 #if USE_AWS_S3

-#    include <Poco/JSON/JSON.h>
-#    include <Poco/JSON/Object.h>
-#    include <Poco/JSON/Parser.h>
-#    include <Storages/S3Queue/S3QueueSettings.h>
-#    include <Storages/S3Queue/S3QueueTableMetadata.h>
-#    include <Storages/StorageS3.h>
+#include <Poco/JSON/JSON.h>
+#include <Poco/JSON/Object.h>
+#include <Poco/JSON/Parser.h>
+#include <Storages/S3Queue/S3QueueSettings.h>
+#include <Storages/S3Queue/S3QueueTableMetadata.h>
+#include <Storages/StorageS3.h>


 namespace DB
@ -18,13 +18,17 @@ namespace ErrorCodes
    extern const int METADATA_MISMATCH;
 }

-S3QueueTableMetadata::S3QueueTableMetadata(const StorageS3::Configuration & configuration, const S3QueueSettings & engine_settings)
+S3QueueTableMetadata::S3QueueTableMetadata(
+    const StorageS3::Configuration & configuration,
+    const S3QueueSettings & engine_settings,
+    const StorageInMemoryMetadata & storage_metadata)
 {
    format_name = configuration.format;
    after_processing = engine_settings.after_processing.toString();
    mode = engine_settings.mode.toString();
    s3queue_tracked_files_limit = engine_settings.s3queue_tracked_files_limit;
    s3queue_tracked_file_ttl_sec = engine_settings.s3queue_tracked_file_ttl_sec;
+    columns = storage_metadata.getColumns().toString();
 }


@ -36,6 +40,7 @@ String S3QueueTableMetadata::toString() const
    json.set("s3queue_tracked_files_limit", s3queue_tracked_files_limit);
    json.set("s3queue_tracked_file_ttl_sec", s3queue_tracked_file_ttl_sec);
    json.set("format_name", format_name);
+    json.set("columns", columns);

    std::ostringstream oss;     // STYLE_CHECK_ALLOW_STD_STRING_STREAM
    oss.exceptions(std::ios::failbit);
@ -52,6 +57,7 @@ void S3QueueTableMetadata::read(const String & metadata_str)
    s3queue_tracked_files_limit = json->getValue<UInt64>("s3queue_tracked_files_limit");
    s3queue_tracked_file_ttl_sec = json->getValue<UInt64>("s3queue_tracked_file_ttl_sec");
    format_name = json->getValue<String>("format_name");
+    columns = json->getValue<String>("columns");
 }

 S3QueueTableMetadata S3QueueTableMetadata::parse(const String & metadata_str)
--- a/src/Storages/S3Queue/S3QueueTableMetadata.h
+++ b/src/Storages/S3Queue/S3QueueTableMetadata.h
@ -2,9 +2,9 @@

 #if USE_AWS_S3

-#    include <Storages/S3Queue/S3QueueSettings.h>
-#    include <Storages/StorageS3.h>
-#    include <base/types.h>
+#include <Storages/S3Queue/S3QueueSettings.h>
+#include <Storages/StorageS3.h>
+#include <base/types.h>

 namespace DB
 {
@ -18,13 +18,14 @@ class ReadBuffer;
 struct S3QueueTableMetadata
 {
    String format_name;
+    String columns;
    String after_processing;
    String mode;
    UInt64 s3queue_tracked_files_limit;
    UInt64 s3queue_tracked_file_ttl_sec;

    S3QueueTableMetadata() = default;
-    S3QueueTableMetadata(const StorageS3::Configuration & configuration, const S3QueueSettings & engine_settings);
+    S3QueueTableMetadata(const StorageS3::Configuration & configuration, const S3QueueSettings & engine_settings, const StorageInMemoryMetadata & storage_metadata);

    void read(const String & metadata_str);
    static S3QueueTableMetadata parse(const String & metadata_str);
--- a/src/Storages/S3Queue/StorageS3Queue.cpp
+++ b/src/Storages/S3Queue/StorageS3Queue.cpp
@ -1,83 +1,102 @@
 #include "config.h"

-
 #if USE_AWS_S3
+#include <Common/ProfileEvents.h>
+#include <IO/S3Common.h>
+#include <IO/CompressionMethod.h>
+#include <Formats/FormatFactory.h>
+#include <Interpreters/InterpreterInsertQuery.h>
+#include <Processors/Executors/CompletedPipelineExecutor.h>
+#include <Processors/Executors/PullingPipelineExecutor.h>
+#include <Processors/ISource.h>
+#include <Parsers/ASTFunction.h>
+#include <Parsers/ASTInsertQuery.h>
+#include <Storages/S3Queue/S3QueueTableMetadata.h>
+#include <Storages/S3Queue/StorageS3Queue.h>
+#include <Storages/S3Queue/S3QueueFilesMetadata.h>
+#include <Storages/S3Queue/S3QueueMetadataFactory.h>
+#include <Storages/StorageFactory.h>
+#include <Storages/StorageMaterializedView.h>
+#include <Storages/StorageSnapshot.h>
+#include <Storages/VirtualColumnUtils.h>
+#include <Storages/prepareReadingFromFormat.h>
+#include <filesystem>

-#    include <Databases/DatabaseReplicated.h>
-#    include <IO/WriteBuffer.h>
-#    include <IO/WriteHelpers.h>
-#    include <Interpreters/InterpreterInsertQuery.h>
-#    include <Processors/Executors/CompletedPipelineExecutor.h>
-#    include <Common/ProfileEvents.h>
-#    include <Common/ZooKeeper/ZooKeeper.h>
-#    include <Common/isValidUTF8.h>
-#    include "IO/ParallelReadBuffer.h"
-
-#    include <Functions/FunctionsConversion.h>
-
-#    include <IO/S3Common.h>
-
-#    include <Interpreters/TreeRewriter.h>
-
-#    include <Parsers/ASTFunction.h>
-#    include <Parsers/ASTInsertQuery.h>
-
-#    include <Storages/NamedCollectionsHelpers.h>
-#    include <Storages/PartitionedSink.h>
-#    include <Storages/S3Queue/S3QueueSource.h>
-#    include <Storages/S3Queue/S3QueueTableMetadata.h>
-#    include <Storages/S3Queue/StorageS3Queue.h>
-#    include <Storages/StorageFactory.h>
-#    include <Storages/StorageMaterializedView.h>
-#    include <Storages/StorageS3.h>
-#    include <Storages/StorageSnapshot.h>
-#    include <Storages/VirtualColumnUtils.h>
-#    include <Storages/prepareReadingFromFormat.h>
-#    include <Common/NamedCollections/NamedCollections.h>
-
-
-#    include <Formats/FormatFactory.h>
-
-#    include <Processors/Formats/IInputFormat.h>
-#    include <Processors/Formats/IOutputFormat.h>
-#    include <Processors/Transforms/AddingDefaultsTransform.h>
-
-#    include <QueryPipeline/QueryPipelineBuilder.h>
-
-#    include <DataTypes/DataTypeString.h>
-
-#    include <Common/parseGlobs.h>
-
-#    include <filesystem>
-#    include <Processors/ISource.h>
-#    include <Processors/Sinks/SinkToStorage.h>
-#    include <QueryPipeline/Pipe.h>

 namespace fs = std::filesystem;

 namespace ProfileEvents
 {
-extern const Event S3DeleteObjects;
-extern const Event S3ListObjects;
+    extern const Event S3DeleteObjects;
+    extern const Event S3ListObjects;
 }

 namespace DB
 {

-static const String PARTITION_ID_WILDCARD = "{_partition_id}";
-static const auto MAX_THREAD_WORK_DURATION_MS = 60000;
-
 namespace ErrorCodes
 {
    extern const int LOGICAL_ERROR;
    extern const int BAD_ARGUMENTS;
    extern const int S3_ERROR;
-    extern const int NOT_IMPLEMENTED;
    extern const int QUERY_NOT_ALLOWED;
    extern const int REPLICA_ALREADY_EXISTS;
    extern const int INCOMPATIBLE_COLUMNS;
 }

+namespace
+{
+    bool containsGlobs(const S3::URI & url)
+    {
+        return url.key.find_first_of("*?{") != std::string::npos;
+    }
+
+    std::string chooseZooKeeperPath(const StorageID & table_id, const Settings & settings, const S3QueueSettings & s3queue_settings)
+    {
+        std::string zk_path_prefix = settings.s3queue_default_zookeeper_path.value;
+        if (zk_path_prefix.empty())
+            zk_path_prefix = "/";
+
+        std::string result_zk_path;
+        if (s3queue_settings.keeper_path.changed)
+        {
+            /// We do not add table uuid here on purpose.
+            result_zk_path = fs::path(zk_path_prefix) / s3queue_settings.keeper_path.value;
+        }
+        else
+        {
+            auto database_uuid = DatabaseCatalog::instance().getDatabase(table_id.database_name)->getUUID();
+            result_zk_path = fs::path(zk_path_prefix) / toString(database_uuid) / toString(table_id.uuid);
+        }
+        return zkutil::extractZooKeeperPath(result_zk_path, true);
+    }
+
+    void checkAndAdjustSettings(S3QueueSettings & s3queue_settings, const Settings & settings, Poco::Logger * log)
+    {
+        if (s3queue_settings.mode == S3QueueMode::ORDERED && s3queue_settings.s3queue_processing_threads_num > 1)
+        {
+            LOG_WARNING(log, "Parallel processing is not yet supported for Ordered mode");
+            s3queue_settings.s3queue_processing_threads_num = 1;
+        }
+
+        if (!s3queue_settings.s3queue_processing_threads_num)
+        {
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Setting `s3queue_processing_threads_num` cannot be set to zero");
+        }
+
+        if (!s3queue_settings.s3queue_enable_logging_to_s3queue_log.changed)
+        {
+            s3queue_settings.s3queue_enable_logging_to_s3queue_log = settings.s3queue_enable_logging_to_s3queue_log;
+        }
+
+        if (s3queue_settings.s3queue_cleanup_interval_min_ms > s3queue_settings.s3queue_cleanup_interval_max_ms)
+        {
+            throw Exception(ErrorCodes::BAD_ARGUMENTS,
+                            "Setting `s3queue_cleanup_interval_min_ms` ({}) must be less or equal to `s3queue_cleanup_interval_max_ms` ({})",
+                            s3queue_settings.s3queue_cleanup_interval_min_ms, s3queue_settings.s3queue_cleanup_interval_max_ms);
+        }
+    }
+}

 StorageS3Queue::StorageS3Queue(
    std::unique_ptr<S3QueueSettings> s3queue_settings_,
@ -87,79 +106,80 @@ StorageS3Queue::StorageS3Queue(
    const ConstraintsDescription & constraints_,
    const String & comment,
    ContextPtr context_,
-    std::optional<FormatSettings> format_settings_,
-    ASTPtr partition_by_)
+    std::optional<FormatSettings> format_settings_)
    : IStorage(table_id_)
    , WithContext(context_)
    , s3queue_settings(std::move(s3queue_settings_))
+    , zk_path(chooseZooKeeperPath(table_id_, context_->getSettingsRef(), *s3queue_settings))
    , after_processing(s3queue_settings->after_processing)
+    , files_metadata(S3QueueMetadataFactory::instance().getOrCreate(zk_path, *s3queue_settings))
    , configuration{configuration_}
-    , reschedule_processing_interval_ms(s3queue_settings->s3queue_polling_min_timeout_ms)
    , format_settings(format_settings_)
-    , partition_by(partition_by_)
+    , reschedule_processing_interval_ms(s3queue_settings->s3queue_polling_min_timeout_ms)
    , log(&Poco::Logger::get("StorageS3Queue (" + table_id_.table_name + ")"))
 {
    if (configuration.url.key.ends_with('/'))
+    {
        configuration.url.key += '*';
-
-    if (!withGlobs())
+    }
+    else if (!containsGlobs(configuration.url))
+    {
        throw Exception(ErrorCodes::QUERY_NOT_ALLOWED, "S3Queue url must either end with '/' or contain globs");
-
-    std::string zk_path_prefix = getContext()->getSettingsRef().s3queue_default_zookeeper_path.value;
-    if (zk_path_prefix.empty())
-        zk_path_prefix = "/";
-
-    std::string result_zk_path;
-    if (s3queue_settings->keeper_path.changed)
-    {
-        /// We do not add table uuid here on purpose.
-        result_zk_path = fs::path(zk_path_prefix) / s3queue_settings->keeper_path.value;
-    }
-    else
-    {
-        auto database_uuid = DatabaseCatalog::instance().getDatabase(table_id_.database_name)->getUUID();
-        result_zk_path = fs::path(zk_path_prefix) / toString(database_uuid) / toString(table_id_.uuid);
    }

-    zk_path = zkutil::extractZooKeeperPath(result_zk_path, true/* check_starts_with_slash */, log);
-    LOG_INFO(log, "Using zookeeper path: {}", zk_path);
+    checkAndAdjustSettings(*s3queue_settings, context_->getSettingsRef(), log);

-    FormatFactory::instance().checkFormatName(configuration.format);
-    context_->getGlobalContext()->getRemoteHostFilter().checkURL(configuration.url.uri);
-    StorageInMemoryMetadata storage_metadata;
    configuration.update(context_);
+    FormatFactory::instance().checkFormatName(configuration.format);
+    context_->getRemoteHostFilter().checkURL(configuration.url.uri);

+    StorageInMemoryMetadata storage_metadata;
    if (columns_.empty())
    {
        auto columns = StorageS3::getTableStructureFromDataImpl(configuration, format_settings, context_);
        storage_metadata.setColumns(columns);
    }
    else
+    {
        storage_metadata.setColumns(columns_);
-
+    }
    storage_metadata.setConstraints(constraints_);
    storage_metadata.setComment(comment);
+
+    createOrCheckMetadata(storage_metadata);
    setInMemoryMetadata(storage_metadata);

-    auto metadata_snapshot = getInMemoryMetadataPtr();
-    const bool is_first_replica = createTableIfNotExists(metadata_snapshot);
-
-    if (!is_first_replica)
-    {
-        checkTableStructure(zk_path, metadata_snapshot);
-    }
-
-    files_metadata = std::make_shared<S3QueueFilesMetadata>(this, *s3queue_settings);
    virtual_columns = VirtualColumnUtils::getPathAndFileVirtualsForStorage(storage_metadata.getSampleBlock().getNamesAndTypesList());
+    task = getContext()->getSchedulePool().createTask("S3QueueStreamingTask", [this] { threadFunc(); });

-    auto poll_thread = getContext()->getSchedulePool().createTask("S3QueueStreamingTask", [this] { threadFunc(); });
-    task = std::make_shared<TaskContext>(std::move(poll_thread));
+    LOG_INFO(log, "Using zookeeper path: {}", zk_path.string());
 }

-
-bool StorageS3Queue::supportsSubcolumns() const
+void StorageS3Queue::startup()
 {
-    return true;
+    if (task)
+        task->activateAndSchedule();
+}
+
+void StorageS3Queue::shutdown()
+{
+    shutdown_called = true;
+
+    if (task)
+    {
+        task->deactivate();
+    }
+
+    if (files_metadata)
+    {
+        files_metadata->deactivateCleanupTask();
+        files_metadata.reset();
+    }
+}
+
+void StorageS3Queue::drop()
+{
+    S3QueueMetadataFactory::instance().remove(zk_path);
 }

 bool StorageS3Queue::supportsSubsetOfColumns(const ContextPtr & context_) const
@ -174,83 +194,70 @@ Pipe StorageS3Queue::read(
    ContextPtr local_context,
    QueryProcessingStage::Enum /*processed_stage*/,
    size_t max_block_size,
-    size_t /* num_streams */)
+    size_t num_streams)
 {
    if (!local_context->getSettingsRef().stream_like_engine_allow_direct_select)
-        throw Exception(
-            ErrorCodes::QUERY_NOT_ALLOWED, "Direct select is not allowed. To enable use setting `stream_like_engine_allow_direct_select`");
+    {
+        throw Exception(ErrorCodes::QUERY_NOT_ALLOWED, "Direct select is not allowed. "
+                        "To enable use setting `stream_like_engine_allow_direct_select`");
+    }

    if (mv_attached)
-        throw Exception(ErrorCodes::QUERY_NOT_ALLOWED, "Cannot read from StorageS3Queue with attached materialized views");
+    {
+        throw Exception(ErrorCodes::QUERY_NOT_ALLOWED,
+                        "Cannot read from {} with attached materialized views", getName());
+    }

-    auto query_configuration = updateConfigurationAndGetCopy(local_context);
+    Pipes pipes;
+    const size_t adjusted_num_streams = std::min<size_t>(num_streams, s3queue_settings->s3queue_processing_threads_num);

-    std::shared_ptr<StorageS3Source::IIterator> iterator_wrapper = createFileIterator(local_context, query_info.query);
+    auto file_iterator = createFileIterator(local_context, query_info.query);
+    for (size_t i = 0; i < adjusted_num_streams; ++i)
+        pipes.emplace_back(createSource(file_iterator, column_names, storage_snapshot, max_block_size, local_context));
+    return Pipe::unitePipes(std::move(pipes));
+}

+std::shared_ptr<StorageS3QueueSource> StorageS3Queue::createSource(
+    std::shared_ptr<StorageS3Queue::FileIterator> file_iterator,
+    const Names & column_names,
+    const StorageSnapshotPtr & storage_snapshot,
+    size_t max_block_size,
+    ContextPtr local_context)
+{
+    auto configuration_snapshot = updateConfigurationAndGetCopy(local_context);
    auto read_from_format_info = prepareReadingFromFormat(column_names, storage_snapshot, supportsSubsetOfColumns(local_context), getVirtuals());

-    const size_t max_download_threads = local_context->getSettingsRef().max_download_threads;
-
-    return Pipe(std::make_shared<StorageS3QueueSource>(
-        read_from_format_info,
-        configuration.format,
-        getName(),
-        local_context,
-        format_settings,
+    auto internal_source = std::make_unique<StorageS3Source>(
+        read_from_format_info, configuration.format, getName(), local_context, format_settings,
        max_block_size,
-        query_configuration.request_settings,
-        configuration.compression_method,
-        query_configuration.client,
-        query_configuration.url.bucket,
-        query_configuration.url.version_id,
-        query_configuration.url.uri.getHost() + std::to_string(query_configuration.url.uri.getPort()),
-        iterator_wrapper,
-        files_metadata,
-        after_processing,
-        max_download_threads));
-}
+        configuration_snapshot.request_settings,
+        configuration_snapshot.compression_method,
+        configuration_snapshot.client,
+        configuration_snapshot.url.bucket,
+        configuration_snapshot.url.version_id,
+        configuration_snapshot.url.uri.getHost() + std::to_string(configuration_snapshot.url.uri.getPort()),
+        file_iterator, local_context->getSettingsRef().max_download_threads, false, /* query_info */ std::nullopt);

-SinkToStoragePtr StorageS3Queue::write(const ASTPtr &, const StorageMetadataPtr &, ContextPtr, bool)
-{
-    throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Write is not supported by storage {}", getName());
-}
-
-void StorageS3Queue::truncate(const ASTPtr &, const StorageMetadataPtr &, ContextPtr, TableExclusiveLockHolder &)
-{
-    throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Truncate is not supported by storage {}", getName());
-}
-
-NamesAndTypesList StorageS3Queue::getVirtuals() const
-{
-    return virtual_columns;
-}
-
-bool StorageS3Queue::supportsPartitionBy() const
-{
-    return true;
-}
-
-void StorageS3Queue::startup()
-{
-    if (task)
-        task->holder->activateAndSchedule();
-}
-
-void StorageS3Queue::shutdown()
-{
-    shutdown_called = true;
-    if (task)
+    auto file_deleter = [this, bucket = configuration_snapshot.url.bucket, client = configuration_snapshot.client](const std::string & path)
    {
-        task->stream_cancelled = true;
-        task->holder->deactivate();
-    }
-}
-
-size_t StorageS3Queue::getTableDependentCount() const
-{
-    auto table_id = getStorageID();
-    // Check if at least one direct dependency is attached
-    return DatabaseCatalog::instance().getDependentViews(table_id).size();
+        S3::DeleteObjectRequest request;
+        request.WithKey(path).WithBucket(bucket);
+        auto outcome = client->DeleteObject(request);
+        if (!outcome.IsSuccess())
+        {
+            const auto & err = outcome.GetError();
+            LOG_ERROR(log, "{} (Code: {})", err.GetMessage(), static_cast<size_t>(err.GetErrorType()));
+        }
+        else
+        {
+            LOG_TRACE(log, "Object with path {} was removed from S3", path);
+        }
+    };
+    auto s3_queue_log = s3queue_settings->s3queue_enable_logging_to_s3queue_log ? local_context->getS3QueueLog() : nullptr;
+    return std::make_shared<StorageS3QueueSource>(
+        getName(), read_from_format_info.source_header, std::move(internal_source),
+        files_metadata, after_processing, file_deleter, read_from_format_info.requested_virtual_columns,
+        local_context, shutdown_called, s3_queue_log, getStorageID());
 }

 bool StorageS3Queue::hasDependencies(const StorageID & table_id)
@ -280,41 +287,35 @@ bool StorageS3Queue::hasDependencies(const StorageID & table_id)

 void StorageS3Queue::threadFunc()
 {
-    bool reschedule = true;
+    if (shutdown_called)
+        return;
+
    try
    {
-        auto table_id = getStorageID();
-
-        auto dependencies_count = getTableDependentCount();
+        const size_t dependencies_count = DatabaseCatalog::instance().getDependentViews(getStorageID()).size();
        if (dependencies_count)
        {
-            auto start_time = std::chrono::steady_clock::now();
-
            mv_attached.store(true);
-            // Keep streaming as long as there are attached views and streaming is not cancelled
-            while (!task->stream_cancelled)
+            SCOPE_EXIT({ mv_attached.store(false); });
+
+            LOG_DEBUG(log, "Started streaming to {} attached views", dependencies_count);
+
+            if (streamToViews())
            {
-                if (!hasDependencies(table_id))
-                {
-                    /// For this case, we can not wait for watch thread to wake up
-                    reschedule = true;
-                    break;
-                }
-
-                LOG_DEBUG(log, "Started streaming to {} attached views", dependencies_count);
-                streamToViews();
-
-                auto ts = std::chrono::steady_clock::now();
-                auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(ts - start_time);
-                if (duration.count() > MAX_THREAD_WORK_DURATION_MS)
-                {
-                    LOG_TRACE(log, "Thread work duration limit exceeded. Reschedule.");
-                    reschedule = true;
-                    break;
-                }
-
+                /// Reset the reschedule interval.
                reschedule_processing_interval_ms = s3queue_settings->s3queue_polling_min_timeout_ms;
            }
+            else
+            {
+                /// Increase the reschedule interval.
+                reschedule_processing_interval_ms += s3queue_settings->s3queue_polling_backoff_ms;
+            }
+
+            LOG_DEBUG(log, "Stopped streaming to {} attached views", dependencies_count);
+        }
+        else
+        {
+            LOG_TEST(log, "No attached dependencies");
        }
    }
    catch (...)
@ -322,20 +323,14 @@ void StorageS3Queue::threadFunc()
        tryLogCurrentException(__PRETTY_FUNCTION__);
    }

-    mv_attached.store(false);
-
-    if (reschedule && !shutdown_called)
+    if (!shutdown_called)
    {
-        LOG_TRACE(log, "Reschedule S3 Queue thread func.");
-        /// Reschedule with backoff.
-        if (reschedule_processing_interval_ms < s3queue_settings->s3queue_polling_max_timeout_ms)
-            reschedule_processing_interval_ms += s3queue_settings->s3queue_polling_backoff_ms;
-        task->holder->scheduleAfter(reschedule_processing_interval_ms);
+        LOG_TRACE(log, "Reschedule S3 Queue processing thread in {} ms", reschedule_processing_interval_ms);
+        task->scheduleAfter(reschedule_processing_interval_ms);
    }
 }

-
-void StorageS3Queue::streamToViews()
+bool StorageS3Queue::streamToViews()
 {
    auto table_id = getStorageID();
    auto table = DatabaseCatalog::instance().getTable(table_id, getContext());
@ -348,8 +343,6 @@ void StorageS3Queue::streamToViews()
    auto insert = std::make_shared<ASTInsertQuery>();
    insert->table_id = table_id;

-    size_t block_size = 100;
-
    auto s3queue_context = Context::createCopy(getContext());
    s3queue_context->makeQueryContext();
    auto query_configuration = updateConfigurationAndGetCopy(s3queue_context);
@ -358,40 +351,31 @@ void StorageS3Queue::streamToViews()
    // Only insert into dependent views and expect that input blocks contain virtual columns
    InterpreterInsertQuery interpreter(insert, s3queue_context, false, true, true);
    auto block_io = interpreter.execute();
-    auto column_names = block_io.pipeline.getHeader().getNames();
+    auto file_iterator = createFileIterator(s3queue_context, nullptr);

-    // Create a stream for each consumer and join them in a union stream
+    Pipes pipes;
+    pipes.reserve(s3queue_settings->s3queue_processing_threads_num);
+    for (size_t i = 0; i < s3queue_settings->s3queue_processing_threads_num; ++i)
+    {
+        auto source = createSource(
+            file_iterator, block_io.pipeline.getHeader().getNames(),
+            storage_snapshot, DBMS_DEFAULT_BUFFER_SIZE, s3queue_context);

-    std::shared_ptr<StorageS3Source::IIterator> iterator_wrapper = createFileIterator(s3queue_context, nullptr);
-    auto read_from_format_info = prepareReadingFromFormat(column_names, storage_snapshot, supportsSubsetOfColumns(getContext()), getVirtuals());
-    const size_t max_download_threads = s3queue_context->getSettingsRef().max_download_threads;
-
-    auto pipe = Pipe(std::make_shared<StorageS3QueueSource>(
-        read_from_format_info,
-        configuration.format,
-        getName(),
-        s3queue_context,
-        format_settings,
-        block_size,
-        query_configuration.request_settings,
-        configuration.compression_method,
-        query_configuration.client,
-        query_configuration.url.bucket,
-        query_configuration.url.version_id,
-        query_configuration.url.uri.getHost() + std::to_string(query_configuration.url.uri.getPort()),
-        iterator_wrapper,
-        files_metadata,
-        after_processing,
-        max_download_threads));
+        pipes.emplace_back(std::move(source));
+    }
+    auto pipe = Pipe::unitePipes(std::move(pipes));

+    block_io.pipeline.complete(std::move(pipe));
+    block_io.pipeline.setNumThreads(s3queue_settings->s3queue_processing_threads_num);
+    block_io.pipeline.setConcurrencyControl(s3queue_context->getSettingsRef().use_concurrency_control);

    std::atomic_size_t rows = 0;
-    {
-        block_io.pipeline.complete(std::move(pipe));
-        block_io.pipeline.setProgressCallback([&](const Progress & progress) { rows += progress.read_rows.load(); });
-        CompletedPipelineExecutor executor(block_io.pipeline);
-        executor.execute();
-    }
+    block_io.pipeline.setProgressCallback([&](const Progress & progress) { rows += progress.read_rows.load(); });
+
+    CompletedPipelineExecutor executor(block_io.pipeline);
+    executor.execute();
+
+    return rows > 0;
 }

 StorageS3Queue::Configuration StorageS3Queue::updateConfigurationAndGetCopy(ContextPtr local_context)
@ -402,58 +386,43 @@ StorageS3Queue::Configuration StorageS3Queue::updateConfigurationAndGetCopy(Cont

 zkutil::ZooKeeperPtr StorageS3Queue::getZooKeeper() const
 {
-    std::lock_guard lock{zk_mutex};
-    if (!zk_client || zk_client->expired())
-    {
-        zk_client = getContext()->getZooKeeper();
-        zk_client->sync(zk_path);
-    }
-    return zk_client;
+    return getContext()->getZooKeeper();
 }

-
-bool StorageS3Queue::createTableIfNotExists(const StorageMetadataPtr & metadata_snapshot)
+void StorageS3Queue::createOrCheckMetadata(const StorageInMemoryMetadata & storage_metadata)
 {
    auto zookeeper = getZooKeeper();
    zookeeper->createAncestors(zk_path);

-    for (size_t i = 0; i < zk_create_table_retries; ++i)
+    for (size_t i = 0; i < 1000; ++i)
    {
-        Coordination::Requests ops;
-        bool is_first_replica = true;
-        if (zookeeper->exists(zk_path + "/metadata"))
+        Coordination::Requests requests;
+        if (zookeeper->exists(zk_path / "metadata"))
        {
-            if (!zookeeper->exists(zk_path + "/processing"))
-                ops.emplace_back(zkutil::makeCreateRequest(zk_path + "/processing", "", zkutil::CreateMode::Ephemeral));
-            LOG_DEBUG(log, "This table {} is already created, will use existing metadata for checking engine settings", zk_path);
-            is_first_replica = false;
+            checkTableStructure(zk_path, storage_metadata);
        }
        else
        {
-            String metadata_str = S3QueueTableMetadata(configuration, *s3queue_settings).toString();
-            ops.emplace_back(zkutil::makeCreateRequest(zk_path, "", zkutil::CreateMode::Persistent));
-            ops.emplace_back(zkutil::makeCreateRequest(zk_path + "/processed", "", zkutil::CreateMode::Persistent));
-            ops.emplace_back(zkutil::makeCreateRequest(zk_path + "/failed", "", zkutil::CreateMode::Persistent));
-            ops.emplace_back(zkutil::makeCreateRequest(zk_path + "/processing", "", zkutil::CreateMode::Ephemeral));
-            ops.emplace_back(zkutil::makeCreateRequest(
-                zk_path + "/columns", metadata_snapshot->getColumns().toString(), zkutil::CreateMode::Persistent));
-
-            ops.emplace_back(zkutil::makeCreateRequest(zk_path + "/metadata", metadata_str, zkutil::CreateMode::Persistent));
+            std::string metadata = S3QueueTableMetadata(configuration, *s3queue_settings, storage_metadata).toString();
+            requests.emplace_back(zkutil::makeCreateRequest(zk_path, "", zkutil::CreateMode::Persistent));
+            requests.emplace_back(zkutil::makeCreateRequest(zk_path / "processed", "", zkutil::CreateMode::Persistent));
+            requests.emplace_back(zkutil::makeCreateRequest(zk_path / "failed", "", zkutil::CreateMode::Persistent));
+            requests.emplace_back(zkutil::makeCreateRequest(zk_path / "processing", "", zkutil::CreateMode::Persistent));
+            requests.emplace_back(zkutil::makeCreateRequest(zk_path / "metadata", metadata, zkutil::CreateMode::Persistent));
        }

        Coordination::Responses responses;
-        auto code = zookeeper->tryMulti(ops, responses);
+        auto code = zookeeper->tryMulti(requests, responses);
        if (code == Coordination::Error::ZNODEEXISTS)
        {
-            LOG_INFO(log, "It looks like the table {} was created by another server at the same moment, will retry", zk_path);
+            LOG_INFO(log, "It looks like the table {} was created by another server at the same moment, will retry", zk_path.string());
            continue;
        }
        else if (code != Coordination::Error::ZOK)
        {
-            zkutil::KeeperMultiException::check(code, ops, responses);
+            zkutil::KeeperMultiException::check(code, requests, responses);
        }
-
-        return is_first_replica;
+        return;
    }

    throw Exception(
@ -463,24 +432,20 @@ bool StorageS3Queue::createTableIfNotExists(const StorageMetadataPtr & metadata_
 }


-/** Verify that list of columns and table settings match those specified in ZK (/metadata).
-  * If not, throw an exception.
-  */
-void StorageS3Queue::checkTableStructure(const String & zookeeper_prefix, const StorageMetadataPtr & metadata_snapshot)
+void StorageS3Queue::checkTableStructure(const String & zookeeper_prefix, const StorageInMemoryMetadata & storage_metadata)
 {
+    // Verify that list of columns and table settings match those specified in ZK (/metadata).
+    // If not, throw an exception.
+
    auto zookeeper = getZooKeeper();
-
-    S3QueueTableMetadata old_metadata(configuration, *s3queue_settings);
-
-    Coordination::Stat metadata_stat;
-    String metadata_str = zookeeper->get(fs::path(zookeeper_prefix) / "metadata", &metadata_stat);
+    String metadata_str = zookeeper->get(fs::path(zookeeper_prefix) / "metadata");
    auto metadata_from_zk = S3QueueTableMetadata::parse(metadata_str);
+
+    S3QueueTableMetadata old_metadata(configuration, *s3queue_settings, storage_metadata);
    old_metadata.checkEquals(metadata_from_zk);

-    Coordination::Stat columns_stat;
-    auto columns_from_zk = ColumnsDescription::parse(zookeeper->get(fs::path(zookeeper_prefix) / "columns", &columns_stat));
-
-    const ColumnsDescription & old_columns = metadata_snapshot->getColumns();
+    auto columns_from_zk = ColumnsDescription::parse(metadata_from_zk.columns);
+    const ColumnsDescription & old_columns = storage_metadata.getColumns();
    if (columns_from_zk != old_columns)
    {
        throw Exception(
@ -492,45 +457,12 @@ void StorageS3Queue::checkTableStructure(const String & zookeeper_prefix, const
    }
 }

-
-std::shared_ptr<StorageS3QueueSource::IIterator>
-StorageS3Queue::createFileIterator(ContextPtr local_context, ASTPtr query)
+std::shared_ptr<StorageS3Queue::FileIterator> StorageS3Queue::createFileIterator(ContextPtr local_context, ASTPtr query)
 {
-    auto it = std::make_shared<StorageS3QueueSource::QueueGlobIterator>(
-        *configuration.client,
-        configuration.url,
-        query,
-        virtual_columns,
-        local_context,
-        s3queue_settings->s3queue_polling_size.value,
-        configuration.request_settings);
-
-    auto zookeeper = getZooKeeper();
-    auto lock = files_metadata->acquireLock(zookeeper);
-    S3QueueFilesMetadata::S3FilesCollection files_to_skip = files_metadata->getProcessedFailedAndProcessingFiles();
-
-    Strings files_to_process;
-    if (s3queue_settings->mode == S3QueueMode::UNORDERED)
-    {
-        files_to_process = it->filterProcessingFiles(s3queue_settings->mode, files_to_skip);
-    }
-    else
-    {
-        String max_processed_file = files_metadata->getMaxProcessedFile();
-        files_to_process = it->filterProcessingFiles(s3queue_settings->mode, files_to_skip, max_processed_file);
-    }
-
-    LOG_TEST(log, "Found files to process: {}", fmt::join(files_to_process, ", "));
-
-    files_metadata->setFilesProcessing(files_to_process);
-    return it;
-}
-
-void StorageS3Queue::drop()
-{
-    auto zookeeper = getZooKeeper();
-    if (zookeeper->exists(zk_path))
-        zookeeper->removeRecursive(zk_path);
+    auto glob_iterator = std::make_unique<StorageS3QueueSource::GlobIterator>(
+        *configuration.client, configuration.url, query, virtual_columns, local_context,
+        /* read_keys */nullptr, configuration.request_settings);
+    return std::make_shared<FileIterator>(files_metadata, std::move(glob_iterator), shutdown_called);
 }

 void registerStorageS3QueueImpl(const String & name, StorageFactory & factory)
@ -540,11 +472,15 @@ void registerStorageS3QueueImpl(const String & name, StorageFactory & factory)
        [](const StorageFactory::Arguments & args)
        {
            if (!args.attach && !args.getLocalContext()->getSettingsRef().allow_experimental_s3queue)
-                throw Exception(ErrorCodes::BAD_ARGUMENTS, "S3Queue is experimental. You can enable it with the `allow_experimental_s3queue` setting.");
+            {
+                throw Exception(ErrorCodes::BAD_ARGUMENTS, "S3Queue is experimental. "
+                                "You can enable it with the `allow_experimental_s3queue` setting.");
+            }

            auto & engine_args = args.engine_args;
            if (engine_args.empty())
                throw Exception(ErrorCodes::BAD_ARGUMENTS, "External data source must have arguments");
+
            auto configuration = StorageS3::getConfiguration(engine_args, args.getLocalContext());

            // Use format settings from global server context + settings from
@ -582,10 +518,6 @@ void registerStorageS3QueueImpl(const String & name, StorageFactory & factory)
                format_settings = getFormatSettings(args.getContext());
            }

-            ASTPtr partition_by;
-            if (args.storage_def->partition_by)
-                partition_by = args.storage_def->partition_by->clone();
-
            return std::make_shared<StorageS3Queue>(
                std::move(s3queue_settings),
                std::move(configuration),
@ -594,12 +526,10 @@ void registerStorageS3QueueImpl(const String & name, StorageFactory & factory)
                args.constraints,
                args.comment,
                args.getContext(),
-                format_settings,
-                partition_by);
+                format_settings);
        },
        {
            .supports_settings = true,
-            .supports_sort_order = true, // for partition by
            .supports_schema_inference = true,
            .source_access_type = AccessType::S3,
        });
--- a/src/Storages/S3Queue/StorageS3Queue.h
+++ b/src/Storages/S3Queue/StorageS3Queue.h
@ -1,32 +1,15 @@
 #pragma once
-
 #include "config.h"

 #if USE_AWS_S3
-
-#    include <Core/Types.h>
-
-#    include <Compression/CompressionInfo.h>
-#    include <Common/ZooKeeper/ZooKeeper.h>
-
-#    include <Core/BackgroundSchedulePool.h>
-#    include <Storages/IStorage.h>
-#    include <Storages/S3Queue/S3QueueFilesMetadata.h>
-#    include <Storages/S3Queue/S3QueueSettings.h>
-#    include <Storages/S3Queue/S3QueueSource.h>
-#    include <Storages/StorageS3Settings.h>
-
-#    include <IO/CompressionMethod.h>
-#    include <IO/S3/getObjectInfo.h>
-#    include <Interpreters/Context.h>
-#    include <Interpreters/threadPoolCallbackRunner.h>
-#    include <Processors/Executors/PullingPipelineExecutor.h>
-#    include <Processors/ISource.h>
-#    include <Storages/Cache/SchemaCache.h>
-#    include <Storages/StorageConfiguration.h>
-#    include <Storages/StorageS3.h>
-#    include <Poco/URI.h>
-#    include <Common/logger_useful.h>
+#include <Common/ZooKeeper/ZooKeeper.h>
+#include <Common/logger_useful.h>
+#include <Core/BackgroundSchedulePool.h>
+#include <Storages/IStorage.h>
+#include <Storages/S3Queue/S3QueueSettings.h>
+#include <Storages/S3Queue/S3QueueSource.h>
+#include <Storages/StorageS3.h>
+#include <Interpreters/Context.h>

 namespace Aws::S3
 {
@ -35,7 +18,7 @@ class Client;

 namespace DB
 {
-
+class S3QueueFilesMetadata;

 class StorageS3Queue : public IStorage, WithContext
 {
@ -50,8 +33,7 @@ public:
        const ConstraintsDescription & constraints_,
        const String & comment,
        ContextPtr context_,
-        std::optional<FormatSettings> format_settings_,
-        ASTPtr partition_by_ = nullptr);
+        std::optional<FormatSettings> format_settings_);

    String getName() const override { return "S3Queue"; }

@ -64,79 +46,55 @@ public:
        size_t max_block_size,
        size_t num_streams) override;

-    SinkToStoragePtr write(
-        const ASTPtr & query,
-        const StorageMetadataPtr & metadata_snapshot,
-        ContextPtr context,
-        bool async_insert) override;
-
-    void truncate(
-        const ASTPtr & /*query*/,
-        const StorageMetadataPtr & /*metadata_snapshot*/,
-        ContextPtr /*local_context*/,
-        TableExclusiveLockHolder &) override;
-
-    NamesAndTypesList getVirtuals() const override;
-
-    bool supportsPartitionBy() const override;
+    NamesAndTypesList getVirtuals() const override { return virtual_columns; }

    const auto & getFormatName() const { return configuration.format; }

-    const String & getZooKeeperPath() const { return zk_path; }
+    const fs::path & getZooKeeperPath() const { return zk_path; }

    zkutil::ZooKeeperPtr getZooKeeper() const;

 private:
+    using FileIterator = StorageS3QueueSource::FileIterator;
+
    const std::unique_ptr<S3QueueSettings> s3queue_settings;
+    const fs::path zk_path;
    const S3QueueAction after_processing;

    std::shared_ptr<S3QueueFilesMetadata> files_metadata;
    Configuration configuration;
+
+    const std::optional<FormatSettings> format_settings;
    NamesAndTypesList virtual_columns;
+
+    BackgroundSchedulePool::TaskHolder task;
+    std::atomic<bool> stream_cancelled{false};
    UInt64 reschedule_processing_interval_ms;

-    std::optional<FormatSettings> format_settings;
-    ASTPtr partition_by;
-
-    String zk_path;
-    mutable zkutil::ZooKeeperPtr zk_client;
-    mutable std::mutex zk_mutex;
-
    std::atomic<bool> mv_attached = false;
-    std::atomic<bool> shutdown_called{false};
+    std::atomic<bool> shutdown_called = false;
    Poco::Logger * log;

-    bool supportsSubcolumns() const override;
-    bool withGlobs() const { return configuration.url.key.find_first_of("*?{") != std::string::npos; }
-
-    void threadFunc();
-    size_t getTableDependentCount() const;
-    bool hasDependencies(const StorageID & table_id);
-
    void startup() override;
    void shutdown() override;
    void drop() override;
-
-    struct TaskContext
-    {
-        BackgroundSchedulePool::TaskHolder holder;
-        std::atomic<bool> stream_cancelled{false};
-        explicit TaskContext(BackgroundSchedulePool::TaskHolder && task_) : holder(std::move(task_)) { }
-    };
-    std::shared_ptr<TaskContext> task;
-
    bool supportsSubsetOfColumns(const ContextPtr & context_) const;
+    bool supportsSubcolumns() const override { return true; }

-    const UInt32 zk_create_table_retries = 1000;
-    bool createTableIfNotExists(const StorageMetadataPtr & metadata_snapshot);
-    void checkTableStructure(const String & zookeeper_prefix, const StorageMetadataPtr & metadata_snapshot);
+    std::shared_ptr<FileIterator> createFileIterator(ContextPtr local_context, ASTPtr query);
+    std::shared_ptr<StorageS3QueueSource> createSource(
+        std::shared_ptr<StorageS3Queue::FileIterator> file_iterator,
+        const Names & column_names,
+        const StorageSnapshotPtr & storage_snapshot,
+        size_t max_block_size,
+        ContextPtr local_context);

-    using KeysWithInfo = StorageS3QueueSource::KeysWithInfo;
+    bool hasDependencies(const StorageID & table_id);
+    bool streamToViews();
+    void threadFunc();

-    std::shared_ptr<StorageS3QueueSource::IIterator>
-    createFileIterator(ContextPtr local_context, ASTPtr query);
-
-    void streamToViews();
+    void createOrCheckMetadata(const StorageInMemoryMetadata & storage_metadata);
+    void checkTableStructure(const String & zookeeper_prefix, const StorageInMemoryMetadata & storage_metadata);
    Configuration updateConfigurationAndGetCopy(ContextPtr local_context);
 };

--- a/src/Storages/StorageMaterializedView.cpp
+++ b/src/Storages/StorageMaterializedView.cpp
@ -478,6 +478,13 @@ ActionLock StorageMaterializedView::getActionLock(StorageActionBlockType type)
    return ActionLock{};
 }

+bool StorageMaterializedView::isRemote() const
+{
+    if (auto table = tryGetTargetTable())
+        return table->isRemote();
+    return false;
+}
+
 void registerStorageMaterializedView(StorageFactory & factory)
 {
    factory.registerStorage("MaterializedView", [](const StorageFactory::Arguments & args)
--- a/src/Storages/StorageMaterializedView.h
+++ b/src/Storages/StorageMaterializedView.h
@ -22,6 +22,7 @@ public:

    std::string getName() const override { return "MaterializedView"; }
    bool isView() const override { return true; }
+    bool isRemote() const override;

    bool hasInnerTable() const { return has_inner_table; }

--- a/src/Storages/StorageS3.cpp
+++ b/src/Storages/StorageS3.cpp
@ -161,7 +161,7 @@ public:
        /// We don't have to list bucket, because there is no asterisks.
        if (key_prefix.size() == globbed_uri.key.size())
        {
-            buffer.emplace_back(globbed_uri.key, std::nullopt);
+            buffer.emplace_back(std::make_shared<KeyWithInfo>(globbed_uri.key, std::nullopt));
            buffer_iter = buffer.begin();
            is_finished = true;
            return;
@ -182,7 +182,7 @@ public:
        fillInternalBufferAssumeLocked();
    }

-    KeyWithInfo next()
+    KeyWithInfoPtr next()
    {
        std::lock_guard lock(mutex);
        return nextAssumeLocked();
@ -201,7 +201,7 @@ public:
 private:
    using ListObjectsOutcome = Aws::S3::Model::ListObjectsV2Outcome;

-    KeyWithInfo nextAssumeLocked()
+    KeyWithInfoPtr nextAssumeLocked()
    {
        if (buffer_iter != buffer.end())
        {
@ -210,11 +210,11 @@ private:

            /// If url doesn't contain globs, we didn't list s3 bucket and didn't get object info for the key.
            /// So we get object info lazily here on 'next()' request.
-            if (!answer.info)
+            if (!answer->info)
            {
-                answer.info = S3::getObjectInfo(*client, globbed_uri.bucket, answer.key, globbed_uri.version_id, request_settings);
+                answer->info = S3::getObjectInfo(*client, globbed_uri.bucket, answer->key, globbed_uri.version_id, request_settings);
                if (file_progress_callback)
-                    file_progress_callback(FileProgress(0, answer.info->size));
+                    file_progress_callback(FileProgress(0, answer->info->size));
            }

            return answer;
@ -287,7 +287,7 @@ private:
                    .last_modification_time = row.GetLastModified().Millis() / 1000,
                };

-                temp_buffer.emplace_back(std::move(key), std::move(info));
+                temp_buffer.emplace_back(std::make_shared<KeyWithInfo>(std::move(key), std::move(info)));
            }
        }

@ -299,7 +299,7 @@ private:

        if (!is_initialized)
        {
-            filter_ast = VirtualColumnUtils::createPathAndFileFilterAst(query, virtual_columns, fs::path(globbed_uri.bucket) / temp_buffer.front().key, getContext());
+            filter_ast = VirtualColumnUtils::createPathAndFileFilterAst(query, virtual_columns, fs::path(globbed_uri.bucket) / temp_buffer.front()->key, getContext());
            is_initialized = true;
        }

@ -308,7 +308,7 @@ private:
            std::vector<String> paths;
            paths.reserve(temp_buffer.size());
            for (const auto & key_with_info : temp_buffer)
-                paths.push_back(fs::path(globbed_uri.bucket) / key_with_info.key);
+                paths.push_back(fs::path(globbed_uri.bucket) / key_with_info->key);

            VirtualColumnUtils::filterByPathOrFile(temp_buffer, paths, query, virtual_columns, getContext(), filter_ast);
        }
@ -317,8 +317,8 @@ private:

        if (file_progress_callback)
        {
-            for (const auto & [_, info] : buffer)
-                file_progress_callback(FileProgress(0, info->size));
+            for (const auto & key_with_info : buffer)
+                file_progress_callback(FileProgress(0, key_with_info->info->size));
        }

        /// Set iterator only after the whole batch is processed
@ -381,7 +381,7 @@ StorageS3Source::DisclosedGlobIterator::DisclosedGlobIterator(
 {
 }

-StorageS3Source::KeyWithInfo StorageS3Source::DisclosedGlobIterator::next()
+StorageS3Source::KeyWithInfoPtr StorageS3Source::DisclosedGlobIterator::next()
 {
    return pimpl->next();
 }
@ -432,11 +432,11 @@ public:
        if (read_keys_)
        {
            for (const auto & key : keys)
-                read_keys_->push_back({key, {}});
+                read_keys_->push_back(std::make_shared<KeyWithInfo>(key));
        }
    }

-    KeyWithInfo next()
+    KeyWithInfoPtr next()
    {
        size_t current_index = index.fetch_add(1, std::memory_order_relaxed);
        if (current_index >= keys.size())
@ -449,7 +449,7 @@ public:
            file_progress_callback(FileProgress(0, info->size));
        }

-        return {key, info};
+        return std::make_shared<KeyWithInfo>(key, info);
    }

    size_t objectsCount()
@ -486,7 +486,7 @@ StorageS3Source::KeysIterator::KeysIterator(
 {
 }

-StorageS3Source::KeyWithInfo StorageS3Source::KeysIterator::next()
+StorageS3Source::KeyWithInfoPtr StorageS3Source::KeysIterator::next()
 {
    return pimpl->next();
 }
@ -512,14 +512,14 @@ StorageS3Source::ReadTaskIterator::ReadTaskIterator(
    pool.wait();
    buffer.reserve(max_threads_count);
    for (auto & key_future : keys)
-        buffer.emplace_back(key_future.get(), std::nullopt);
+        buffer.emplace_back(std::make_shared<KeyWithInfo>(key_future.get(), std::nullopt));
 }

-StorageS3Source::KeyWithInfo StorageS3Source::ReadTaskIterator::next()
+StorageS3Source::KeyWithInfoPtr StorageS3Source::ReadTaskIterator::next()
 {
    size_t current_index = index.fetch_add(1, std::memory_order_relaxed);
    if (current_index >= buffer.size())
-        return {callback(), {}};
+        return std::make_shared<KeyWithInfo>(callback());

    return buffer[current_index];
 }
@ -576,22 +576,22 @@ StorageS3Source::StorageS3Source(

 StorageS3Source::ReaderHolder StorageS3Source::createReader()
 {
-    KeyWithInfo key_with_info;
+    KeyWithInfoPtr key_with_info;
    do
    {
        key_with_info = (*file_iterator)();
-        if (key_with_info.key.empty())
+        if (!key_with_info || key_with_info->key.empty())
            return {};

-        if (!key_with_info.info)
-            key_with_info.info = S3::getObjectInfo(*client, bucket, key_with_info.key, version_id, request_settings);
+        if (!key_with_info->info)
+            key_with_info->info = S3::getObjectInfo(*client, bucket, key_with_info->key, version_id, request_settings);
    }
-    while (getContext()->getSettingsRef().s3_skip_empty_files && key_with_info.info->size == 0);
+    while (getContext()->getSettingsRef().s3_skip_empty_files && key_with_info->info->size == 0);

    QueryPipelineBuilder builder;
    std::shared_ptr<ISource> source;
    std::unique_ptr<ReadBuffer> read_buf;
-    std::optional<size_t> num_rows_from_cache = need_only_count && getContext()->getSettingsRef().use_cache_for_count_from_files ? tryGetNumRowsFromCache(key_with_info) : std::nullopt;
+    std::optional<size_t> num_rows_from_cache = need_only_count && getContext()->getSettingsRef().use_cache_for_count_from_files ? tryGetNumRowsFromCache(*key_with_info) : std::nullopt;
    if (num_rows_from_cache)
    {
        /// We should not return single chunk with all number of rows,
@ -604,8 +604,8 @@ StorageS3Source::ReaderHolder StorageS3Source::createReader()
    }
    else
    {
-        auto compression_method = chooseCompressionMethod(key_with_info.key, compression_hint);
-        read_buf = createS3ReadBuffer(key_with_info.key, key_with_info.info->size);
+        auto compression_method = chooseCompressionMethod(key_with_info->key, compression_hint);
+        read_buf = createS3ReadBuffer(key_with_info->key, key_with_info->info->size);

        auto input_format = FormatFactory::instance().getInput(
            format,
@ -1505,7 +1505,7 @@ namespace
            {
                current_key_with_info = (*file_iterator)();

-                if (current_key_with_info.key.empty())
+                if (!current_key_with_info || current_key_with_info->key.empty())
                {
                    if (first)
                        throw Exception(
@ -1526,15 +1526,15 @@ namespace
                        return nullptr;
                }

-                if (getContext()->getSettingsRef().s3_skip_empty_files && current_key_with_info.info && current_key_with_info.info->size == 0)
+                if (getContext()->getSettingsRef().s3_skip_empty_files && current_key_with_info->info && current_key_with_info->info->size == 0)
                    continue;

                int zstd_window_log_max = static_cast<int>(getContext()->getSettingsRef().zstd_window_log_max);
-                auto impl = std::make_unique<ReadBufferFromS3>(configuration.client, configuration.url.bucket, current_key_with_info.key, configuration.url.version_id, configuration.request_settings, getContext()->getReadSettings());
+                auto impl = std::make_unique<ReadBufferFromS3>(configuration.client, configuration.url.bucket, current_key_with_info->key, configuration.url.version_id, configuration.request_settings, getContext()->getReadSettings());
                if (!getContext()->getSettingsRef().s3_skip_empty_files || !impl->eof())
                {
                    first = false;
-                    return wrapReadBufferWithCompressionMethod(std::move(impl), chooseCompressionMethod(current_key_with_info.key, configuration.compression_method), zstd_window_log_max);
+                    return wrapReadBufferWithCompressionMethod(std::move(impl), chooseCompressionMethod(current_key_with_info->key, configuration.compression_method), zstd_window_log_max);
                }
            }
        }
@ -1549,7 +1549,7 @@ namespace
            if (!getContext()->getSettingsRef().schema_inference_use_cache_for_s3)
                return;

-            String source = fs::path(configuration.url.uri.getHost() + std::to_string(configuration.url.uri.getPort())) / configuration.url.bucket / current_key_with_info.key;
+            String source = fs::path(configuration.url.uri.getHost() + std::to_string(configuration.url.uri.getPort())) / configuration.url.bucket / current_key_with_info->key;
            auto key = getKeyForSchemaCache(source, configuration.format, format_settings, getContext());
            StorageS3::getSchemaCache(getContext()).addNumRows(key, num_rows);
        }
@ -1560,7 +1560,7 @@ namespace
        const StorageS3::Configuration & configuration;
        const std::optional<FormatSettings> & format_settings;
        std::optional<ColumnsDescription> columns_from_cache;
-        StorageS3Source::KeyWithInfo current_key_with_info;
+        StorageS3Source::KeyWithInfoPtr current_key_with_info;
        size_t prev_read_keys_size;
        bool first = true;
    };
@ -1700,9 +1700,9 @@ std::optional<ColumnsDescription> StorageS3::tryGetColumnsFromCache(
        auto get_last_mod_time = [&]
        {
            time_t last_modification_time = 0;
-            if (it->info)
+            if ((*it)->info)
            {
-                last_modification_time = it->info->last_modification_time;
+                last_modification_time = (*it)->info->last_modification_time;
            }
            else
            {
@ -1712,7 +1712,7 @@ std::optional<ColumnsDescription> StorageS3::tryGetColumnsFromCache(
                last_modification_time = S3::getObjectInfo(
                    *configuration.client,
                    configuration.url.bucket,
-                    it->key,
+                    (*it)->key,
                    configuration.url.version_id,
                    configuration.request_settings,
                    /*with_metadata=*/ false,
@ -1723,7 +1723,7 @@ std::optional<ColumnsDescription> StorageS3::tryGetColumnsFromCache(
            return last_modification_time ? std::make_optional(last_modification_time) : std::nullopt;
        };

-        String path = fs::path(configuration.url.bucket) / it->key;
+        String path = fs::path(configuration.url.bucket) / (*it)->key;
        String source = fs::path(configuration.url.uri.getHost() + std::to_string(configuration.url.uri.getPort())) / path;
        auto cache_key = getKeyForSchemaCache(source, configuration.format, format_settings, ctx);
        auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time);
@ -1745,7 +1745,7 @@ void StorageS3::addColumnsToCache(
    auto host_and_bucket = fs::path(configuration.url.uri.getHost() + std::to_string(configuration.url.uri.getPort())) / configuration.url.bucket;
    Strings sources;
    sources.reserve(keys.size());
-    std::transform(keys.begin(), keys.end(), std::back_inserter(sources), [&](const auto & elem){ return host_and_bucket / elem.key; });
+    std::transform(keys.begin(), keys.end(), std::back_inserter(sources), [&](const auto & elem){ return host_and_bucket / elem->key; });
    auto cache_keys = getKeysForSchemaCache(sources, format_name, format_settings, ctx);
    auto & schema_cache = getSchemaCache(ctx);
    schema_cache.addManyColumns(cache_keys, columns);
--- a/src/Storages/StorageS3.h
+++ b/src/Storages/StorageS3.h
@ -43,22 +43,24 @@ public:
    struct KeyWithInfo
    {
        KeyWithInfo() = default;
-        KeyWithInfo(String key_, std::optional<S3::ObjectInfo> info_)
-            : key(std::move(key_)), info(std::move(info_))
-        {
-        }
+
+        explicit KeyWithInfo(String key_, std::optional<S3::ObjectInfo> info_ = std::nullopt)
+            : key(std::move(key_)), info(std::move(info_)) {}
+
+        virtual ~KeyWithInfo() = default;

        String key;
        std::optional<S3::ObjectInfo> info;
    };
+    using KeyWithInfoPtr = std::shared_ptr<KeyWithInfo>;

-    using KeysWithInfo = std::vector<KeyWithInfo>;
+    using KeysWithInfo = std::vector<KeyWithInfoPtr>;

    class IIterator
    {
    public:
        virtual ~IIterator() = default;
-        virtual KeyWithInfo next() = 0;
+        virtual KeyWithInfoPtr next() = 0;

        /// Estimates how many streams we need to process all files.
        /// If keys count >= max_threads_count, the returned number may not represent the actual number of the keys.
@ -66,7 +68,7 @@ public:
        /// fixme: May underestimate if the glob has a strong filter, so there are few matches among the first 1000 ListObjects results.
        virtual size_t estimatedKeysCount() = 0;

-        KeyWithInfo operator ()() { return next(); }
+        KeyWithInfoPtr operator ()() { return next(); }
    };

    class DisclosedGlobIterator : public IIterator
@ -82,7 +84,7 @@ public:
            const S3Settings::RequestSettings & request_settings_ = {},
            std::function<void(FileProgress)> progress_callback_ = {});

-        KeyWithInfo next() override;
+        KeyWithInfoPtr next() override;
        size_t estimatedKeysCount() override;

    private:
@ -106,7 +108,7 @@ public:
            KeysWithInfo * read_keys = nullptr,
            std::function<void(FileProgress)> progress_callback_ = {});

-        KeyWithInfo next() override;
+        KeyWithInfoPtr next() override;
        size_t estimatedKeysCount() override;

    private:
@ -120,7 +122,7 @@ public:
    public:
        explicit ReadTaskIterator(const ReadTaskCallback & callback_, const size_t max_threads_count);

-        KeyWithInfo next() override;
+        KeyWithInfoPtr next() override;
        size_t estimatedKeysCount() override;

    private:
@ -176,13 +178,13 @@ private:
    {
    public:
        ReaderHolder(
-            KeyWithInfo key_with_info_,
+            KeyWithInfoPtr key_with_info_,
            String bucket_,
            std::unique_ptr<ReadBuffer> read_buf_,
            std::shared_ptr<ISource> source_,
            std::unique_ptr<QueryPipeline> pipeline_,
            std::unique_ptr<PullingPipelineExecutor> reader_)
-            : key_with_info(std::move(key_with_info_))
+            : key_with_info(key_with_info_)
            , bucket(std::move(bucket_))
            , read_buf(std::move(read_buf_))
            , source(std::move(source_))
@ -216,14 +218,14 @@ private:
        explicit operator bool() const { return reader != nullptr; }
        PullingPipelineExecutor * operator->() { return reader.get(); }
        const PullingPipelineExecutor * operator->() const { return reader.get(); }
-        String getPath() const { return fs::path(bucket) / key_with_info.key; }
-        const String & getFile() const { return key_with_info.key; }
-        const KeyWithInfo & getKeyWithInfo() const { return key_with_info; }
+        String getPath() const { return fs::path(bucket) / key_with_info->key; }
+        const String & getFile() const { return key_with_info->key; }
+        const KeyWithInfo & getKeyWithInfo() const { return *key_with_info; }

        const IInputFormat * getInputFormat() const { return dynamic_cast<const IInputFormat *>(source.get()); }

    private:
-        KeyWithInfo key_with_info;
+        KeyWithInfoPtr key_with_info;
        String bucket;
        std::unique_ptr<ReadBuffer> read_buf;
        std::shared_ptr<ISource> source;
--- a/src/Storages/StorageS3Cluster.cpp
+++ b/src/Storages/StorageS3Cluster.cpp
@ -82,7 +82,13 @@ RemoteQueryExecutor::Extension StorageS3Cluster::getTaskIteratorExtension(ASTPtr
 {
    auto iterator = std::make_shared<StorageS3Source::DisclosedGlobIterator>(
        *s3_configuration.client, s3_configuration.url, query, virtual_columns, context, nullptr, s3_configuration.request_settings, context->getFileProgressCallback());
-    auto callback = std::make_shared<std::function<String()>>([iterator]() mutable -> String { return iterator->next().key; });
+
+    auto callback = std::make_shared<std::function<String()>>([iterator]() mutable -> String
+    {
+        if (auto next = iterator->next())
+            return next->key;
+        return "";
+    });
    return RemoteQueryExecutor::Extension{ .task_iterator = std::move(callback) };
 }

--- a/src/Storages/StorageSet.cpp
+++ b/src/Storages/StorageSet.cpp
@ -156,12 +156,62 @@ StorageSet::StorageSet(
 }


-void StorageSet::insertBlock(const Block & block, ContextPtr) { set->insertFromBlock(block.getColumnsWithTypeAndName()); }
-void StorageSet::finishInsert() { set->finishInsert(); }
+SetPtr StorageSet::getSet() const
+{
+    std::lock_guard lock(mutex);
+    return set;
+}

-size_t StorageSet::getSize(ContextPtr) const { return set->getTotalRowCount(); }
-std::optional<UInt64> StorageSet::totalRows(const Settings &) const { return set->getTotalRowCount(); }
-std::optional<UInt64> StorageSet::totalBytes(const Settings &) const { return set->getTotalByteCount(); }
+
+void StorageSet::insertBlock(const Block & block, ContextPtr)
+{
+    SetPtr current_set;
+    {
+        std::lock_guard lock(mutex);
+        current_set = set;
+    }
+    current_set->insertFromBlock(block.getColumnsWithTypeAndName());
+}
+
+void StorageSet::finishInsert()
+{
+    SetPtr current_set;
+    {
+        std::lock_guard lock(mutex);
+        current_set = set;
+    }
+    current_set->finishInsert();
+}
+
+size_t StorageSet::getSize(ContextPtr) const
+{
+    SetPtr current_set;
+    {
+        std::lock_guard lock(mutex);
+        current_set = set;
+    }
+    return current_set->getTotalRowCount();
+}
+
+std::optional<UInt64> StorageSet::totalRows(const Settings &) const
+{
+    SetPtr current_set;
+    {
+        std::lock_guard lock(mutex);
+        current_set = set;
+    }
+    return current_set->getTotalRowCount();
+}
+
+std::optional<UInt64> StorageSet::totalBytes(const Settings &) const
+{
+    SetPtr current_set;
+    {
+        std::lock_guard lock(mutex);
+        current_set = set;
+    }
+    return current_set->getTotalByteCount();
+}

 void StorageSet::truncate(const ASTPtr &, const StorageMetadataPtr & metadata_snapshot, ContextPtr, TableExclusiveLockHolder &)
 {
@ -176,8 +226,13 @@ void StorageSet::truncate(const ASTPtr &, const StorageMetadataPtr & metadata_sn
    Block header = metadata_snapshot->getSampleBlock();

    increment = 0;
-    set = std::make_shared<Set>(SizeLimits(), 0, true);
-    set->setHeader(header.getColumnsWithTypeAndName());
+
+    auto new_set = std::make_shared<Set>(SizeLimits(), 0, true);
+    new_set->setHeader(header.getColumnsWithTypeAndName());
+    {
+        std::lock_guard lock(mutex);
+        set = new_set;
+    }
 }


--- a/src/Storages/StorageSet.h
+++ b/src/Storages/StorageSet.h
@ -79,7 +79,7 @@ public:
    String getName() const override { return "Set"; }

    /// Access the insides.
-    SetPtr & getSet() { return set; }
+    SetPtr getSet() const;

    void truncate(const ASTPtr &, const StorageMetadataPtr & metadata_snapshot, ContextPtr, TableExclusiveLockHolder &) override;

@ -87,7 +87,9 @@ public:
    std::optional<UInt64> totalBytes(const Settings & settings) const override;

 private:
-    SetPtr set;
+    /// Allows to concurrently truncate the set and work (read/fill) the existing set.
+    mutable std::mutex mutex;
+    SetPtr set TSA_GUARDED_BY(mutex);

    void insertBlock(const Block & block, ContextPtr) override;
    void finishInsert() override;
--- a/src/Storages/System/StorageSystemS3Queue.cpp
+++ b/src/Storages/System/StorageSystemS3Queue.cpp
@ -0,0 +1,73 @@
+#include "StorageSystemS3Queue.h"
+
+#include <Access/ContextAccess.h>
+#include <DataTypes/DataTypeString.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <DataTypes/DataTypeNullable.h>
+#include <DataTypes/DataTypeDateTime.h>
+#include <DataTypes/DataTypeMap.h>
+#include <Interpreters/Cache/FileCache.h>
+#include <Interpreters/Cache/FileSegment.h>
+#include <Interpreters/Cache/FileCacheFactory.h>
+#include <Interpreters/Context.h>
+#include <Interpreters/ProfileEventsExt.h>
+#include <Storages/S3Queue/S3QueueFilesMetadata.h>
+#include <Storages/S3Queue/S3QueueMetadataFactory.h>
+#include <Storages/S3Queue/StorageS3Queue.h>
+#include <Disks/IDisk.h>
+
+
+namespace DB
+{
+
+NamesAndTypesList StorageSystemS3Queue::getNamesAndTypes()
+{
+    return {
+        {"zookeeper_path", std::make_shared<DataTypeString>()},
+        {"file_name", std::make_shared<DataTypeString>()},
+        {"rows_processed", std::make_shared<DataTypeUInt64>()},
+        {"status", std::make_shared<DataTypeString>()},
+        {"processing_start_time", std::make_shared<DataTypeNullable>(std::make_shared<DataTypeDateTime>())},
+        {"processing_end_time", std::make_shared<DataTypeNullable>(std::make_shared<DataTypeDateTime>())},
+        {"ProfileEvents", std::make_shared<DataTypeMap>(std::make_shared<DataTypeString>(), std::make_shared<DataTypeUInt64>())},
+        {"exception", std::make_shared<DataTypeString>()},
+    };
+}
+
+StorageSystemS3Queue::StorageSystemS3Queue(const StorageID & table_id_)
+    : IStorageSystemOneBlock(table_id_)
+{
+}
+
+void StorageSystemS3Queue::fillData(MutableColumns & res_columns, ContextPtr, const SelectQueryInfo &) const
+{
+    for (const auto & [zookeeper_path, metadata] : S3QueueMetadataFactory::instance().getAll())
+    {
+        for (const auto & [file_name, file_status] : metadata->getFileStateses())
+        {
+            size_t i = 0;
+            res_columns[i++]->insert(zookeeper_path);
+            res_columns[i++]->insert(file_name);
+
+            std::lock_guard lock(file_status->metadata_lock);
+
+            res_columns[i++]->insert(file_status->processed_rows.load());
+            res_columns[i++]->insert(magic_enum::enum_name(file_status->state));
+
+            if (file_status->processing_start_time)
+                res_columns[i++]->insert(file_status->processing_start_time);
+            else
+                res_columns[i++]->insertDefault();
+            if (file_status->processing_end_time)
+                res_columns[i++]->insert(file_status->processing_end_time);
+            else
+                res_columns[i++]->insertDefault();
+
+            ProfileEvents::dumpToMapColumn(file_status->profile_counters.getPartiallyAtomicSnapshot(), res_columns[i++].get(), true);
+
+            res_columns[i++]->insert(file_status->last_exception);
+        }
+    }
+}
+
+}
--- a/src/Storages/System/StorageSystemS3Queue.h
+++ b/src/Storages/System/StorageSystemS3Queue.h
@ -0,0 +1,23 @@
+#pragma once
+#include "config.h"
+
+#include <Storages/System/IStorageSystemOneBlock.h>
+#include <Interpreters/Cache/FileCache_fwd_internal.h>
+
+namespace DB
+{
+
+class StorageSystemS3Queue final : public IStorageSystemOneBlock<StorageSystemS3Queue>
+{
+public:
+    explicit StorageSystemS3Queue(const StorageID & table_id_);
+
+    std::string getName() const override { return "SystemS3Queue"; }
+
+    static NamesAndTypesList getNamesAndTypes();
+
+protected:
+    void fillData(MutableColumns & res_columns, ContextPtr context, const SelectQueryInfo & query_info) const override;
+};
+
+}
--- a/src/Storages/System/attachSystemTables.cpp
+++ b/src/Storages/System/attachSystemTables.cpp
@ -84,6 +84,7 @@
 #include <Storages/System/StorageSystemZooKeeperConnection.h>
 #include <Storages/System/StorageSystemJemalloc.h>
 #include <Storages/System/StorageSystemScheduler.h>
+#include <Storages/System/StorageSystemS3Queue.h>

 #if USE_RDKAFKA
 #include <Storages/System/StorageSystemKafkaConsumers.h>
@ -196,6 +197,7 @@ void attachSystemTablesServer(ContextPtr context, IDatabase & system_database, b
    attach<StorageSystemNamedCollections>(context, system_database, "named_collections");
    attach<StorageSystemUserProcesses>(context, system_database, "user_processes");
    attach<StorageSystemJemallocBins>(context, system_database, "jemalloc_bins");
+    attach<StorageSystemS3Queue>(context, system_database, "s3queue");

    if (has_zookeeper)
    {
--- a/src/Storages/VirtualColumnUtils.cpp
+++ b/src/Storages/VirtualColumnUtils.cpp
@ -1,3 +1,4 @@
+#include <algorithm>
 #include <memory>
 #include <Core/NamesAndTypes.h>
 #include <Core/TypeId.h>
@ -81,14 +82,33 @@ bool extractFunctions(const ASTPtr & expression, const std::function<bool(const
        }
        else if (function->name == "or")
        {
-            bool ret = true;
+            bool ret = false;
            ASTs or_args;
            for (const auto & child : function->arguments->children)
-                ret &= extractFunctions(child, is_constant, or_args);
-            /// We can keep condition only if it still OR condition (i.e. we
-            /// have dependent conditions for columns at both sides)
-            if (or_args.size() == 2)
+                ret |= extractFunctions(child, is_constant, or_args);
+
+            if (!or_args.empty())
+            {
+                /// In case of there are less number of arguments for which
+                /// is_constant() == true, we need to add always-true
+                /// implicitly to avoid breaking AND invariant.
+                ///
+                /// Consider the following:
+                ///
+                ///     ((value = 10) OR (_table = 'v2')) AND ((_table = 'v1') OR (value = 20))
+                ///
+                /// Without implicit always-true:
+                ///
+                ///     (_table = 'v2') AND (_table = 'v1')
+                ///
+                /// With:
+                ///
+                ///     (_table = 'v2' OR 1) AND (_table = 'v1' OR 1) -> (_table = 'v2') OR (_table = 'v1')
+                ///
+                if (or_args.size() != function->arguments->children.size())
+                    or_args.push_back(std::make_shared<ASTLiteral>(Field(1)));
                result.push_back(makeASTForLogicalOr(std::move(or_args)));
+            }
            return ret;
        }
    }
@ -165,8 +185,10 @@ bool prepareFilterBlockWithQuery(const ASTPtr & query, ContextPtr context, Block
    if (!select.where() && !select.prewhere())
        return unmodified;

-    // Provide input columns as constant columns to check if an expression is constant.
-    std::function<bool(const ASTPtr &)> is_constant = [&block, &context](const ASTPtr & node)
+    // Provide input columns as constant columns to check if an expression is
+    // constant and depends on the columns from provided block (the last is
+    // required to allow skipping some conditions for handling OR).
+    std::function<bool(const ASTPtr &)> is_constant = [&block, &context](const ASTPtr & expr)
    {
        auto actions = std::make_shared<ActionsDAG>(block.getColumnsWithTypeAndName());
        PreparedSetsPtr prepared_sets = std::make_shared<PreparedSets>();
@ -178,13 +200,26 @@ bool prepareFilterBlockWithQuery(const ASTPtr & query, ContextPtr context, Block
            context, SizeLimits{}, 1, source_columns, std::move(actions), prepared_sets, true, true, true,
            { aggregation_keys, grouping_set_keys, GroupByKind::NONE });

-        ActionsVisitor(visitor_data).visit(node);
+        ActionsVisitor(visitor_data).visit(expr);
        actions = visitor_data.getActions();
+        auto expr_column_name = expr->getColumnName();
+
+        const auto * expr_const_node = actions->tryFindInOutputs(expr_column_name);
+        if (!expr_const_node)
+            return false;
+        auto filter_actions = ActionsDAG::buildFilterActionsDAG({expr_const_node}, {}, context);
+        const auto & nodes = filter_actions->getNodes();
+        bool has_dependent_columns = std::any_of(nodes.begin(), nodes.end(), [&](const auto & node)
+        {
+            return block.has(node.result_name);
+        });
+        if (!has_dependent_columns)
+            return false;
+
        auto expression_actions = std::make_shared<ExpressionActions>(actions);
        auto block_with_constants = block;
        expression_actions->execute(block_with_constants);
-        auto column_name = node->getColumnName();
-        return block_with_constants.has(column_name) && isColumnConst(*block_with_constants.getByName(column_name).column);
+        return block_with_constants.has(expr_column_name) && isColumnConst(*block_with_constants.getByName(expr_column_name).column);
    };

    /// Create an expression that evaluates the expressions in WHERE and PREWHERE, depending only on the existing columns.
--- a/tests/README.md
+++ b/tests/README.md
@ -0,0 +1 @@
+Find CI documents and instructions on running CI checks localy [here](https://clickhouse.com/docs/en/development/continuous-integration).
--- a/tests/analyzer_integration_broken_tests.txt
+++ b/tests/analyzer_integration_broken_tests.txt
@ -39,8 +39,6 @@ test_settings_profile/test.py::test_show_profiles
 test_shard_level_const_function/test.py::test_remote
 test_sql_user_defined_functions_on_cluster/test.py::test_sql_user_defined_functions_on_cluster
 test_storage_rabbitmq/test.py::test_rabbitmq_materialized_view
-test_system_merges/test.py::test_mutation_simple[]
-test_system_merges/test.py::test_mutation_simple[replicated]
 test_user_defined_object_persistence/test.py::test_persistence
 test_wrong_db_or_table_name/test.py::test_wrong_table_name
 test_zookeeper_config/test.py::test_chroot_with_same_root
--- a/tests/ci/build_download_helper.py
+++ b/tests/ci/build_download_helper.py
@ -51,9 +51,9 @@ def get_gh_api(
    sleep: int = 3,
    **kwargs: Any,
 ) -> requests.Response:
-    """It's a wrapper around get_with_retries that requests GH api w/o auth by
-    default, and falls back to the get_best_robot_token in case of receiving
-    "403 rate limit exceeded" error
+    """
+    Request GH api w/o auth by default, and failover to the get_best_robot_token in case of receiving
+    "403 rate limit exceeded" or "404 not found" error
    It sets auth automatically when ROBOT_TOKEN is already set by get_best_robot_token
    """

@ -71,27 +71,39 @@ def get_gh_api(
    if grt.ROBOT_TOKEN is not None:
        set_auth_header()

-    need_retry = False
-    for _ in range(retries):
+    token_is_set = "Authorization" in kwargs.get("headers", {})
+    exc = Exception("A placeholder to satisfy typing and avoid nesting")
+    try_cnt = 0
+    while try_cnt < retries:
+        try_cnt += 1
        try:
-            response = get_with_retries(url, 1, sleep, **kwargs)
+            response = requests.get(url, **kwargs)
            response.raise_for_status()
            return response
-        except requests.HTTPError as exc:
-            if (
-                exc.response.status_code == 403
+        except requests.HTTPError as e:
+            exc = e
+            ratelimit_exceeded = (
+                e.response.status_code == 403
                and b"rate limit exceeded"
-                in exc.response._content  # pylint:disable=protected-access
-            ):
+                in e.response._content  # pylint:disable=protected-access
+            )
+            try_auth = e.response.status_code == 404
+            if (ratelimit_exceeded or try_auth) and not token_is_set:
                logging.warning(
                    "Received rate limit exception, setting the auth header and retry"
                )
                set_auth_header()
-                need_retry = True
-                break
+                token_is_set = True
+                try_cnt = 0
+                continue
+        except Exception as e:
+            exc = e

-    if need_retry:
-        return get_with_retries(url, retries, sleep, **kwargs)
+        if try_cnt < retries:
+            logging.info("Exception '%s' while getting, retry %i", exc, try_cnt)
+            time.sleep(sleep)
+
+    raise exc


 def get_build_name_for_check(check_name: str) -> str:
--- a/Show More
+++ b/Show More
				`@ -0,0 +1 @@`
				`Find CI documents and instructions on running CI checks localy [here](https://clickhouse.com/docs/en/development/continuous-integration).`