Merge branch 'master' into alter_in_memory_db

2024-11-24 08:32:02 +00:00 · 2021-10-29 19:41:53 +00:00 · 2021-10-29 19:41:53 +00:00 · cf227c477a
commit cf227c477a
parent 138c4fcd9d 6035e0498b
114 changed files with 3252 additions and 870 deletions
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@ -47,9 +47,17 @@ jobs:
      - name: Check out repository code
        uses: actions/checkout@v2
      - name: Style Check
+        env:
+          TEMP_PATH: ${{ runner.temp }}/style_check
        run: |
          cd $GITHUB_WORKSPACE/tests/ci
          python3 style_check.py
+      - name: Cleanup
+        if: always()
+        run: |
+          docker kill $(docker ps -q) ||:
+          docker rm -f $(docker ps -a -q) ||:
+          sudo rm -fr $TEMP_PATH
  BuilderDebDebug:
    needs: DockerHubPush
    runs-on: [self-hosted, builder]
@ -58,14 +66,16 @@ jobs:
        uses: actions/download-artifact@v2
        with:
          name: changed_images
-          path: ${{ runner.temp }}/build_check
+          path: ${{ runner.temp }}/images_path
      - name: Check out repository code
        uses: actions/checkout@v2
        with:
          submodules: 'recursive'
+          fetch-depth: 0 # otherwise we will have no info about contributors
      - name: Build
        env:
          TEMP_PATH: ${{runner.temp}}/build_check
+          IMAGES_PATH: ${{runner.temp}}/images_path
          REPO_COPY: ${{runner.temp}}/build_check/ClickHouse
          CACHES_PATH: ${{runner.temp}}/../ccaches
          CHECK_NAME: 'ClickHouse build check (actions)'
@ -80,6 +90,12 @@ jobs:
        with:
          name: ${{ env.BUILD_NAME }}
          path: ${{ runner.temp }}/build_check/${{ env.BUILD_NAME }}.json
+      - name: Cleanup
+        if: always()
+        run: |
+          docker kill $(docker ps -q) ||:
+          docker rm -f $(docker ps -a -q) ||:
+          sudo rm -fr $TEMP_PATH
  BuilderReport:
    needs: [BuilderDebDebug]
    runs-on: [self-hosted, style-checker]
@ -100,6 +116,72 @@ jobs:
          mkdir -p $TEMP_PATH
          cd $GITHUB_WORKSPACE/tests/ci
          python3 build_report_check.py "$CHECK_NAME"
+      - name: Cleanup
+        if: always()
+        run: |
+          docker kill $(docker ps -q) ||:
+          docker rm -f $(docker ps -a -q) ||:
+          sudo rm -fr $TEMP_PATH
+  FunctionalStatelessTestDebug:
+    needs: [BuilderDebDebug]
+    runs-on: [self-hosted, func-tester]
+    steps:
+      - name: Download json reports
+        uses: actions/download-artifact@v2
+        with:
+          path: ${{runner.temp}}/reports_dir
+      - name: Check out repository code
+        uses: actions/checkout@v2
+      - name: Functional test
+        env:
+          TEMP_PATH: ${{runner.temp}}/stateless_debug
+          REPORTS_PATH: ${{runner.temp}}/reports_dir
+          CHECK_NAME: 'Stateless tests (debug, actions)'
+          REPO_COPY: ${{runner.temp}}/stateless_debug/ClickHouse
+          REQUIRED_BUILD_NUMBER: 7
+          KILL_TIMEOUT: 10800
+        run: |
+          sudo rm -fr $TEMP_PATH
+          mkdir -p $TEMP_PATH
+          cp -r $GITHUB_WORKSPACE $TEMP_PATH
+          cd $REPO_COPY/tests/ci
+          python3 functional_test_check.py "$CHECK_NAME" $REQUIRED_BUILD_NUMBER $KILL_TIMEOUT
+      - name: Cleanup
+        if: always()
+        run: |
+          docker kill $(docker ps -q) ||:
+          docker rm -f $(docker ps -a -q) ||:
+          sudo rm -fr $TEMP_PATH
+  FunctionalStatefulTestDebug:
+    needs: [BuilderDebDebug]
+    runs-on: [self-hosted, func-tester]
+    steps:
+      - name: Download json reports
+        uses: actions/download-artifact@v2
+        with:
+          path: ${{runner.temp}}/reports_dir
+      - name: Check out repository code
+        uses: actions/checkout@v2
+      - name: Functional test
+        env:
+          TEMP_PATH: ${{runner.temp}}/stateful_debug
+          REPORTS_PATH: ${{runner.temp}}/reports_dir
+          CHECK_NAME: 'Stateful tests (debug, actions)'
+          REPO_COPY: ${{runner.temp}}/stateful_debug/ClickHouse
+          REQUIRED_BUILD_NUMBER: 7
+          KILL_TIMEOUT: 3600
+        run: |
+          sudo rm -fr $TEMP_PATH
+          mkdir -p $TEMP_PATH
+          cp -r $GITHUB_WORKSPACE $TEMP_PATH
+          cd $REPO_COPY/tests/ci
+          python3 functional_test_check.py "$CHECK_NAME" $REQUIRED_BUILD_NUMBER $KILL_TIMEOUT
+      - name: Cleanup
+        if: always()
+        run: |
+          docker kill $(docker ps -q) ||:
+          docker rm -f $(docker ps -a -q) ||:
+          sudo rm -fr $TEMP_PATH
  FastTest:
    needs: DockerHubPush
    runs-on: [self-hosted, builder]
@ -116,8 +198,14 @@ jobs:
          mkdir -p $TEMP_PATH
          cp -r $GITHUB_WORKSPACE $TEMP_PATH
          cd $REPO_COPY/tests/ci && python3 fast_test_check.py
+      - name: Cleanup
+        if: always()
+        run: |
+          docker kill $(docker ps -q) ||:
+          docker rm -f $(docker ps -a -q) ||:
+          sudo rm -fr $TEMP_PATH
  FinishCheck:
-    needs: [StyleCheck, DockerHubPush, CheckLabels, BuilderReport, FastTest]
+    needs: [StyleCheck, DockerHubPush, CheckLabels, BuilderReport, FastTest, FunctionalStatelessTestDebug, FunctionalStatefulTestDebug]
    runs-on: [self-hosted, style-checker]
    steps:
      - name: Check out repository code
--- a/docker/test/fasttest/run.sh
+++ b/docker/test/fasttest/run.sh
@ -264,7 +264,7 @@ function run_tests

    set +e
    time clickhouse-test --hung-check -j 8 --order=random \
-            --fast-tests-only --no-long --testname --shard --zookeeper \
+            --fast-tests-only --no-long --testname --shard --zookeeper --check-zookeeper-session \
            -- "$FASTTEST_FOCUS" 2>&1 \
        | ts '%Y-%m-%d %H:%M:%S' \
        | tee "$FASTTEST_OUTPUT/test_result.txt"
--- a/docker/test/fuzzer/run-fuzzer.sh
+++ b/docker/test/fuzzer/run-fuzzer.sh
@ -1,5 +1,5 @@
 #!/bin/bash
-# shellcheck disable=SC2086,SC2001
+# shellcheck disable=SC2086,SC2001,SC2046

 set -eux
 set -o pipefail
@ -13,24 +13,48 @@ script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 echo "$script_dir"
 repo_dir=ch
 BINARY_TO_DOWNLOAD=${BINARY_TO_DOWNLOAD:="clang-13_debug_none_bundled_unsplitted_disable_False_binary"}
+BINARY_URL_TO_DOWNLOAD=${BINARY_URL_TO_DOWNLOAD:="https://clickhouse-builds.s3.yandex.net/$PR_TO_TEST/$SHA_TO_TEST/clickhouse_build_check/$BINARY_TO_DOWNLOAD/clickhouse"}

 function clone
 {
-    # The download() function is dependent on CI binaries anyway, so we can take
-    # the repo from the CI as well. For local runs, start directly from the "fuzz"
-    # stage.
-    rm -rf ch ||:
-    mkdir ch ||:
-    wget -nv -nd -c "https://clickhouse-test-reports.s3.yandex.net/$PR_TO_TEST/$SHA_TO_TEST/repo/clickhouse_no_subs.tar.gz"
-    tar -C ch --strip-components=1 -xf clickhouse_no_subs.tar.gz
+    # For local runs, start directly from the "fuzz" stage.
+    rm -rf "$repo_dir" ||:
+    mkdir "$repo_dir" ||:
+
+    git clone --depth 1 https://github.com/ClickHouse/ClickHouse.git -- "$repo_dir" 2>&1 | ts '%Y-%m-%d %H:%M:%S'
+    (
+        cd "$repo_dir"
+        if [ "$PR_TO_TEST" != "0" ]; then
+            if git fetch --depth 1 origin "+refs/pull/$PR_TO_TEST/merge"; then
+                git checkout FETCH_HEAD
+                echo "Checked out pull/$PR_TO_TEST/merge ($(git rev-parse FETCH_HEAD))"
+            else
+                git fetch --depth 1 origin "+refs/pull/$PR_TO_TEST/head"
+                git checkout "$SHA_TO_TEST"
+                echo "Checked out nominal SHA $SHA_TO_TEST for PR $PR_TO_TEST"
+            fi
+            git diff --name-only master HEAD | tee ci-changed-files.txt
+        else
+            if [ -v COMMIT_SHA ]; then
+                git fetch --depth 2 origin "$SHA_TO_TEST"
+                git checkout "$SHA_TO_TEST"
+                echo "Checked out nominal SHA $SHA_TO_TEST for master"
+            else
+                git fetch --depth 2 origin
+                echo "Using default repository head $(git rev-parse HEAD)"
+            fi
+            git diff --name-only HEAD~1 HEAD | tee ci-changed-files.txt
+        fi
+        cd -
+    )
+
    ls -lath ||:
+
 }

 function download
 {
-    wget -nv -nd -c "https://clickhouse-builds.s3.yandex.net/$PR_TO_TEST/$SHA_TO_TEST/clickhouse_build_check/$BINARY_TO_DOWNLOAD/clickhouse" &
-    wget -nv -nd -c "https://clickhouse-test-reports.s3.yandex.net/$PR_TO_TEST/$SHA_TO_TEST/repo/ci-changed-files.txt" &
-    wait
+    wget -nv -nd -c "$BINARY_URL_TO_DOWNLOAD"

    chmod +x clickhouse
    ln -s ./clickhouse ./clickhouse-server
@ -113,7 +137,7 @@ function fuzz

    # Obtain the list of newly added tests. They will be fuzzed in more extreme way than other tests.
    # Don't overwrite the NEW_TESTS_OPT so that it can be set from the environment.
-    NEW_TESTS="$(sed -n 's!\(^tests/queries/0_stateless/.*\.sql\(\.j2\)\?\)$!ch/\1!p' ci-changed-files.txt | sort -R)"
+    NEW_TESTS="$(sed -n 's!\(^tests/queries/0_stateless/.*\.sql\(\.j2\)\?\)$!ch/\1!p' $repo_dir/ci-changed-files.txt | sort -R)"
    # ci-changed-files.txt contains also files that has been deleted/renamed, filter them out.
    NEW_TESTS="$(filter_exists_and_template $NEW_TESTS)"
    if [[ -n "$NEW_TESTS" ]]
--- a/docker/test/performance-comparison/Dockerfile
+++ b/docker/test/performance-comparison/Dockerfile
@ -33,7 +33,7 @@ RUN apt-get update \
            tzdata \
            vim \
            wget \
-    && pip3 --no-cache-dir install 'git+https://github.com/mymarilyn/clickhouse-driver.git' scipy \
+    && pip3 --no-cache-dir install 'clickhouse-driver==0.2.1' scipy \
    && apt-get purge --yes python3-dev g++ \
    && apt-get autoremove --yes \
    && apt-get clean \
--- a/docker/test/performance-comparison/compare.sh
+++ b/docker/test/performance-comparison/compare.sh
@ -196,7 +196,6 @@ function run_tests
        test_files=$(ls "$test_prefix" | grep "$CHPC_TEST_GREP" | xargs -I{} -n1 readlink -f "$test_prefix/{}")
    elif [ "$PR_TO_TEST" -ne 0 ] \
        && [ "$(wc -l < changed-test-definitions.txt)" -gt 0 ] \
-        && [ "$(wc -l < changed-test-scripts.txt)" -eq 0 ] \
        && [ "$(wc -l < other-changed-files.txt)" -eq 0 ]
    then
        # If only the perf tests were changed in the PR, we will run only these
@ -208,15 +207,15 @@ function run_tests
        test_files=$(ls "$test_prefix"/*.xml)
    fi

-    # For PRs w/o changes in test definitons and scripts, test only a subset of
-    # queries, and run them less times. If the corresponding environment variables
-    # are already set, keep those values.
-    if [ "$PR_TO_TEST" -ne 0 ] \
-        && [ "$(wc -l < changed-test-definitions.txt)" -eq 0 ] \
-        && [ "$(wc -l < changed-test-scripts.txt)" -eq 0 ]
+    # For PRs w/o changes in test definitons, test only a subset of queries,
+    # and run them less times. If the corresponding environment variables are
+    # already set, keep those values.
+    #
+    # NOTE: too high CHPC_RUNS/CHPC_MAX_QUERIES may hit internal CI timeout.
+    if [ "$PR_TO_TEST" -ne 0 ] && [ "$(wc -l < changed-test-definitions.txt)" -eq 0 ]
    then
        CHPC_RUNS=${CHPC_RUNS:-7}
-        CHPC_MAX_QUERIES=${CHPC_MAX_QUERIES:-20}
+        CHPC_MAX_QUERIES=${CHPC_MAX_QUERIES:-10}
    else
        CHPC_RUNS=${CHPC_RUNS:-13}
        CHPC_MAX_QUERIES=${CHPC_MAX_QUERIES:-0}
@ -319,14 +318,14 @@ function get_profiles

    wait

-    clickhouse-client --port $LEFT_SERVER_PORT --query "select * from system.query_log where type = 'QueryFinish' format TSVWithNamesAndTypes" > left-query-log.tsv ||: &
+    clickhouse-client --port $LEFT_SERVER_PORT --query "select * from system.query_log where type in ('QueryFinish', 'ExceptionWhileProcessing') format TSVWithNamesAndTypes" > left-query-log.tsv ||: &
    clickhouse-client --port $LEFT_SERVER_PORT --query "select * from system.query_thread_log format TSVWithNamesAndTypes" > left-query-thread-log.tsv ||: &
    clickhouse-client --port $LEFT_SERVER_PORT --query "select * from system.trace_log format TSVWithNamesAndTypes" > left-trace-log.tsv ||: &
    clickhouse-client --port $LEFT_SERVER_PORT --query "select arrayJoin(trace) addr, concat(splitByChar('/', addressToLine(addr))[-1], '#', demangle(addressToSymbol(addr)) ) name from system.trace_log group by addr format TSVWithNamesAndTypes" > left-addresses.tsv ||: &
    clickhouse-client --port $LEFT_SERVER_PORT --query "select * from system.metric_log format TSVWithNamesAndTypes" > left-metric-log.tsv ||: &
    clickhouse-client --port $LEFT_SERVER_PORT --query "select * from system.asynchronous_metric_log format TSVWithNamesAndTypes" > left-async-metric-log.tsv ||: &

-    clickhouse-client --port $RIGHT_SERVER_PORT --query "select * from system.query_log where type = 'QueryFinish' format TSVWithNamesAndTypes" > right-query-log.tsv ||: &
+    clickhouse-client --port $RIGHT_SERVER_PORT --query "select * from system.query_log where type in ('QueryFinish', 'ExceptionWhileProcessing') format TSVWithNamesAndTypes" > right-query-log.tsv ||: &
    clickhouse-client --port $RIGHT_SERVER_PORT --query "select * from system.query_thread_log format TSVWithNamesAndTypes" > right-query-thread-log.tsv ||: &
    clickhouse-client --port $RIGHT_SERVER_PORT --query "select * from system.trace_log format TSVWithNamesAndTypes" > right-trace-log.tsv ||: &
    clickhouse-client --port $RIGHT_SERVER_PORT --query "select arrayJoin(trace) addr, concat(splitByChar('/', addressToLine(addr))[-1], '#', demangle(addressToSymbol(addr)) ) name from system.trace_log group by addr format TSVWithNamesAndTypes" > right-addresses.tsv ||: &
--- a/docker/test/performance-comparison/config/users.d/perf-comparison-tweaks-users.xml
+++ b/docker/test/performance-comparison/config/users.d/perf-comparison-tweaks-users.xml
@ -24,6 +24,13 @@

            <!-- Don't fail some prewarm queries too early -->
            <timeout_before_checking_execution_speed>60</timeout_before_checking_execution_speed>
+
+            <!-- Query profiler enabled only for prewarm queries explicitly (see perf.py)
+                 This is needed for flamegraphs.  -->
+            <query_profiler_real_time_period_ns>0</query_profiler_real_time_period_ns>
+            <query_profiler_cpu_time_period_ns>0</query_profiler_cpu_time_period_ns>
+            <!-- Disable memory profiler too, since due to max_untracked_memory some queries may add trace entry and some may not -->
+            <memory_profiler_step>0</memory_profiler_step>
        </default>
    </profiles>
    <users>
--- a/docker/test/performance-comparison/entrypoint.sh
+++ b/docker/test/performance-comparison/entrypoint.sh
@ -102,7 +102,6 @@ then
    base=$(git -C right/ch merge-base pr origin/master)
    git -C right/ch diff --name-only "$base" pr -- . | tee all-changed-files.txt
    git -C right/ch diff --name-only "$base" pr -- tests/performance | tee changed-test-definitions.txt
-    git -C right/ch diff --name-only "$base" pr -- docker/test/performance-comparison | tee changed-test-scripts.txt
    git -C right/ch diff --name-only "$base" pr -- :!tests/performance :!docker/test/performance-comparison | tee other-changed-files.txt
 fi

--- a/docker/test/performance-comparison/perf.py
+++ b/docker/test/performance-comparison/perf.py
@ -283,8 +283,11 @@ for query_index in queries_to_run:
                #   test coverage. We disable profiler for normal runs because
                #   it makes the results unstable.
                res = c.execute(q, query_id = prewarm_id,
-                    settings = {'max_execution_time': args.max_query_seconds,
-                        'query_profiler_real_time_period_ns': 10000000})
+                    settings = {
+                        'max_execution_time': args.max_query_seconds,
+                        'query_profiler_real_time_period_ns': 10000000,
+                        'memory_profiler_step': '4Mi',
+                    })
            except clickhouse_driver.errors.Error as e:
                # Add query id to the exception to make debugging easier.
                e.args = (prewarm_id, *e.args)
--- a/docker/test/stateful/Dockerfile
+++ b/docker/test/stateful/Dockerfile
@ -9,6 +9,7 @@ RUN apt-get update -y \

 COPY s3downloader /s3downloader

+ENV S3_URL="https://clickhouse-datasets.s3.yandex.net"
 ENV DATASETS="hits visits"

 COPY run.sh /
--- a/docker/test/stateful/run.sh
+++ b/docker/test/stateful/run.sh
@ -56,7 +56,7 @@ function start()

 start
 # shellcheck disable=SC2086 # No quotes because I want to split it into words.
-/s3downloader --dataset-names $DATASETS
+/s3downloader --url-prefix "$S3_URL" --dataset-names $DATASETS
 chmod 777 -R /var/lib/clickhouse
 clickhouse-client --query "SHOW DATABASES"

@ -109,7 +109,7 @@ function run_tests()
    fi

    set +e
-    clickhouse-test --testname --shard --zookeeper --no-stateless --hung-check --print-time "${ADDITIONAL_OPTIONS[@]}" \
+    clickhouse-test --testname --shard --zookeeper --check-zookeeper-session --no-stateless --hung-check --print-time "${ADDITIONAL_OPTIONS[@]}" \
        "$SKIP_TESTS_OPTION" 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee test_output/test_result.txt
    set -e
 }
--- a/docker/test/stateless/run.sh
+++ b/docker/test/stateless/run.sh
@ -97,7 +97,7 @@ function run_tests()
    fi

    set +e
-    clickhouse-test --testname --shard --zookeeper --hung-check --print-time \
+    clickhouse-test --testname --shard --zookeeper --check-zookeeper-session --hung-check --print-time \
            --test-runs "$NUM_TRIES" "${ADDITIONAL_OPTIONS[@]}" 2>&1 \
        | ts '%Y-%m-%d %H:%M:%S' \
        | tee -a test_output/test_result.txt
--- a/docs/en/introduction/adopters.md
+++ b/docs/en/introduction/adopters.md
@ -170,5 +170,7 @@ toc_title: Adopters
 | <a href="https://cft.ru/" class="favicon">ЦФТ</a> | Banking, Financial products, Payments | — | — | — | [Meetup in Russian, April 2020](https://team.cft.ru/events/162) |
 | <a href="https://promo.croc.ru/digitalworker" class="favicon">Цифровой Рабочий</a> | Industrial IoT, Analytics | — | — | — | [Blog post in Russian, March 2021](https://habr.com/en/company/croc/blog/548018/) |
 | <a href="https://shop.okraina.ru/" class="favicon">ООО «МПЗ Богородский»</a> | Agriculture | — | — | — | [Article in Russian, November 2020](https://cloud.yandex.ru/cases/okraina) |
+| <a href="https://domclick.ru/" class="favicon">ДомКлик</a> | Real Estate | — | — | — | [Article in Russian, October 2021](https://habr.com/ru/company/domclick/blog/585936/) |
+| <a href="https://www.deepl.com/" class="favicon">Deepl</a> | Machine Learning | — | — | — | [Video, October 2021](https://www.youtube.com/watch?v=WIYJiPwxXdM&t=1182s) |

 [Original article](https://clickhouse.com/docs/en/introduction/adopters/) <!--hide-->
--- a/docs/en/operations/performance-test.md
+++ b/docs/en/operations/performance-test.md
@ -23,7 +23,7 @@ chmod a+x ./hardware.sh
 ./hardware.sh
 ```

-3. Copy the output and send it to clickhouse-feedback@yandex-team.com
+3. Copy the output and send it to feedback@clickhouse.com

 All the results are published here: https://clickhouse.com/benchmark/hardware/

--- a/docs/en/operations/server-configuration-parameters/settings.md
+++ b/docs/en/operations/server-configuration-parameters/settings.md
@ -69,6 +69,8 @@ If no conditions met for a data part, ClickHouse uses the `lz4` compression.
 </compression>
 ```

+<!--
+
 ## encryption {#server-settings-encryption}

 Configures a command to obtain a key to be used by [encryption codecs](../../sql-reference/statements/create/table.md#create-query-encryption-codecs). Key (or keys) should be written in environment variables or set in the configuration file.
@ -131,7 +133,7 @@ Also, users can add nonce that must be 12 bytes long (by default encryption and
 ```xml
 <encryption_codecs>
    <aes_128_gcm_siv>
-        <nonce>0123456789101</nonce>
+        <nonce>012345678910</nonce>
    </aes_128_gcm_siv>
 </encryption_codecs>
 ```
@ -148,6 +150,8 @@ Or it can be set in hex:

 Everything mentioned above can be applied for `aes_256_gcm_siv` (but the key must be 32 bytes long).

+-->
+
 ## custom_settings_prefixes {#custom_settings_prefixes}

 List of prefixes for [custom settings](../../operations/settings/index.md#custom_settings). The prefixes must be separated with commas.
--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@ -1751,9 +1751,11 @@ Do not merge aggregation states from different servers for distributed query pro

 Possible values:

-   0 — Disabled (final query processing is done on the initiator node).
-   1 - Do not merge aggregation states from different servers for distributed query processing (query completelly processed on the shard, initiator only proxy the data), can be used in case it is for certain that there are different keys on different shards.
-   2 - Same as `1` but applies `ORDER BY` and `LIMIT` (it is not possible when the query processed completelly on the remote node, like for `distributed_group_by_no_merge=1`) on the initiator (can be used for queries with `ORDER BY` and/or `LIMIT`).
+-   `0` — Disabled (final query processing is done on the initiator node).
+-   `1` - Do not merge aggregation states from different servers for distributed query processing (query completelly processed on the shard, initiator only proxy the data), can be used in case it is for certain that there are different keys on different shards.
+-   `2` - Same as `1` but applies `ORDER BY` and `LIMIT` (it is not possible when the query processed completelly on the remote node, like for `distributed_group_by_no_merge=1`) on the initiator (can be used for queries with `ORDER BY` and/or `LIMIT`).
+
+Default value: `0`

 **Example**

@ -1784,29 +1786,27 @@ FORMAT PrettyCompactMonoBlock
 └───────┘
 ```

-Default value: 0
+## distributed_push_down_limit {#distributed-push-down-limit}

-## distributed_push_down_limit (#distributed-push-down-limit}
-
-LIMIT will be applied on each shard separatelly.
+Enables or disables [LIMIT](#limit) applying on each shard separatelly.

 This will allow to avoid:
+-  Sending extra rows over network;
+-  Processing rows behind the limit on the initiator.

- sending extra rows over network,
- processing rows behind the limit on the initiator.
-
-It is possible if at least one of the following conditions met:
-
- `distributed_group_by_no_merge` > 0
- query **does not have `GROUP BY`/`DISTINCT`/`LIMIT BY`**, but it has `ORDER BY`/`LIMIT`.
- query **has `GROUP BY`/`DISTINCT`/`LIMIT BY`** with `ORDER BY`/`LIMIT` and:
-  - `optimize_skip_unused_shards_limit` is enabled
-  - `optimize_distributed_group_by_sharding_key` is enabled
+Starting from 21.9 version you cannot get inaccurate results anymore, since `distributed_push_down_limit` changes query execution only if at least one of the conditions met:
+-  [distributed_group_by_no_merge](#distributed-group-by-no-merge) > 0.
+-  Query **does not have** `GROUP BY`/`DISTINCT`/`LIMIT BY`, but it has `ORDER BY`/`LIMIT`.
+-  Query **has** `GROUP BY`/`DISTINCT`/`LIMIT BY` with `ORDER BY`/`LIMIT` and:
+    -  [optimize_skip_unused_shards](#optimize-skip-unused-shards) is enabled.
+    -  [optimize_distributed_group_by_sharding_key](#optimize-distributed-group-by-sharding-key) is enabled.

 Possible values:

-  0 - Disabled
-  1 - Enabled
+-  0 — Disabled.
+-  1 — Enabled.
+
+Default value: `1`.

 See also:

@ -1920,6 +1920,7 @@ Default value: 0
 See also:

 -   [distributed_group_by_no_merge](#distributed-group-by-no-merge)
+-   [distributed_push_down_limit](#distributed-push-down-limit)
 -   [optimize_skip_unused_shards](#optimize-skip-unused-shards)

 !!! note "Note"
@ -3831,6 +3832,21 @@ Default value: `0`.

 -   [optimize_move_to_prewhere](#optimize_move_to_prewhere) setting

+## describe_include_subcolumns {#describe_include_subcolumns}
+
+Enables describing subcolumns for a [DESCRIBE](../../sql-reference/statements/describe-table.md) query. For example, members of a [Tuple](../../sql-reference/data-types/tuple.md) or subcolumns of a [Map](../../sql-reference/data-types/map.md#map-subcolumns), [Nullable](../../sql-reference/data-types/nullable.md#finding-null) or an [Array](../../sql-reference/data-types/array.md#array-size) data type.
+
+Possible values:
+
+-   0 — Subcolumns are not included in `DESCRIBE` queries.
+-   1 — Subcolumns are included in `DESCRIBE` queries.
+
+Default value: `0`.
+
+**Example**
+
+See an example for the [DESCRIBE](../../sql-reference/statements/describe-table.md) statement.
+
 ## async_insert {#async-insert}

 Enables or disables asynchronous inserts. This makes sense only for insertion over HTTP protocol. Note that deduplication isn't working for such inserts.
--- a/docs/en/sql-reference/functions/geo/s2.md
+++ b/docs/en/sql-reference/functions/geo/s2.md
@ -2,13 +2,13 @@
 toc_title: S2 Geometry
 ---

-# Functions for Working with S2 Index {#s2Index}
+# Functions for Working with S2 Index {#s2index}

 [S2](https://s2geometry.io/) is a geographical indexing system where all geographical data is represented on a three-dimensional sphere (similar to a globe).

-In the S2 library points are represented as unit length vectors called S2 point indices (points on the surface of a three dimensional unit sphere) as opposed to traditional (latitude, longitude) pairs.
+In the S2 library points are represented as the S2 Index - a specific number which encodes internally a point on the surface of a unit sphere, unlike traditional (latitude, longitude) pairs. To get the S2 point index for a given point specified in the format (latitude, longitude) use the [geoToS2](#geotos2) function. Also, you can use the [s2ToGeo](#s2togeo) function for getting geographical coordinates corresponding to the specified S2 point index.

-## geoToS2 {#geoToS2}
+## geoToS2 {#geotos2}

 Returns [S2](#s2index) point index corresponding to the provided coordinates `(longitude, latitude)`.

@ -34,7 +34,7 @@ Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
 Query:

 ``` sql
-SELECT geoToS2(37.79506683, 55.71290588) as s2Index;
+SELECT geoToS2(37.79506683, 55.71290588) AS s2Index;
 ```

 Result:
@ -45,7 +45,7 @@ Result:
 └─────────────────────┘
 ```

-## s2ToGeo {#s2ToGeo}
+## s2ToGeo {#s2togeo}

 Returns geo coordinates `(longitude, latitude)` corresponding to the provided [S2](#s2index) point index.

@ -57,20 +57,20 @@ s2ToGeo(s2index)

 **Arguments**

-   `s2Index` — S2 Index. [UInt64](../../../sql-reference/data-types/int-uint.md).
+-   `s2index` — S2 Index. [UInt64](../../../sql-reference/data-types/int-uint.md).

 **Returned values**

 -   A tuple consisting of two values: `tuple(lon,lat)`.

-Type: `lon` - [Float64](../../../sql-reference/data-types/float.md). `lat` — [Float64](../../../sql-reference/data-types/float.md).
+Type: `lon` — [Float64](../../../sql-reference/data-types/float.md). `lat` — [Float64](../../../sql-reference/data-types/float.md).

 **Example**

 Query:

 ``` sql
-SELECT s2ToGeo(4704772434919038107) as s2Coodrinates;
+SELECT s2ToGeo(4704772434919038107) AS s2Coodrinates;
 ```

 Result:
@ -81,9 +81,9 @@ Result:
 └──────────────────────────────────────┘
 ```

-## s2GetNeighbors {#s2GetNeighbors}
+## s2GetNeighbors {#s2getneighbors}

-Returns S2 neighbor indices corresponding to the provided [S2](#s2index)). Each cell in the S2 system is a quadrilateral bounded by four geodesics. So, each cell has 4 neighbors. 
+Returns S2 neighbor indixes corresponding to the provided [S2](#s2index). Each cell in the S2 system is a quadrilateral bounded by four geodesics. So, each cell has 4 neighbors. 

 **Syntax**

@ -97,16 +97,16 @@ s2GetNeighbors(s2index)

 **Returned values**

-   An array consisting of the 4 neighbor indices: `array[s2index1, s2index3, s2index2, s2index4]`. 
+-   An array consisting of 4 neighbor indexes: `array[s2index1, s2index3, s2index2, s2index4]`.

-Type: Each S2 index is [UInt64](../../../sql-reference/data-types/int-uint.md).
+Type: [UInt64](../../../sql-reference/data-types/int-uint.md).

 **Example**

 Query:

 ``` sql
- select  s2GetNeighbors(5074766849661468672) AS s2Neighbors;
+SELECT s2GetNeighbors(5074766849661468672) AS s2Neighbors;
 ```

 Result:
@ -117,9 +117,9 @@ Result:
 └───────────────────────────────────────────────────────────────────────────────────┘
 ```

-## s2CellsIntersect {#s2CellsIntersect}
+## s2CellsIntersect {#s2cellsintersect}

-Determines if the two provided [S2](#s2index)) cell indices intersect or not.
+Determines if the two provided [S2](#s2index) cells intersect or not.

 **Syntax**

@ -133,8 +133,8 @@ s2CellsIntersect(s2index1, s2index2)

 **Returned values**

-   1 — If the S2 cell indices intersect.
-   0 — If the S2 cell indices don't intersect.
+-   1 — If the cells intersect.
+-   0 — If the cells don't intersect.

 Type: [UInt8](../../../sql-reference/data-types/int-uint.md).

@ -143,7 +143,7 @@ Type: [UInt8](../../../sql-reference/data-types/int-uint.md).
 Query:

 ``` sql
- select s2CellsIntersect(9926595209846587392, 9926594385212866560) as intersect;
+SELECT s2CellsIntersect(9926595209846587392, 9926594385212866560) AS intersect;
 ```

 Result:
@ -154,11 +154,9 @@ Result:
 └───────────┘
 ```

-## s2CapContains {#s2CapContains}
+## s2CapContains {#s2capcontains}

-A cap represents a portion of the sphere that has been cut off by a plane. It is defined by a point on a sphere and a radius in degrees.
-
-Determines if a cap contains a s2 point index.
+Determines if a cap contains a S2 point. A cap represents a part of the sphere that has been cut off by a plane. It is defined by a point on a sphere and a radius in degrees.

 **Syntax**

@ -168,9 +166,9 @@ s2CapContains(center, degrees, point)

 **Arguments**

-  `center`  - S2 point index corresponding to the cap. [UInt64](../../../sql-reference/data-types/int-uint.md).
- - `degrees` - Radius of the cap in degrees. [Float64](../../../sql-reference/data-types/float.md).
- - `point`   - S2 point index. [UInt64](../../../sql-reference/data-types/int-uint.md).
+-   `center` — S2 point index corresponding to the cap. [UInt64](../../../sql-reference/data-types/int-uint.md).
+-   `degrees` — Radius of the cap in degrees. [Float64](../../../sql-reference/data-types/float.md).
+-   `point` — S2 point index. [UInt64](../../../sql-reference/data-types/int-uint.md).

 **Returned values**

@ -184,7 +182,7 @@ Type: [UInt8](../../../sql-reference/data-types/int-uint.md).
 Query:

 ``` sql
-select s2CapContains(1157339245694594829, 1.0, 1157347770437378819) as capContains;
+SELECT s2CapContains(1157339245694594829, 1.0, 1157347770437378819) AS capContains;
 ```

 Result:
@ -195,11 +193,9 @@ Result:
 └─────────────┘
 ```

-## s2CapUnion {#s2CapUnion}
+## s2CapUnion {#s2capunion}

-A cap represents a portion of the sphere that has been cut off by a plane. It is defined by a point on a sphere and a radius in degrees.
-
-Determines the smallest cap that contains the given two input caps.
+Determines the smallest cap that contains the given two input caps. A cap represents a portion of the sphere that has been cut off by a plane. It is defined by a point on a sphere and a radius in degrees.

 **Syntax**

@ -209,13 +205,13 @@ s2CapUnion(center1, radius1, center2, radius2)

 **Arguments**

-  `center1`, `center2` - S2 point indices corresponding to the two input caps. [UInt64](../../../sql-reference/data-types/int-uint.md).
- - `radius1`, `radius2` - Radii of the two input caps in degrees. [Float64](../../../sql-reference/data-types/float.md).
+-   `center1`, `center2` — S2 point indixes corresponding to the two input caps. [UInt64](../../../sql-reference/data-types/int-uint.md).
+-   `radius1`, `radius2` — Radius of the two input caps in degrees. [Float64](../../../sql-reference/data-types/float.md).

 **Returned values**

-  `center` - S2 point index corresponding the center of the smallest cap containing the two input caps. Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
- - `radius` - Radius of the smallest cap containing the two input caps. Type: [Float64](../../../sql-reference/data-types/float.md).
+-   `center` — S2 point index corresponding the center of the smallest cap containing the two input caps. Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
+-   `radius` — Radius of the smallest cap containing the two input caps. Type: [Float64](../../../sql-reference/data-types/float.md).

 **Example**

@ -233,11 +229,9 @@ Result:
 └────────────────────────────────────────┘
 ```

-## s2RectAdd{#s2RectAdd}
+## s2RectAdd {#s2rectadd}

-In the S2 system, a rectangle is represented by a type of S2Region called a S2LatLngRect that represents a rectangle in latitude-longitude space.
-
-Increases the size of the bounding rectangle to include the given S2 point index.
+Increases the size of the bounding rectangle to include the given S2 point. In the S2 system, a rectangle is represented by a type of S2Region called a `S2LatLngRect` that represents a rectangle in latitude-longitude space.

 **Syntax**

@ -247,21 +241,21 @@ s2RectAdd(s2pointLow, s2pointHigh, s2Point)

 **Arguments**

- `s2PointLow`  - Low S2 point index corresponding to the rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md).
- `s2PointHigh` - High S2 point index corresponding to the rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md).
- `s2Point`     - Target S2 point index that the bound rectangle should be grown to include. [UInt64](../../../sql-reference/data-types/int-uint.md).
+-   `s2PointLow` — Low S2 point index corresponding to the rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md).
+-   `s2PointHigh` — High S2 point index corresponding to the rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md).
+-   `s2Point` — Target S2 point index that the bound rectangle should be grown to include. [UInt64](../../../sql-reference/data-types/int-uint.md).

 **Returned values**

-  `s2PointLow`  - Low S2 cell id corresponding to the grown rectangle. Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
- - `s2PointHigh` - Hight S2 cell id corresponding to the grown rectangle. Type: [UInt64](../../../sql-reference/data-types/float.md).
+-   `s2PointLow` — Low S2 cell id corresponding to the grown rectangle. Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
+-   `s2PointHigh` — Hight S2 cell id corresponding to the grown rectangle. Type: [UInt64](../../../sql-reference/data-types/float.md).

 **Example**

 Query:

 ``` sql
-SELECT s2RectAdd(5178914411069187297, 5177056748191934217, 5179056748191934217) as rectAdd;
+SELECT s2RectAdd(5178914411069187297, 5177056748191934217, 5179056748191934217) AS rectAdd;
 ```

 Result:
@ -272,11 +266,9 @@ Result:
 └───────────────────────────────────────────┘
 ```

-## s2RectContains{#s2RectContains}
+## s2RectContains {#s2rectcontains}

-In the S2 system, a rectangle is represented by a type of S2Region called a S2LatLngRect that represents a rectangle in latitude-longitude space.
-
-Determines if a given rectangle contains a S2 point index.
+Determines if a given rectangle contains a S2 point. In the S2 system, a rectangle is represented by a type of S2Region called a `S2LatLngRect` that represents a rectangle in latitude-longitude space.

 **Syntax**

@ -286,9 +278,9 @@ s2RectContains(s2PointLow, s2PointHi, s2Point)

 **Arguments**

- `s2PointLow`  - Low S2 point index corresponding to the rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md).
- `s2PointHigh` - High S2 point index corresponding to the rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md).
- `s2Point`     - Target S2 point index. [UInt64](../../../sql-reference/data-types/int-uint.md).
+-   `s2PointLow` — Low S2 point index corresponding to the rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md).
+-   `s2PointHigh` — High S2 point index corresponding to the rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md).
+-   `s2Point` — Target S2 point index. [UInt64](../../../sql-reference/data-types/int-uint.md).

 **Returned values**

@ -300,7 +292,7 @@ s2RectContains(s2PointLow, s2PointHi, s2Point)
 Query:

 ``` sql
-SELECT s2RectContains(5179062030687166815, 5177056748191934217, 5177914411069187297) AS rectContains
+SELECT s2RectContains(5179062030687166815, 5177056748191934217, 5177914411069187297) AS rectContains;
 ```

 Result:
@ -311,11 +303,9 @@ Result:
 └──────────────┘
 ```

-## s2RectUinion{#s2RectUnion}
+## s2RectUinion {#s2rectunion}

-In the S2 system, a rectangle is represented by a type of S2Region called a S2LatLngRect that represents a rectangle in latitude-longitude space.
-
-Returns the smallest rectangle containing the union of this rectangle and the given rectangle.
+Returns the smallest rectangle containing the union of this rectangle and the given rectangle. In the S2 system, a rectangle is represented by a type of S2Region called a `S2LatLngRect` that represents a rectangle in latitude-longitude space.

 **Syntax**

@ -325,20 +315,20 @@ s2RectUnion(s2Rect1PointLow, s2Rect1PointHi, s2Rect2PointLow, s2Rect2PointHi)

 **Arguments**

- `s2Rect1PointLow`, `s2Rect1PointHi` - Low and High S2 point indices corresponding to the first rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md).
- `s2Rect2PointLow`, `s2Rect2PointHi` - Low and High S2 point indices corresponding to the second rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md).
+-   `s2Rect1PointLow`, `s2Rect1PointHi` — Low and High S2 point indexes corresponding to the first rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md).
+-   `s2Rect2PointLow`, `s2Rect2PointHi` — Low and High S2 point indexes corresponding to the second rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md).

 **Returned values**

- `s2UnionRect2PointLow` - Low S2 cell id corresponding to the union rectangle. Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
- `s2UnionRect2PointHi` -  High S2 cell id corresponding to the union rectangle. Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
+-   `s2UnionRect2PointLow` — Low S2 cell id corresponding to the union rectangle. Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
+-   `s2UnionRect2PointHi` — High S2 cell id corresponding to the union rectangle. Type: [UInt64](../../../sql-reference/data-types/int-uint.md).

 **Example**

 Query:

 ``` sql
-SELECT s2RectUnion(5178914411069187297, 5177056748191934217, 5179062030687166815, 5177056748191934217) AS rectUnion
+SELECT s2RectUnion(5178914411069187297, 5177056748191934217, 5179062030687166815, 5177056748191934217) AS rectUnion;
 ```

 Result:
@ -349,9 +339,9 @@ Result:
 └───────────────────────────────────────────┘
 ```

-## s2RectIntersection{#s2RectIntersection}
+## s2RectIntersection {#s2rectintersection}

-Returns the smallest Rectangle containing the intersection of this rectangle and the given rectangle.
+Returns the smallest rectangle containing the intersection of this rectangle and the given rectangle. In the S2 system, a rectangle is represented by a type of S2Region called a `S2LatLngRect` that represents a rectangle in latitude-longitude space.

 **Syntax**

@ -361,20 +351,20 @@ s2RectIntersection(s2Rect1PointLow, s2Rect1PointHi, s2Rect2PointLow, s2Rect2Poin

 **Arguments**

- `s2Rect1PointLow`, `s2Rect1PointHi` - Low and High S2 point indices corresponding to the first rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md).
- `s2Rect2PointLow`, `s2Rect2PointHi` - Low and High S2 point indices corresponding to the second rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md).
+-   `s2Rect1PointLow`, `s2Rect1PointHi` — Low and High S2 point indexes corresponding to the first rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md).
+-   `s2Rect2PointLow`, `s2Rect2PointHi` — Low and High S2 point indexes corresponding to the second rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md).

 **Returned values**

- `s2UnionRect2PointLow` - Low S2 cell id corresponding to the rectangle containing the intersection of the given rectangles. Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
- `s2UnionRect2PointHi` -  Hi S2 cell id corresponding to the rectangle containing the intersection of the given rectangles. Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
+-   `s2UnionRect2PointLow` — Low S2 cell id corresponding to the rectangle containing the intersection of the given rectangles. Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
+-   `s2UnionRect2PointHi` — High S2 cell id corresponding to the rectangle containing the intersection of the given rectangles. Type: [UInt64](../../../sql-reference/data-types/int-uint.md).

 **Example**

 Query:

 ``` sql
-SELECT s2RectIntersection(5178914411069187297, 5177056748191934217, 5179062030687166815, 5177056748191934217) AS rectIntersection
+SELECT s2RectIntersection(5178914411069187297, 5177056748191934217, 5179062030687166815, 5177056748191934217) AS rectIntersection;
 ```

 Result:
--- a/docs/en/sql-reference/functions/tuple-map-functions.md
+++ b/docs/en/sql-reference/functions/tuple-map-functions.md
@ -165,9 +165,6 @@ Result:
 ## mapPopulateSeries {#function-mappopulateseries}

 Fills missing keys in the maps (key and value array pair), where keys are integers. Also, it supports specifying the max key, which is used to extend the keys array.
-Arguments are [maps](../../sql-reference/data-types/map.md) or two [arrays](../../sql-reference/data-types/array.md#data-type-array), where the first array represent keys, and the second array contains values for the each key.
-
-For array arguments the number of elements in `keys` and `values` must be the same for each row.

 **Syntax**

@ -178,12 +175,17 @@ mapPopulateSeries(map[, max])

 Generates a map (a tuple with two arrays or a value of `Map` type, depending on the arguments), where keys are a series of numbers, from minimum to maximum keys (or `max` argument if it specified) taken from the map with a step size of one, and corresponding values. If the value is not specified for the key, then it uses the default value in the resulting map. For repeated keys, only the first value (in order of appearing) gets associated with the key.

+For array arguments the number of elements in `keys` and `values` must be the same for each row.
+
 **Arguments**

+Arguments are [maps](../../sql-reference/data-types/map.md) or two [arrays](../../sql-reference/data-types/array.md#data-type-array), where the first array represent keys, and the second array contains values for the each key.
+
 Mapped arrays:

 -   `keys` — Array of keys. [Array](../../sql-reference/data-types/array.md#data-type-array)([Int](../../sql-reference/data-types/int-uint.md#uint-ranges)).
 -   `values` — Array of values. [Array](../../sql-reference/data-types/array.md#data-type-array)([Int](../../sql-reference/data-types/int-uint.md#uint-ranges)).
+-   `max` — Maximum key value. Optional. [Int8, Int16, Int32, Int64, Int128, Int256](../../sql-reference/data-types/int-uint.md#int-ranges).

 or

@ -198,7 +200,7 @@ or
 Query with mapped arrays:

 ```sql
-select mapPopulateSeries([1,2,4], [11,22,44], 5) as res, toTypeName(res) as type;
+SELECT mapPopulateSeries([1,2,4], [11,22,44], 5) AS res, toTypeName(res) AS type;
 ```

 Result:
@ -390,5 +392,43 @@ Result:
 └─────────────────────────────┘  
 ```  
  
+## mapExtractKeyLike {#mapExtractKeyLike}  
+  
+**Syntax**
+
+```sql
+mapExtractKeyLike(map, pattern)
+```  
+  
+**Parameters**
+  
+-   `map` — Map. [Map](../../sql-reference/data-types/map.md).  
+-   `pattern`  - String pattern to match.  
+  
+**Returned value**
+
+- A map contained elements the key of which matchs the specified pattern. If there are no elements matched the pattern, it will return an empty map.
+  
+**Example**
+
+Query:
+
+```sql
+CREATE TABLE test (a Map(String,String)) ENGINE = Memory;
+
+INSERT INTO test VALUES ({'abc':'abc','def':'def'}), ({'hij':'hij','klm':'klm'});
+
+SELECT mapExtractKeyLike(a, 'a%') FROM test;
+```  
+  
+Result:  
+  
+```text
+┌─mapExtractKeyLike(a, 'a%')─┐
+│ {'abc':'abc'}              │
+│ {}                         │
+└────────────────────────────┘
+```  
+  

 [Original article](https://clickhouse.com/docs/en/sql-reference/functions/tuple-map-functions/) <!--hide-->
--- a/docs/en/sql-reference/statements/describe-table.md
+++ b/docs/en/sql-reference/statements/describe-table.md
@ -3,18 +3,67 @@ toc_priority: 42
 toc_title: DESCRIBE
 ---

-# DESCRIBE TABLE Statement {#misc-describe-table}
+# DESCRIBE TABLE {#misc-describe-table}
+
+Returns information about table columns.
+
+**Syntax**

 ``` sql
 DESC|DESCRIBE TABLE [db.]table [INTO OUTFILE filename] [FORMAT format]
 ```

-Returns the following `String` type columns:
+The `DESCRIBE` statement returns a row for each table column with the following [String](../../sql-reference/data-types/string.md) values:

-   `name` — Column name.
-   `type`— Column type.
-   `default_type` — Clause that is used in [default expression](../../sql-reference/statements/create/table.md#create-default-values) (`DEFAULT`, `MATERIALIZED` or `ALIAS`). Column contains an empty string, if the default expression isn’t specified.
-   `default_expression` — Value specified in the `DEFAULT` clause.
-   `comment_expression` — Comment text.
+-   `name` — A column name.
+-   `type` — A column type.
+-   `default_type` — A clause that is used in the column [default expression](../../sql-reference/statements/create/table.md#create-default-values): `DEFAULT`, `MATERIALIZED` or `ALIAS`. If there is no default expression, then empty string is returned.
+-   `default_expression` — An expression specified after the `DEFAULT` clause.
+-   `comment` — A [column comment](../../sql-reference/statements/alter/column.md#alter_comment-column).
+-   `codec_expression` — A [codec](../../sql-reference/statements/create/table.md#codecs) that is applied to the column.
+-   `ttl_expression` — A [TTL](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl) expression.
+-   `is_subcolumn` — A flag that equals `1` for internal subcolumns. It is included into the result only if subcolumn description is enabled by the [describe_include_subcolumns](../../operations/settings/settings.md#describe_include_subcolumns) setting.

-Nested data structures are output in “expanded” format. Each column is shown separately, with the name after a dot.
+All columns in [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) data structures are described separately. The name of each column is prefixed with a parent column name and a dot.
+
+To show internal subcolumns of other data types, use the [describe_include_subcolumns](../../operations/settings/settings.md#describe_include_subcolumns) setting. 
+
+**Example**
+
+Query:
+
+``` sql
+CREATE TABLE describe_example (
+    id UInt64, text String DEFAULT 'unknown' CODEC(ZSTD),
+    user Tuple (name String, age UInt8)
+) ENGINE = MergeTree() ORDER BY id;
+
+DESCRIBE TABLE describe_example;
+DESCRIBE TABLE describe_example SETTINGS describe_include_subcolumns=1;
+```
+
+Result:
+
+``` text
+┌─name─┬─type──────────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
+│ id   │ UInt64                        │              │                    │         │                  │                │
+│ text │ String                        │ DEFAULT      │ 'unknown'          │         │ ZSTD(1)          │                │
+│ user │ Tuple(name String, age UInt8) │              │                    │         │                  │                │
+└──────┴───────────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
+```
+
+The second query additionally shows subcolumns:
+
+``` text
+┌─name──────┬─type──────────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┬─is_subcolumn─┐
+│ id        │ UInt64                        │              │                    │         │                  │                │            0 │
+│ text      │ String                        │ DEFAULT      │ 'unknown'          │         │ ZSTD(1)          │                │            0 │
+│ user      │ Tuple(name String, age UInt8) │              │                    │         │                  │                │            0 │
+│ user.name │ String                        │              │                    │         │                  │                │            1 │
+│ user.age  │ UInt8                         │              │                    │         │                  │                │            1 │
+└───────────┴───────────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┴──────────────┘
+```
+
+**See Also**
+
+-   [describe_include_subcolumns](../../operations/settings/settings.md#describe_include_subcolumns) setting.
--- a/docs/ru/operations/settings/settings.md
+++ b/docs/ru/operations/settings/settings.md
@ -1705,6 +1705,32 @@ ClickHouse генерирует исключение

 Значение по умолчанию: 0.

+## distributed_push_down_limit {#distributed-push-down-limit}
+
+Включает или отключает [LIMIT](#limit), применяемый к каждому шарду по отдельности. 
+
+Это позволяет избежать:
+- отправки дополнительных строк по сети;
+- обработки строк за пределами ограничения для инициатора.
+
+Начиная с версии 21.9 вы больше не сможете получить неточные результаты, так как `distributed_push_down_limit` изменяет выполнение запроса только в том случае, если выполнено хотя бы одно из условий:
+- `distributed_group_by_no_merge` > 0.
+- запрос **не содержит** `GROUP BY`/`DISTINCT`/`LIMIT BY`, но содержит `ORDER BY`/`LIMIT`.
+- запрос **содержит** `GROUP BY`/`DISTINCT`/`LIMIT BY` с `ORDER BY`/`LIMIT` и:
+  - включена настройка [optimize_skip_unused_shards](#optimize-skip-unused-shards).
+  - включена настройка `optimize_distributed_group_by_sharding_key`.
+
+Возможные значения:
+
+-    0 — выключена.
+-    1 — включена.
+
+Значение по умолчанию: `1`.
+
+См. также:
+
+-   [optimize_skip_unused_shards](#optimize-skip-unused-shards)
+
 ## optimize_skip_unused_shards {#optimize-skip-unused-shards}

 Включает или отключает пропуск неиспользуемых шардов для запросов [SELECT](../../sql-reference/statements/select/index.md) , в которых условие ключа шардирования задано в секции `WHERE/PREWHERE`. Предполагается, что данные распределены с помощью ключа шардирования, в противном случае запрос выдаст неверный результат.
@ -3641,6 +3667,21 @@ SELECT * FROM positional_arguments ORDER BY 2,3;

 -   настройка [optimize_move_to_prewhere](#optimize_move_to_prewhere)

+## describe_include_subcolumns {#describe_include_subcolumns}
+
+Включает или отключает описание подстолбцов при выполнении запроса [DESCRIBE](../../sql-reference/statements/describe-table.md). Настройка действует, например, на элементы [Tuple](../../sql-reference/data-types/tuple.md) или подстолбцы типов [Map](../../sql-reference/data-types/map.md#map-subcolumns), [Nullable](../../sql-reference/data-types/nullable.md#finding-null) или [Array](../../sql-reference/data-types/array.md#array-size).
+
+Возможные значения:
+
+-   0 — подстолбцы не включаются в результат запросов `DESCRIBE`.
+-   1 — подстолбцы включаются в результат запросов `DESCRIBE`.
+
+Значение по умолчанию: `0`.
+
+**Пример**
+
+Смотрите пример запроса [DESCRIBE](../../sql-reference/statements/describe-table.md).
+
 ## async_insert {#async-insert}

 Включает или отключает асинхронные вставки. Работает только для вставок по протоколу HTTP. Обратите внимание, что при таких вставках дедупликация не производится.
--- a/docs/ru/sql-reference/functions/geo/s2.md
+++ b/docs/ru/sql-reference/functions/geo/s2.md
@ -0,0 +1,376 @@
+--- 
+toc_title: "Функции для работы с индексами S2"
+---
+
+# Функции для работы с индексами S2 {#s2index}
+
+[S2](https://s2geometry.io/) — это система геокодирования, в которой все географические данные представлены на трехмерной сфере (аналогично глобусу).
+
+В библиотеке S2 точки представлены в виде индекса S2 — определенного числа, которое внутренне кодирует точку на поверхности трехмерной единичной сферы, в отличие от традиционных пар (широта, долгота). Чтобы получить индекс S2 для точки, заданной в формате (широта, долгота), используйте функцию [geoToS2](#geotools2). Также вы можете использовать функцию [s2togeo](#s2togeo) для получения географических координат, соответствующих заданному S2 индексу точки.
+
+## geoToS2 {#geotos2}
+
+Возвращает [S2](#s2index) индекс точки, соответствующий заданным координатам в формате `(долгота, широта)`.
+
+**Синтаксис**
+
+``` sql
+geoToS2(lon, lat)
+```
+
+**Аргументы**
+
+-   `lon` — долгота. [Float64](../../../sql-reference/data-types/float.md).
+-   `lat` — широта. [Float64](../../../sql-reference/data-types/float.md).
+
+**Возвращаемое значение**
+
+-  S2 индекс точки.
+
+Тип: [UInt64](../../../sql-reference/data-types/int-uint.md).
+
+**Пример**
+
+Запрос:
+
+``` sql
+SELECT geoToS2(37.79506683, 55.71290588) AS s2Index;
+```
+
+Результат:
+
+``` text
+┌─────────────s2Index─┐
+│ 4704772434919038107 │
+└─────────────────────┘
+```
+
+## s2ToGeo {#s2togeo}
+
+Возвращает географические координаты `(долгота, широта)`, соответствующие заданному [S2](#s2index) индексу точки.
+
+**Синтаксис**
+
+``` sql
+s2ToGeo(s2index)
+```
+
+**Аргументы**
+
+-   `s2index` — [S2](#s2index) индекс. [UInt64](../../../sql-reference/data-types/int-uint.md).
+
+**Возвращаемые значения**
+
+-   Кортеж их двух значений: `tuple(lon,lat)`.
+
+Тип: `lon` — [Float64](../../../sql-reference/data-types/float.md). `lat` — [Float64](../../../sql-reference/data-types/float.md).
+
+**Пример**
+
+Запрос:
+
+``` sql
+SELECT s2ToGeo(4704772434919038107) AS s2Coodrinates;
+```
+
+Результат:
+
+``` text
+┌─s2Coodrinates────────────────────────┐
+│ (37.79506681471008,55.7129059052841) │
+└──────────────────────────────────────┘
+```
+
+## s2GetNeighbors {#s2getneighbors}
+
+Возвращает [S2](#s2index) индексы ячеек, которые являются соседними для заданного S2 индекса. Ячейка в системе S2 представляет собой прямоугольник, ограниченный четырьмя сторонами. Соответственно, у каждой ячейки есть 4 соседние ячейки.
+
+**Синтаксис**
+
+``` sql
+s2GetNeighbors(s2index)
+```
+
+**Аргументы**
+
+-   `s2index` — [S2](#s2index) индекс. [UInt64](../../../sql-reference/data-types/int-uint.md).
+
+**Возвращаемые значения**
+
+-   Массив, содержащий 4 значения — S2 индекса соседних ячеек: `array[s2index1, s2index3, s2index2, s2index4]`.
+
+Тип: [UInt64](../../../sql-reference/data-types/int-uint.md).
+
+**Пример**
+
+Запрос:
+
+``` sql
+SELECT s2GetNeighbors(5074766849661468672) AS s2Neighbors;
+```
+
+Результат:
+
+``` text
+┌─s2Neighbors───────────────────────────────────────────────────────────────────────┐
+│ [5074766987100422144,5074766712222515200,5074767536856236032,5074767261978329088] │
+└───────────────────────────────────────────────────────────────────────────────────┘
+```
+
+## s2CellsIntersect {#s2cellsintersect}
+
+Проверяет, пересекаются ли две заданные ячейки или нет.
+
+**Синтаксис**
+
+``` sql
+s2CellsIntersect(s2index1, s2index2)
+```
+
+**Аргументы**
+
+-   `siIndex1`, `s2index2` — S2 индексы первой и второй ячейки. [UInt64](../../../sql-reference/data-types/int-uint.md).
+
+**Возвращаемые значения**
+
+-   1 — ячейки пересекаются.
+-   0 — ячейки не пересекаются.
+
+Тип: [UInt8](../../../sql-reference/data-types/int-uint.md).
+
+**Пример**
+
+Запрос:
+
+``` sql
+SELECT s2CellsIntersect(9926595209846587392, 9926594385212866560) AS intersect;
+```
+
+Результат:
+
+``` text
+┌─intersect─┐
+│         1 │
+└───────────┘
+```
+
+## s2CapContains {#s2capcontains}
+
+Определяет, содержит ли заданный купол указанную точку. Купол представляет собой часть сферы, которая была отрезана плоскостью. Купол задается точкой на сфере и радиусом в градусах.
+
+**Синтаксис**
+
+``` sql
+s2CapContains(center, degrees, point)
+```
+
+**Аргументы**
+
+-   `center` — S2 индекс точки, определяющей центр купола. [UInt64](../../../sql-reference/data-types/int-uint.md).
+-   `degrees` — радиус купола в градусах. [Float64](../../../sql-reference/data-types/float.md).
+-   `point` — S2 индекс проверяемой точки. [UInt64](../../../sql-reference/data-types/int-uint.md).
+
+**Возвращаемые значения**
+
+-   1 — купол содержит точку.
+-   0 — купол не содержит точку.
+
+Тип: [UInt8](../../../sql-reference/data-types/int-uint.md).
+
+**Пример**
+
+Запрос:
+
+``` sql
+SELECT s2CapContains(1157339245694594829, 1.0, 1157347770437378819) AS capContains;
+```
+
+Результат:
+
+``` text
+┌─capContains─┐
+│           1 │
+└─────────────┘
+```
+
+## s2CapUnion {#s2capunion}
+
+Определяет наименьший купол, содержащий два заданных купола. Купол представляет собой часть сферы, которая была отрезана плоскостью. Купол задается точкой на сфере и радиусом в градусах.
+
+**Синтаксис**
+
+``` sql
+s2CapUnion(center1, radius1, center2, radius2)
+```
+
+**Аргументы**
+
+-   `center1`, `center2` — S2 индексы точек, определяющие два центра куполов. [UInt64](../../../sql-reference/data-types/int-uint.md).
+-   `radius1`, `radius2` — значения радиусов в градусах, определяющие два радиуса куполов. [Float64](../../../sql-reference/data-types/float.md).
+
+**Возвращаемые значения**
+
+-   `center` — S2 индекс точки, соответствующий центру наименьшего купола, содержащего заданные купола. Тип: [UInt64](../../../sql-reference/data-types/int-uint.md).
+-   `radius` — радиус в градусах наименьшего купола, содержащего заданные купола. Тип: [Float64](../../../sql-reference/data-types/float.md).
+
+**Пример**
+
+Запрос:
+
+``` sql
+SELECT s2CapUnion(3814912406305146967, 1.0, 1157347770437378819, 1.0) AS capUnion;
+```
+
+Результат:
+
+``` text
+┌─capUnion───────────────────────────────┐
+│ (4534655147792050737,60.2088283994957) │
+└────────────────────────────────────────┘
+```
+
+## s2RectAdd {#s2rectadd}
+
+Увеличивает размер ограничивающего прямоугольника, чтобы включить в себя точку, заданную S2 индексом. В системе S2 прямоугольник представлен типом S2Region, называемым `S2LatLngRect`, который задает прямоугольник в пространстве широта-долгота.
+
+**Синтаксис**
+
+``` sql
+s2RectAdd(s2pointLow, s2pointHigh, s2Point)
+```
+
+**Аргументы**
+
+-   `s2PointLow` — S2 индекс нижней точки, которая задает ограничиваюший прямоугольник. [UInt64](../../../sql-reference/data-types/int-uint.md).
+-   `s2PointHigh` — S2 индекс верхний точки, которая задает ограничиваюший прямоугольник. [UInt64](../../../sql-reference/data-types/int-uint.md).
+-   `s2Point` — S2 индекс целевой точки, которая будет содержаться увеличенным ограничивающим прямоугольником. [UInt64](../../../sql-reference/data-types/int-uint.md).
+
+**Возвращаемые значения**
+
+-   `s2PointLow` — идентификатор нижней S2 ячейки, соответствующий увеличенному прямоугольнику. Тип: [UInt64](../../../sql-reference/data-types/int-uint.md).
+-   `s2PointHigh` — идентификатор верхней S2 ячейки, соответствующий увеличенному прямоугольнику. Тип: [UInt64](../../../sql-reference/data-types/float.md).
+
+**Пример**
+
+Запрос:
+
+``` sql
+SELECT s2RectAdd(5178914411069187297, 5177056748191934217, 5179056748191934217) AS rectAdd;
+```
+
+Результат:
+
+``` text
+┌─rectAdd───────────────────────────────────┐
+│ (5179062030687166815,5177056748191934217) │
+└───────────────────────────────────────────┘
+```
+
+## s2RectContains {#s2rectcontains}
+
+Проверяет, содержит ли заданный прямоугольник указанную S2 точку. В системе S2 прямоугольник представлен типом S2Region, называемым `S2LatLngRect`, который задает прямоугольник в пространстве широта-долгота.
+
+**Синтаксис**
+
+``` sql
+s2RectContains(s2PointLow, s2PointHi, s2Point)
+```
+
+**Аргументы**
+
+-   `s2PointLow` — S2 индекс самой низкой точки, которая задает прямоугольник. [UInt64](../../../sql-reference/data-types/int-uint.md).
+-   `s2PointHigh` — S2 индекс самой высокой точки, которая задает прямоугольник. [UInt64](../../../sql-reference/data-types/int-uint.md).
+-   `s2Point` — S2 индекс проверяемой точки. [UInt64](../../../sql-reference/data-types/int-uint.md).
+
+**Возвращаемые значения**
+
+-   1 — прямоугольник содержит заданную точку.
+-   0 — прямоугольник не содержит заданную точку.
+
+**Пример**
+
+Запрос:
+
+``` sql
+SELECT s2RectContains(5179062030687166815, 5177056748191934217, 5177914411069187297) AS rectContains;
+```
+
+Результат:
+
+``` text
+┌─rectContains─┐
+│            0 │
+└──────────────┘
+```
+
+## s2RectUinion {#s2rectunion}
+
+Возвращает наименьший прямоугольник, содержащий объединение двух заданных прямоугольников. В системе S2 прямоугольник представлен типом S2Region, называемым `S2LatLngRect`, который задает прямоугольник в пространстве широта-долгота.
+
+**Синтаксис**
+
+``` sql
+s2RectUnion(s2Rect1PointLow, s2Rect1PointHi, s2Rect2PointLow, s2Rect2PointHi)
+```
+
+**Аргументы**
+
+-   `s2Rect1PointLow`, `s2Rect1PointHi` — значения S2 индекса для самой низкой и самой высокой точек, которые задают первый прямоугольник. [UInt64](../../../sql-reference/data-types/int-uint.md).
+-   `s2Rect2PointLow`, `s2Rect2PointHi` — значения S2 индекса для самой низкой и самой высокой точек, которые задают второй прямоугольник. [UInt64](../../../sql-reference/data-types/int-uint.md).
+
+**Возвращаемые значения**
+
+-   `s2UnionRect2PointLow` — идентификатор нижней ячейки, соответствующей объединенному прямоугольнику. Тип: [UInt64](../../../sql-reference/data-types/int-uint.md).
+-   `s2UnionRect2PointHi` — идентификатор  верхней ячейки, соответствующей объединенному прямоугольнику. Тип: [UInt64](../../../sql-reference/data-types/int-uint.md).
+
+**Пример**
+
+Запрос:
+
+``` sql
+SELECT s2RectUnion(5178914411069187297, 5177056748191934217, 5179062030687166815, 5177056748191934217) AS rectUnion;
+```
+
+Результат:
+
+``` text
+┌─rectUnion─────────────────────────────────┐
+│ (5179062030687166815,5177056748191934217) │
+└───────────────────────────────────────────┘
+```
+
+## s2RectIntersection {#s2rectintersection}
+
+Возвращает наименьший прямоугольник, содержащий пересечение двух заданных прямоугольников. В системе S2 прямоугольник представлен типом S2Region, называемым `S2LatLngRect`, который задает прямоугольник в пространстве широта-долгота.
+
+**Синтаксис**
+
+``` sql
+s2RectIntersection(s2Rect1PointLow, s2Rect1PointHi, s2Rect2PointLow, s2Rect2PointHi)
+```
+
+**Аргументы**
+
+-   `s2Rect1PointLow`, `s2Rect1PointHi` — значения S2 индекса для самой низкой и самой высокой точек, которые задают первый прямоугольник. [UInt64](../../../sql-reference/data-types/int-uint.md).
+-   `s2Rect2PointLow`, `s2Rect2PointHi` — значения S2 индекса для самой низкой и самой высокой точек, которые задают второй прямоугольник. [UInt64](../../../sql-reference/data-types/int-uint.md).
+
+**Возвращаемые значения**
+
+-   `s2UnionRect2PointLow` — идентификатор нижней ячейки, соответствующей результирующему прямоугольнику. Тип: [UInt64](../../../sql-reference/data-types/int-uint.md).
+-   `s2UnionRect2PointHi` — идентификатор верхней ячейки, соответствующей результирующему прямоугольнику. Тип: [UInt64](../../../sql-reference/data-types/int-uint.md).
+
+**Пример**
+
+Запрос:
+
+``` sql
+SELECT s2RectIntersection(5178914411069187297, 5177056748191934217, 5179062030687166815, 5177056748191934217) AS rectIntersection;
+```
+
+Результат:
+
+``` text
+┌─rectIntersection──────────────────────────┐
+│ (5178914411069187297,5177056748191934217) │
+└───────────────────────────────────────────┘
+```
--- a/docs/ru/sql-reference/functions/tuple-map-functions.md
+++ b/docs/ru/sql-reference/functions/tuple-map-functions.md
@ -108,7 +108,7 @@ SELECT mapAdd(([toUInt8(1), 2], [1, 1]), ([toUInt8(1), 2], [1, 1])) as res, toTy
 SELECT mapAdd(map(1,1), map(1,1));
 ```

-Result:
+Результат:

 ```text
 ┌─mapAdd(map(1, 1), map(1, 1))─┐
@ -128,13 +128,13 @@ mapSubtract(Tuple(Array, Array), Tuple(Array, Array) [, ...])

 **Аргументы**

-Аргументами являются [кортежи](../../sql-reference/data-types/tuple.md#tuplet1-t2) из двух [массивов](../../sql-reference/data-types/array.md#data-type-array), где элементы в первом массиве представляют ключи, а второй массив содержит значения для каждого ключа.
+Аргументами являются контейнеры [Map](../../sql-reference/data-types/map.md) или [кортежи](../../sql-reference/data-types/tuple.md#tuplet1-t2) из двух [массивов](../../sql-reference/data-types/array.md#data-type-array), где элементы в первом массиве представляют ключи, а второй массив содержит значения для каждого ключа.
 Все массивы ключей должны иметь один и тот же тип, а все массивы значений должны содержать элементы, которые можно приводить к одному типу ([Int64](../../sql-reference/data-types/int-uint.md#int-ranges), [UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges) или [Float64](../../sql-reference/data-types/float.md#float32-float64)).
 Общий приведенный тип используется в качестве типа для результирующего массива.

 **Возвращаемое значение**

-   Возвращает один [tuple](../../sql-reference/data-types/tuple.md#tuplet1-t2), в котором первый массив содержит отсортированные ключи, а второй - значения.
+-   В зависимости от аргумента возвращает один [Map](../../sql-reference/data-types/map.md) или [кортеж](../../sql-reference/data-types/tuple.md#tuplet1-t2), в котором первый массив содержит отсортированные ключи, а второй — значения.

 **Пример**

@ -152,6 +152,20 @@ SELECT mapSubtract(([toUInt8(1), 2], [toInt32(1), 1]), ([toUInt8(1), 2], [toInt3
 └────────────────┴───────────────────────────────────┘
 ```

+Запрос с контейнером `Map`:
+
+```sql
+SELECT mapSubtract(map(1,1), map(1,1));
+```
+
+Результат:
+
+```text
+┌─mapSubtract(map(1, 1), map(1, 1))─┐
+│ {1:0}                             │
+└───────────────────────────────────┘
+```
+
 ## mapPopulateSeries {#function-mappopulateseries}

 Заполняет недостающие ключи в контейнере map (пара массивов ключей и значений), где ключи являются целыми числами. Кроме того, он поддерживает указание максимального ключа, который используется для расширения массива ключей.
@ -160,6 +174,7 @@ SELECT mapSubtract(([toUInt8(1), 2], [toInt32(1), 1]), ([toUInt8(1), 2], [toInt3

 ``` sql
 mapPopulateSeries(keys, values[, max])
+mapPopulateSeries(map[, max])
 ```

 Генерирует контейнер map, где ключи - это серия чисел, от минимального до максимального ключа (или аргумент `max`, если он указан), взятых из массива `keys` с размером шага один, и соответствующие значения, взятые из массива `values`. Если значение не указано для ключа, то в результирующем контейнере используется значение по умолчанию.
@ -168,19 +183,28 @@ mapPopulateSeries(keys, values[, max])

 **Аргументы**

-   `keys` — массив ключей [Array](../../sql-reference/data-types/array.md#data-type-array)([Int](../../sql-reference/data-types/int-uint.md#int-ranges)).
+Аргументами являются контейнер [Map](../../sql-reference/data-types/map.md) или два [массива](../../sql-reference/data-types/array.md#data-type-array), где первый массив представляет ключи, а второй массив содержит значения для каждого ключа.
+
+Сопоставленные массивы:
+
+-   `keys` — массив ключей. [Array](../../sql-reference/data-types/array.md#data-type-array)([Int](../../sql-reference/data-types/int-uint.md#int-ranges)).
 -   `values` — массив значений. [Array](../../sql-reference/data-types/array.md#data-type-array)([Int](../../sql-reference/data-types/int-uint.md#int-ranges)).
+-   `max` — максимальное значение ключа. Необязательный параметр. [Int8, Int16, Int32, Int64, Int128, Int256](../../sql-reference/data-types/int-uint.md#int-ranges).
+
+или
+
+-   `map` — контейнер `Map` с целочисленными ключами. [Map](../../sql-reference/data-types/map.md).

 **Возвращаемое значение**

-  Возвращает [кортеж](../../sql-reference/data-types/tuple.md#tuplet1-t2) из двух [массивов](../../sql-reference/data-types/array.md#data-type-array): ключи отсортированные по порядку и значения соответствующих ключей.
+-   В зависимости от аргумента возвращает контейнер [Map](../../sql-reference/data-types/map.md) или [кортеж](../../sql-reference/data-types/tuple.md#tuplet1-t2) из двух [массивов](../../sql-reference/data-types/array.md#data-type-array): ключи отсортированные по порядку и значения соответствующих ключей.

 **Пример**

-Запрос:
+Запрос с сопоставленными массивами:

 ```sql
-select mapPopulateSeries([1,2,4], [11,22,44], 5) as res, toTypeName(res) as type;
+SELECT mapPopulateSeries([1,2,4], [11,22,44], 5) AS res, toTypeName(res) AS type;
 ```

 Результат:
@ -191,6 +215,20 @@ select mapPopulateSeries([1,2,4], [11,22,44], 5) as res, toTypeName(res) as type
 └──────────────────────────────┴───────────────────────────────────┘
 ```

+Запрос с контейнером `Map`:
+
+```sql
+SELECT mapPopulateSeries(map(1, 10, 5, 20), 6);
+```
+
+Результат:
+
+```text
+┌─mapPopulateSeries(map(1, 10, 5, 20), 6)─┐
+│ {1:10,2:0,3:0,4:0,5:20,6:0}             │
+└─────────────────────────────────────────┘
+```
+
 ## mapContains {#mapcontains}

 Определяет, содержит ли контейнер `map` ключ `key`.
@ -319,4 +357,3 @@ SELECT mapValues(a) FROM test;
 │ ['twelve','6.0'] │
 └──────────────────┘
 ```
-
--- a/docs/ru/sql-reference/statements/describe-table.md
+++ b/docs/ru/sql-reference/statements/describe-table.md
@ -3,21 +3,66 @@ toc_priority: 42
 toc_title: DESCRIBE
 ---

-# DESCRIBE TABLE Statement {#misc-describe-table}
+# DESCRIBE TABLE {#misc-describe-table}
+
+Возвращает описание столбцов таблицы.
+
+**Синтаксис**

 ``` sql
 DESC|DESCRIBE TABLE [db.]table [INTO OUTFILE filename] [FORMAT format]
 ```

-Возвращает описание столбцов таблицы.
+Запрос `DESCRIBE` для каждого столбца таблицы возвращает строку со следующими значениями типа [String](../../sql-reference/data-types/string.md):

-Результат запроса содержит столбцы (все столбцы имеют тип String):
-
-   `name` — имя столбца таблицы;
+-   `name` — имя столбца;
 -   `type` — тип столбца;
-   `default_type` — в каком виде задано [выражение для значения по умолчанию](../../sql-reference/statements/create/table.md#create-default-values): `DEFAULT`, `MATERIALIZED` или `ALIAS`. Столбец содержит пустую строку, если значение по умолчанию не задано.
+-   `default_type` — вид [выражения для значения по умолчанию](../../sql-reference/statements/create/table.md#create-default-values): `DEFAULT`, `MATERIALIZED` или `ALIAS`. Если значение по умолчанию не задано, то возвращается пустая строка;
 -   `default_expression` — значение, заданное в секции `DEFAULT`;
-   `comment_expression` — комментарий к столбцу.
+-   `comment` — [комментарий](../../sql-reference/statements/alter/column.md#alter_comment-column);
+-   `codec_expression` — [кодек](../../sql-reference/statements/create/table.md#codecs), который применяется к столбцу;
+-   `ttl_expression` — выражение [TTL](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl);
+-   `is_subcolumn` — флаг, который равен `1` для внутренних подстолбцов. Он появляется в результате, только если описание подстолбцов разрешено настройкой [describe_include_subcolumns](../../operations/settings/settings.md#describe_include_subcolumns).

-Вложенные структуры данных выводятся в «развёрнутом» виде. То есть, каждый столбец - по отдельности, с именем через точку.
+Каждый столбец [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) структур описывается отдельно. Перед его именем ставится имя родительского столбца с точкой.
+Чтобы отобразить внутренние подстолбцы других типов данных, нужно включить настройку [describe_include_subcolumns](../../operations/settings/settings.md#describe_include_subcolumns). 

+**Пример**
+
+Запрос:
+
+``` sql
+CREATE TABLE describe_example (
+    id UInt64, text String DEFAULT 'unknown' CODEC(ZSTD),
+    user Tuple (name String, age UInt8)
+) ENGINE = MergeTree() ORDER BY id;
+
+DESCRIBE TABLE describe_example;
+DESCRIBE TABLE describe_example SETTINGS describe_include_subcolumns=1;
+```
+
+Результат:
+
+``` text
+┌─name─┬─type──────────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
+│ id   │ UInt64                        │              │                    │         │                  │                │
+│ text │ String                        │ DEFAULT      │ 'unknown'          │         │ ZSTD(1)          │                │
+│ user │ Tuple(name String, age UInt8) │              │                    │         │                  │                │
+└──────┴───────────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
+```
+
+Второй запрос дополнительно выводит информацию о подстолбцах:
+
+``` text
+┌─name──────┬─type──────────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┬─is_subcolumn─┐
+│ id        │ UInt64                        │              │                    │         │                  │                │            0 │
+│ text      │ String                        │ DEFAULT      │ 'unknown'          │         │ ZSTD(1)          │                │            0 │
+│ user      │ Tuple(name String, age UInt8) │              │                    │         │                  │                │            0 │
+│ user.name │ String                        │              │                    │         │                  │                │            1 │
+│ user.age  │ UInt8                         │              │                    │         │                  │                │            1 │
+└───────────┴───────────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┴──────────────┘
+```
+
+**См. также**
+
+-   настройка [describe_include_subcolumns](../../operations/settings/settings.md#describe_include_subcolumns).
--- a/src/Bridge/LibraryBridgeHelper.cpp
+++ b/src/Bridge/LibraryBridgeHelper.cpp
@ -258,6 +258,7 @@ Pipe LibraryBridgeHelper::loadBase(const Poco::URI & uri, ReadWriteBufferFromHTT
        0,
        Poco::Net::HTTPBasicCredentials{},
        DBMS_DEFAULT_BUFFER_SIZE,
+        getContext()->getReadSettings(),
        ReadWriteBufferFromHTTP::HTTPHeaderEntries{});

    auto source = FormatFactory::instance().getInput(LibraryBridgeHelper::DEFAULT_FORMAT, *read_buf_ptr, sample_block, getContext(), DEFAULT_BLOCK_SIZE);
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -77,6 +77,7 @@ add_headers_and_sources(clickhouse_common_io IO)
 add_headers_and_sources(clickhouse_common_io IO/S3)
 list (REMOVE_ITEM clickhouse_common_io_sources Common/malloc.cpp Common/new_delete.cpp)

+add_headers_and_sources(dbms Disks/IO)
 if (USE_SQLITE)
    add_headers_and_sources(dbms Databases/SQLite)
 endif()
--- a/src/Common/ProfileEvents.cpp
+++ b/src/Common/ProfileEvents.cpp
@ -250,6 +250,17 @@
    M(S3WriteRequestsRedirects, "Number of redirects in POST, DELETE, PUT and PATCH requests to S3 storage.") \
    M(QueryMemoryLimitExceeded, "Number of times when memory limit exceeded for query.") \
    \
+    M(RemoteFSReadMicroseconds, "Time of reading from remote filesystem.") \
+    M(RemoteFSReadBytes, "Read bytes from remote filesystem.") \
+    \
+    M(RemoteFSSeeks, "Total number of seeks for async buffer") \
+    M(RemoteFSPrefetches, "Number of prefetches made with asynchronous reading from remote filesystem") \
+    M(RemoteFSCancelledPrefetches, "Number of cancelled prefecthes (because of seek)") \
+    M(RemoteFSUnusedPrefetches, "Number of prefetches pending at buffer destruction") \
+    M(RemoteFSPrefetchedReads, "Number of reads from prefecthed buffer") \
+    M(RemoteFSUnprefetchedReads, "Number of reads from unprefetched buffer") \
+    M(RemoteFSBuffers, "Number of buffers created for asynchronous reading from remote filesystem") \
+    \
    M(SleepFunctionCalls, "Number of times a sleep function (sleep, sleepEachRow) has been called.") \
    M(SleepFunctionMicroseconds, "Time spent sleeping due to a sleep function call.") \
    \
--- a/src/Compression/CachedCompressedReadBuffer.cpp
+++ b/src/Compression/CachedCompressedReadBuffer.cpp
@ -28,6 +28,12 @@ void CachedCompressedReadBuffer::initInput()
 }


+void CachedCompressedReadBuffer::prefetch()
+{
+    file_in->prefetch();
+}
+
+
 bool CachedCompressedReadBuffer::nextImpl()
 {
    /// Let's check for the presence of a decompressed block in the cache, grab the ownership of this block, if it exists.
--- a/src/Compression/CachedCompressedReadBuffer.h
+++ b/src/Compression/CachedCompressedReadBuffer.h
@ -33,8 +33,11 @@ private:
    UncompressedCache::MappedPtr owned_cell;

    void initInput();
+
    bool nextImpl() override;

+    void prefetch() override;
+
    /// Passed into file_in.
    ReadBufferFromFileBase::ProfileCallback profile_callback;
    clockid_t clock_type {};
@ -55,6 +58,18 @@ public:
        profile_callback = profile_callback_;
        clock_type = clock_type_;
    }
+
+    void setReadUntilPosition(size_t position) override
+    {
+        if (file_in)
+            file_in->setReadUntilPosition(position);
+    }
+
+    void setReadUntilEnd() override
+    {
+        if (file_in)
+            file_in->setReadUntilEnd();
+    }
 };

 }
--- a/src/Compression/CompressedReadBufferFromFile.cpp
+++ b/src/Compression/CompressedReadBufferFromFile.cpp
@ -44,12 +44,6 @@ bool CompressedReadBufferFromFile::nextImpl()
 }


-void CompressedReadBufferFromFile::prefetch()
-{
-    file_in.prefetch();
-}
-
-
 CompressedReadBufferFromFile::CompressedReadBufferFromFile(std::unique_ptr<ReadBufferFromFileBase> buf, bool allow_different_codecs_)
    : BufferWithOwnMemory<ReadBuffer>(0), p_file_in(std::move(buf)), file_in(*p_file_in)
 {
@ -58,6 +52,12 @@ CompressedReadBufferFromFile::CompressedReadBufferFromFile(std::unique_ptr<ReadB
 }


+void CompressedReadBufferFromFile::prefetch()
+{
+    file_in.prefetch();
+}
+
+
 void CompressedReadBufferFromFile::seek(size_t offset_in_compressed_file, size_t offset_in_decompressed_block)
 {
    /// Nothing to do if we already at required position
--- a/src/Compression/CompressedReadBufferFromFile.h
+++ b/src/Compression/CompressedReadBufferFromFile.h
@ -42,10 +42,11 @@ private:
    /* size_t nextimpl_working_buffer_offset; */

    bool nextImpl() override;
+
    void prefetch() override;

 public:
-    CompressedReadBufferFromFile(std::unique_ptr<ReadBufferFromFileBase> buf, bool allow_different_codecs_ = false);
+    explicit CompressedReadBufferFromFile(std::unique_ptr<ReadBufferFromFileBase> buf, bool allow_different_codecs_ = false);

    /// Seek is lazy in some sense. We move position in compressed file_in to offset_in_compressed_file, but don't
    /// read data into working_buffer and don't shit our position to offset_in_decompressed_block. Instead
@ -58,6 +59,10 @@ public:
    {
        file_in.setProfileCallback(profile_callback_, clock_type_);
    }
+
+    void setReadUntilPosition(size_t position) override { file_in.setReadUntilPosition(position); }
+
+    void setReadUntilEnd() override { file_in.setReadUntilEnd(); }
 };

 }
--- a/src/Compression/CompressionCodecEncrypted.cpp
+++ b/src/Compression/CompressionCodecEncrypted.cpp
@ -1,3 +1,4 @@
+#include <string_view>
 #include <Common/config.h>
 #include <Common/Exception.h>
 #include <base/types.h>
@ -84,6 +85,8 @@ namespace
 constexpr size_t tag_size          = 16;   /// AES-GCM-SIV always uses a tag of 16 bytes length
 constexpr size_t key_id_max_size   = 8;    /// Max size of varint.
 constexpr size_t nonce_max_size    = 13;   /// Nonce size and one byte to show if nonce in in text
+constexpr size_t actual_nonce_size = 12;   /// Nonce actual size
+const String empty_nonce = {"\0\0\0\0\0\0\0\0\0\0\0\0", actual_nonce_size};

 /// Get encryption/decryption algorithms.
 auto getMethod(EncryptionMethod Method)
@ -137,7 +140,7 @@ size_t encrypt(const std::string_view & plaintext, char * ciphertext_and_tag, En
    EVP_AEAD_CTX_zero(&encrypt_ctx);
    const int ok_init = EVP_AEAD_CTX_init(&encrypt_ctx, getMethod(method)(),
                                            reinterpret_cast<const uint8_t*>(key.data()), key.size(),
-                                            16 /* tag size */, nullptr);
+                                            tag_size, nullptr);
    if (!ok_init)
        throw Exception(lastErrorString(), ErrorCodes::OPENSSL_ERROR);

@ -145,7 +148,7 @@ size_t encrypt(const std::string_view & plaintext, char * ciphertext_and_tag, En
    size_t out_len;
    const int ok_open = EVP_AEAD_CTX_seal(&encrypt_ctx,
                                            reinterpret_cast<uint8_t *>(ciphertext_and_tag),
-                                            &out_len, plaintext.size() + 16,
+                                            &out_len, plaintext.size() + tag_size,
                                            reinterpret_cast<const uint8_t *>(nonce.data()), nonce.size(),
                                            reinterpret_cast<const uint8_t *>(plaintext.data()), plaintext.size(),
                                            nullptr, 0);
@ -167,7 +170,7 @@ size_t decrypt(const std::string_view & ciphertext, char * plaintext, Encryption

    const int ok_init = EVP_AEAD_CTX_init(&decrypt_ctx, getMethod(method)(),
                                          reinterpret_cast<const uint8_t*>(key.data()), key.size(),
-                                          16 /* tag size */, nullptr);
+                                          tag_size, nullptr);
    if (!ok_init)
        throw Exception(lastErrorString(), ErrorCodes::OPENSSL_ERROR);

@ -221,7 +224,7 @@ inline char* writeNonce(const String& nonce, char* dest)
 {
    /// If nonce consists of nul bytes, it shouldn't be in dest. Zero byte is the only byte that should be written.
    /// Otherwise, 1 is written and data from nonce is copied
-    if (nonce != String("\0\0\0\0\0\0\0\0\0\0\0\0", 12))
+    if (nonce != empty_nonce)
    {
        *dest = 1;
        ++dest;
@ -246,15 +249,15 @@ inline const char* readNonce(String& nonce, const char* source)
    /// If first is zero byte: move source and set zero-bytes nonce
    if (!*source)
    {
-        nonce = {"\0\0\0\0\0\0\0\0\0\0\0\0", 12};
+        nonce = empty_nonce;
        return ++source;
    }
    /// Move to next byte. Nonce will begin from there
    ++source;

    /// Otherwise, use data from source in nonce
-    nonce = {source, 12};
-    source += 12;
+    nonce = {source, actual_nonce_size};
+    source += actual_nonce_size;
    return source;
 }

@ -332,14 +335,14 @@ void CompressionCodecEncrypted::Configuration::loadImpl(
    if (!new_params->keys_storage[method].contains(new_params->current_key_id[method]))
        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Not found a key with the current ID {}", new_params->current_key_id[method]);

-    /// Read nonce (in hex or in string). Its length should be 12 bytes.
+    /// Read nonce (in hex or in string). Its length should be 12 bytes (actual_nonce_size).
    if (config.has(config_prefix + ".nonce_hex"))
        new_params->nonce[method] = unhexKey(config.getString(config_prefix + ".nonce_hex"));
    else
        new_params->nonce[method] = config.getString(config_prefix + ".nonce", "");

-    if (new_params->nonce[method].size() != 12 && !new_params->nonce[method].empty())
-        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Got nonce with unexpected size {}, the size should be 12", new_params->nonce[method].size());
+    if (new_params->nonce[method].size() != actual_nonce_size && !new_params->nonce[method].empty())
+        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Got nonce with unexpected size {}, the size should be {}", new_params->nonce[method].size(), actual_nonce_size);
 }

 bool CompressionCodecEncrypted::Configuration::tryLoad(const Poco::Util::AbstractConfiguration & config, const String & config_prefix)
@ -399,7 +402,7 @@ void CompressionCodecEncrypted::Configuration::getCurrentKeyAndNonce(EncryptionM
    /// This will lead to data loss.
    nonce = current_params->nonce[method];
    if (nonce.empty())
-        nonce = {"\0\0\0\0\0\0\0\0\0\0\0\0", 12};
+        nonce = empty_nonce;
 }

 String CompressionCodecEncrypted::Configuration::getKey(EncryptionMethod method, const UInt64 & key_id) const
@ -448,8 +451,10 @@ UInt32 CompressionCodecEncrypted::getMaxCompressedDataSize(UInt32 uncompressed_s

 UInt32 CompressionCodecEncrypted::doCompressData(const char * source, UInt32 source_size, char * dest) const
 {
-    // Generate an IV out of the data block and the key-generation
-    // key. It is completely deterministic, but does not leak any
+    // Nonce, key and plaintext will be used to generate authentication tag
+    // and message encryption key. AES-GCM-SIV authenticates the encoded additional data and plaintext.
+    // For this purpose message_authentication_key is used.
+    // Algorithm is completely deterministic, but does not leak any
    // information about the data block except for equivalence of
    // identical blocks (under the same key).

@ -470,8 +475,7 @@ UInt32 CompressionCodecEncrypted::doCompressData(const char * source, UInt32 sou
    char* ciphertext = writeNonce(nonce, ciphertext_with_nonce);
    UInt64 nonce_size = ciphertext - ciphertext_with_nonce;

-    // The IV will be used as an authentication tag. The ciphertext and the
-    // tag will be written directly in the dest buffer.
+    // The ciphertext and the authentication tag will be written directly in the dest buffer.
    size_t out_len = encrypt(plaintext, ciphertext, encryption_method, current_key, nonce);

    /// Length of encrypted text should be equal to text length plus tag_size (which was added by algorithm).
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -75,7 +75,6 @@ class IColumn;
    M(UInt64, s3_max_single_read_retries, 4, "The maximum number of retries during single S3 read.", 0) \
    M(UInt64, s3_max_redirects, 10, "Max number of S3 redirects hops allowed.", 0) \
    M(UInt64, s3_max_connections, 1024, "The maximum number of connections per server.", 0) \
-    M(UInt64, http_max_single_read_retries, 4, "The maximum number of retries during single http read.", 0) \
    M(UInt64, hsts_max_age, 0, "Expired time for hsts. 0 means disable HSTS.", 0) \
    M(Bool, extremes, false, "Calculate minimums and maximums of the result columns. They can be output in JSON-formats.", IMPORTANT) \
    M(Bool, use_uncompressed_cache, false, "Whether to use the cache of uncompressed blocks.", 0) \
@ -508,6 +507,7 @@ class IColumn;
    M(ShortCircuitFunctionEvaluation, short_circuit_function_evaluation, ShortCircuitFunctionEvaluation::ENABLE, "Setting for short-circuit function evaluation configuration. Possible values: 'enable' - use short-circuit function evaluation for functions that are suitable for it, 'disable' - disable short-circuit function evaluation, 'force_enable' - use short-circuit function evaluation for all functions.", 0) \
    \
    M(String, local_filesystem_read_method, "pread", "Method of reading data from local filesystem, one of: read, pread, mmap, pread_threadpool.", 0) \
+    M(String, remote_filesystem_read_method, "read", "Method of reading data from remote filesystem, one of: read, read_threadpool.", 0) \
    M(Bool, local_filesystem_read_prefetch, false, "Should use prefetching when reading data from local filesystem.", 0) \
    M(Bool, remote_filesystem_read_prefetch, true, "Should use prefetching when reading data from remote filesystem.", 0) \
    M(Int64, read_priority, 0, "Priority to read data from local filesystem. Only supported for 'pread_threadpool' method.", 0) \
@ -520,7 +520,7 @@ class IColumn;
    M(Milliseconds, async_insert_busy_timeout_ms, 200, "Maximum time to wait before dumping collected data per query since the first data appeared", 0) \
    M(Milliseconds, async_insert_stale_timeout_ms, 0, "Maximum time to wait before dumping collected data per query since the last data appeared. Zero means no timeout at all", 0) \
    \
-    M(Int64, remote_fs_read_backoff_threshold, 10000, "Max wait time when trying to read data for remote disk", 0) \
+    M(Int64, remote_fs_read_max_backoff_ms, 10000, "Max wait time when trying to read data for remote disk", 0) \
    M(Int64, remote_fs_read_backoff_max_tries, 5, "Max attempts to read with backoff", 0) \
    \
    M(Bool, force_remove_data_recursively_on_drop, false, "Recursively remove data on DROP query. Avoids 'Directory not empty' error, but may silently remove detached data", 0) \
--- a/src/Dictionaries/DictionarySource.cpp
+++ b/src/Dictionaries/DictionarySource.cpp
@ -7,7 +7,7 @@ namespace DB
 namespace ErrorCodes
 {
    extern const int LOGICAL_ERROR;
-    extern const int UNSUPPORTED_METHOD;
+    extern const int NO_SUCH_COLUMN_IN_TABLE;
 }

 bool DictionarySourceCoordinator::getKeyColumnsNextRangeToRead(ColumnsWithTypeAndName & key_columns, ColumnsWithTypeAndName & data_columns)
@ -69,12 +69,6 @@ void DictionarySourceCoordinator::initialize(const Names & column_names)
                    }
                }
            }
-            else
-            {
-                throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "No such column name {} in dictionary {}",
-                    column_name,
-                    dictionary->getDictionaryID().getNameForLogs());
-            }
        }
        else
        {
@ -86,6 +80,11 @@ void DictionarySourceCoordinator::initialize(const Names & column_names)
            column_with_type.type = attribute.type;
        }

+        if (!column_with_type.type)
+            throw Exception(ErrorCodes::NO_SUCH_COLUMN_IN_TABLE, "No such column name {} in dictionary {}",
+                column_name,
+                dictionary->getDictionaryID().getNameForLogs());
+
        column_with_type.column = column_with_type.type->createColumn();
        columns_with_type.emplace_back(std::move(column_with_type));
    }
--- a/src/Dictionaries/HTTPDictionarySource.cpp
+++ b/src/Dictionaries/HTTPDictionarySource.cpp
@ -101,6 +101,7 @@ Pipe HTTPDictionarySource::loadAll()
        0,
        credentials,
        DBMS_DEFAULT_BUFFER_SIZE,
+        context->getReadSettings(),
        configuration.header_entries);

    return createWrappedBuffer(std::move(in_ptr));
@ -119,6 +120,7 @@ Pipe HTTPDictionarySource::loadUpdatedAll()
        0,
        credentials,
        DBMS_DEFAULT_BUFFER_SIZE,
+        context->getReadSettings(),
        configuration.header_entries);

    return createWrappedBuffer(std::move(in_ptr));
@ -146,6 +148,7 @@ Pipe HTTPDictionarySource::loadIds(const std::vector<UInt64> & ids)
        0,
        credentials,
        DBMS_DEFAULT_BUFFER_SIZE,
+        context->getReadSettings(),
        configuration.header_entries);

    return createWrappedBuffer(std::move(in_ptr));
@ -173,6 +176,7 @@ Pipe HTTPDictionarySource::loadKeys(const Columns & key_columns, const std::vect
        0,
        credentials,
        DBMS_DEFAULT_BUFFER_SIZE,
+        context->getReadSettings(),
        configuration.header_entries);

    return createWrappedBuffer(std::move(in_ptr));
--- a/src/Disks/DiskRestartProxy.cpp
+++ b/src/Disks/DiskRestartProxy.cpp
@ -20,6 +20,10 @@ public:
    RestartAwareReadBuffer(const DiskRestartProxy & disk, std::unique_ptr<ReadBufferFromFileBase> impl_)
        : ReadBufferFromFileDecorator(std::move(impl_)), lock(disk.mutex) { }

+    void prefetch() override { impl->prefetch(); }
+
+    void setReadUntilPosition(size_t position) override { impl->setReadUntilPosition(position); }
+
 private:
    ReadLock lock;
 };
--- a/src/Disks/DiskWebServer.cpp
+++ b/src/Disks/DiskWebServer.cpp
@ -3,15 +3,18 @@
 #include <base/logger_useful.h>
 #include <Common/escapeForFileName.h>

-#include <Disks/IDiskRemote.h>
-#include <Disks/ReadIndirectBufferFromRemoteFS.h>
-#include <Disks/ReadIndirectBufferFromWebServer.h>
-
 #include <IO/ReadWriteBufferFromHTTP.h>
 #include <IO/SeekAvoidingReadBuffer.h>
 #include <IO/ReadHelpers.h>
 #include <IO/WriteHelpers.h>

+#include <Disks/IDiskRemote.h>
+#include <Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h>
+#include <Disks/IO/ReadIndirectBufferFromRemoteFS.h>
+#include <Disks/IO/WriteIndirectBufferFromRemoteFS.h>
+#include <Disks/IO/ReadBufferFromRemoteFSGather.h>
+#include <Disks/IO/ThreadPoolRemoteFSReader.h>
+
 #include <Storages/MergeTree/MergeTreeData.h>

 #include <Poco/Exception.h>
@ -105,39 +108,6 @@ private:
 };


-class ReadBufferFromWebServer final : public ReadIndirectBufferFromRemoteFS<ReadIndirectBufferFromWebServer>
-{
-public:
-    ReadBufferFromWebServer(
-            const String & uri_,
-            RemoteMetadata metadata_,
-            ContextPtr context_,
-            size_t buf_size_,
-            size_t backoff_threshold_,
-            size_t max_tries_)
-        : ReadIndirectBufferFromRemoteFS<ReadIndirectBufferFromWebServer>(metadata_)
-        , uri(uri_)
-        , context(context_)
-        , buf_size(buf_size_)
-        , backoff_threshold(backoff_threshold_)
-        , max_tries(max_tries_)
-    {
-    }
-
-    std::unique_ptr<ReadIndirectBufferFromWebServer> createReadBuffer(const String & path) override
-    {
-        return std::make_unique<ReadIndirectBufferFromWebServer>(fs::path(uri) / path, context, buf_size, backoff_threshold, max_tries);
-    }
-
-private:
-    String uri;
-    ContextPtr context;
-    size_t buf_size;
-    size_t backoff_threshold;
-    size_t max_tries;
-};
-
-
 DiskWebServer::DiskWebServer(
            const String & disk_name_,
            const String & url_,
@ -196,9 +166,20 @@ std::unique_ptr<ReadBufferFromFileBase> DiskWebServer::readFile(const String & p
    RemoteMetadata meta(path, remote_path);
    meta.remote_fs_objects.emplace_back(std::make_pair(remote_path, iter->second.size));

-    auto reader = std::make_unique<ReadBufferFromWebServer>(url, meta, getContext(),
-        read_settings.remote_fs_buffer_size, read_settings.remote_fs_backoff_threshold, read_settings.remote_fs_backoff_max_tries);
-    return std::make_unique<SeekAvoidingReadBuffer>(std::move(reader), min_bytes_for_seek);
+    bool threadpool_read = read_settings.remote_fs_method == RemoteFSReadMethod::read_threadpool;
+
+    auto web_impl = std::make_unique<ReadBufferFromWebServerGather>(path, url, meta, getContext(), threadpool_read, read_settings);
+
+    if (threadpool_read)
+    {
+        auto reader = IDiskRemote::getThreadPoolReader();
+        return std::make_unique<AsynchronousReadIndirectBufferFromRemoteFS>(reader, read_settings, std::move(web_impl), min_bytes_for_seek);
+    }
+    else
+    {
+        auto buf = std::make_unique<ReadIndirectBufferFromRemoteFS>(std::move(web_impl));
+        return std::make_unique<SeekAvoidingReadBuffer>(std::move(buf), min_bytes_for_seek);
+    }
 }


--- a/src/Disks/HDFS/DiskHDFS.cpp
+++ b/src/Disks/HDFS/DiskHDFS.cpp
@ -1,10 +1,14 @@
 #include <Disks/HDFS/DiskHDFS.h>

-#include <Storages/HDFS/ReadBufferFromHDFS.h>
-#include <Storages/HDFS/WriteBufferFromHDFS.h>
 #include <IO/SeekAvoidingReadBuffer.h>
-#include <Disks/ReadIndirectBufferFromRemoteFS.h>
-#include <Disks/WriteIndirectBufferFromRemoteFS.h>
+#include <Storages/HDFS/WriteBufferFromHDFS.h>
+
+#include <Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h>
+#include <Disks/IO/ReadIndirectBufferFromRemoteFS.h>
+#include <Disks/IO/WriteIndirectBufferFromRemoteFS.h>
+#include <Disks/IO/ReadBufferFromRemoteFSGather.h>
+#include <Disks/IO/ThreadPoolRemoteFSReader.h>
+
 #include <base/logger_useful.h>
 #include <base/FnTraits.h>

@ -48,37 +52,6 @@ private:
 };


-/// Reads data from HDFS using stored paths in metadata.
-class ReadIndirectBufferFromHDFS final : public ReadIndirectBufferFromRemoteFS<ReadBufferFromHDFS>
-{
-public:
-    ReadIndirectBufferFromHDFS(
-            const Poco::Util::AbstractConfiguration & config_,
-            const String & hdfs_uri_,
-            DiskHDFS::Metadata metadata_,
-            size_t buf_size_)
-        : ReadIndirectBufferFromRemoteFS<ReadBufferFromHDFS>(metadata_)
-        , config(config_)
-        , buf_size(buf_size_)
-    {
-        const size_t begin_of_path = hdfs_uri_.find('/', hdfs_uri_.find("//") + 2);
-        hdfs_directory = hdfs_uri_.substr(begin_of_path);
-        hdfs_uri = hdfs_uri_.substr(0, begin_of_path);
-    }
-
-    std::unique_ptr<ReadBufferFromHDFS> createReadBuffer(const String & path) override
-    {
-        return std::make_unique<ReadBufferFromHDFS>(hdfs_uri, hdfs_directory + path, config, buf_size);
-    }
-
-private:
-    const Poco::Util::AbstractConfiguration & config;
-    String hdfs_uri;
-    String hdfs_directory;
-    size_t buf_size;
-};
-
-
 DiskHDFS::DiskHDFS(
    const String & disk_name_,
    const String & hdfs_root_path_,
@ -102,8 +75,18 @@ std::unique_ptr<ReadBufferFromFileBase> DiskHDFS::readFile(const String & path,
        "Read from file by path: {}. Existing HDFS objects: {}",
        backQuote(metadata_path + path), metadata.remote_fs_objects.size());

-    auto reader = std::make_unique<ReadIndirectBufferFromHDFS>(config, remote_fs_root_path, metadata, read_settings.remote_fs_buffer_size);
-    return std::make_unique<SeekAvoidingReadBuffer>(std::move(reader), settings->min_bytes_for_seek);
+    auto hdfs_impl = std::make_unique<ReadBufferFromHDFSGather>(path, config, remote_fs_root_path, metadata, read_settings.remote_fs_buffer_size);
+
+    if (read_settings.remote_fs_method == RemoteFSReadMethod::read_threadpool)
+    {
+        auto reader = getThreadPoolReader();
+        return std::make_unique<AsynchronousReadIndirectBufferFromRemoteFS>(reader, read_settings, std::move(hdfs_impl));
+    }
+    else
+    {
+        auto buf = std::make_unique<ReadIndirectBufferFromRemoteFS>(std::move(hdfs_impl));
+        return std::make_unique<SeekAvoidingReadBuffer>(std::move(buf), settings->min_bytes_for_seek);
+    }
 }


--- a/src/Disks/IDiskRemote.cpp
+++ b/src/Disks/IDiskRemote.cpp
@ -12,6 +12,7 @@
 #include <Common/checkStackSize.h>
 #include <boost/algorithm/string.hpp>
 #include <Common/filesystemHelpers.h>
+#include <Disks/IO/ThreadPoolRemoteFSReader.h>


 namespace DB
@ -496,4 +497,13 @@ String IDiskRemote::getUniqueId(const String & path) const
    return id;
 }

+
+AsynchronousReaderPtr IDiskRemote::getThreadPoolReader()
+{
+    constexpr size_t pool_size = 50;
+    constexpr size_t queue_size = 1000000;
+    static AsynchronousReaderPtr reader = std::make_shared<ThreadPoolRemoteFSReader>(pool_size, queue_size);
+    return reader;
+}
+
 }
--- a/src/Disks/IDiskRemote.h
+++ b/src/Disks/IDiskRemote.h
@ -3,8 +3,8 @@
 #include <Common/config.h>

 #include <atomic>
-#include "Disks/DiskFactory.h"
-#include "Disks/Executor.h"
+#include <Disks/DiskFactory.h>
+#include <Disks/Executor.h>
 #include <utility>
 #include <Common/MultiVersion.h>
 #include <Common/ThreadPool.h>
@ -33,6 +33,10 @@ protected:
 using RemoteFSPathKeeperPtr = std::shared_ptr<RemoteFSPathKeeper>;


+class IAsynchronousReader;
+using AsynchronousReaderPtr = std::shared_ptr<IAsynchronousReader>;
+
+
 /// Base Disk class for remote FS's, which are not posix-compatible (DiskS3 and DiskHDFS)
 class IDiskRemote : public IDisk
 {
@ -125,6 +129,8 @@ public:

    virtual RemoteFSPathKeeperPtr createFSPathKeeper() const = 0;

+    static AsynchronousReaderPtr getThreadPoolReader();
+
 protected:
    Poco::Logger * log;
    const String name;
--- a/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.cpp
+++ b/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.cpp
@ -0,0 +1,262 @@
+#include "AsynchronousReadIndirectBufferFromRemoteFS.h"
+
+#include <Common/Stopwatch.h>
+#include <Disks/IO/ThreadPoolRemoteFSReader.h>
+#include <Disks/IO/ReadBufferFromRemoteFSGather.h>
+#include <IO/ReadSettings.h>
+#include <base/logger_useful.h>
+
+
+namespace CurrentMetrics
+{
+    extern const Metric AsynchronousReadWait;
+}
+
+namespace ProfileEvents
+{
+    extern const Event AsynchronousReadWaitMicroseconds;
+    extern const Event RemoteFSSeeks;
+    extern const Event RemoteFSPrefetches;
+    extern const Event RemoteFSCancelledPrefetches;
+    extern const Event RemoteFSUnusedPrefetches;
+    extern const Event RemoteFSPrefetchedReads;
+    extern const Event RemoteFSUnprefetchedReads;
+    extern const Event RemoteFSBuffers;
+}
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+    extern const int CANNOT_SEEK_THROUGH_FILE;
+}
+
+
+AsynchronousReadIndirectBufferFromRemoteFS::AsynchronousReadIndirectBufferFromRemoteFS(
+        AsynchronousReaderPtr reader_,
+        const ReadSettings & settings_,
+        std::shared_ptr<ReadBufferFromRemoteFSGather> impl_,
+        size_t min_bytes_for_seek_)
+    : ReadBufferFromFileBase(settings_.remote_fs_buffer_size, nullptr, 0)
+    , reader(reader_)
+    , priority(settings_.priority)
+    , impl(impl_)
+    , prefetch_buffer(settings_.remote_fs_buffer_size)
+    , min_bytes_for_seek(min_bytes_for_seek_)
+    , must_read_until_position(settings_.must_read_until_position)
+{
+    ProfileEvents::increment(ProfileEvents::RemoteFSBuffers);
+}
+
+
+String AsynchronousReadIndirectBufferFromRemoteFS::getFileName() const
+{
+    return impl->getFileName();
+}
+
+
+bool AsynchronousReadIndirectBufferFromRemoteFS::hasPendingDataToRead()
+{
+    /// Position is set only for MergeTree tables.
+    if (read_until_position)
+    {
+        /// Everything is already read.
+        if (file_offset_of_buffer_end == read_until_position)
+            return false;
+
+        if (file_offset_of_buffer_end > read_until_position)
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Read beyond last offset ({} > {})",
+                            file_offset_of_buffer_end, read_until_position);
+    }
+    else if (must_read_until_position)
+        throw Exception(ErrorCodes::LOGICAL_ERROR,
+                        "Reading for MergeTree family tables must be done with last position boundary");
+
+    return true;
+}
+
+
+std::future<IAsynchronousReader::Result> AsynchronousReadIndirectBufferFromRemoteFS::readInto(char * data, size_t size)
+{
+    IAsynchronousReader::Request request;
+    request.descriptor = std::make_shared<ThreadPoolRemoteFSReader::RemoteFSFileDescriptor>(impl);
+    request.buf = data;
+    request.size = size;
+    request.offset = file_offset_of_buffer_end;
+    request.priority = priority;
+
+    if (bytes_to_ignore)
+    {
+        request.ignore = bytes_to_ignore;
+        bytes_to_ignore = 0;
+    }
+    return reader->submit(request);
+}
+
+
+void AsynchronousReadIndirectBufferFromRemoteFS::prefetch()
+{
+    if (prefetch_future.valid())
+        return;
+
+    /// Check boundary, which was set in readUntilPosition().
+    if (!hasPendingDataToRead())
+        return;
+
+    /// Prefetch even in case hasPendingData() == true.
+    prefetch_future = readInto(prefetch_buffer.data(), prefetch_buffer.size());
+    ProfileEvents::increment(ProfileEvents::RemoteFSPrefetches);
+}
+
+
+void AsynchronousReadIndirectBufferFromRemoteFS::setReadUntilPosition(size_t position)
+{
+    if (prefetch_future.valid())
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Prefetch is valid in readUntilPosition");
+
+    read_until_position = position;
+    impl->setReadUntilPosition(read_until_position);
+}
+
+
+void AsynchronousReadIndirectBufferFromRemoteFS::setReadUntilEnd()
+{
+    if (prefetch_future.valid())
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Prefetch is valid in readUntilEnd");
+
+    read_until_position = impl->getFileSize();
+    impl->setReadUntilPosition(read_until_position);
+}
+
+
+bool AsynchronousReadIndirectBufferFromRemoteFS::nextImpl()
+{
+    if (!hasPendingDataToRead())
+        return false;
+
+    size_t size = 0;
+
+    if (prefetch_future.valid())
+    {
+        ProfileEvents::increment(ProfileEvents::RemoteFSPrefetchedReads);
+
+        CurrentMetrics::Increment metric_increment{CurrentMetrics::AsynchronousReadWait};
+        Stopwatch watch;
+        {
+            size = prefetch_future.get();
+            if (size)
+            {
+                memory.swap(prefetch_buffer);
+                set(memory.data(), memory.size());
+                working_buffer.resize(size);
+                file_offset_of_buffer_end += size;
+            }
+        }
+
+        watch.stop();
+        ProfileEvents::increment(ProfileEvents::AsynchronousReadWaitMicroseconds, watch.elapsedMicroseconds());
+    }
+    else
+    {
+        ProfileEvents::increment(ProfileEvents::RemoteFSUnprefetchedReads);
+        size = readInto(memory.data(), memory.size()).get();
+
+        if (size)
+        {
+            set(memory.data(), memory.size());
+            working_buffer.resize(size);
+            file_offset_of_buffer_end += size;
+        }
+    }
+
+    prefetch_future = {};
+    return size;
+}
+
+
+off_t AsynchronousReadIndirectBufferFromRemoteFS::seek(off_t offset_, int whence)
+{
+    ProfileEvents::increment(ProfileEvents::RemoteFSSeeks);
+
+    if (whence == SEEK_CUR)
+    {
+        /// If position within current working buffer - shift pos.
+        if (!working_buffer.empty() && static_cast<size_t>(getPosition() + offset_) < file_offset_of_buffer_end)
+        {
+            pos += offset_;
+            return getPosition();
+        }
+        else
+        {
+            file_offset_of_buffer_end += offset_;
+        }
+    }
+    else if (whence == SEEK_SET)
+    {
+        /// If position is within current working buffer - shift pos.
+        if (!working_buffer.empty()
+            && static_cast<size_t>(offset_) >= file_offset_of_buffer_end - working_buffer.size()
+            && size_t(offset_) < file_offset_of_buffer_end)
+        {
+            pos = working_buffer.end() - (file_offset_of_buffer_end - offset_);
+
+            assert(pos >= working_buffer.begin());
+            assert(pos <= working_buffer.end());
+
+            return getPosition();
+        }
+        else
+        {
+            file_offset_of_buffer_end = offset_;
+        }
+    }
+    else
+        throw Exception("Only SEEK_SET or SEEK_CUR modes are allowed.", ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
+
+    if (prefetch_future.valid())
+    {
+        ProfileEvents::increment(ProfileEvents::RemoteFSCancelledPrefetches);
+        prefetch_future.wait();
+        prefetch_future = {};
+    }
+
+    pos = working_buffer.end();
+
+    /// Note: we read in range [file_offset_of_buffer_end, read_until_position).
+    if (file_offset_of_buffer_end < read_until_position
+        && static_cast<off_t>(file_offset_of_buffer_end) >= getPosition()
+        && static_cast<off_t>(file_offset_of_buffer_end) < getPosition() + static_cast<off_t>(min_bytes_for_seek))
+    {
+       /**
+        * Lazy ignore. Save number of bytes to ignore and ignore it either for prefetch buffer or current buffer.
+        */
+        bytes_to_ignore = file_offset_of_buffer_end - getPosition();
+    }
+    else
+    {
+        impl->reset();
+    }
+
+    return file_offset_of_buffer_end;
+}
+
+
+void AsynchronousReadIndirectBufferFromRemoteFS::finalize()
+{
+    if (prefetch_future.valid())
+    {
+        ProfileEvents::increment(ProfileEvents::RemoteFSUnusedPrefetches);
+        prefetch_future.wait();
+        prefetch_future = {};
+    }
+}
+
+
+AsynchronousReadIndirectBufferFromRemoteFS::~AsynchronousReadIndirectBufferFromRemoteFS()
+{
+    finalize();
+}
+
+}
--- a/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h
+++ b/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h
@ -0,0 +1,84 @@
+#pragma once
+
+#if !defined(ARCADIA_BUILD)
+#include <Common/config.h>
+#endif
+
+#include <IO/ReadBufferFromFile.h>
+#include <IO/AsynchronousReader.h>
+#include <utility>
+
+
+namespace DB
+{
+
+class ReadBufferFromRemoteFSGather;
+struct ReadSettings;
+
+/**
+ * Reads data from S3/HDFS/Web using stored paths in metadata.
+* This class is an asynchronous version of ReadIndirectBufferFromRemoteFS.
+*
+* Buffers chain for diskS3:
+* AsynchronousIndirectReadBufferFromRemoteFS -> ReadBufferFromRemoteFS ->
+* -> ReadBufferFromS3 -> ReadBufferFromIStream.
+*
+* Buffers chain for diskWeb:
+* AsynchronousIndirectReadBufferFromRemoteFS -> ReadBufferFromRemoteFS ->
+* -> ReadIndirectBufferFromWebServer -> ReadBufferFromHttp -> ReadBufferFromIStream.
+*
+* We pass either `memory` or `prefetch_buffer` through all this chain and return it back.
+*/
+class AsynchronousReadIndirectBufferFromRemoteFS : public ReadBufferFromFileBase
+{
+public:
+    explicit AsynchronousReadIndirectBufferFromRemoteFS(
+        AsynchronousReaderPtr reader_, const ReadSettings & settings_,
+        std::shared_ptr<ReadBufferFromRemoteFSGather> impl_,
+        size_t min_bytes_for_seek = 1024 * 1024);
+
+    ~AsynchronousReadIndirectBufferFromRemoteFS() override;
+
+    off_t seek(off_t offset_, int whence) override;
+
+    off_t getPosition() override { return file_offset_of_buffer_end - available(); }
+
+    String getFileName() const override;
+
+    void prefetch() override;
+
+    void setReadUntilPosition(size_t position) override;
+
+    void setReadUntilEnd() override;
+
+private:
+    bool nextImpl() override;
+
+    void finalize();
+
+    bool hasPendingDataToRead();
+
+    std::future<IAsynchronousReader::Result> readInto(char * data, size_t size);
+
+    AsynchronousReaderPtr reader;
+
+    Int32 priority;
+
+    std::shared_ptr<ReadBufferFromRemoteFSGather> impl;
+
+    std::future<IAsynchronousReader::Result> prefetch_future;
+
+    size_t file_offset_of_buffer_end = 0;
+
+    Memory<> prefetch_buffer;
+
+    size_t min_bytes_for_seek;
+
+    size_t bytes_to_ignore = 0;
+
+    size_t read_until_position = 0;
+
+    bool must_read_until_position;
+};
+
+}
--- a/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp
+++ b/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp
@ -0,0 +1,182 @@
+#include "ReadBufferFromRemoteFSGather.h"
+
+#include <Disks/IDiskRemote.h>
+#include <IO/SeekableReadBuffer.h>
+#include <Disks/IO/ReadBufferFromWebServer.h>
+
+#if USE_AWS_S3
+#include <IO/ReadBufferFromS3.h>
+#endif
+
+#if USE_HDFS
+#include <Storages/HDFS/ReadBufferFromHDFS.h>
+#endif
+
+#include <base/logger_useful.h>
+#include <filesystem>
+#include <iostream>
+
+namespace fs = std::filesystem;
+
+namespace DB
+{
+
+#if USE_AWS_S3
+SeekableReadBufferPtr ReadBufferFromS3Gather::createImplementationBuffer(const String & path, size_t read_until_position_) const
+{
+    return std::make_unique<ReadBufferFromS3>(client_ptr, bucket,
+        fs::path(metadata.remote_fs_root_path) / path, max_single_read_retries, settings, threadpool_read, read_until_position_);
+}
+#endif
+
+
+SeekableReadBufferPtr ReadBufferFromWebServerGather::createImplementationBuffer(const String & path, size_t read_until_position_) const
+{
+    return std::make_unique<ReadBufferFromWebServer>(fs::path(uri) / path, context, settings, threadpool_read, read_until_position_);
+}
+
+
+#if USE_HDFS
+SeekableReadBufferPtr ReadBufferFromHDFSGather::createImplementationBuffer(const String & path, size_t read_until_position_) const
+{
+    return std::make_unique<ReadBufferFromHDFS>(hdfs_uri, fs::path(hdfs_directory) / path, config, buf_size, read_until_position_);
+}
+#endif
+
+
+ReadBufferFromRemoteFSGather::ReadBufferFromRemoteFSGather(const RemoteMetadata & metadata_, const String & path_)
+    : ReadBuffer(nullptr, 0)
+    , metadata(metadata_)
+    , canonical_path(path_)
+{
+}
+
+
+size_t ReadBufferFromRemoteFSGather::readInto(char * data, size_t size, size_t offset, size_t ignore)
+{
+    /**
+     * Set `data` to current working and internal buffers.
+     * Internal buffer with size `size`. Working buffer with size 0.
+     */
+    set(data, size);
+
+    absolute_position = offset;
+    bytes_to_ignore = ignore;
+
+    auto result = nextImpl();
+    bytes_to_ignore = 0;
+
+    if (result)
+        return working_buffer.size();
+
+    return 0;
+}
+
+
+void ReadBufferFromRemoteFSGather::initialize()
+{
+    /// One clickhouse file can be split into multiple files in remote fs.
+    auto current_buf_offset = absolute_position;
+    for (size_t i = 0; i < metadata.remote_fs_objects.size(); ++i)
+    {
+        const auto & [file_path, size] = metadata.remote_fs_objects[i];
+
+        if (size > current_buf_offset)
+        {
+            /// Do not create a new buffer if we already have what we need.
+            if (!current_buf || current_buf_idx != i)
+            {
+                current_buf = createImplementationBuffer(file_path, read_until_position);
+                current_buf_idx = i;
+            }
+
+            current_buf->seek(current_buf_offset, SEEK_SET);
+            return;
+        }
+
+        current_buf_offset -= size;
+    }
+    current_buf_idx = metadata.remote_fs_objects.size();
+    current_buf = nullptr;
+}
+
+
+bool ReadBufferFromRemoteFSGather::nextImpl()
+{
+    /// Find first available buffer that fits to given offset.
+    if (!current_buf)
+        initialize();
+
+    /// If current buffer has remaining data - use it.
+    if (current_buf)
+    {
+        if (readImpl())
+            return true;
+    }
+    else
+        return false;
+
+    /// If there is no available buffers - nothing to read.
+    if (current_buf_idx + 1 >= metadata.remote_fs_objects.size())
+        return false;
+
+    ++current_buf_idx;
+
+    const auto & current_path = metadata.remote_fs_objects[current_buf_idx].first;
+    current_buf = createImplementationBuffer(current_path, read_until_position);
+
+    return readImpl();
+}
+
+
+bool ReadBufferFromRemoteFSGather::readImpl()
+{
+    swap(*current_buf);
+
+    /**
+     * Lazy seek is performed here.
+     * In asynchronous buffer when seeking to offset in range [pos, pos + min_bytes_for_seek]
+     * we save how many bytes need to be ignored (new_offset - position() bytes).
+     */
+    if (bytes_to_ignore)
+        current_buf->ignore(bytes_to_ignore);
+
+    auto result = current_buf->next();
+
+    swap(*current_buf);
+
+    if (result)
+        absolute_position += working_buffer.size();
+
+    return result;
+}
+
+
+void ReadBufferFromRemoteFSGather::setReadUntilPosition(size_t position)
+{
+    read_until_position = position;
+    reset();
+}
+
+
+void ReadBufferFromRemoteFSGather::reset()
+{
+    current_buf.reset();
+}
+
+
+String ReadBufferFromRemoteFSGather::getFileName() const
+{
+    return canonical_path;
+}
+
+
+size_t ReadBufferFromRemoteFSGather::getFileSize() const
+{
+    size_t size = 0;
+    for (const auto & object : metadata.remote_fs_objects)
+        size += object.second;
+    return size;
+}
+
+}
--- a/src/Disks/IO/ReadBufferFromRemoteFSGather.h
+++ b/src/Disks/IO/ReadBufferFromRemoteFSGather.h
@ -0,0 +1,161 @@
+#pragma once
+
+#if !defined(ARCADIA_BUILD)
+#include <Common/config.h>
+#endif
+
+#include <Disks/IDiskRemote.h>
+#include <IO/ReadBufferFromFile.h>
+#include <IO/ReadSettings.h>
+
+namespace Aws
+{
+namespace S3
+{
+class S3Client;
+}
+}
+
+namespace DB
+{
+
+/**
+ * Remote disk might need to split one clickhouse file into multiple files in remote fs.
+ * This class works like a proxy to allow transition from one file into multiple.
+ */
+class ReadBufferFromRemoteFSGather : public ReadBuffer
+{
+friend class ReadIndirectBufferFromRemoteFS;
+
+public:
+    explicit ReadBufferFromRemoteFSGather(const RemoteMetadata & metadata_, const String & path_);
+
+    String getFileName() const;
+
+    void reset();
+
+    void setReadUntilPosition(size_t position) override;
+
+    size_t readInto(char * data, size_t size, size_t offset, size_t ignore = 0);
+
+    size_t getFileSize() const;
+
+protected:
+    virtual SeekableReadBufferPtr createImplementationBuffer(const String & path, size_t read_until_position) const = 0;
+
+    RemoteMetadata metadata;
+
+private:
+    bool nextImpl() override;
+
+    void initialize();
+
+    bool readImpl();
+
+    SeekableReadBufferPtr current_buf;
+
+    size_t current_buf_idx = 0;
+
+    size_t absolute_position = 0;
+
+    size_t bytes_to_ignore = 0;
+
+    size_t read_until_position = 0;
+
+    String canonical_path;
+};
+
+
+#if USE_AWS_S3
+/// Reads data from S3 using stored paths in metadata.
+class ReadBufferFromS3Gather final : public ReadBufferFromRemoteFSGather
+{
+public:
+    ReadBufferFromS3Gather(
+        const String & path_,
+        std::shared_ptr<Aws::S3::S3Client> client_ptr_,
+        const String & bucket_,
+        IDiskRemote::Metadata metadata_,
+        size_t max_single_read_retries_,
+        const ReadSettings & settings_,
+        bool threadpool_read_ = false)
+        : ReadBufferFromRemoteFSGather(metadata_, path_)
+        , client_ptr(std::move(client_ptr_))
+        , bucket(bucket_)
+        , max_single_read_retries(max_single_read_retries_)
+        , settings(settings_)
+        , threadpool_read(threadpool_read_)
+    {
+    }
+
+    SeekableReadBufferPtr createImplementationBuffer(const String & path, size_t read_until_position) const override;
+
+private:
+    std::shared_ptr<Aws::S3::S3Client> client_ptr;
+    String bucket;
+    UInt64 max_single_read_retries;
+    ReadSettings settings;
+    bool threadpool_read;
+};
+#endif
+
+
+class ReadBufferFromWebServerGather final : public ReadBufferFromRemoteFSGather
+{
+public:
+    ReadBufferFromWebServerGather(
+            const String & path_,
+            const String & uri_,
+            RemoteMetadata metadata_,
+            ContextPtr context_,
+            size_t threadpool_read_,
+            const ReadSettings & settings_)
+        : ReadBufferFromRemoteFSGather(metadata_, path_)
+        , uri(uri_)
+        , context(context_)
+        , threadpool_read(threadpool_read_)
+        , settings(settings_)
+    {
+    }
+
+    SeekableReadBufferPtr createImplementationBuffer(const String & path, size_t read_until_position) const override;
+
+private:
+    String uri;
+    ContextPtr context;
+    bool threadpool_read;
+    ReadSettings settings;
+};
+
+
+#if USE_HDFS
+/// Reads data from HDFS using stored paths in metadata.
+class ReadBufferFromHDFSGather final : public ReadBufferFromRemoteFSGather
+{
+public:
+    ReadBufferFromHDFSGather(
+            const String & path_,
+            const Poco::Util::AbstractConfiguration & config_,
+            const String & hdfs_uri_,
+            IDiskRemote::Metadata metadata_,
+            size_t buf_size_)
+        : ReadBufferFromRemoteFSGather(metadata_, path_)
+        , config(config_)
+        , buf_size(buf_size_)
+    {
+        const size_t begin_of_path = hdfs_uri_.find('/', hdfs_uri_.find("//") + 2);
+        hdfs_directory = hdfs_uri_.substr(begin_of_path);
+        hdfs_uri = hdfs_uri_.substr(0, begin_of_path);
+    }
+
+    SeekableReadBufferPtr createImplementationBuffer(const String & path, size_t read_until_position) const override;
+
+private:
+    const Poco::Util::AbstractConfiguration & config;
+    String hdfs_uri;
+    String hdfs_directory;
+    size_t buf_size;
+};
+#endif
+
+}
--- a/src/Disks/IO/ReadBufferFromWebServer.cpp
+++ b/src/Disks/IO/ReadBufferFromWebServer.cpp
@ -0,0 +1,198 @@
+#include "ReadBufferFromWebServer.h"
+
+#include <base/logger_useful.h>
+#include <base/sleep.h>
+#include <Core/Types.h>
+#include <IO/ReadWriteBufferFromHTTP.h>
+#include <IO/ConnectionTimeoutsContext.h>
+#include <IO/WriteBufferFromString.h>
+#include <IO/Operators.h>
+#include <thread>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int CANNOT_SEEK_THROUGH_FILE;
+    extern const int SEEK_POSITION_OUT_OF_BOUND;
+    extern const int LOGICAL_ERROR;
+}
+
+
+static constexpr size_t HTTP_MAX_TRIES = 10;
+static constexpr size_t WAIT_INIT = 100;
+
+ReadBufferFromWebServer::ReadBufferFromWebServer(
+    const String & url_,
+    ContextPtr context_,
+    const ReadSettings & settings_,
+    bool use_external_buffer_,
+    size_t last_offset_)
+    : SeekableReadBuffer(nullptr, 0)
+    , log(&Poco::Logger::get("ReadBufferFromWebServer"))
+    , context(context_)
+    , url(url_)
+    , buf_size(settings_.remote_fs_buffer_size)
+    , read_settings(settings_)
+    , use_external_buffer(use_external_buffer_)
+    , last_offset(last_offset_)
+{
+}
+
+
+std::unique_ptr<ReadBuffer> ReadBufferFromWebServer::initialize()
+{
+    Poco::URI uri(url);
+
+    ReadWriteBufferFromHTTP::HTTPHeaderEntries headers;
+
+    if (last_offset)
+    {
+        if (last_offset < offset)
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to read beyond right offset ({} > {})", offset, last_offset - 1);
+
+        headers.emplace_back(std::make_pair("Range", fmt::format("bytes={}-{}", offset, last_offset - 1)));
+        LOG_DEBUG(log, "Reading with range: {}-{}", offset, last_offset);
+    }
+    else
+    {
+        headers.emplace_back(std::make_pair("Range", fmt::format("bytes={}-", offset)));
+        LOG_DEBUG(log, "Reading from offset: {}", offset);
+    }
+
+    const auto & settings = context->getSettingsRef();
+    const auto & config = context->getConfigRef();
+    Poco::Timespan http_keep_alive_timeout{config.getUInt("keep_alive_timeout", 20), 0};
+
+    return std::make_unique<ReadWriteBufferFromHTTP>(
+        uri,
+        Poco::Net::HTTPRequest::HTTP_GET,
+        ReadWriteBufferFromHTTP::OutStreamCallback(),
+        ConnectionTimeouts(std::max(Poco::Timespan(settings.http_connection_timeout.totalSeconds(), 0), Poco::Timespan(20, 0)),
+                           settings.http_send_timeout,
+                           std::max(Poco::Timespan(settings.http_receive_timeout.totalSeconds(), 0), Poco::Timespan(20, 0)),
+                           settings.tcp_keep_alive_timeout,
+                           http_keep_alive_timeout),
+        0,
+        Poco::Net::HTTPBasicCredentials{},
+        buf_size,
+        read_settings,
+        headers,
+        context->getRemoteHostFilter(),
+        use_external_buffer);
+}
+
+
+void ReadBufferFromWebServer::initializeWithRetry()
+{
+    /// Initialize impl with retry.
+    size_t milliseconds_to_wait = WAIT_INIT;
+    for (size_t i = 0; i < HTTP_MAX_TRIES; ++i)
+    {
+        try
+        {
+            impl = initialize();
+
+            if (use_external_buffer)
+            {
+                /**
+                 * See comment 30 lines lower.
+                 */
+                impl->set(internal_buffer.begin(), internal_buffer.size());
+                assert(working_buffer.begin() != nullptr);
+                assert(!internal_buffer.empty());
+            }
+
+            break;
+        }
+        catch (Poco::Exception & e)
+        {
+            if (i == HTTP_MAX_TRIES - 1)
+                throw;
+
+            LOG_ERROR(&Poco::Logger::get("ReadBufferFromWeb"), "Error: {}, code: {}", e.what(), e.code());
+            sleepForMilliseconds(milliseconds_to_wait);
+            milliseconds_to_wait *= 2;
+        }
+    }
+}
+
+
+bool ReadBufferFromWebServer::nextImpl()
+{
+    if (last_offset)
+    {
+        if (last_offset == offset)
+            return false;
+
+        if (last_offset < offset)
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to read beyond right offset ({} > {})", offset, last_offset - 1);
+    }
+
+    if (impl)
+    {
+        if (use_external_buffer)
+        {
+            /**
+            * use_external_buffer -- means we read into the buffer which
+            * was passed to us from somewhere else. We do not check whether
+            * previously returned buffer was read or not, because this branch
+            * means we are prefetching data, each nextImpl() call we can fill
+            * a different buffer.
+            */
+            impl->set(internal_buffer.begin(), internal_buffer.size());
+            assert(working_buffer.begin() != nullptr);
+            assert(!internal_buffer.empty());
+        }
+        else
+        {
+            /**
+            * impl was initialized before, pass position() to it to make
+            * sure there is no pending data which was not read, because
+            * this branch means we read sequentially.
+            */
+            impl->position() = position();
+            assert(!impl->hasPendingData());
+        }
+    }
+    else
+    {
+        initializeWithRetry();
+    }
+
+    auto result = impl->next();
+    if (result)
+    {
+        BufferBase::set(impl->buffer().begin(), impl->buffer().size(), impl->offset());
+        offset += working_buffer.size();
+    }
+
+    return result;
+}
+
+
+off_t ReadBufferFromWebServer::seek(off_t offset_, int whence)
+{
+    if (impl)
+        throw Exception(ErrorCodes::CANNOT_SEEK_THROUGH_FILE, "Seek is allowed only before first read attempt from the buffer");
+
+    if (whence != SEEK_SET)
+        throw Exception(ErrorCodes::CANNOT_SEEK_THROUGH_FILE, "Only SEEK_SET mode is allowed");
+
+    if (offset_ < 0)
+        throw Exception(ErrorCodes::SEEK_POSITION_OUT_OF_BOUND, "Seek position is out of bounds. Offset: {}", std::to_string(offset_));
+
+    offset = offset_;
+
+    return offset;
+}
+
+
+off_t ReadBufferFromWebServer::getPosition()
+{
+    return offset - available();
+}
+
+}
--- a/src/Disks/ReadIndirectBufferFromWebServer.h
+++ b/src/Disks/ReadIndirectBufferFromWebServer.h
@ -2,6 +2,7 @@

 #include <IO/SeekableReadBuffer.h>
 #include <IO/BufferWithOwnMemory.h>
+#include <IO/ReadSettings.h>
 #include <Interpreters/Context.h>


@ -11,15 +12,16 @@ namespace DB
 /* Read buffer, which reads via http, but is used as ReadBufferFromFileBase.
 * Used to read files, hosted on a web server with static files.
 *
- * Usage: ReadIndirectBufferFromRemoteFS -> SeekAvoidingReadBuffer -> ReadIndirectBufferFromWebServer -> ReadWriteBufferFromHTTP.
+ * Usage: ReadIndirectBufferFromRemoteFS -> SeekAvoidingReadBuffer -> ReadBufferFromWebServer -> ReadWriteBufferFromHTTP.
 */
-class ReadIndirectBufferFromWebServer : public BufferWithOwnMemory<SeekableReadBuffer>
+class ReadBufferFromWebServer : public SeekableReadBuffer
 {
 public:
-    explicit ReadIndirectBufferFromWebServer(const String & url_,
-                                             ContextPtr context_,
-                                             size_t buf_size_ = DBMS_DEFAULT_BUFFER_SIZE,
-                                             size_t backoff_threshold_ = 10000, size_t max_tries_ = 4);
+    explicit ReadBufferFromWebServer(
+        const String & url_, ContextPtr context_,
+        const ReadSettings & settings_ = {},
+        bool use_external_buffer_ = false,
+        size_t last_offset = 0);

    bool nextImpl() override;

@ -30,6 +32,8 @@ public:
 private:
    std::unique_ptr<ReadBuffer> initialize();

+    void initializeWithRetry();
+
    Poco::Logger * log;
    ContextPtr context;

@ -40,8 +44,11 @@ private:

    off_t offset = 0;

-    size_t backoff_threshold_ms;
-    size_t max_tries;
+    ReadSettings read_settings;
+
+    bool use_external_buffer;
+
+    off_t last_offset = 0;
 };

 }
--- a/src/Disks/IO/ReadIndirectBufferFromRemoteFS.cpp
+++ b/src/Disks/IO/ReadIndirectBufferFromRemoteFS.cpp
@ -0,0 +1,85 @@
+#include "ReadIndirectBufferFromRemoteFS.h"
+
+#include <Disks/IO/ReadBufferFromRemoteFSGather.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int CANNOT_SEEK_THROUGH_FILE;
+}
+
+
+ReadIndirectBufferFromRemoteFS::ReadIndirectBufferFromRemoteFS(
+    std::shared_ptr<ReadBufferFromRemoteFSGather> impl_) : impl(std::move(impl_))
+{
+}
+
+
+off_t ReadIndirectBufferFromRemoteFS::getPosition()
+{
+    return impl->absolute_position - available();
+}
+
+
+String ReadIndirectBufferFromRemoteFS::getFileName() const
+{
+    return impl->getFileName();
+}
+
+
+off_t ReadIndirectBufferFromRemoteFS::seek(off_t offset_, int whence)
+{
+    if (whence == SEEK_CUR)
+    {
+        /// If position within current working buffer - shift pos.
+        if (!working_buffer.empty() && size_t(getPosition() + offset_) < impl->absolute_position)
+        {
+            pos += offset_;
+            return getPosition();
+        }
+        else
+        {
+            impl->absolute_position += offset_;
+        }
+    }
+    else if (whence == SEEK_SET)
+    {
+        /// If position within current working buffer - shift pos.
+        if (!working_buffer.empty()
+            && size_t(offset_) >= impl->absolute_position - working_buffer.size()
+            && size_t(offset_) < impl->absolute_position)
+        {
+            pos = working_buffer.end() - (impl->absolute_position - offset_);
+            return getPosition();
+        }
+        else
+        {
+            impl->absolute_position = offset_;
+        }
+    }
+    else
+        throw Exception("Only SEEK_SET or SEEK_CUR modes are allowed.", ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
+
+    impl->reset();
+    pos = working_buffer.end();
+
+    return impl->absolute_position;
+}
+
+
+bool ReadIndirectBufferFromRemoteFS::nextImpl()
+{
+    /// Transfer current position and working_buffer to actual ReadBuffer
+    swap(*impl);
+    /// Position and working_buffer will be updated in next() call
+    auto result = impl->next();
+    /// and assigned to current buffer.
+    swap(*impl);
+
+    return result;
+}
+
+}
--- a/src/Disks/IO/ReadIndirectBufferFromRemoteFS.h
+++ b/src/Disks/IO/ReadIndirectBufferFromRemoteFS.h
@ -0,0 +1,36 @@
+#pragma once
+
+#include <Common/config.h>
+#include <IO/ReadBufferFromFile.h>
+#include <Disks/IDiskRemote.h>
+#include <utility>
+
+
+namespace DB
+{
+
+class ReadBufferFromRemoteFSGather;
+
+/**
+* Reads data from S3/HDFS/Web using stored paths in metadata.
+* There is asynchronous version of this class -- AsynchronousReadIndirectBufferFromRemoteFS.
+*/
+class ReadIndirectBufferFromRemoteFS : public ReadBufferFromFileBase
+{
+
+public:
+    explicit ReadIndirectBufferFromRemoteFS(std::shared_ptr<ReadBufferFromRemoteFSGather> impl_);
+
+    off_t seek(off_t offset_, int whence) override;
+
+    off_t getPosition() override;
+
+    String getFileName() const override;
+
+private:
+    bool nextImpl() override;
+
+    std::shared_ptr<ReadBufferFromRemoteFSGather> impl;
+};
+
+}
--- a/src/Disks/IO/ThreadPoolRemoteFSReader.cpp
+++ b/src/Disks/IO/ThreadPoolRemoteFSReader.cpp
@ -0,0 +1,68 @@
+#include "ThreadPoolRemoteFSReader.h"
+
+#include <Common/Exception.h>
+#include <Common/ProfileEvents.h>
+#include <Common/CurrentMetrics.h>
+#include <Common/Stopwatch.h>
+#include <Common/assert_cast.h>
+#include <Common/setThreadName.h>
+
+#include <IO/SeekableReadBuffer.h>
+#include <Disks/IO/ReadBufferFromRemoteFSGather.h>
+
+#include <future>
+#include <iostream>
+
+
+namespace ProfileEvents
+{
+    extern const Event RemoteFSReadMicroseconds;
+    extern const Event RemoteFSReadBytes;
+}
+
+namespace CurrentMetrics
+{
+    extern const Metric Read;
+}
+
+namespace DB
+{
+
+size_t ThreadPoolRemoteFSReader::RemoteFSFileDescriptor::readInto(char * data, size_t size, size_t offset, size_t ignore)
+{
+    return reader->readInto(data, size, offset, ignore);
+}
+
+
+ThreadPoolRemoteFSReader::ThreadPoolRemoteFSReader(size_t pool_size, size_t queue_size_)
+    : pool(pool_size, pool_size, queue_size_)
+{
+}
+
+
+std::future<IAsynchronousReader::Result> ThreadPoolRemoteFSReader::submit(Request request)
+{
+    auto task = std::make_shared<std::packaged_task<Result()>>([request]
+    {
+        setThreadName("ThreadPoolRemoteFSRead");
+        CurrentMetrics::Increment metric_increment{CurrentMetrics::Read};
+        auto * remote_fs_fd = assert_cast<RemoteFSFileDescriptor *>(request.descriptor.get());
+
+        Stopwatch watch(CLOCK_MONOTONIC);
+        auto bytes_read = remote_fs_fd->readInto(request.buf, request.size, request.offset, request.ignore);
+        watch.stop();
+
+        ProfileEvents::increment(ProfileEvents::RemoteFSReadMicroseconds, watch.elapsedMicroseconds());
+        ProfileEvents::increment(ProfileEvents::RemoteFSReadBytes, bytes_read);
+
+        return bytes_read;
+    });
+
+    auto future = task->get_future();
+
+    /// ThreadPool is using "bigger is higher priority" instead of "smaller is more priority".
+    pool.scheduleOrThrow([task]{ (*task)(); }, -request.priority);
+
+    return future;
+}
+}
--- a/src/Disks/IO/ThreadPoolRemoteFSReader.h
+++ b/src/Disks/IO/ThreadPoolRemoteFSReader.h
@ -0,0 +1,39 @@
+#pragma once
+
+#include <IO/AsynchronousReader.h>
+#include <IO/SeekableReadBuffer.h>
+#include <Common/ThreadPool.h>
+#include <Disks/IDiskRemote.h>
+
+
+namespace DB
+{
+class ReadBufferFromRemoteFSGather;
+
+class ThreadPoolRemoteFSReader : public IAsynchronousReader
+{
+
+private:
+    ThreadPool pool;
+
+public:
+    ThreadPoolRemoteFSReader(size_t pool_size, size_t queue_size_);
+
+    std::future<Result> submit(Request request) override;
+
+    struct RemoteFSFileDescriptor;
+};
+
+
+struct ThreadPoolRemoteFSReader::RemoteFSFileDescriptor : public IFileDescriptor
+{
+public:
+    RemoteFSFileDescriptor(std::shared_ptr<ReadBufferFromRemoteFSGather> reader_) : reader(reader_) {}
+
+    size_t readInto(char * data, size_t size, size_t offset, size_t ignore = 0);
+
+private:
+    std::shared_ptr<ReadBufferFromRemoteFSGather> reader;
+};
+
+}
--- a/src/Disks/IO/WriteIndirectBufferFromRemoteFS.cpp
+++ b/src/Disks/IO/WriteIndirectBufferFromRemoteFS.cpp
--- a/src/Disks/IO/WriteIndirectBufferFromRemoteFS.h
+++ b/src/Disks/IO/WriteIndirectBufferFromRemoteFS.h
--- a/src/Disks/ReadIndirectBufferFromRemoteFS.cpp
+++ b/src/Disks/ReadIndirectBufferFromRemoteFS.cpp
@ -1,143 +0,0 @@
-#include "ReadIndirectBufferFromRemoteFS.h"
-
-#include <IO/ReadBufferFromS3.h>
-#include <Storages/HDFS/ReadBufferFromHDFS.h>
-#include <Disks/ReadIndirectBufferFromWebServer.h>
-
-
-namespace DB
-{
-
-namespace ErrorCodes
-{
-    extern const int CANNOT_SEEK_THROUGH_FILE;
-}
-
-
-template<typename T>
-ReadIndirectBufferFromRemoteFS<T>::ReadIndirectBufferFromRemoteFS(
-    RemoteMetadata metadata_)
-    : metadata(std::move(metadata_))
-{
-}
-
-
-template<typename T>
-off_t ReadIndirectBufferFromRemoteFS<T>::seek(off_t offset_, int whence)
-{
-    if (whence == SEEK_CUR)
-    {
-        /// If position within current working buffer - shift pos.
-        if (!working_buffer.empty() && size_t(getPosition() + offset_) < absolute_position)
-        {
-            pos += offset_;
-            return getPosition();
-        }
-        else
-        {
-            absolute_position += offset_;
-        }
-    }
-    else if (whence == SEEK_SET)
-    {
-        /// If position within current working buffer - shift pos.
-        if (!working_buffer.empty() && size_t(offset_) >= absolute_position - working_buffer.size()
-            && size_t(offset_) < absolute_position)
-        {
-            pos = working_buffer.end() - (absolute_position - offset_);
-            return getPosition();
-        }
-        else
-        {
-            absolute_position = offset_;
-        }
-    }
-    else
-        throw Exception("Only SEEK_SET or SEEK_CUR modes are allowed.", ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
-
-    current_buf = initialize();
-    pos = working_buffer.end();
-
-    return absolute_position;
-}
-
-
-template<typename T>
-std::unique_ptr<T> ReadIndirectBufferFromRemoteFS<T>::initialize()
-{
-    size_t offset = absolute_position;
-    for (size_t i = 0; i < metadata.remote_fs_objects.size(); ++i)
-    {
-        current_buf_idx = i;
-        const auto & [file_path, size] = metadata.remote_fs_objects[i];
-        if (size > offset)
-        {
-            auto buf = createReadBuffer(file_path);
-            buf->seek(offset, SEEK_SET);
-            return buf;
-        }
-        offset -= size;
-    }
-    return nullptr;
-}
-
-
-template<typename T>
-bool ReadIndirectBufferFromRemoteFS<T>::nextImpl()
-{
-    /// Find first available buffer that fits to given offset.
-    if (!current_buf)
-        current_buf = initialize();
-
-    /// If current buffer has remaining data - use it.
-    if (current_buf)
-    {
-        bool result = nextAndShiftPosition();
-        if (result)
-            return true;
-    }
-
-    /// If there is no available buffers - nothing to read.
-    if (current_buf_idx + 1 >= metadata.remote_fs_objects.size())
-        return false;
-
-    ++current_buf_idx;
-    const auto & path = metadata.remote_fs_objects[current_buf_idx].first;
-
-    current_buf = createReadBuffer(path);
-
-    return nextAndShiftPosition();
-}
-
-template <typename T>
-bool ReadIndirectBufferFromRemoteFS<T>::nextAndShiftPosition()
-{
-    /// Transfer current position and working_buffer to actual ReadBuffer
-    swap(*current_buf);
-    /// Position and working_buffer will be updated in next() call
-    auto result = current_buf->next();
-    /// and assigned to current buffer.
-    swap(*current_buf);
-
-    /// absolute position is shifted by a data size that was read in next() call above.
-    if (result)
-        absolute_position += working_buffer.size();
-
-    return result;
-}
-
-
-#if USE_AWS_S3
-template
-class ReadIndirectBufferFromRemoteFS<ReadBufferFromS3>;
-#endif
-
-#if USE_HDFS
-template
-class ReadIndirectBufferFromRemoteFS<ReadBufferFromHDFS>;
-#endif
-
-template
-class ReadIndirectBufferFromRemoteFS<ReadIndirectBufferFromWebServer>;
-
-}
--- a/src/Disks/ReadIndirectBufferFromRemoteFS.h
+++ b/src/Disks/ReadIndirectBufferFromRemoteFS.h
@ -1,45 +0,0 @@
-#pragma once
-
-#include <Common/config.h>
-
-#include <IO/ReadBufferFromFile.h>
-#include <Disks/IDiskRemote.h>
-#include <utility>
-
-
-namespace DB
-{
-
-/// Reads data from S3/HDFS using stored paths in metadata.
-template <typename T>
-class ReadIndirectBufferFromRemoteFS : public ReadBufferFromFileBase
-{
-public:
-    explicit ReadIndirectBufferFromRemoteFS(RemoteMetadata metadata_);
-
-    off_t seek(off_t offset_, int whence) override;
-
-    off_t getPosition() override { return absolute_position - available(); }
-
-    String getFileName() const override { return metadata.metadata_file_path; }
-
-    virtual std::unique_ptr<T> createReadBuffer(const String & path) = 0;
-
-protected:
-    RemoteMetadata metadata;
-
-private:
-    std::unique_ptr<T> initialize();
-
-    bool nextAndShiftPosition();
-
-    bool nextImpl() override;
-
-    size_t absolute_position = 0;
-
-    size_t current_buf_idx = 0;
-
-    std::unique_ptr<T> current_buf;
-};
-
-}
--- a/src/Disks/ReadIndirectBufferFromWebServer.cpp
+++ b/src/Disks/ReadIndirectBufferFromWebServer.cpp
@ -1,146 +0,0 @@
-#include "ReadIndirectBufferFromWebServer.h"
-
-#include <base/logger_useful.h>
-#include <base/sleep.h>
-#include <Core/Types.h>
-#include <IO/ReadWriteBufferFromHTTP.h>
-#include <IO/ConnectionTimeoutsContext.h>
-#include <IO/WriteBufferFromString.h>
-#include <IO/Operators.h>
-#include <thread>
-
-
-namespace DB
-{
-
-namespace ErrorCodes
-{
-    extern const int CANNOT_SEEK_THROUGH_FILE;
-    extern const int SEEK_POSITION_OUT_OF_BOUND;
-    extern const int NETWORK_ERROR;
-}
-
-static const auto WAIT_MS = 10;
-
-
-ReadIndirectBufferFromWebServer::ReadIndirectBufferFromWebServer(
-    const String & url_, ContextPtr context_, size_t buf_size_, size_t backoff_threshold_, size_t max_tries_)
-    : BufferWithOwnMemory<SeekableReadBuffer>(buf_size_)
-    , log(&Poco::Logger::get("ReadIndirectBufferFromWebServer"))
-    , context(context_)
-    , url(url_)
-    , buf_size(buf_size_)
-    , backoff_threshold_ms(backoff_threshold_)
-    , max_tries(max_tries_)
-{
-}
-
-
-std::unique_ptr<ReadBuffer> ReadIndirectBufferFromWebServer::initialize()
-{
-    Poco::URI uri(url);
-
-    ReadWriteBufferFromHTTP::HTTPHeaderEntries headers;
-    headers.emplace_back(std::make_pair("Range", fmt::format("bytes={}-", offset)));
-    const auto & settings = context->getSettingsRef();
-    LOG_DEBUG(log, "Reading from offset: {}", offset);
-    const auto & config = context->getConfigRef();
-    Poco::Timespan http_keep_alive_timeout{config.getUInt("keep_alive_timeout", 20), 0};
-
-    return std::make_unique<ReadWriteBufferFromHTTP>(
-        uri,
-        Poco::Net::HTTPRequest::HTTP_GET,
-        ReadWriteBufferFromHTTP::OutStreamCallback(),
-        ConnectionTimeouts(std::max(Poco::Timespan(settings.http_connection_timeout.totalSeconds(), 0), Poco::Timespan(20, 0)),
-                           settings.http_send_timeout,
-                           std::max(Poco::Timespan(settings.http_receive_timeout.totalSeconds(), 0), Poco::Timespan(20, 0)),
-                           settings.tcp_keep_alive_timeout,
-                           http_keep_alive_timeout),
-        0,
-        Poco::Net::HTTPBasicCredentials{},
-        buf_size,
-        headers);
-}
-
-
-bool ReadIndirectBufferFromWebServer::nextImpl()
-{
-    bool next_result = false, successful_read = false;
-    UInt16 milliseconds_to_wait = WAIT_MS;
-
-    if (impl)
-    {
-        /// Restore correct position at the needed offset.
-        impl->position() = position();
-        assert(!impl->hasPendingData());
-    }
-
-    WriteBufferFromOwnString error_msg;
-    for (size_t i = 0; (i < max_tries) && !successful_read && !next_result; ++i)
-    {
-        while (milliseconds_to_wait < backoff_threshold_ms)
-        {
-            try
-            {
-                if (!impl)
-                {
-                    impl = initialize();
-                    next_result = impl->hasPendingData();
-                    if (next_result)
-                        break;
-                }
-
-                next_result = impl->next();
-                successful_read = true;
-                break;
-            }
-            catch (const Poco::Exception & e)
-            {
-                LOG_WARNING(log, "Read attempt failed for url: {}. Error: {}", url, e.what());
-                error_msg << fmt::format("Error: {}\n", e.what());
-
-                sleepForMilliseconds(milliseconds_to_wait);
-                milliseconds_to_wait *= 2;
-                impl.reset();
-            }
-        }
-        milliseconds_to_wait = WAIT_MS;
-    }
-
-    if (!successful_read)
-        throw Exception(ErrorCodes::NETWORK_ERROR,
-                        "All read attempts failed for url: {}. Reason:\n{}", url, error_msg.str());
-
-    if (next_result)
-    {
-        BufferBase::set(impl->buffer().begin(), impl->buffer().size(), impl->offset());
-        offset += working_buffer.size();
-    }
-
-    return next_result;
-}
-
-
-off_t ReadIndirectBufferFromWebServer::seek(off_t offset_, int whence)
-{
-    if (impl)
-        throw Exception(ErrorCodes::CANNOT_SEEK_THROUGH_FILE, "Seek is allowed only before first read attempt from the buffer");
-
-    if (whence != SEEK_SET)
-        throw Exception(ErrorCodes::CANNOT_SEEK_THROUGH_FILE, "Only SEEK_SET mode is allowed");
-
-    if (offset_ < 0)
-        throw Exception(ErrorCodes::SEEK_POSITION_OUT_OF_BOUND, "Seek position is out of bounds. Offset: {}", std::to_string(offset_));
-
-    offset = offset_;
-
-    return offset;
-}
-
-
-off_t ReadIndirectBufferFromWebServer::getPosition()
-{
-    return offset - available();
-}
-
-}
--- a/src/Disks/S3/DiskS3.cpp
+++ b/src/Disks/S3/DiskS3.cpp
@ -17,11 +17,7 @@
 #include <Common/quoteString.h>
 #include <Common/thread_local_rng.h>

-#include <Disks/ReadIndirectBufferFromRemoteFS.h>
-#include <Disks/WriteIndirectBufferFromRemoteFS.h>
-
 #include <Interpreters/Context.h>
-
 #include <IO/ReadBufferFromS3.h>
 #include <IO/ReadBufferFromString.h>
 #include <IO/ReadHelpers.h>
@ -29,6 +25,12 @@
 #include <IO/WriteBufferFromS3.h>
 #include <IO/WriteHelpers.h>

+#include <Disks/IO/ReadBufferFromRemoteFSGather.h>
+#include <Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h>
+#include <Disks/IO/ReadIndirectBufferFromRemoteFS.h>
+#include <Disks/IO/WriteIndirectBufferFromRemoteFS.h>
+#include <Disks/IO/ThreadPoolRemoteFSReader.h>
+
 #include <aws/s3/model/CopyObjectRequest.h>
 #include <aws/s3/model/DeleteObjectsRequest.h>
 #include <aws/s3/model/GetObjectRequest.h>
@ -127,47 +129,19 @@ void throwIfError(const Aws::Utils::Outcome<Result, Error> & response)
    }
 }

-/// Reads data from S3 using stored paths in metadata.
-class ReadIndirectBufferFromS3 final : public ReadIndirectBufferFromRemoteFS<ReadBufferFromS3>
-{
-public:
-    ReadIndirectBufferFromS3(
-        std::shared_ptr<Aws::S3::S3Client> client_ptr_,
-        const String & bucket_,
-        DiskS3::Metadata metadata_,
-        size_t max_single_read_retries_,
-        size_t buf_size_)
-        : ReadIndirectBufferFromRemoteFS<ReadBufferFromS3>(metadata_)
-        , client_ptr(std::move(client_ptr_))
-        , bucket(bucket_)
-        , max_single_read_retries(max_single_read_retries_)
-        , buf_size(buf_size_)
-    {
-    }
-
-    std::unique_ptr<ReadBufferFromS3> createReadBuffer(const String & path) override
-    {
-        return std::make_unique<ReadBufferFromS3>(client_ptr, bucket, fs::path(metadata.remote_fs_root_path) / path, max_single_read_retries, buf_size);
-    }
-
-private:
-    std::shared_ptr<Aws::S3::S3Client> client_ptr;
-    const String & bucket;
-    UInt64 max_single_read_retries;
-    size_t buf_size;
-};
-
 DiskS3::DiskS3(
    String name_,
    String bucket_,
    String s3_root_path_,
    String metadata_path_,
+    ContextPtr context_,
    SettingsPtr settings_,
    GetDiskSettings settings_getter_)
    : IDiskRemote(name_, s3_root_path_, metadata_path_, "DiskS3", settings_->thread_pool_size)
    , bucket(std::move(bucket_))
    , current_settings(std::move(settings_))
    , settings_getter(settings_getter_)
+    , context(context_)
 {
 }

@ -230,9 +204,23 @@ std::unique_ptr<ReadBufferFromFileBase> DiskS3::readFile(const String & path, co
    LOG_TRACE(log, "Read from file by path: {}. Existing S3 objects: {}",
        backQuote(metadata_path + path), metadata.remote_fs_objects.size());

-    auto reader = std::make_unique<ReadIndirectBufferFromS3>(
-        settings->client, bucket, metadata, settings->s3_max_single_read_retries, read_settings.remote_fs_buffer_size);
-    return std::make_unique<SeekAvoidingReadBuffer>(std::move(reader), settings->min_bytes_for_seek);
+    bool threadpool_read = read_settings.remote_fs_method == RemoteFSReadMethod::read_threadpool;
+
+    auto s3_impl = std::make_unique<ReadBufferFromS3Gather>(
+        path,
+        settings->client, bucket, metadata,
+        settings->s3_max_single_read_retries, read_settings, threadpool_read);
+
+    if (threadpool_read)
+    {
+        auto reader = getThreadPoolReader();
+        return std::make_unique<AsynchronousReadIndirectBufferFromRemoteFS>(reader, read_settings, std::move(s3_impl));
+    }
+    else
+    {
+        auto buf = std::make_unique<ReadIndirectBufferFromRemoteFS>(std::move(s3_impl));
+        return std::make_unique<SeekAvoidingReadBuffer>(std::move(buf), settings->min_bytes_for_seek);
+    }
 }

 std::unique_ptr<WriteBufferFromFileBase> DiskS3::writeFile(const String & path, size_t buf_size, WriteMode mode)
@ -378,7 +366,7 @@ int DiskS3::readSchemaVersion(const String & source_bucket, const String & sourc
        source_bucket,
        source_path + SCHEMA_VERSION_OBJECT,
        settings->s3_max_single_read_retries,
-        DBMS_DEFAULT_BUFFER_SIZE);
+        context->getReadSettings());

    readIntText(version, buffer);

@ -1033,9 +1021,9 @@ void DiskS3::onFreeze(const String & path)
    revision_file_buf.finalize();
 }

-void DiskS3::applyNewSettings(const Poco::Util::AbstractConfiguration & config, ContextPtr context, const String &, const DisksMap &)
+void DiskS3::applyNewSettings(const Poco::Util::AbstractConfiguration & config, ContextPtr context_, const String &, const DisksMap &)
 {
-    auto new_settings = settings_getter(config, "storage_configuration.disks." + name, context);
+    auto new_settings = settings_getter(config, "storage_configuration.disks." + name, context_);

    current_settings.set(std::move(new_settings));

--- a/src/Disks/S3/DiskS3.h
+++ b/src/Disks/S3/DiskS3.h
@ -69,6 +69,7 @@ public:
        String bucket_,
        String s3_root_path_,
        String metadata_path_,
+        ContextPtr context_,
        SettingsPtr settings_,
        GetDiskSettings settings_getter_);

@ -175,6 +176,8 @@ private:
    static constexpr int RESTORABLE_SCHEMA_VERSION = 1;
    /// Directories with data.
    const std::vector<String> data_roots {"data", "store"};
+
+    ContextPtr context;
 };

 }
--- a/src/Disks/S3/registerDiskS3.cpp
+++ b/src/Disks/S3/registerDiskS3.cpp
@ -184,6 +184,7 @@ void registerDiskS3(DiskFactory & factory)
            uri.bucket,
            uri.key,
            metadata_path,
+            context,
            getSettings(config, config_prefix, context),
            getSettings);

--- a/src/Functions/map.cpp
+++ b/src/Functions/map.cpp
@ -382,6 +382,142 @@ public:
    bool useDefaultImplementationForConstants() const override { return true; }
 };

+class FunctionExtractKeyLike : public IFunction
+{
+public:
+    static constexpr auto name = "mapExtractKeyLike";
+    static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionExtractKeyLike>(); }
+
+    String getName() const override
+    {
+        return name;
+    }
+
+    bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*info*/) const override { return true; }
+
+    size_t getNumberOfArguments() const override { return 2; }
+
+    DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
+    {
+        if (arguments.size() != 2)
+            throw Exception("Number of arguments for function " + getName() + " doesn't match: passed "
+                + toString(arguments.size()) + ", should be 2",
+                ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
+
+
+        const DataTypeMap * map_type = checkAndGetDataType<DataTypeMap>(arguments[0].type.get());
+
+        if (!map_type)
+            throw Exception{"First argument for function " + getName() + " must be a map",
+                ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT};
+
+
+        auto key_type = map_type->getKeyType();
+
+        WhichDataType which(key_type);
+
+        if (!which.isStringOrFixedString())
+            throw Exception{"Function " + getName() + "only support the map with String or FixedString key",
+                ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT};
+
+        if (!isStringOrFixedString(arguments[1].type))
+            throw Exception{"Second argument passed to function " + getName() + " must be String or FixedString",
+                ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT};
+
+        return std::make_shared<DataTypeMap>(map_type->getKeyType(), map_type->getValueType());
+    }
+
+    bool useDefaultImplementationForConstants() const override { return true; }
+
+    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override
+    {
+        bool is_const = isColumnConst(*arguments[0].column);
+        const ColumnMap * col_map = typeid_cast<const ColumnMap *>(arguments[0].column.get());
+
+        //It may not be necessary to check this condition, cause it will be checked in getReturnTypeImpl function
+        if (!col_map)
+            return nullptr;
+
+        const DataTypeMap * map_type = checkAndGetDataType<DataTypeMap>(arguments[0].type.get());
+        auto key_type = map_type->getKeyType();
+        auto value_type = map_type->getValueType();
+
+        const auto & nested_column = col_map->getNestedColumn();
+        const auto & keys_column = col_map->getNestedData().getColumn(0);
+        const auto & values_column = col_map->getNestedData().getColumn(1);
+        const ColumnString * keys_string_column = checkAndGetColumn<ColumnString>(keys_column);
+        const ColumnFixedString * keys_fixed_string_column = checkAndGetColumn<ColumnFixedString>(keys_column);
+
+        FunctionLike func_like;
+
+        //create result data
+        MutableColumnPtr keys_data = key_type->createColumn();
+        MutableColumnPtr values_data = value_type->createColumn();
+        MutableColumnPtr offsets = DataTypeNumber<IColumn::Offset>().createColumn();
+
+        IColumn::Offset current_offset = 0;
+
+        for (size_t row = 0; row < input_rows_count; row++)
+        {
+            size_t element_start_row = row != 0 ? nested_column.getOffsets()[row-1] : 0;
+            size_t element_size = nested_column.getOffsets()[row]- element_start_row;
+
+            ColumnsWithTypeAndName new_arguments;
+            ColumnPtr sub_map_column;
+            DataTypePtr data_type;
+
+            if (keys_string_column)
+            {
+                sub_map_column = keys_string_column->cut(element_start_row, element_size);
+                data_type = std::make_shared<DataTypeString>();
+            }
+            else
+            {
+                sub_map_column = keys_fixed_string_column->cut(element_start_row, element_size);
+                data_type =std::make_shared<DataTypeFixedString>(checkAndGetColumn<ColumnFixedString>(sub_map_column.get())->getN());
+            }
+
+            size_t col_key_size = sub_map_column->size();
+            auto column = is_const? ColumnConst::create(std::move(sub_map_column), std::move(col_key_size)) : std::move(sub_map_column);
+
+            new_arguments = {
+                    {
+                        column,
+                        data_type,
+                        ""
+                        },
+                    arguments[1]
+                    };
+
+            auto res = func_like.executeImpl(new_arguments, result_type, input_rows_count);
+            const auto & container = checkAndGetColumn<ColumnUInt8>(res.get())->getData();
+
+            for (size_t row_num = 0; row_num < element_size; row_num++)
+            {
+                if (container[row_num] == 1)
+                {
+                    auto key_ref = keys_string_column ?
+                                   keys_string_column->getDataAt(element_start_row + row_num) :
+                                   keys_fixed_string_column->getDataAt(element_start_row + row_num);
+                    auto value_ref = values_column.getDataAt(element_start_row + row_num);
+
+                    keys_data->insertData(key_ref.data, key_ref.size);
+                    values_data->insertData(value_ref.data, value_ref.size);
+                    current_offset += 1;
+                }
+            }
+
+            offsets->insert(current_offset);
+        }
+
+        auto result_nested_column = ColumnArray::create(
+            ColumnTuple::create(Columns{std::move(keys_data), std::move(values_data)}),
+            std::move(offsets));
+
+        return ColumnMap::create(result_nested_column);
+    }
+};
+
 }

 void registerFunctionsMap(FunctionFactory & factory)
@ -391,6 +527,7 @@ void registerFunctionsMap(FunctionFactory & factory)
    factory.registerFunction<FunctionMapKeys>();
    factory.registerFunction<FunctionMapValues>();
    factory.registerFunction<FunctionMapContainsKeyLike>();
+    factory.registerFunction<FunctionExtractKeyLike>();
 }

 }
--- a/src/IO/AsynchronousReader.h
+++ b/src/IO/AsynchronousReader.h
@ -46,6 +46,7 @@ public:
        size_t size = 0;
        char * buf = nullptr;
        int64_t priority = 0;
+        size_t ignore = 0;
    };

    /// Less than requested amount of data can be returned.
--- a/src/IO/ReadBuffer.h
+++ b/src/IO/ReadBuffer.h
@ -202,6 +202,12 @@ public:
      */
    virtual void prefetch() {}

+    /**
+     * For reading from remote filesystem, when it matters how much we read.
+     */
+    virtual void setReadUntilPosition(size_t /* position */) {}
+    virtual void setReadUntilEnd() {}
+
 protected:
    /// The number of bytes to ignore from the initial position of `working_buffer`
    /// buffer. Apparently this is an additional out-parameter for nextImpl(),
--- a/src/IO/ReadBufferFromS3.cpp
+++ b/src/IO/ReadBufferFromS3.cpp
@ -8,7 +8,9 @@

 #include <aws/s3/S3Client.h>
 #include <aws/s3/model/GetObjectRequest.h>
+
 #include <base/logger_useful.h>
+#include <base/sleep.h>

 #include <utility>

@ -27,43 +29,81 @@ namespace ErrorCodes
    extern const int S3_ERROR;
    extern const int CANNOT_SEEK_THROUGH_FILE;
    extern const int SEEK_POSITION_OUT_OF_BOUND;
+    extern const int LOGICAL_ERROR;
 }


 ReadBufferFromS3::ReadBufferFromS3(
-    std::shared_ptr<Aws::S3::S3Client> client_ptr_, const String & bucket_, const String & key_, UInt64 max_single_read_retries_, size_t buffer_size_)
+    std::shared_ptr<Aws::S3::S3Client> client_ptr_, const String & bucket_, const String & key_,
+    UInt64 max_single_read_retries_, const ReadSettings & settings_, bool use_external_buffer_, size_t read_until_position_)
    : SeekableReadBuffer(nullptr, 0)
    , client_ptr(std::move(client_ptr_))
    , bucket(bucket_)
    , key(key_)
    , max_single_read_retries(max_single_read_retries_)
-    , buffer_size(buffer_size_)
+    , read_settings(settings_)
+    , use_external_buffer(use_external_buffer_)
+    , read_until_position(read_until_position_)
 {
 }

 bool ReadBufferFromS3::nextImpl()
 {
+    if (read_until_position)
+    {
+        if (read_until_position == offset)
+            return false;
+
+        if (read_until_position < offset)
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to read beyond right offset ({} > {})", offset, read_until_position - 1);
+    }
+
    bool next_result = false;

    if (impl)
    {
-        /// `impl` has been initialized earlier and now we're at the end of the current portion of data.
-        impl->position() = position();
-        assert(!impl->hasPendingData());
+        if (use_external_buffer)
+        {
+            /**
+            * use_external_buffer -- means we read into the buffer which
+            * was passed to us from somewhere else. We do not check whether
+            * previously returned buffer was read or not (no hasPendingData() check is needed),
+            * because this branch means we are prefetching data,
+            * each nextImpl() call we can fill a different buffer.
+            */
+            impl->set(internal_buffer.begin(), internal_buffer.size());
+            assert(working_buffer.begin() != nullptr);
+            assert(!internal_buffer.empty());
        }
        else
        {
-        /// `impl` is not initialized and we're about to read the first portion of data.
-        impl = initialize();
-        next_result = impl->hasPendingData();
+            /**
+            * impl was initialized before, pass position() to it to make
+            * sure there is no pending data which was not read.
+            */
+            impl->position() = position();
+            assert(!impl->hasPendingData());
+        }
    }

-    auto sleep_time_with_backoff_milliseconds = std::chrono::milliseconds(100);
+    size_t sleep_time_with_backoff_milliseconds = 100;
    for (size_t attempt = 0; (attempt < max_single_read_retries) && !next_result; ++attempt)
    {
        Stopwatch watch;
        try
        {
+            if (!impl)
+            {
+                impl = initialize();
+
+                if (use_external_buffer)
+                {
+                    impl->set(internal_buffer.begin(), internal_buffer.size());
+                    assert(working_buffer.begin() != nullptr);
+                    assert(!internal_buffer.empty());
+                }
+            }
+
            /// Try to read a next portion of data.
            next_result = impl->next();
            watch.stop();
@ -83,13 +123,11 @@ bool ReadBufferFromS3::nextImpl()
                throw;

            /// Pause before next attempt.
-            std::this_thread::sleep_for(sleep_time_with_backoff_milliseconds);
+            sleepForMilliseconds(sleep_time_with_backoff_milliseconds);
            sleep_time_with_backoff_milliseconds *= 2;

            /// Try to reinitialize `impl`.
            impl.reset();
-            impl = initialize();
-            next_result = impl->hasPendingData();
        }
    }

@ -127,19 +165,34 @@ off_t ReadBufferFromS3::getPosition()

 std::unique_ptr<ReadBuffer> ReadBufferFromS3::initialize()
 {
-    LOG_TRACE(log, "Read S3 object. Bucket: {}, Key: {}, Offset: {}", bucket, key, offset);
-
    Aws::S3::Model::GetObjectRequest req;
    req.SetBucket(bucket);
    req.SetKey(key);
+
+    /**
+     * If remote_filesystem_read_method = 'read_threadpool', then for MergeTree family tables
+     * exact byte ranges to read are always passed here.
+     */
+    if (read_until_position)
+    {
+        if (offset >= read_until_position)
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to read beyond right offset ({} > {})", offset, read_until_position - 1);
+
+        req.SetRange(fmt::format("bytes={}-{}", offset, read_until_position - 1));
+        LOG_DEBUG(log, "Read S3 object. Bucket: {}, Key: {}, Range: {}-{}", bucket, key, offset, read_until_position - 1);
+    }
+    else
+    {
        req.SetRange(fmt::format("bytes={}-", offset));
+        LOG_DEBUG(log, "Read S3 object. Bucket: {}, Key: {}, Offset: {}", bucket, key, offset);
+    }

    Aws::S3::Model::GetObjectOutcome outcome = client_ptr->GetObject(req);

    if (outcome.IsSuccess())
    {
        read_result = outcome.GetResultWithOwnership();
-        return std::make_unique<ReadBufferFromIStream>(read_result.GetBody(), buffer_size);
+        return std::make_unique<ReadBufferFromIStream>(read_result.GetBody(), read_settings.remote_fs_buffer_size);
    }
    else
        throw Exception(outcome.GetError().GetMessage(), ErrorCodes::S3_ERROR);
--- a/src/IO/ReadBufferFromS3.h
+++ b/src/IO/ReadBufferFromS3.h
@ -8,8 +8,10 @@

 #include <IO/HTTPCommon.h>
 #include <IO/ReadBuffer.h>
+#include <IO/ReadSettings.h>
+#include <IO/SeekableReadBuffer.h>
+
 #include <aws/s3/model/GetObjectResult.h>
-#    include "SeekableReadBuffer.h"

 namespace Aws::S3
 {
@ -28,7 +30,6 @@ private:
    String bucket;
    String key;
    UInt64 max_single_read_retries;
-    size_t buffer_size;
    off_t offset = 0;
    Aws::S3::Model::GetObjectResult read_result;
    std::unique_ptr<ReadBuffer> impl;
@ -41,7 +42,9 @@ public:
        const String & bucket_,
        const String & key_,
        UInt64 max_single_read_retries_,
-        size_t buffer_size_);
+        const ReadSettings & settings_,
+        bool use_external_buffer = false,
+        size_t read_until_position_ = 0);

    bool nextImpl() override;

@ -50,6 +53,10 @@ public:

 private:
    std::unique_ptr<ReadBuffer> initialize();
+
+    ReadSettings read_settings;
+    bool use_external_buffer;
+    off_t read_until_position = 0;
 };

 }
--- a/src/IO/ReadSettings.h
+++ b/src/IO/ReadSettings.h
@ -6,7 +6,7 @@

 namespace DB
 {
-enum class ReadMethod
+enum class LocalFSReadMethod
 {
    /**
     * Simple synchronous reads with 'read'.
@ -43,12 +43,20 @@ enum class ReadMethod
    pread_fake_async
 };

+enum class RemoteFSReadMethod
+{
+    read,
+    read_threadpool,
+};
+
 class MMappedFileCache;

 struct ReadSettings
 {
    /// Method to use reading from local filesystem.
-    ReadMethod local_fs_method = ReadMethod::pread;
+    LocalFSReadMethod local_fs_method = LocalFSReadMethod::pread;
+    /// Method to use reading from remote filesystem.
+    RemoteFSReadMethod remote_fs_method = RemoteFSReadMethod::read;

    size_t local_fs_buffer_size = DBMS_DEFAULT_BUFFER_SIZE;
    size_t remote_fs_buffer_size = DBMS_DEFAULT_BUFFER_SIZE;
@ -66,8 +74,14 @@ struct ReadSettings
    /// For 'pread_threadpool' method. Lower is more priority.
    size_t priority = 0;

-    size_t remote_fs_backoff_threshold = 10000;
-    size_t remote_fs_backoff_max_tries = 4;
+    size_t remote_fs_read_max_backoff_ms = 10000;
+    size_t remote_fs_read_backoff_max_tries = 4;
+
+    /// Set to true for MergeTree tables to make sure
+    /// that last position (offset in compressed file) is always passed.
+    /// (Otherwise asynchronous reading from remote fs is not efficient).
+    /// If reading is done without final position set, throw logical_error.
+    bool must_read_until_position = false;

    ReadSettings adjustBufferSize(size_t file_size) const
    {
--- a/src/IO/ReadWriteBufferFromHTTP.h
+++ b/src/IO/ReadWriteBufferFromHTTP.h
@ -2,10 +2,12 @@

 #include <functional>
 #include <base/types.h>
+#include <base/sleep.h>
 #include <IO/ConnectionTimeouts.h>
 #include <IO/HTTPCommon.h>
 #include <IO/ReadBuffer.h>
 #include <IO/ReadBufferFromIStream.h>
+#include <IO/ReadSettings.h>
 #include <Poco/Any.h>
 #include <Poco/Net/HTTPBasicCredentials.h>
 #include <Poco/Net/HTTPClientSession.h>
@ -99,6 +101,9 @@ namespace detail
        RemoteHostFilter remote_host_filter;
        std::function<void(size_t)> next_callback;

+        size_t buffer_size;
+        ReadSettings settings;
+
        std::istream * call(Poco::URI uri_, Poco::Net::HTTPResponse & response)
        {
            // With empty path poco will send "POST  HTTP/1.1" its bug.
@ -146,6 +151,9 @@ namespace detail
            }
        }

+    private:
+        bool use_external_buffer;
+
    public:
        using NextCallback = std::function<void(size_t)>;
        using OutStreamCallback = std::function<void(std::ostream &)>;
@ -157,8 +165,10 @@ namespace detail
            OutStreamCallback out_stream_callback_ = {},
            const Poco::Net::HTTPBasicCredentials & credentials_ = {},
            size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE,
+            const ReadSettings & settings_ = {},
            HTTPHeaderEntries http_header_entries_ = {},
-            const RemoteHostFilter & remote_host_filter_ = {})
+            const RemoteHostFilter & remote_host_filter_ = {},
+            bool use_external_buffer_ = false)
            : ReadBuffer(nullptr, 0)
            , uri {uri_}
            , method {!method_.empty() ? method_ : out_stream_callback_ ? Poco::Net::HTTPRequest::HTTP_POST : Poco::Net::HTTPRequest::HTTP_GET}
@ -167,9 +177,17 @@ namespace detail
            , credentials {credentials_}
            , http_header_entries {http_header_entries_}
            , remote_host_filter {remote_host_filter_}
+            , buffer_size {buffer_size_}
+            , settings {settings_}
+            , use_external_buffer {use_external_buffer_}
        {
-            Poco::Net::HTTPResponse response;
+            initialize();
+        }

+        void initialize()
+        {
+
+            Poco::Net::HTTPResponse response;
            istr = call(uri, response);

            while (isRedirect(response.getStatus()))
@ -184,7 +202,17 @@ namespace detail

            try
            {
-                impl = std::make_unique<ReadBufferFromIStream>(*istr, buffer_size_);
+                impl = std::make_unique<ReadBufferFromIStream>(*istr, buffer_size);
+
+                if (use_external_buffer)
+                {
+                    /**
+                    * See comment 30 lines below.
+                    */
+                    impl->set(internal_buffer.begin(), internal_buffer.size());
+                    assert(working_buffer.begin() != nullptr);
+                    assert(!internal_buffer.empty());
+                }
            }
            catch (const Poco::Exception & e)
            {
@ -200,10 +228,33 @@ namespace detail
        {
            if (next_callback)
                next_callback(count());
+
+            if (use_external_buffer)
+            {
+                /**
+                * use_external_buffer -- means we read into the buffer which
+                * was passed to us from somewhere else. We do not check whether
+                * previously returned buffer was read or not (no hasPendingData() check is needed),
+                * because this branch means we are prefetching data,
+                * each nextImpl() call we can fill a different buffer.
+                */
+                impl->set(internal_buffer.begin(), internal_buffer.size());
+                assert(working_buffer.begin() != nullptr);
+                assert(!internal_buffer.empty());
+            }
+            else
+            {
+                /**
+                * impl was initialized before, pass position() to it to make
+                * sure there is no pending data which was not read.
+                */
                if (!working_buffer.empty())
                    impl->position() = position();
+            }
+
            if (!impl->next())
                return false;
+
            internal_buffer = impl->buffer();
            working_buffer = internal_buffer;
            return true;
@ -268,10 +319,13 @@ public:
        const UInt64 max_redirects = 0,
        const Poco::Net::HTTPBasicCredentials & credentials_ = {},
        size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE,
+        const ReadSettings & settings_ = {},
        const HTTPHeaderEntries & http_header_entries_ = {},
-        const RemoteHostFilter & remote_host_filter_ = {})
+        const RemoteHostFilter & remote_host_filter_ = {},
+        bool use_external_buffer_ = false)
        : Parent(std::make_shared<UpdatableSession>(uri_, timeouts, max_redirects),
-            uri_, method_, out_stream_callback_, credentials_, buffer_size_, http_header_entries_, remote_host_filter_)
+                 uri_, method_, out_stream_callback_, credentials_, buffer_size_,
+                 settings_, http_header_entries_, remote_host_filter_, use_external_buffer_)
    {
    }
 };
--- a/src/IO/SeekAvoidingReadBuffer.h
+++ b/src/IO/SeekAvoidingReadBuffer.h
@ -17,6 +17,8 @@ public:

    off_t seek(off_t off, int whence) override;

+    void prefetch() override { impl->prefetch(); }
+
 private:
    UInt64 min_bytes_for_seek; /// Minimum positive seek offset which shall be executed using seek operation.
 };
--- a/src/IO/SeekableReadBuffer.h
+++ b/src/IO/SeekableReadBuffer.h
@ -33,4 +33,5 @@ public:
    virtual off_t getPosition() = 0;
 };

+using SeekableReadBufferPtr = std::shared_ptr<SeekableReadBuffer>;
 }
--- a/src/IO/createReadBufferFromFileBase.cpp
+++ b/src/IO/createReadBufferFromFileBase.cpp
@ -36,7 +36,7 @@ std::unique_ptr<ReadBufferFromFileBase> createReadBufferFromFileBase(
    size_t estimated_size = size.has_value() ? *size : 0;

    if (!existing_memory
-        && settings.local_fs_method == ReadMethod::mmap
+        && settings.local_fs_method == LocalFSReadMethod::mmap
        && settings.mmap_threshold
        && settings.mmap_cache
        && estimated_size >= settings.mmap_threshold)
@ -58,21 +58,21 @@ std::unique_ptr<ReadBufferFromFileBase> createReadBufferFromFileBase(
    {
        std::unique_ptr<ReadBufferFromFileBase> res;

-        if (settings.local_fs_method == ReadMethod::read)
+        if (settings.local_fs_method == LocalFSReadMethod::read)
        {
            res = std::make_unique<ReadBufferFromFile>(filename, buffer_size, actual_flags, existing_memory, alignment);
        }
-        else if (settings.local_fs_method == ReadMethod::pread || settings.local_fs_method == ReadMethod::mmap)
+        else if (settings.local_fs_method == LocalFSReadMethod::pread || settings.local_fs_method == LocalFSReadMethod::mmap)
        {
            res = std::make_unique<ReadBufferFromFilePReadWithDescriptorsCache>(filename, buffer_size, actual_flags, existing_memory, alignment);
        }
-        else if (settings.local_fs_method == ReadMethod::pread_fake_async)
+        else if (settings.local_fs_method == LocalFSReadMethod::pread_fake_async)
        {
            static AsynchronousReaderPtr reader = std::make_shared<SynchronousReader>();
            res = std::make_unique<AsynchronousReadBufferFromFileWithDescriptorsCache>(
                reader, settings.priority, filename, buffer_size, actual_flags, existing_memory, alignment);
        }
-        else if (settings.local_fs_method == ReadMethod::pread_threadpool)
+        else if (settings.local_fs_method == LocalFSReadMethod::pread_threadpool)
        {
            static AsynchronousReaderPtr reader = std::make_shared<ThreadPoolReader>(16, 1000000);
            res = std::make_unique<AsynchronousReadBufferFromFileWithDescriptorsCache>(
--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@ -1970,6 +1970,9 @@ zkutil::ZooKeeperPtr Context::getAuxiliaryZooKeeper(const String & name) const
    auto zookeeper = shared->auxiliary_zookeepers.find(name);
    if (zookeeper == shared->auxiliary_zookeepers.end())
    {
+        if (name.find(':') != std::string::npos || name.find('/') != std::string::npos)
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Invalid auxiliary ZooKeeper name {}: ':' and '/' are not allowed", name);
+
        const auto & config = shared->auxiliary_zookeepers_config ? *shared->auxiliary_zookeepers_config : getConfigRef();
        if (!config.has("auxiliary_zookeepers." + name))
            throw Exception(
@ -3062,16 +3065,23 @@ ReadSettings Context::getReadSettings() const

    std::string_view read_method_str = settings.local_filesystem_read_method.value;

-    if (auto opt_method = magic_enum::enum_cast<ReadMethod>(read_method_str))
+    if (auto opt_method = magic_enum::enum_cast<LocalFSReadMethod>(read_method_str))
        res.local_fs_method = *opt_method;
    else
-        throw Exception(ErrorCodes::UNKNOWN_READ_METHOD, "Unknown read method '{}'", read_method_str);
+        throw Exception(ErrorCodes::UNKNOWN_READ_METHOD, "Unknown read method '{}' for local filesystem", read_method_str);
+
+    read_method_str = settings.remote_filesystem_read_method.value;
+
+    if (auto opt_method = magic_enum::enum_cast<RemoteFSReadMethod>(read_method_str))
+        res.remote_fs_method = *opt_method;
+    else
+        throw Exception(ErrorCodes::UNKNOWN_READ_METHOD, "Unknown read method '{}' for remote filesystem", read_method_str);

    res.local_fs_prefetch = settings.local_filesystem_read_prefetch;
    res.remote_fs_prefetch = settings.remote_filesystem_read_prefetch;

-    res.remote_fs_backoff_threshold = settings.remote_fs_read_backoff_threshold;
-    res.remote_fs_backoff_max_tries = settings.remote_fs_read_backoff_max_tries;
+    res.remote_fs_read_max_backoff_ms = settings.remote_fs_read_max_backoff_ms;
+    res.remote_fs_read_backoff_max_tries = settings.remote_fs_read_backoff_max_tries;

    res.local_fs_buffer_size = settings.max_read_buffer_size;
    res.direct_io_threshold = settings.min_bytes_to_use_direct_io;
--- a/src/Storages/FileLog/FileLogSettings.h
+++ b/src/Storages/FileLog/FileLogSettings.h
@ -14,7 +14,7 @@ class ASTStorage;
    M(Milliseconds, poll_timeout_ms, 0, "Timeout for single poll from StorageFileLog.", 0) \
    M(UInt64, poll_max_batch_size, 0, "Maximum amount of messages to be polled in a single StorageFileLog poll.", 0) \
    M(UInt64, max_block_size, 0, "Number of row collected by poll(s) for flushing data from StorageFileLog.", 0) \
-    M(UInt64, max_threads, 8, "Number of max threads to parse files, default is 8", 0) \
+    M(UInt64, max_threads, 0, "Number of max threads to parse files, default is 0, which means the number will be max(1, physical_cpu_cores / 4)", 0) \
    M(Milliseconds, poll_directory_watch_events_backoff_init, 500, "The initial sleep value for watch directory thread.", 0) \
    M(Milliseconds, poll_directory_watch_events_backoff_max, 32000, "The max sleep value for watch directory thread.", 0) \
    M(UInt64, poll_directory_watch_events_backoff_factor, 2, "The speed of backoff, exponential by default", 0)
--- a/src/Storages/FileLog/StorageFileLog.cpp
+++ b/src/Storages/FileLog/StorageFileLog.cpp
@ -750,7 +750,12 @@ void registerStorageFileLog(StorageFactory & factory)
        auto physical_cpu_cores = getNumberOfPhysicalCPUCores();
        auto num_threads = filelog_settings->max_threads.value;

-        if (num_threads > physical_cpu_cores)
+        if (!num_threads) /// Default
+        {
+            num_threads = std::max(unsigned(1), physical_cpu_cores / 4);
+            filelog_settings->set("max_threads", num_threads);
+        }
+        else if (num_threads > physical_cpu_cores)
        {
            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Number of threads to parse files can not be bigger than {}", physical_cpu_cores);
        }
--- a/src/Storages/HDFS/ReadBufferFromHDFS.cpp
+++ b/src/Storages/HDFS/ReadBufferFromHDFS.cpp
@ -15,6 +15,7 @@ namespace ErrorCodes
    extern const int CANNOT_OPEN_FILE;
    extern const int CANNOT_SEEK_THROUGH_FILE;
    extern const int SEEK_POSITION_OUT_OF_BOUND;
+    extern const int LOGICAL_ERROR;
 }

 ReadBufferFromHDFS::~ReadBufferFromHDFS() = default;
@ -33,16 +34,18 @@ struct ReadBufferFromHDFS::ReadBufferFromHDFSImpl : public BufferWithOwnMemory<S

    off_t offset = 0;
    bool initialized = false;
+    off_t read_until_position = 0;

    explicit ReadBufferFromHDFSImpl(
        const std::string & hdfs_uri_,
        const std::string & hdfs_file_path_,
        const Poco::Util::AbstractConfiguration & config_,
-        size_t buf_size_)
+        size_t buf_size_, size_t read_until_position_)
        : BufferWithOwnMemory<SeekableReadBuffer>(buf_size_)
        , hdfs_uri(hdfs_uri_)
        , hdfs_file_path(hdfs_file_path_)
        , builder(createHDFSBuilder(hdfs_uri_, config_))
+        , read_until_position(read_until_position_)
    {
        std::lock_guard lock(hdfs_init_mutex);

@ -79,7 +82,23 @@ struct ReadBufferFromHDFS::ReadBufferFromHDFSImpl : public BufferWithOwnMemory<S
            initialized = true;
        }

-        int bytes_read = hdfsRead(fs.get(), fin, internal_buffer.begin(), internal_buffer.size());
+        size_t num_bytes_to_read;
+        if (read_until_position)
+        {
+            if (read_until_position == offset)
+                return false;
+
+            if (read_until_position < offset)
+                throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to read beyond right offset ({} > {})", offset, read_until_position - 1);
+
+            num_bytes_to_read = read_until_position - offset;
+        }
+        else
+        {
+            num_bytes_to_read = internal_buffer.size();
+        }
+
+        int bytes_read = hdfsRead(fs.get(), fin, internal_buffer.begin(), num_bytes_to_read);
        if (bytes_read < 0)
            throw Exception(ErrorCodes::NETWORK_ERROR,
                "Fail to read from HDFS: {}, file path: {}. Error: {}",
@ -125,9 +144,9 @@ ReadBufferFromHDFS::ReadBufferFromHDFS(
        const String & hdfs_uri_,
        const String & hdfs_file_path_,
        const Poco::Util::AbstractConfiguration & config_,
-        size_t buf_size_)
+        size_t buf_size_, size_t read_until_position_)
    : SeekableReadBuffer(nullptr, 0)
-    , impl(std::make_unique<ReadBufferFromHDFSImpl>(hdfs_uri_, hdfs_file_path_, config_, buf_size_))
+    , impl(std::make_unique<ReadBufferFromHDFSImpl>(hdfs_uri_, hdfs_file_path_, config_, buf_size_, read_until_position_))
 {
 }

--- a/src/Storages/HDFS/ReadBufferFromHDFS.h
+++ b/src/Storages/HDFS/ReadBufferFromHDFS.h
@ -25,7 +25,9 @@ struct ReadBufferFromHDFSImpl;

 public:
    ReadBufferFromHDFS(const String & hdfs_uri_, const String & hdfs_file_path_,
-        const Poco::Util::AbstractConfiguration & config_, size_t buf_size_ = DBMS_DEFAULT_BUFFER_SIZE);
+                       const Poco::Util::AbstractConfiguration & config_,
+                       size_t buf_size_ = DBMS_DEFAULT_BUFFER_SIZE,
+                       size_t read_until_position_ = 0);

    ~ReadBufferFromHDFS() override;

--- a/src/Storages/MergeTree/IMergeTreeDataPart.h
+++ b/src/Storages/MergeTree/IMergeTreeDataPart.h
@ -97,6 +97,8 @@ public:

    virtual bool isStoredOnDisk() const = 0;

+    virtual bool isStoredOnRemoteDisk() const = 0;
+
    virtual bool supportsVerticalMerge() const { return false; }

    /// NOTE: Returns zeros if column files are not found in checksums.
--- a/src/Storages/MergeTree/IMergeTreeReader.h
+++ b/src/Storages/MergeTree/IMergeTreeReader.h
@ -30,8 +30,10 @@ public:
        const ValueSizeMap & avg_value_size_hints_ = ValueSizeMap{});

    /// Return the number of rows has been read or zero if there is no columns to read.
-    /// If continue_reading is true, continue reading from last state, otherwise seek to from_mark
-    virtual size_t readRows(size_t from_mark, bool continue_reading, size_t max_rows_to_read, Columns & res_columns) = 0;
+    /// If continue_reading is true, continue reading from last state, otherwise seek to from_mark.
+    /// current_task_last mark is needed for asynchronous reading (mainly from remote fs).
+    virtual size_t readRows(size_t from_mark, size_t current_task_last_mark,
+                            bool continue_reading, size_t max_rows_to_read, Columns & res_columns) = 0;

    virtual bool canReadIncompleteGranules() const = 0;

--- a/src/Storages/MergeTree/MarkRange.cpp
+++ b/src/Storages/MergeTree/MarkRange.cpp
@ -0,0 +1,14 @@
+#include "MarkRange.h"
+
+namespace DB
+{
+
+size_t getLastMark(const MarkRanges & ranges)
+{
+    size_t current_task_last_mark = 0;
+    for (const auto & mark_range : ranges)
+        current_task_last_mark = std::max(current_task_last_mark, mark_range.end);
+    return current_task_last_mark;
+}
+
+}
--- a/src/Storages/MergeTree/MarkRange.h
+++ b/src/Storages/MergeTree/MarkRange.h
@ -8,7 +8,8 @@ namespace DB
 {


-/** A pair of marks that defines the range of rows in a part. Specifically, the range has the form [begin * index_granularity, end * index_granularity).
+/** A pair of marks that defines the range of rows in a part. Specifically,
+ * the range has the form [begin * index_granularity, end * index_granularity).
 */
 struct MarkRange
 {
@ -21,5 +22,8 @@ struct MarkRange

 using MarkRanges = std::deque<MarkRange>;

+/** Get max range.end from ranges.
+ */
+size_t getLastMark(const MarkRanges & ranges);

 }
--- a/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp
@ -182,6 +182,11 @@ void MergeTreeDataPartCompact::checkConsistency(bool require_part_metadata) cons
    }
 }

+bool MergeTreeDataPartCompact::isStoredOnRemoteDisk() const
+{
+    return volume->getDisk()->isRemote();
+}
+
 MergeTreeDataPartCompact::~MergeTreeDataPartCompact()
 {
    removeIfNeeded();
--- a/src/Storages/MergeTree/MergeTreeDataPartCompact.h
+++ b/src/Storages/MergeTree/MergeTreeDataPartCompact.h
@ -56,6 +56,8 @@ public:

    bool isStoredOnDisk() const override { return true; }

+    bool isStoredOnRemoteDisk() const override;
+
    bool hasColumnFiles(const NameAndTypePair & column) const override;

    String getFileNameForColumn(const NameAndTypePair & /* column */) const override { return DATA_FILE_NAME; }
--- a/src/Storages/MergeTree/MergeTreeDataPartInMemory.h
+++ b/src/Storages/MergeTree/MergeTreeDataPartInMemory.h
@ -44,6 +44,7 @@ public:
        const MergeTreeIndexGranularity & computed_index_granularity) const override;

    bool isStoredOnDisk() const override { return false; }
+    bool isStoredOnRemoteDisk() const override { return false; }
    bool hasColumnFiles(const NameAndTypePair & column) const override { return !!getColumnPosition(column.name); }
    String getFileNameForColumn(const NameAndTypePair & /* column */) const override { return ""; }
    void renameTo(const String & new_relative_path, bool remove_new_dir_if_exists) const override;
--- a/src/Storages/MergeTree/MergeTreeDataPartWide.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataPartWide.cpp
@ -142,6 +142,11 @@ void MergeTreeDataPartWide::loadIndexGranularity()
    index_granularity.setInitialized();
 }

+bool MergeTreeDataPartWide::isStoredOnRemoteDisk() const
+{
+    return volume->getDisk()->isRemote();
+}
+
 MergeTreeDataPartWide::~MergeTreeDataPartWide()
 {
    removeIfNeeded();
--- a/src/Storages/MergeTree/MergeTreeDataPartWide.h
+++ b/src/Storages/MergeTree/MergeTreeDataPartWide.h
@ -50,6 +50,8 @@ public:

    bool isStoredOnDisk() const override { return true; }

+    bool isStoredOnRemoteDisk() const override;
+
    bool supportsVerticalMerge() const override { return true; }

    String getFileNameForColumn(const NameAndTypePair & column) const override;
--- a/src/Storages/MergeTree/MergeTreeIndexReader.cpp
+++ b/src/Storages/MergeTree/MergeTreeIndexReader.cpp
@ -54,6 +54,7 @@ MergeTreeIndexReader::MergeTreeIndexReader(
        std::move(settings));
    version = index_format.version;

+    stream->adjustForRange(MarkRange(0, getLastMark(all_mark_ranges_)));
    stream->seekToStart();
 }

--- a/src/Storages/MergeTree/MergeTreeRangeReader.cpp
+++ b/src/Storages/MergeTree/MergeTreeRangeReader.cpp
@ -54,9 +54,21 @@ static void filterColumns(Columns & columns, const ColumnPtr & filter)
 }


+static size_t getLastMark(const MergeTreeRangeReader::ReadResult::RangesInfo & ranges)
+{
+    size_t current_task_last_mark = 0;
+    for (const auto & mark_range : ranges)
+        current_task_last_mark = std::max(current_task_last_mark, mark_range.range.end);
+    return current_task_last_mark;
+}
+
+
 MergeTreeRangeReader::DelayedStream::DelayedStream(
-        size_t from_mark, IMergeTreeReader * merge_tree_reader_)
+    size_t from_mark,
+    size_t current_task_last_mark_,
+    IMergeTreeReader * merge_tree_reader_)
        : current_mark(from_mark), current_offset(0), num_delayed_rows(0)
+        , current_task_last_mark(current_task_last_mark_)
        , merge_tree_reader(merge_tree_reader_)
        , index_granularity(&(merge_tree_reader->data_part->index_granularity))
        , continue_reading(false), is_finished(false)
@ -73,7 +85,8 @@ size_t MergeTreeRangeReader::DelayedStream::readRows(Columns & columns, size_t n
 {
    if (num_rows)
    {
-        size_t rows_read = merge_tree_reader->readRows(current_mark, continue_reading, num_rows, columns);
+        size_t rows_read = merge_tree_reader->readRows(
+            current_mark, current_task_last_mark, continue_reading, num_rows, columns);
        continue_reading = true;

        /// Zero rows_read maybe either because reading has finished
@ -151,13 +164,13 @@ size_t MergeTreeRangeReader::DelayedStream::finalize(Columns & columns)


 MergeTreeRangeReader::Stream::Stream(
-        size_t from_mark, size_t to_mark, IMergeTreeReader * merge_tree_reader_)
+        size_t from_mark, size_t to_mark, size_t current_task_last_mark, IMergeTreeReader * merge_tree_reader_)
        : current_mark(from_mark), offset_after_current_mark(0)
        , last_mark(to_mark)
        , merge_tree_reader(merge_tree_reader_)
        , index_granularity(&(merge_tree_reader->data_part->index_granularity))
        , current_mark_index_granularity(index_granularity->getMarkRows(from_mark))
-        , stream(from_mark, merge_tree_reader)
+        , stream(from_mark, current_task_last_mark, merge_tree_reader)
 {
    size_t marks_count = index_granularity->getMarksCount();
    if (from_mark >= marks_count)
@ -280,9 +293,9 @@ void MergeTreeRangeReader::ReadResult::adjustLastGranule()
        throw Exception("Can't adjust last granule because no granules were added.", ErrorCodes::LOGICAL_ERROR);

    if (num_rows_to_subtract > rows_per_granule.back())
-        throw Exception("Can't adjust last granule because it has " + toString(rows_per_granule.back())
-                        + " rows, but try to subtract " + toString(num_rows_to_subtract) + " rows.",
-                        ErrorCodes::LOGICAL_ERROR);
+        throw Exception(ErrorCodes::LOGICAL_ERROR,
+                        "Can't adjust last granule because it has {} rows, but try to subtract {} rows.",
+                        toString(rows_per_granule.back()), toString(num_rows_to_subtract));

    rows_per_granule.back() -= num_rows_to_subtract;
    total_rows_per_granule -= num_rows_to_subtract;
@ -750,6 +763,8 @@ MergeTreeRangeReader::ReadResult MergeTreeRangeReader::startReadingChain(size_t
    ReadResult result;
    result.columns.resize(merge_tree_reader->getColumns().size());

+    size_t current_task_last_mark = getLastMark(ranges);
+
    /// Stream is lazy. result.num_added_rows is the number of rows added to block which is not equal to
    /// result.num_rows_read until call to stream.finalize(). Also result.num_added_rows may be less than
    /// result.num_rows_read if the last granule in range also the last in part (so we have to adjust last granule).
@ -760,7 +775,7 @@ MergeTreeRangeReader::ReadResult MergeTreeRangeReader::startReadingChain(size_t
            if (stream.isFinished())
            {
                result.addRows(stream.finalize(result.columns));
-                stream = Stream(ranges.front().begin, ranges.front().end, merge_tree_reader);
+                stream = Stream(ranges.front().begin, ranges.front().end, current_task_last_mark, merge_tree_reader);
                result.addRange(ranges.front());
                ranges.pop_front();
            }
@ -807,6 +822,7 @@ Columns MergeTreeRangeReader::continueReadingChain(ReadResult & result, size_t &
    const auto & rows_per_granule = result.rowsPerGranule();
    const auto & started_ranges = result.startedRanges();

+    size_t current_task_last_mark = getLastMark(started_ranges);
    size_t next_range_to_start = 0;

    auto size = rows_per_granule.size();
@ -818,7 +834,7 @@ Columns MergeTreeRangeReader::continueReadingChain(ReadResult & result, size_t &
            num_rows += stream.finalize(columns);
            const auto & range = started_ranges[next_range_to_start].range;
            ++next_range_to_start;
-            stream = Stream(range.begin, range.end, merge_tree_reader);
+            stream = Stream(range.begin, range.end, current_task_last_mark, merge_tree_reader);
        }

        bool last = i + 1 == size;
--- a/src/Storages/MergeTree/MergeTreeRangeReader.h
+++ b/src/Storages/MergeTree/MergeTreeRangeReader.h
@ -62,7 +62,7 @@ public:
    {
    public:
        DelayedStream() = default;
-        DelayedStream(size_t from_mark, IMergeTreeReader * merge_tree_reader);
+        DelayedStream(size_t from_mark, size_t current_task_last_mark_, IMergeTreeReader * merge_tree_reader);

        /// Read @num_rows rows from @from_mark starting from @offset row
        /// Returns the number of rows added to block.
@ -81,6 +81,8 @@ public:
        size_t current_offset = 0;
        /// Num of rows we have to read
        size_t num_delayed_rows = 0;
+        /// Last mark from all ranges of current task.
+        size_t current_task_last_mark = 0;

        /// Actual reader of data from disk
        IMergeTreeReader * merge_tree_reader = nullptr;
@ -99,7 +101,8 @@ public:
    {
    public:
        Stream() = default;
-        Stream(size_t from_mark, size_t to_mark, IMergeTreeReader * merge_tree_reader);
+        Stream(size_t from_mark, size_t to_mark,
+               size_t current_task_last_mark, IMergeTreeReader * merge_tree_reader);

        /// Returns the number of rows added to block.
        size_t read(Columns & columns, size_t num_rows, bool skip_remaining_rows_in_current_granule);
@ -122,6 +125,7 @@ public:
        /// Invariant: offset_after_current_mark + skipped_rows_after_offset < index_granularity
        size_t offset_after_current_mark = 0;

+        /// Last mark in current range.
        size_t last_mark = 0;

        IMergeTreeReader * merge_tree_reader = nullptr;
--- a/src/Storages/MergeTree/MergeTreeReadPool.cpp
+++ b/src/Storages/MergeTree/MergeTreeReadPool.cpp
@ -88,8 +88,11 @@ MergeTreeReadTaskPtr MergeTreeReadPool::getTask(const size_t min_marks_to_read,
    auto & part = parts_with_idx[part_idx];
    auto & marks_in_part = thread_tasks.sum_marks_in_parts.back();

-    /// Get whole part to read if it is small enough.
-    auto need_marks = std::min(marks_in_part, min_marks_to_read);
+    size_t need_marks;
+    if (is_part_on_remote_disk[part_idx]) /// For better performance with remote disks
+        need_marks = marks_in_part;
+    else /// Get whole part to read if it is small enough.
+        need_marks = std::min(marks_in_part, min_marks_to_read);

    /// Do not leave too little rows in part for next time.
    if (marks_in_part > need_marks &&
@ -190,10 +193,12 @@ std::vector<size_t> MergeTreeReadPool::fillPerPartInfo(const RangesInDataParts &
 {
    std::vector<size_t> per_part_sum_marks;
    Block sample_block = metadata_snapshot->getSampleBlock();
+    is_part_on_remote_disk.resize(parts.size());

    for (const auto i : collections::range(0, parts.size()))
    {
        const auto & part = parts[i];
+        is_part_on_remote_disk[i] = part.data_part->isStoredOnRemoteDisk();

        /// Read marks for every data part.
        size_t sum_marks = 0;
--- a/src/Storages/MergeTree/MergeTreeReadPool.h
+++ b/src/Storages/MergeTree/MergeTreeReadPool.h
@ -135,6 +135,8 @@ private:
    mutable std::mutex mutex;

    Poco::Logger * log = &Poco::Logger::get("MergeTreeReadPool");
+
+    std::vector<bool> is_part_on_remote_disk;
 };

 using MergeTreeReadPoolPtr = std::shared_ptr<MergeTreeReadPool>;
--- a/src/Storages/MergeTree/MergeTreeReaderCompact.cpp
+++ b/src/Storages/MergeTree/MergeTreeReaderCompact.cpp
@ -120,7 +120,8 @@ MergeTreeReaderCompact::MergeTreeReaderCompact(
    }
 }

-size_t MergeTreeReaderCompact::readRows(size_t from_mark, bool continue_reading, size_t max_rows_to_read, Columns & res_columns)
+size_t MergeTreeReaderCompact::readRows(
+    size_t from_mark, size_t current_task_last_mark, bool continue_reading, size_t max_rows_to_read, Columns & res_columns)
 {
    if (continue_reading)
        from_mark = next_mark;
@ -156,7 +157,7 @@ size_t MergeTreeReaderCompact::readRows(size_t from_mark, bool continue_reading,
                auto & column = res_columns[pos];
                size_t column_size_before_reading = column->size();

-                readData(column_from_part, column, from_mark, *column_positions[pos], rows_to_read, read_only_offsets[pos]);
+                readData(column_from_part, column, from_mark, current_task_last_mark, *column_positions[pos], rows_to_read, read_only_offsets[pos]);

                size_t read_rows_in_column = column->size() - column_size_before_reading;
                if (read_rows_in_column != rows_to_read)
@ -191,7 +192,7 @@ size_t MergeTreeReaderCompact::readRows(size_t from_mark, bool continue_reading,

 void MergeTreeReaderCompact::readData(
    const NameAndTypePair & name_and_type, ColumnPtr & column,
-    size_t from_mark, size_t column_position, size_t rows_to_read, bool only_offsets)
+    size_t from_mark, size_t current_task_last_mark, size_t column_position, size_t rows_to_read, bool only_offsets)
 {
    const auto & [name, type] = name_and_type;

@ -203,6 +204,8 @@ void MergeTreeReaderCompact::readData(
        if (only_offsets && (substream_path.size() != 1 || substream_path[0].type != ISerialization::Substream::ArraySizes))
            return nullptr;

+        /// For asynchronous reading from remote fs.
+        data_buffer->setReadUntilPosition(marks_loader.getMark(current_task_last_mark).offset_in_compressed_file);
        return data_buffer;
    };

--- a/src/Storages/MergeTree/MergeTreeReaderCompact.h
+++ b/src/Storages/MergeTree/MergeTreeReaderCompact.h
@ -32,7 +32,8 @@ public:

    /// Return the number of rows has been read or zero if there is no columns to read.
    /// If continue_reading is true, continue reading from last state, otherwise seek to from_mark
-    size_t readRows(size_t from_mark, bool continue_reading, size_t max_rows_to_read, Columns & res_columns) override;
+    size_t readRows(size_t from_mark, size_t current_task_last_mark,
+                    bool continue_reading, size_t max_rows_to_read, Columns & res_columns) override;

    bool canReadIncompleteGranules() const override { return false; }

@ -57,7 +58,7 @@ private:
    void seekToMark(size_t row_index, size_t column_index);

    void readData(const NameAndTypePair & name_and_type, ColumnPtr & column, size_t from_mark,
-        size_t column_position, size_t rows_to_read, bool only_offsets);
+        size_t current_task_last_mark, size_t column_position, size_t rows_to_read, bool only_offsets);

    /// Returns maximal value of granule size in compressed file from @mark_ranges.
    /// This value is used as size of read buffer.
--- a/src/Storages/MergeTree/MergeTreeReaderInMemory.cpp
+++ b/src/Storages/MergeTree/MergeTreeReaderInMemory.cpp
@ -37,7 +37,8 @@ MergeTreeReaderInMemory::MergeTreeReaderInMemory(
    }
 }

-size_t MergeTreeReaderInMemory::readRows(size_t from_mark, bool continue_reading, size_t max_rows_to_read, Columns & res_columns)
+size_t MergeTreeReaderInMemory::readRows(
+    size_t from_mark, size_t /* current_task_last_mark */, bool continue_reading, size_t max_rows_to_read, Columns & res_columns)
 {
    if (!continue_reading)
        total_rows_read = 0;
--- a/src/Storages/MergeTree/MergeTreeReaderInMemory.h
+++ b/src/Storages/MergeTree/MergeTreeReaderInMemory.h
@ -23,7 +23,8 @@ public:

    /// Return the number of rows has been read or zero if there is no columns to read.
    /// If continue_reading is true, continue reading from last state, otherwise seek to from_mark
-    size_t readRows(size_t from_mark, bool continue_reading, size_t max_rows_to_read, Columns & res_columns) override;
+    size_t readRows(size_t from_mark, size_t current_tasl_last_mark,
+                    bool continue_reading, size_t max_rows_to_read, Columns & res_columns) override;

    bool canReadIncompleteGranules() const override { return true; }

--- a/src/Storages/MergeTree/MergeTreeReaderStream.cpp
+++ b/src/Storages/MergeTree/MergeTreeReaderStream.cpp
@ -1,6 +1,7 @@
 #include <Storages/MergeTree/MergeTreeReaderStream.h>
 #include <Compression/CachedCompressedReadBuffer.h>

+#include <base/getThreadId.h>
 #include <utility>


@ -13,18 +14,22 @@ namespace ErrorCodes
    extern const int CANNOT_READ_ALL_DATA;
 }

-
 MergeTreeReaderStream::MergeTreeReaderStream(
        DiskPtr disk_,
        const String & path_prefix_, const String & data_file_extension_, size_t marks_count_,
        const MarkRanges & all_mark_ranges,
        const MergeTreeReaderSettings & settings,
        MarkCache * mark_cache_,
-        UncompressedCache * uncompressed_cache, size_t file_size,
+        UncompressedCache * uncompressed_cache, size_t file_size_,
        const MergeTreeIndexGranularityInfo * index_granularity_info_,
        const ReadBufferFromFileBase::ProfileCallback & profile_callback, clockid_t clock_type)
-        : disk(std::move(disk_)), path_prefix(path_prefix_), data_file_extension(data_file_extension_), marks_count(marks_count_)
-        , mark_cache(mark_cache_), save_marks_in_cache(settings.save_marks_in_cache)
+    : disk(std::move(disk_))
+    , path_prefix(path_prefix_)
+    , data_file_extension(data_file_extension_)
+    , marks_count(marks_count_)
+    , file_size(file_size_)
+    , mark_cache(mark_cache_)
+    , save_marks_in_cache(settings.save_marks_in_cache)
    , index_granularity_info(index_granularity_info_)
    , marks_loader(disk, mark_cache, index_granularity_info->getMarksFilePath(path_prefix),
        marks_count, *index_granularity_info, save_marks_in_cache)
@ -37,35 +42,7 @@ MergeTreeReaderStream::MergeTreeReaderStream(
    {
        size_t left_mark = mark_range.begin;
        size_t right_mark = mark_range.end;
-
-        /// NOTE: if we are reading the whole file, then right_mark == marks_count
-        /// and we will use max_read_buffer_size for buffer size, thus avoiding the need to load marks.
-
-        /// If the end of range is inside the block, we will need to read it too.
-        if (right_mark < marks_count && marks_loader.getMark(right_mark).offset_in_decompressed_block > 0)
-        {
-            auto indices = collections::range(right_mark, marks_count);
-            auto it = std::upper_bound(indices.begin(), indices.end(), right_mark, [this](size_t i, size_t j)
-            {
-                return marks_loader.getMark(i).offset_in_compressed_file < marks_loader.getMark(j).offset_in_compressed_file;
-            });
-
-            right_mark = (it == indices.end() ? marks_count : *it);
-        }
-
-        size_t mark_range_bytes;
-
-        /// If there are no marks after the end of range, just use file size
-        if (right_mark >= marks_count
-            || (right_mark + 1 == marks_count
-                && marks_loader.getMark(right_mark).offset_in_compressed_file == marks_loader.getMark(mark_range.end).offset_in_compressed_file))
-        {
-            mark_range_bytes = file_size - (left_mark < marks_count ? marks_loader.getMark(left_mark).offset_in_compressed_file : 0);
-        }
-        else
-        {
-            mark_range_bytes = marks_loader.getMark(right_mark).offset_in_compressed_file - marks_loader.getMark(left_mark).offset_in_compressed_file;
-        }
+        auto [right_offset, mark_range_bytes] = getRightOffsetAndBytesRange(left_mark, right_mark);

        max_mark_range_bytes = std::max(max_mark_range_bytes, mark_range_bytes);
        sum_mark_range_bytes += mark_range_bytes;
@ -78,6 +55,7 @@ MergeTreeReaderStream::MergeTreeReaderStream(
    /// Avoid empty buffer. May happen while reading dictionary for DataTypeLowCardinality.
    /// For example: part has single dictionary and all marks point to the same position.
    ReadSettings read_settings = settings.read_settings;
+    read_settings.must_read_until_position = true;
    if (max_mark_range_bytes != 0)
        read_settings = read_settings.adjustBufferSize(max_mark_range_bytes);

@ -128,6 +106,45 @@ MergeTreeReaderStream::MergeTreeReaderStream(
 }


+std::pair<size_t, size_t> MergeTreeReaderStream::getRightOffsetAndBytesRange(size_t left_mark, size_t right_mark)
+{
+    /// NOTE: if we are reading the whole file, then right_mark == marks_count
+    /// and we will use max_read_buffer_size for buffer size, thus avoiding the need to load marks.
+
+    /// If the end of range is inside the block, we will need to read it too.
+    size_t result_right_mark = right_mark;
+    if (right_mark < marks_count && marks_loader.getMark(right_mark).offset_in_decompressed_block > 0)
+    {
+        auto indices = collections::range(right_mark, marks_count);
+        auto it = std::upper_bound(indices.begin(), indices.end(), right_mark, [this](size_t i, size_t j)
+        {
+            return marks_loader.getMark(i).offset_in_compressed_file < marks_loader.getMark(j).offset_in_compressed_file;
+        });
+
+        result_right_mark = (it == indices.end() ? marks_count : *it);
+    }
+
+    size_t right_offset;
+    size_t mark_range_bytes;
+
+    /// If there are no marks after the end of range, just use file size
+    if (result_right_mark >= marks_count
+        || (result_right_mark + 1 == marks_count
+            && marks_loader.getMark(result_right_mark).offset_in_compressed_file == marks_loader.getMark(right_mark).offset_in_compressed_file))
+    {
+        right_offset = file_size;
+        mark_range_bytes = right_offset - (left_mark < marks_count ? marks_loader.getMark(left_mark).offset_in_compressed_file : 0);
+    }
+    else
+    {
+        right_offset = marks_loader.getMark(result_right_mark).offset_in_compressed_file;
+        mark_range_bytes = right_offset - marks_loader.getMark(left_mark).offset_in_compressed_file;
+    }
+
+    return std::make_pair(right_offset, mark_range_bytes);
+}
+
+
 void MergeTreeReaderStream::seekToMark(size_t index)
 {
    MarkInCompressedFile mark = marks_loader.getMark(index);
@ -172,4 +189,25 @@ void MergeTreeReaderStream::seekToStart()
    }
 }

+
+void MergeTreeReaderStream::adjustForRange(MarkRange range)
+{
+    auto [right_offset, mark_range_bytes] = getRightOffsetAndBytesRange(range.begin, range.end);
+    if (!right_offset)
+    {
+        if (cached_buffer)
+            cached_buffer->setReadUntilEnd();
+        if (non_cached_buffer)
+            non_cached_buffer->setReadUntilEnd();
+    }
+    else if (right_offset > last_right_offset)
+    {
+        last_right_offset = right_offset;
+        if (cached_buffer)
+            cached_buffer->setReadUntilPosition(last_right_offset);
+        if (non_cached_buffer)
+            non_cached_buffer->setReadUntilPosition(last_right_offset);
+    }
+}
+
 }
--- a/src/Storages/MergeTree/MergeTreeReaderStream.h
+++ b/src/Storages/MergeTree/MergeTreeReaderStream.h
@ -23,25 +23,36 @@ public:
        const MarkRanges & all_mark_ranges,
        const MergeTreeReaderSettings & settings_,
        MarkCache * mark_cache, UncompressedCache * uncompressed_cache,
-        size_t file_size, const MergeTreeIndexGranularityInfo * index_granularity_info_,
+        size_t file_size_, const MergeTreeIndexGranularityInfo * index_granularity_info_,
        const ReadBufferFromFileBase::ProfileCallback & profile_callback, clockid_t clock_type);

    void seekToMark(size_t index);

    void seekToStart();

+    /**
+     * Does buffer need to know something about mark ranges bounds it is going to read?
+     * (In case of MergeTree* tables). Mostly needed for reading from remote fs.
+     */
+    void adjustForRange(MarkRange range);
+
    ReadBuffer * data_buffer;

 private:
+    std::pair<size_t, size_t> getRightOffsetAndBytesRange(size_t left_mark, size_t right_mark);
+
    DiskPtr disk;
    std::string path_prefix;
    std::string data_file_extension;

    size_t marks_count;
+    size_t file_size;

    MarkCache * mark_cache;
    bool save_marks_in_cache;

+    size_t last_right_offset = 0;
+
    const MergeTreeIndexGranularityInfo * index_granularity_info;

    std::unique_ptr<CachedCompressedReadBuffer> cached_buffer;
--- a/src/Storages/MergeTree/MergeTreeReaderWide.cpp
+++ b/src/Storages/MergeTree/MergeTreeReaderWide.cpp
@ -61,7 +61,8 @@ MergeTreeReaderWide::MergeTreeReaderWide(
 }


-size_t MergeTreeReaderWide::readRows(size_t from_mark, bool continue_reading, size_t max_rows_to_read, Columns & res_columns)
+size_t MergeTreeReaderWide::readRows(
+    size_t from_mark, size_t current_task_last_mark, bool continue_reading, size_t max_rows_to_read, Columns & res_columns)
 {
    size_t read_rows = 0;
    try
@ -83,7 +84,7 @@ size_t MergeTreeReaderWide::readRows(size_t from_mark, bool continue_reading, si
                try
                {
                    auto & cache = caches[column_from_part.getNameInStorage()];
-                    prefetch(column_from_part, from_mark, continue_reading, cache, prefetched_streams);
+                    prefetch(column_from_part, from_mark, continue_reading, current_task_last_mark, cache, prefetched_streams);
                }
                catch (Exception & e)
                {
@ -113,7 +114,7 @@ size_t MergeTreeReaderWide::readRows(size_t from_mark, bool continue_reading, si
                auto & cache = caches[column_from_part.getNameInStorage()];

                readData(
-                    column_from_part, column, from_mark, continue_reading,
+                    column_from_part, column, from_mark, continue_reading, current_task_last_mark,
                    max_rows_to_read, cache, /* was_prefetched =*/ !prefetched_streams.empty());

                /// For elements of Nested, column_size_before_reading may be greater than column size
@ -195,6 +196,7 @@ static ReadBuffer * getStream(
    MergeTreeReaderWide::FileStreams & streams,
    const NameAndTypePair & name_and_type,
    size_t from_mark, bool seek_to_mark,
+    size_t current_task_last_mark,
    ISerialization::SubstreamsCache & cache)
 {
    /// If substream have already been read.
@ -208,6 +210,7 @@ static ReadBuffer * getStream(
        return nullptr;

    MergeTreeReaderStream & stream = *it->second;
+    stream.adjustForRange(MarkRange(seek_to_start ? 0 : from_mark, current_task_last_mark));

    if (seek_to_start)
        stream.seekToStart();
@ -222,6 +225,7 @@ void MergeTreeReaderWide::prefetch(
    const NameAndTypePair & name_and_type,
    size_t from_mark,
    bool continue_reading,
+    size_t current_task_last_mark,
    ISerialization::SubstreamsCache & cache,
    std::unordered_set<std::string> & prefetched_streams)
 {
@ -235,7 +239,7 @@ void MergeTreeReaderWide::prefetch(
        if (!prefetched_streams.count(stream_name))
        {
            bool seek_to_mark = !continue_reading;
-            if (ReadBuffer * buf = getStream(false, substream_path, streams, name_and_type, from_mark, seek_to_mark, cache))
+            if (ReadBuffer * buf = getStream(false, substream_path, streams, name_and_type, from_mark, seek_to_mark, current_task_last_mark, cache))
                buf->prefetch();

            prefetched_streams.insert(stream_name);
@ -246,8 +250,8 @@ void MergeTreeReaderWide::prefetch(

 void MergeTreeReaderWide::readData(
    const NameAndTypePair & name_and_type, ColumnPtr & column,
-    size_t from_mark, bool continue_reading, size_t max_rows_to_read,
-    ISerialization::SubstreamsCache & cache, bool was_prefetched)
+    size_t from_mark, bool continue_reading, size_t current_task_last_mark,
+    size_t max_rows_to_read, ISerialization::SubstreamsCache & cache, bool was_prefetched)
 {
    double & avg_value_size_hint = avg_value_size_hints[name_and_type.name];
    ISerialization::DeserializeBinaryBulkSettings deserialize_settings;
@ -260,7 +264,7 @@ void MergeTreeReaderWide::readData(
    {
        deserialize_settings.getter = [&](const ISerialization::SubstreamPath & substream_path)
        {
-            return getStream(/* seek_to_start = */true, substream_path, streams, name_and_type, from_mark, /* seek_to_mark = */false, cache);
+            return getStream(/* seek_to_start = */true, substream_path, streams, name_and_type, from_mark, /* seek_to_mark = */false, current_task_last_mark, cache);
        };
        serialization->deserializeBinaryBulkStatePrefix(deserialize_settings, deserialize_binary_bulk_state_map[name]);
    }
@ -271,7 +275,7 @@ void MergeTreeReaderWide::readData(

        return getStream(
            /* seek_to_start = */false, substream_path, streams, name_and_type, from_mark,
-            seek_to_mark, cache);
+            seek_to_mark, current_task_last_mark, cache);
    };
    deserialize_settings.continuous_reading = continue_reading;
    auto & deserialize_state = deserialize_binary_bulk_state_map[name];
--- a/src/Storages/MergeTree/MergeTreeReaderWide.h
+++ b/src/Storages/MergeTree/MergeTreeReaderWide.h
@ -28,7 +28,8 @@ public:

    /// Return the number of rows has been read or zero if there is no columns to read.
    /// If continue_reading is true, continue reading from last state, otherwise seek to from_mark
-    size_t readRows(size_t from_mark, bool continue_reading, size_t max_rows_to_read, Columns & res_columns) override;
+    size_t readRows(size_t from_mark, size_t current_task_last_mark,
+                    bool continue_reading, size_t max_rows_to_read, Columns & res_columns) override;

    bool canReadIncompleteGranules() const override { return true; }

@ -45,7 +46,7 @@ private:

    void readData(
        const NameAndTypePair & name_and_type, ColumnPtr & column,
-        size_t from_mark, bool continue_reading, size_t max_rows_to_read,
+        size_t from_mark, bool continue_reading, size_t current_task_last_mark, size_t max_rows_to_read,
        ISerialization::SubstreamsCache & cache, bool was_prefetched);

    /// Make next readData more simple by calling 'prefetch' of all related ReadBuffers (column streams).
@ -53,6 +54,7 @@ private:
        const NameAndTypePair & name_and_type,
        size_t from_mark,
        bool continue_reading,
+        size_t current_task_last_mark,
        ISerialization::SubstreamsCache & cache,
        std::unordered_set<std::string> & prefetched_streams); /// if stream was already prefetched do nothing
 };
--- a/src/Storages/MergeTree/MergeTreeSequentialSource.cpp
+++ b/src/Storages/MergeTree/MergeTreeSequentialSource.cpp
@ -78,7 +78,8 @@ try

        const auto & sample = reader->getColumns();
        Columns columns(sample.size());
-        size_t rows_read = reader->readRows(current_mark, continue_reading, rows_to_read, columns);
+        /// TODO: pass stream size instead of zero?
+        size_t rows_read = reader->readRows(current_mark, 0, continue_reading, rows_to_read, columns);

        if (rows_read)
        {
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@ -192,43 +192,54 @@ zkutil::ZooKeeperPtr StorageReplicatedMergeTree::getZooKeeper() const
    return res;
 }

-static std::string normalizeZooKeeperPath(std::string zookeeper_path)
+static std::string normalizeZooKeeperPath(std::string zookeeper_path, bool check_starts_with_slash, Poco::Logger * log = nullptr)
 {
    if (!zookeeper_path.empty() && zookeeper_path.back() == '/')
        zookeeper_path.resize(zookeeper_path.size() - 1);
    /// If zookeeper chroot prefix is used, path should start with '/', because chroot concatenates without it.
    if (!zookeeper_path.empty() && zookeeper_path.front() != '/')
+    {
+        /// Do not allow this for new tables, print warning for tables created in old versions
+        if (check_starts_with_slash)
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "ZooKeeper path must starts with '/', got '{}'", zookeeper_path);
+        if (log)
+            LOG_WARNING(log, "ZooKeeper path ('{}') does not start with '/'. It will not be supported in future releases");
        zookeeper_path = "/" + zookeeper_path;
+    }

    return zookeeper_path;
 }

 static String extractZooKeeperName(const String & path)
 {
+    static constexpr auto default_zookeeper_name = "default";
    if (path.empty())
-        throw Exception("ZooKeeper path should not be empty", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
-    auto pos = path.find(':');
-    if (pos != String::npos)
+        throw Exception("ZooKeeper path should not be empty", ErrorCodes::BAD_ARGUMENTS);
+    if (path[0] == '/')
+        return default_zookeeper_name;
+    auto pos = path.find(":/");
+    if (pos != String::npos && pos < path.find('/'))
    {
        auto zookeeper_name = path.substr(0, pos);
        if (zookeeper_name.empty())
-            throw Exception("Zookeeper path should start with '/' or '<auxiliary_zookeeper_name>:/'", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+            throw Exception("Zookeeper path should start with '/' or '<auxiliary_zookeeper_name>:/'", ErrorCodes::BAD_ARGUMENTS);
        return zookeeper_name;
    }
-    static constexpr auto default_zookeeper_name = "default";
    return default_zookeeper_name;
 }

-static String extractZooKeeperPath(const String & path)
+static String extractZooKeeperPath(const String & path, bool check_starts_with_slash, Poco::Logger * log = nullptr)
 {
    if (path.empty())
-        throw Exception("ZooKeeper path should not be empty", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
-    auto pos = path.find(':');
-    if (pos != String::npos)
+        throw Exception("ZooKeeper path should not be empty", ErrorCodes::BAD_ARGUMENTS);
+    if (path[0] == '/')
+        return normalizeZooKeeperPath(path, check_starts_with_slash, log);
+    auto pos = path.find(":/");
+    if (pos != String::npos && pos < path.find('/'))
    {
-        return normalizeZooKeeperPath(path.substr(pos + 1, String::npos));
+        return normalizeZooKeeperPath(path.substr(pos + 1, String::npos), check_starts_with_slash, log);
    }
-    return normalizeZooKeeperPath(path);
+    return normalizeZooKeeperPath(path, check_starts_with_slash, log);
 }

 static MergeTreePartInfo makeDummyDropRangeForMovePartitionOrAttachPartitionFrom(const String & partition_id)
@ -275,7 +286,7 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree(
                    attach,
                    [this] (const std::string & name) { enqueuePartForCheck(name); })
    , zookeeper_name(extractZooKeeperName(zookeeper_path_))
-    , zookeeper_path(extractZooKeeperPath(zookeeper_path_))
+    , zookeeper_path(extractZooKeeperPath(zookeeper_path_, /* check_starts_with_slash */ !attach, log))
    , replica_name(replica_name_)
    , replica_path(fs::path(zookeeper_path) / "replicas" / replica_name_)
    , reader(*this)
@ -5556,7 +5567,7 @@ void StorageReplicatedMergeTree::fetchPartition(
    info.table_id.uuid = UUIDHelpers::Nil;
    auto expand_from = query_context->getMacros()->expand(from_, info);
    String auxiliary_zookeeper_name = extractZooKeeperName(expand_from);
-    String from = extractZooKeeperPath(expand_from);
+    String from = extractZooKeeperPath(expand_from, /* check_starts_with_slash */ true);
    if (from.empty())
        throw Exception("ZooKeeper path should not be empty", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);

@ -6621,7 +6632,7 @@ void StorageReplicatedMergeTree::movePartitionToShard(
    if (!move_part)
        throw Exception("MOVE PARTITION TO SHARD is not supported, use MOVE PART instead", ErrorCodes::NOT_IMPLEMENTED);

-    if (normalizeZooKeeperPath(zookeeper_path) == normalizeZooKeeperPath(to))
+    if (normalizeZooKeeperPath(zookeeper_path, /* check_starts_with_slash */ true) == normalizeZooKeeperPath(to, /* check_starts_with_slash */ true))
        throw Exception("Source and destination are the same", ErrorCodes::BAD_ARGUMENTS);

    auto zookeeper = getZooKeeper();
--- a/src/Storages/StorageS3.cpp
+++ b/src/Storages/StorageS3.cpp
@ -234,7 +234,7 @@ bool StorageS3Source::initialize()
    file_path = fs::path(bucket) / current_key;

    read_buf = wrapReadBufferWithCompressionMethod(
-        std::make_unique<ReadBufferFromS3>(client, bucket, current_key, max_single_read_retries, DBMS_DEFAULT_BUFFER_SIZE),
+        std::make_unique<ReadBufferFromS3>(client, bucket, current_key, max_single_read_retries, getContext()->getReadSettings()),
        chooseCompressionMethod(current_key, compression_hint));
    auto input_format = getContext()->getInputFormat(format, *read_buf, sample_block, max_block_size, format_settings);
    QueryPipelineBuilder builder;
--- a/src/Storages/StorageURL.cpp
+++ b/src/Storages/StorageURL.cpp
@ -124,6 +124,7 @@ namespace
                                context->getSettingsRef().max_http_get_redirects,
                                Poco::Net::HTTPBasicCredentials{},
                                DBMS_DEFAULT_BUFFER_SIZE,
+                                context->getReadSettings(),
                                headers,
                                context->getRemoteHostFilter()),
                            chooseCompressionMethod(request_uri.getPath(), compression_method));
--- a/tests/ci/build_check.py
+++ b/tests/ci/build_check.py
@ -128,7 +128,7 @@ if __name__ == "__main__":

    gh = Github(get_best_robot_token())

-    images_path = os.path.join(temp_path, 'changed_images.json')
+    images_path = os.path.join(os.getenv("IMAGES_PATH", temp_path), 'changed_images.json')
    image_name = get_image_name(build_config)
    image_version = 'latest'
    if os.path.exists(images_path):
--- a/tests/ci/build_report_check.py
+++ b/tests/ci/build_report_check.py
@ -86,7 +86,6 @@ if __name__ == "__main__":

    build_reports = []
    for root, dirs, files in os.walk(reports_path):
-        print(files)
        for f in files:
            if f.startswith("build_urls_") and f.endswith('.json'):
                logging.info("Found build report json %s", f)
--- a/tests/ci/fast_test_check.py
+++ b/tests/ci/fast_test_check.py
@ -113,7 +113,7 @@ if __name__ == "__main__":
            images = json.load(images_fd)
            logging.info("Got images %s", images)
            if 'clickhouse/fasttest' in images:
-                docker_image += ':' + images['clickhouse/pvs-test']
+                docker_image += ':' + images['clickhouse/fasttest']

    logging.info("Got docker image %s", docker_image)
    for i in range(10):
--- a/Show More
+++ b/Show More