diff --git a/docker/test/fuzzer/run-fuzzer.sh b/docker/test/fuzzer/run-fuzzer.sh
index 15590902b68..0c11e0a615d 100755
--- a/docker/test/fuzzer/run-fuzzer.sh
+++ b/docker/test/fuzzer/run-fuzzer.sh
@@ -1,5 +1,5 @@
#!/bin/bash
-# shellcheck disable=SC2086,SC2001
+# shellcheck disable=SC2086,SC2001,SC2046
set -eux
set -o pipefail
@@ -13,24 +13,48 @@ script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
echo "$script_dir"
repo_dir=ch
BINARY_TO_DOWNLOAD=${BINARY_TO_DOWNLOAD:="clang-13_debug_none_bundled_unsplitted_disable_False_binary"}
+BINARY_URL_TO_DOWNLOAD=${BINARY_URL_TO_DOWNLOAD:="https://clickhouse-builds.s3.yandex.net/$PR_TO_TEST/$SHA_TO_TEST/clickhouse_build_check/$BINARY_TO_DOWNLOAD/clickhouse"}
function clone
{
- # The download() function is dependent on CI binaries anyway, so we can take
- # the repo from the CI as well. For local runs, start directly from the "fuzz"
- # stage.
- rm -rf ch ||:
- mkdir ch ||:
- wget -nv -nd -c "https://clickhouse-test-reports.s3.yandex.net/$PR_TO_TEST/$SHA_TO_TEST/repo/clickhouse_no_subs.tar.gz"
- tar -C ch --strip-components=1 -xf clickhouse_no_subs.tar.gz
+ # For local runs, start directly from the "fuzz" stage.
+ rm -rf "$repo_dir" ||:
+ mkdir "$repo_dir" ||:
+
+ git clone --depth 1 https://github.com/ClickHouse/ClickHouse.git -- "$repo_dir" 2>&1 | ts '%Y-%m-%d %H:%M:%S'
+ (
+ cd "$repo_dir"
+ if [ "$PR_TO_TEST" != "0" ]; then
+ if git fetch --depth 1 origin "+refs/pull/$PR_TO_TEST/merge"; then
+ git checkout FETCH_HEAD
+ echo "Checked out pull/$PR_TO_TEST/merge ($(git rev-parse FETCH_HEAD))"
+ else
+ git fetch --depth 1 origin "+refs/pull/$PR_TO_TEST/head"
+ git checkout "$SHA_TO_TEST"
+ echo "Checked out nominal SHA $SHA_TO_TEST for PR $PR_TO_TEST"
+ fi
+ git diff --name-only master HEAD | tee ci-changed-files.txt
+ else
+ if [ -v COMMIT_SHA ]; then
+ git fetch --depth 2 origin "$SHA_TO_TEST"
+ git checkout "$SHA_TO_TEST"
+ echo "Checked out nominal SHA $SHA_TO_TEST for master"
+ else
+ git fetch --depth 2 origin
+ echo "Using default repository head $(git rev-parse HEAD)"
+ fi
+ git diff --name-only HEAD~1 HEAD | tee ci-changed-files.txt
+ fi
+ cd -
+ )
+
ls -lath ||:
+
}
function download
{
- wget -nv -nd -c "https://clickhouse-builds.s3.yandex.net/$PR_TO_TEST/$SHA_TO_TEST/clickhouse_build_check/$BINARY_TO_DOWNLOAD/clickhouse" &
- wget -nv -nd -c "https://clickhouse-test-reports.s3.yandex.net/$PR_TO_TEST/$SHA_TO_TEST/repo/ci-changed-files.txt" &
- wait
+ wget -nv -nd -c "$BINARY_URL_TO_DOWNLOAD"
chmod +x clickhouse
ln -s ./clickhouse ./clickhouse-server
@@ -113,7 +137,7 @@ function fuzz
# Obtain the list of newly added tests. They will be fuzzed in more extreme way than other tests.
# Don't overwrite the NEW_TESTS_OPT so that it can be set from the environment.
- NEW_TESTS="$(sed -n 's!\(^tests/queries/0_stateless/.*\.sql\(\.j2\)\?\)$!ch/\1!p' ci-changed-files.txt | sort -R)"
+ NEW_TESTS="$(sed -n 's!\(^tests/queries/0_stateless/.*\.sql\(\.j2\)\?\)$!ch/\1!p' $repo_dir/ci-changed-files.txt | sort -R)"
# ci-changed-files.txt contains also files that has been deleted/renamed, filter them out.
NEW_TESTS="$(filter_exists_and_template $NEW_TESTS)"
if [[ -n "$NEW_TESTS" ]]
diff --git a/docker/test/performance-comparison/Dockerfile b/docker/test/performance-comparison/Dockerfile
index 88b66d42ecb..73d9454ab7f 100644
--- a/docker/test/performance-comparison/Dockerfile
+++ b/docker/test/performance-comparison/Dockerfile
@@ -33,7 +33,7 @@ RUN apt-get update \
tzdata \
vim \
wget \
- && pip3 --no-cache-dir install 'git+https://github.com/mymarilyn/clickhouse-driver.git' scipy \
+ && pip3 --no-cache-dir install 'clickhouse-driver==0.2.1' scipy \
&& apt-get purge --yes python3-dev g++ \
&& apt-get autoremove --yes \
&& apt-get clean \
diff --git a/docker/test/performance-comparison/compare.sh b/docker/test/performance-comparison/compare.sh
index c97e8a6ed2b..38595d47528 100755
--- a/docker/test/performance-comparison/compare.sh
+++ b/docker/test/performance-comparison/compare.sh
@@ -196,7 +196,6 @@ function run_tests
test_files=$(ls "$test_prefix" | grep "$CHPC_TEST_GREP" | xargs -I{} -n1 readlink -f "$test_prefix/{}")
elif [ "$PR_TO_TEST" -ne 0 ] \
&& [ "$(wc -l < changed-test-definitions.txt)" -gt 0 ] \
- && [ "$(wc -l < changed-test-scripts.txt)" -eq 0 ] \
&& [ "$(wc -l < other-changed-files.txt)" -eq 0 ]
then
# If only the perf tests were changed in the PR, we will run only these
@@ -208,15 +207,15 @@ function run_tests
test_files=$(ls "$test_prefix"/*.xml)
fi
- # For PRs w/o changes in test definitons and scripts, test only a subset of
- # queries, and run them less times. If the corresponding environment variables
- # are already set, keep those values.
- if [ "$PR_TO_TEST" -ne 0 ] \
- && [ "$(wc -l < changed-test-definitions.txt)" -eq 0 ] \
- && [ "$(wc -l < changed-test-scripts.txt)" -eq 0 ]
+ # For PRs w/o changes in test definitons, test only a subset of queries,
+ # and run them less times. If the corresponding environment variables are
+ # already set, keep those values.
+ #
+ # NOTE: too high CHPC_RUNS/CHPC_MAX_QUERIES may hit internal CI timeout.
+ if [ "$PR_TO_TEST" -ne 0 ] && [ "$(wc -l < changed-test-definitions.txt)" -eq 0 ]
then
CHPC_RUNS=${CHPC_RUNS:-7}
- CHPC_MAX_QUERIES=${CHPC_MAX_QUERIES:-20}
+ CHPC_MAX_QUERIES=${CHPC_MAX_QUERIES:-10}
else
CHPC_RUNS=${CHPC_RUNS:-13}
CHPC_MAX_QUERIES=${CHPC_MAX_QUERIES:-0}
@@ -319,14 +318,14 @@ function get_profiles
wait
- clickhouse-client --port $LEFT_SERVER_PORT --query "select * from system.query_log where type = 'QueryFinish' format TSVWithNamesAndTypes" > left-query-log.tsv ||: &
+ clickhouse-client --port $LEFT_SERVER_PORT --query "select * from system.query_log where type in ('QueryFinish', 'ExceptionWhileProcessing') format TSVWithNamesAndTypes" > left-query-log.tsv ||: &
clickhouse-client --port $LEFT_SERVER_PORT --query "select * from system.query_thread_log format TSVWithNamesAndTypes" > left-query-thread-log.tsv ||: &
clickhouse-client --port $LEFT_SERVER_PORT --query "select * from system.trace_log format TSVWithNamesAndTypes" > left-trace-log.tsv ||: &
clickhouse-client --port $LEFT_SERVER_PORT --query "select arrayJoin(trace) addr, concat(splitByChar('/', addressToLine(addr))[-1], '#', demangle(addressToSymbol(addr)) ) name from system.trace_log group by addr format TSVWithNamesAndTypes" > left-addresses.tsv ||: &
clickhouse-client --port $LEFT_SERVER_PORT --query "select * from system.metric_log format TSVWithNamesAndTypes" > left-metric-log.tsv ||: &
clickhouse-client --port $LEFT_SERVER_PORT --query "select * from system.asynchronous_metric_log format TSVWithNamesAndTypes" > left-async-metric-log.tsv ||: &
- clickhouse-client --port $RIGHT_SERVER_PORT --query "select * from system.query_log where type = 'QueryFinish' format TSVWithNamesAndTypes" > right-query-log.tsv ||: &
+ clickhouse-client --port $RIGHT_SERVER_PORT --query "select * from system.query_log where type in ('QueryFinish', 'ExceptionWhileProcessing') format TSVWithNamesAndTypes" > right-query-log.tsv ||: &
clickhouse-client --port $RIGHT_SERVER_PORT --query "select * from system.query_thread_log format TSVWithNamesAndTypes" > right-query-thread-log.tsv ||: &
clickhouse-client --port $RIGHT_SERVER_PORT --query "select * from system.trace_log format TSVWithNamesAndTypes" > right-trace-log.tsv ||: &
clickhouse-client --port $RIGHT_SERVER_PORT --query "select arrayJoin(trace) addr, concat(splitByChar('/', addressToLine(addr))[-1], '#', demangle(addressToSymbol(addr)) ) name from system.trace_log group by addr format TSVWithNamesAndTypes" > right-addresses.tsv ||: &
diff --git a/docker/test/performance-comparison/config/users.d/perf-comparison-tweaks-users.xml b/docker/test/performance-comparison/config/users.d/perf-comparison-tweaks-users.xml
index a9ae31bf38c..093834943a3 100644
--- a/docker/test/performance-comparison/config/users.d/perf-comparison-tweaks-users.xml
+++ b/docker/test/performance-comparison/config/users.d/perf-comparison-tweaks-users.xml
@@ -24,6 +24,13 @@
60
+
+
+ 0
+ 0
+
+ 0
diff --git a/docker/test/performance-comparison/entrypoint.sh b/docker/test/performance-comparison/entrypoint.sh
index 1295e5567fb..d87b95b1129 100755
--- a/docker/test/performance-comparison/entrypoint.sh
+++ b/docker/test/performance-comparison/entrypoint.sh
@@ -102,7 +102,6 @@ then
base=$(git -C right/ch merge-base pr origin/master)
git -C right/ch diff --name-only "$base" pr -- . | tee all-changed-files.txt
git -C right/ch diff --name-only "$base" pr -- tests/performance | tee changed-test-definitions.txt
- git -C right/ch diff --name-only "$base" pr -- docker/test/performance-comparison | tee changed-test-scripts.txt
git -C right/ch diff --name-only "$base" pr -- :!tests/performance :!docker/test/performance-comparison | tee other-changed-files.txt
fi
diff --git a/docker/test/performance-comparison/perf.py b/docker/test/performance-comparison/perf.py
index a6e7e397e32..301c5cc7d73 100755
--- a/docker/test/performance-comparison/perf.py
+++ b/docker/test/performance-comparison/perf.py
@@ -283,8 +283,11 @@ for query_index in queries_to_run:
# test coverage. We disable profiler for normal runs because
# it makes the results unstable.
res = c.execute(q, query_id = prewarm_id,
- settings = {'max_execution_time': args.max_query_seconds,
- 'query_profiler_real_time_period_ns': 10000000})
+ settings = {
+ 'max_execution_time': args.max_query_seconds,
+ 'query_profiler_real_time_period_ns': 10000000,
+ 'memory_profiler_step': '4Mi',
+ })
except clickhouse_driver.errors.Error as e:
# Add query id to the exception to make debugging easier.
e.args = (prewarm_id, *e.args)
diff --git a/docs/en/introduction/adopters.md b/docs/en/introduction/adopters.md
index 6dd53fffb86..f04d4102138 100644
--- a/docs/en/introduction/adopters.md
+++ b/docs/en/introduction/adopters.md
@@ -170,5 +170,7 @@ toc_title: Adopters
| ЦФТ | Banking, Financial products, Payments | — | — | — | [Meetup in Russian, April 2020](https://team.cft.ru/events/162) |
| Цифровой Рабочий | Industrial IoT, Analytics | — | — | — | [Blog post in Russian, March 2021](https://habr.com/en/company/croc/blog/548018/) |
| ООО «МПЗ Богородский» | Agriculture | — | — | — | [Article in Russian, November 2020](https://cloud.yandex.ru/cases/okraina) |
+| ДомКлик | Real Estate | — | — | — | [Article in Russian, October 2021](https://habr.com/ru/company/domclick/blog/585936/) |
+| Deepl | Machine Learning | — | — | — | [Video, October 2021](https://www.youtube.com/watch?v=WIYJiPwxXdM&t=1182s) |
[Original article](https://clickhouse.com/docs/en/introduction/adopters/)
diff --git a/docs/en/operations/performance-test.md b/docs/en/operations/performance-test.md
index 2880793962a..a220575cb3c 100644
--- a/docs/en/operations/performance-test.md
+++ b/docs/en/operations/performance-test.md
@@ -23,7 +23,7 @@ chmod a+x ./hardware.sh
./hardware.sh
```
-3. Copy the output and send it to clickhouse-feedback@yandex-team.com
+3. Copy the output and send it to feedback@clickhouse.com
All the results are published here: https://clickhouse.com/benchmark/hardware/
diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md
index cdf49678570..8bb50144180 100644
--- a/docs/en/operations/server-configuration-parameters/settings.md
+++ b/docs/en/operations/server-configuration-parameters/settings.md
@@ -69,6 +69,8 @@ If no conditions met for a data part, ClickHouse uses the `lz4` compression.
```
+
+
## custom_settings_prefixes {#custom_settings_prefixes}
List of prefixes for [custom settings](../../operations/settings/index.md#custom_settings). The prefixes must be separated with commas.
diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md
index 8eb6101a605..ff47aa96502 100644
--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@@ -1751,9 +1751,11 @@ Do not merge aggregation states from different servers for distributed query pro
Possible values:
-- 0 — Disabled (final query processing is done on the initiator node).
-- 1 - Do not merge aggregation states from different servers for distributed query processing (query completelly processed on the shard, initiator only proxy the data), can be used in case it is for certain that there are different keys on different shards.
-- 2 - Same as `1` but applies `ORDER BY` and `LIMIT` (it is not possible when the query processed completelly on the remote node, like for `distributed_group_by_no_merge=1`) on the initiator (can be used for queries with `ORDER BY` and/or `LIMIT`).
+- `0` — Disabled (final query processing is done on the initiator node).
+- `1` - Do not merge aggregation states from different servers for distributed query processing (query completelly processed on the shard, initiator only proxy the data), can be used in case it is for certain that there are different keys on different shards.
+- `2` - Same as `1` but applies `ORDER BY` and `LIMIT` (it is not possible when the query processed completelly on the remote node, like for `distributed_group_by_no_merge=1`) on the initiator (can be used for queries with `ORDER BY` and/or `LIMIT`).
+
+Default value: `0`
**Example**
@@ -1784,29 +1786,27 @@ FORMAT PrettyCompactMonoBlock
└───────┘
```
-Default value: 0
+## distributed_push_down_limit {#distributed-push-down-limit}
-## distributed_push_down_limit (#distributed-push-down-limit}
-
-LIMIT will be applied on each shard separatelly.
+Enables or disables [LIMIT](#limit) applying on each shard separatelly.
This will allow to avoid:
+- Sending extra rows over network;
+- Processing rows behind the limit on the initiator.
-- sending extra rows over network,
-- processing rows behind the limit on the initiator.
-
-It is possible if at least one of the following conditions met:
-
-- `distributed_group_by_no_merge` > 0
-- query **does not have `GROUP BY`/`DISTINCT`/`LIMIT BY`**, but it has `ORDER BY`/`LIMIT`.
-- query **has `GROUP BY`/`DISTINCT`/`LIMIT BY`** with `ORDER BY`/`LIMIT` and:
- - `optimize_skip_unused_shards_limit` is enabled
- - `optimize_distributed_group_by_sharding_key` is enabled
+Starting from 21.9 version you cannot get inaccurate results anymore, since `distributed_push_down_limit` changes query execution only if at least one of the conditions met:
+- [distributed_group_by_no_merge](#distributed-group-by-no-merge) > 0.
+- Query **does not have** `GROUP BY`/`DISTINCT`/`LIMIT BY`, but it has `ORDER BY`/`LIMIT`.
+- Query **has** `GROUP BY`/`DISTINCT`/`LIMIT BY` with `ORDER BY`/`LIMIT` and:
+ - [optimize_skip_unused_shards](#optimize-skip-unused-shards) is enabled.
+ - [optimize_distributed_group_by_sharding_key](#optimize-distributed-group-by-sharding-key) is enabled.
Possible values:
-- 0 - Disabled
-- 1 - Enabled
+- 0 — Disabled.
+- 1 — Enabled.
+
+Default value: `1`.
See also:
@@ -1920,6 +1920,7 @@ Default value: 0
See also:
- [distributed_group_by_no_merge](#distributed-group-by-no-merge)
+- [distributed_push_down_limit](#distributed-push-down-limit)
- [optimize_skip_unused_shards](#optimize-skip-unused-shards)
!!! note "Note"
@@ -3831,6 +3832,21 @@ Default value: `0`.
- [optimize_move_to_prewhere](#optimize_move_to_prewhere) setting
+## describe_include_subcolumns {#describe_include_subcolumns}
+
+Enables describing subcolumns for a [DESCRIBE](../../sql-reference/statements/describe-table.md) query. For example, members of a [Tuple](../../sql-reference/data-types/tuple.md) or subcolumns of a [Map](../../sql-reference/data-types/map.md#map-subcolumns), [Nullable](../../sql-reference/data-types/nullable.md#finding-null) or an [Array](../../sql-reference/data-types/array.md#array-size) data type.
+
+Possible values:
+
+- 0 — Subcolumns are not included in `DESCRIBE` queries.
+- 1 — Subcolumns are included in `DESCRIBE` queries.
+
+Default value: `0`.
+
+**Example**
+
+See an example for the [DESCRIBE](../../sql-reference/statements/describe-table.md) statement.
+
## async_insert {#async-insert}
Enables or disables asynchronous inserts. This makes sense only for insertion over HTTP protocol. Note that deduplication isn't working for such inserts.
diff --git a/docs/en/sql-reference/functions/geo/s2.md b/docs/en/sql-reference/functions/geo/s2.md
index d669b1c8b32..f8736bcc61a 100644
--- a/docs/en/sql-reference/functions/geo/s2.md
+++ b/docs/en/sql-reference/functions/geo/s2.md
@@ -2,13 +2,13 @@
toc_title: S2 Geometry
---
-# Functions for Working with S2 Index {#s2Index}
+# Functions for Working with S2 Index {#s2index}
[S2](https://s2geometry.io/) is a geographical indexing system where all geographical data is represented on a three-dimensional sphere (similar to a globe).
-In the S2 library points are represented as unit length vectors called S2 point indices (points on the surface of a three dimensional unit sphere) as opposed to traditional (latitude, longitude) pairs.
+In the S2 library points are represented as the S2 Index - a specific number which encodes internally a point on the surface of a unit sphere, unlike traditional (latitude, longitude) pairs. To get the S2 point index for a given point specified in the format (latitude, longitude) use the [geoToS2](#geotos2) function. Also, you can use the [s2ToGeo](#s2togeo) function for getting geographical coordinates corresponding to the specified S2 point index.
-## geoToS2 {#geoToS2}
+## geoToS2 {#geotos2}
Returns [S2](#s2index) point index corresponding to the provided coordinates `(longitude, latitude)`.
@@ -34,7 +34,7 @@ Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
Query:
``` sql
-SELECT geoToS2(37.79506683, 55.71290588) as s2Index;
+SELECT geoToS2(37.79506683, 55.71290588) AS s2Index;
```
Result:
@@ -45,7 +45,7 @@ Result:
└─────────────────────┘
```
-## s2ToGeo {#s2ToGeo}
+## s2ToGeo {#s2togeo}
Returns geo coordinates `(longitude, latitude)` corresponding to the provided [S2](#s2index) point index.
@@ -57,20 +57,20 @@ s2ToGeo(s2index)
**Arguments**
-- `s2Index` — S2 Index. [UInt64](../../../sql-reference/data-types/int-uint.md).
+- `s2index` — S2 Index. [UInt64](../../../sql-reference/data-types/int-uint.md).
**Returned values**
- A tuple consisting of two values: `tuple(lon,lat)`.
-Type: `lon` - [Float64](../../../sql-reference/data-types/float.md). `lat` — [Float64](../../../sql-reference/data-types/float.md).
+Type: `lon` — [Float64](../../../sql-reference/data-types/float.md). `lat` — [Float64](../../../sql-reference/data-types/float.md).
**Example**
Query:
``` sql
-SELECT s2ToGeo(4704772434919038107) as s2Coodrinates;
+SELECT s2ToGeo(4704772434919038107) AS s2Coodrinates;
```
Result:
@@ -81,9 +81,9 @@ Result:
└──────────────────────────────────────┘
```
-## s2GetNeighbors {#s2GetNeighbors}
+## s2GetNeighbors {#s2getneighbors}
-Returns S2 neighbor indices corresponding to the provided [S2](#s2index)). Each cell in the S2 system is a quadrilateral bounded by four geodesics. So, each cell has 4 neighbors.
+Returns S2 neighbor indixes corresponding to the provided [S2](#s2index). Each cell in the S2 system is a quadrilateral bounded by four geodesics. So, each cell has 4 neighbors.
**Syntax**
@@ -97,16 +97,16 @@ s2GetNeighbors(s2index)
**Returned values**
-- An array consisting of the 4 neighbor indices: `array[s2index1, s2index3, s2index2, s2index4]`.
+- An array consisting of 4 neighbor indexes: `array[s2index1, s2index3, s2index2, s2index4]`.
-Type: Each S2 index is [UInt64](../../../sql-reference/data-types/int-uint.md).
+Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
**Example**
Query:
``` sql
- select s2GetNeighbors(5074766849661468672) AS s2Neighbors;
+SELECT s2GetNeighbors(5074766849661468672) AS s2Neighbors;
```
Result:
@@ -117,9 +117,9 @@ Result:
└───────────────────────────────────────────────────────────────────────────────────┘
```
-## s2CellsIntersect {#s2CellsIntersect}
+## s2CellsIntersect {#s2cellsintersect}
-Determines if the two provided [S2](#s2index)) cell indices intersect or not.
+Determines if the two provided [S2](#s2index) cells intersect or not.
**Syntax**
@@ -133,8 +133,8 @@ s2CellsIntersect(s2index1, s2index2)
**Returned values**
-- 1 — If the S2 cell indices intersect.
-- 0 — If the S2 cell indices don't intersect.
+- 1 — If the cells intersect.
+- 0 — If the cells don't intersect.
Type: [UInt8](../../../sql-reference/data-types/int-uint.md).
@@ -143,7 +143,7 @@ Type: [UInt8](../../../sql-reference/data-types/int-uint.md).
Query:
``` sql
- select s2CellsIntersect(9926595209846587392, 9926594385212866560) as intersect;
+SELECT s2CellsIntersect(9926595209846587392, 9926594385212866560) AS intersect;
```
Result:
@@ -154,11 +154,9 @@ Result:
└───────────┘
```
-## s2CapContains {#s2CapContains}
+## s2CapContains {#s2capcontains}
-A cap represents a portion of the sphere that has been cut off by a plane. It is defined by a point on a sphere and a radius in degrees.
-
-Determines if a cap contains a s2 point index.
+Determines if a cap contains a S2 point. A cap represents a part of the sphere that has been cut off by a plane. It is defined by a point on a sphere and a radius in degrees.
**Syntax**
@@ -168,9 +166,9 @@ s2CapContains(center, degrees, point)
**Arguments**
-- `center` - S2 point index corresponding to the cap. [UInt64](../../../sql-reference/data-types/int-uint.md).
- - `degrees` - Radius of the cap in degrees. [Float64](../../../sql-reference/data-types/float.md).
- - `point` - S2 point index. [UInt64](../../../sql-reference/data-types/int-uint.md).
+- `center` — S2 point index corresponding to the cap. [UInt64](../../../sql-reference/data-types/int-uint.md).
+- `degrees` — Radius of the cap in degrees. [Float64](../../../sql-reference/data-types/float.md).
+- `point` — S2 point index. [UInt64](../../../sql-reference/data-types/int-uint.md).
**Returned values**
@@ -184,7 +182,7 @@ Type: [UInt8](../../../sql-reference/data-types/int-uint.md).
Query:
``` sql
-select s2CapContains(1157339245694594829, 1.0, 1157347770437378819) as capContains;
+SELECT s2CapContains(1157339245694594829, 1.0, 1157347770437378819) AS capContains;
```
Result:
@@ -195,11 +193,9 @@ Result:
└─────────────┘
```
-## s2CapUnion {#s2CapUnion}
+## s2CapUnion {#s2capunion}
-A cap represents a portion of the sphere that has been cut off by a plane. It is defined by a point on a sphere and a radius in degrees.
-
-Determines the smallest cap that contains the given two input caps.
+Determines the smallest cap that contains the given two input caps. A cap represents a portion of the sphere that has been cut off by a plane. It is defined by a point on a sphere and a radius in degrees.
**Syntax**
@@ -209,13 +205,13 @@ s2CapUnion(center1, radius1, center2, radius2)
**Arguments**
-- `center1`, `center2` - S2 point indices corresponding to the two input caps. [UInt64](../../../sql-reference/data-types/int-uint.md).
- - `radius1`, `radius2` - Radii of the two input caps in degrees. [Float64](../../../sql-reference/data-types/float.md).
+- `center1`, `center2` — S2 point indixes corresponding to the two input caps. [UInt64](../../../sql-reference/data-types/int-uint.md).
+- `radius1`, `radius2` — Radius of the two input caps in degrees. [Float64](../../../sql-reference/data-types/float.md).
**Returned values**
-- `center` - S2 point index corresponding the center of the smallest cap containing the two input caps. Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
- - `radius` - Radius of the smallest cap containing the two input caps. Type: [Float64](../../../sql-reference/data-types/float.md).
+- `center` — S2 point index corresponding the center of the smallest cap containing the two input caps. Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
+- `radius` — Radius of the smallest cap containing the two input caps. Type: [Float64](../../../sql-reference/data-types/float.md).
**Example**
@@ -233,11 +229,9 @@ Result:
└────────────────────────────────────────┘
```
-## s2RectAdd{#s2RectAdd}
+## s2RectAdd {#s2rectadd}
-In the S2 system, a rectangle is represented by a type of S2Region called a S2LatLngRect that represents a rectangle in latitude-longitude space.
-
-Increases the size of the bounding rectangle to include the given S2 point index.
+Increases the size of the bounding rectangle to include the given S2 point. In the S2 system, a rectangle is represented by a type of S2Region called a `S2LatLngRect` that represents a rectangle in latitude-longitude space.
**Syntax**
@@ -247,21 +241,21 @@ s2RectAdd(s2pointLow, s2pointHigh, s2Point)
**Arguments**
-- `s2PointLow` - Low S2 point index corresponding to the rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md).
-- `s2PointHigh` - High S2 point index corresponding to the rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md).
-- `s2Point` - Target S2 point index that the bound rectangle should be grown to include. [UInt64](../../../sql-reference/data-types/int-uint.md).
+- `s2PointLow` — Low S2 point index corresponding to the rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md).
+- `s2PointHigh` — High S2 point index corresponding to the rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md).
+- `s2Point` — Target S2 point index that the bound rectangle should be grown to include. [UInt64](../../../sql-reference/data-types/int-uint.md).
**Returned values**
-- `s2PointLow` - Low S2 cell id corresponding to the grown rectangle. Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
- - `s2PointHigh` - Hight S2 cell id corresponding to the grown rectangle. Type: [UInt64](../../../sql-reference/data-types/float.md).
+- `s2PointLow` — Low S2 cell id corresponding to the grown rectangle. Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
+- `s2PointHigh` — Hight S2 cell id corresponding to the grown rectangle. Type: [UInt64](../../../sql-reference/data-types/float.md).
**Example**
Query:
``` sql
-SELECT s2RectAdd(5178914411069187297, 5177056748191934217, 5179056748191934217) as rectAdd;
+SELECT s2RectAdd(5178914411069187297, 5177056748191934217, 5179056748191934217) AS rectAdd;
```
Result:
@@ -272,11 +266,9 @@ Result:
└───────────────────────────────────────────┘
```
-## s2RectContains{#s2RectContains}
+## s2RectContains {#s2rectcontains}
-In the S2 system, a rectangle is represented by a type of S2Region called a S2LatLngRect that represents a rectangle in latitude-longitude space.
-
-Determines if a given rectangle contains a S2 point index.
+Determines if a given rectangle contains a S2 point. In the S2 system, a rectangle is represented by a type of S2Region called a `S2LatLngRect` that represents a rectangle in latitude-longitude space.
**Syntax**
@@ -286,9 +278,9 @@ s2RectContains(s2PointLow, s2PointHi, s2Point)
**Arguments**
-- `s2PointLow` - Low S2 point index corresponding to the rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md).
-- `s2PointHigh` - High S2 point index corresponding to the rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md).
-- `s2Point` - Target S2 point index. [UInt64](../../../sql-reference/data-types/int-uint.md).
+- `s2PointLow` — Low S2 point index corresponding to the rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md).
+- `s2PointHigh` — High S2 point index corresponding to the rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md).
+- `s2Point` — Target S2 point index. [UInt64](../../../sql-reference/data-types/int-uint.md).
**Returned values**
@@ -300,7 +292,7 @@ s2RectContains(s2PointLow, s2PointHi, s2Point)
Query:
``` sql
-SELECT s2RectContains(5179062030687166815, 5177056748191934217, 5177914411069187297) AS rectContains
+SELECT s2RectContains(5179062030687166815, 5177056748191934217, 5177914411069187297) AS rectContains;
```
Result:
@@ -311,11 +303,9 @@ Result:
└──────────────┘
```
-## s2RectUinion{#s2RectUnion}
+## s2RectUinion {#s2rectunion}
-In the S2 system, a rectangle is represented by a type of S2Region called a S2LatLngRect that represents a rectangle in latitude-longitude space.
-
-Returns the smallest rectangle containing the union of this rectangle and the given rectangle.
+Returns the smallest rectangle containing the union of this rectangle and the given rectangle. In the S2 system, a rectangle is represented by a type of S2Region called a `S2LatLngRect` that represents a rectangle in latitude-longitude space.
**Syntax**
@@ -325,20 +315,20 @@ s2RectUnion(s2Rect1PointLow, s2Rect1PointHi, s2Rect2PointLow, s2Rect2PointHi)
**Arguments**
-- `s2Rect1PointLow`, `s2Rect1PointHi` - Low and High S2 point indices corresponding to the first rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md).
-- `s2Rect2PointLow`, `s2Rect2PointHi` - Low and High S2 point indices corresponding to the second rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md).
+- `s2Rect1PointLow`, `s2Rect1PointHi` — Low and High S2 point indexes corresponding to the first rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md).
+- `s2Rect2PointLow`, `s2Rect2PointHi` — Low and High S2 point indexes corresponding to the second rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md).
**Returned values**
-- `s2UnionRect2PointLow` - Low S2 cell id corresponding to the union rectangle. Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
-- `s2UnionRect2PointHi` - High S2 cell id corresponding to the union rectangle. Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
+- `s2UnionRect2PointLow` — Low S2 cell id corresponding to the union rectangle. Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
+- `s2UnionRect2PointHi` — High S2 cell id corresponding to the union rectangle. Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
**Example**
Query:
``` sql
-SELECT s2RectUnion(5178914411069187297, 5177056748191934217, 5179062030687166815, 5177056748191934217) AS rectUnion
+SELECT s2RectUnion(5178914411069187297, 5177056748191934217, 5179062030687166815, 5177056748191934217) AS rectUnion;
```
Result:
@@ -349,9 +339,9 @@ Result:
└───────────────────────────────────────────┘
```
-## s2RectIntersection{#s2RectIntersection}
+## s2RectIntersection {#s2rectintersection}
-Returns the smallest Rectangle containing the intersection of this rectangle and the given rectangle.
+Returns the smallest rectangle containing the intersection of this rectangle and the given rectangle. In the S2 system, a rectangle is represented by a type of S2Region called a `S2LatLngRect` that represents a rectangle in latitude-longitude space.
**Syntax**
@@ -361,20 +351,20 @@ s2RectIntersection(s2Rect1PointLow, s2Rect1PointHi, s2Rect2PointLow, s2Rect2Poin
**Arguments**
-- `s2Rect1PointLow`, `s2Rect1PointHi` - Low and High S2 point indices corresponding to the first rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md).
-- `s2Rect2PointLow`, `s2Rect2PointHi` - Low and High S2 point indices corresponding to the second rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md).
+- `s2Rect1PointLow`, `s2Rect1PointHi` — Low and High S2 point indexes corresponding to the first rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md).
+- `s2Rect2PointLow`, `s2Rect2PointHi` — Low and High S2 point indexes corresponding to the second rectangle. [UInt64](../../../sql-reference/data-types/int-uint.md).
**Returned values**
-- `s2UnionRect2PointLow` - Low S2 cell id corresponding to the rectangle containing the intersection of the given rectangles. Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
-- `s2UnionRect2PointHi` - Hi S2 cell id corresponding to the rectangle containing the intersection of the given rectangles. Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
+- `s2UnionRect2PointLow` — Low S2 cell id corresponding to the rectangle containing the intersection of the given rectangles. Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
+- `s2UnionRect2PointHi` — High S2 cell id corresponding to the rectangle containing the intersection of the given rectangles. Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
**Example**
Query:
``` sql
-SELECT s2RectIntersection(5178914411069187297, 5177056748191934217, 5179062030687166815, 5177056748191934217) AS rectIntersection
+SELECT s2RectIntersection(5178914411069187297, 5177056748191934217, 5179062030687166815, 5177056748191934217) AS rectIntersection;
```
Result:
diff --git a/docs/en/sql-reference/functions/tuple-map-functions.md b/docs/en/sql-reference/functions/tuple-map-functions.md
index 46ce350377c..843cb16f572 100644
--- a/docs/en/sql-reference/functions/tuple-map-functions.md
+++ b/docs/en/sql-reference/functions/tuple-map-functions.md
@@ -22,7 +22,7 @@ map(key1, value1[, key2, value2, ...])
**Returned value**
-- Data structure as `key:value` pairs.
+- Data structure as `key:value` pairs.
Type: [Map(key, value)](../../sql-reference/data-types/map.md).
@@ -165,9 +165,6 @@ Result:
## mapPopulateSeries {#function-mappopulateseries}
Fills missing keys in the maps (key and value array pair), where keys are integers. Also, it supports specifying the max key, which is used to extend the keys array.
-Arguments are [maps](../../sql-reference/data-types/map.md) or two [arrays](../../sql-reference/data-types/array.md#data-type-array), where the first array represent keys, and the second array contains values for the each key.
-
-For array arguments the number of elements in `keys` and `values` must be the same for each row.
**Syntax**
@@ -178,12 +175,17 @@ mapPopulateSeries(map[, max])
Generates a map (a tuple with two arrays or a value of `Map` type, depending on the arguments), where keys are a series of numbers, from minimum to maximum keys (or `max` argument if it specified) taken from the map with a step size of one, and corresponding values. If the value is not specified for the key, then it uses the default value in the resulting map. For repeated keys, only the first value (in order of appearing) gets associated with the key.
+For array arguments the number of elements in `keys` and `values` must be the same for each row.
+
**Arguments**
+Arguments are [maps](../../sql-reference/data-types/map.md) or two [arrays](../../sql-reference/data-types/array.md#data-type-array), where the first array represent keys, and the second array contains values for the each key.
+
Mapped arrays:
- `keys` — Array of keys. [Array](../../sql-reference/data-types/array.md#data-type-array)([Int](../../sql-reference/data-types/int-uint.md#uint-ranges)).
- `values` — Array of values. [Array](../../sql-reference/data-types/array.md#data-type-array)([Int](../../sql-reference/data-types/int-uint.md#uint-ranges)).
+- `max` — Maximum key value. Optional. [Int8, Int16, Int32, Int64, Int128, Int256](../../sql-reference/data-types/int-uint.md#int-ranges).
or
@@ -191,14 +193,14 @@ or
**Returned value**
-- Depending on the arguments returns a [map](../../sql-reference/data-types/map.md) or a [tuple](../../sql-reference/data-types/tuple.md#tuplet1-t2) of two [arrays](../../sql-reference/data-types/array.md#data-type-array): keys in sorted order, and values the corresponding keys.
+- Depending on the arguments returns a [map](../../sql-reference/data-types/map.md) or a [tuple](../../sql-reference/data-types/tuple.md#tuplet1-t2) of two [arrays](../../sql-reference/data-types/array.md#data-type-array): keys in sorted order, and values the corresponding keys.
**Example**
Query with mapped arrays:
```sql
-select mapPopulateSeries([1,2,4], [11,22,44], 5) as res, toTypeName(res) as type;
+SELECT mapPopulateSeries([1,2,4], [11,22,44], 5) AS res, toTypeName(res) AS type;
```
Result:
diff --git a/docs/en/sql-reference/statements/describe-table.md b/docs/en/sql-reference/statements/describe-table.md
index bc197bf0f72..823a31ed313 100644
--- a/docs/en/sql-reference/statements/describe-table.md
+++ b/docs/en/sql-reference/statements/describe-table.md
@@ -3,18 +3,67 @@ toc_priority: 42
toc_title: DESCRIBE
---
-# DESCRIBE TABLE Statement {#misc-describe-table}
+# DESCRIBE TABLE {#misc-describe-table}
+
+Returns information about table columns.
+
+**Syntax**
``` sql
DESC|DESCRIBE TABLE [db.]table [INTO OUTFILE filename] [FORMAT format]
```
-Returns the following `String` type columns:
+The `DESCRIBE` statement returns a row for each table column with the following [String](../../sql-reference/data-types/string.md) values:
-- `name` — Column name.
-- `type`— Column type.
-- `default_type` — Clause that is used in [default expression](../../sql-reference/statements/create/table.md#create-default-values) (`DEFAULT`, `MATERIALIZED` or `ALIAS`). Column contains an empty string, if the default expression isn’t specified.
-- `default_expression` — Value specified in the `DEFAULT` clause.
-- `comment_expression` — Comment text.
+- `name` — A column name.
+- `type` — A column type.
+- `default_type` — A clause that is used in the column [default expression](../../sql-reference/statements/create/table.md#create-default-values): `DEFAULT`, `MATERIALIZED` or `ALIAS`. If there is no default expression, then empty string is returned.
+- `default_expression` — An expression specified after the `DEFAULT` clause.
+- `comment` — A [column comment](../../sql-reference/statements/alter/column.md#alter_comment-column).
+- `codec_expression` — A [codec](../../sql-reference/statements/create/table.md#codecs) that is applied to the column.
+- `ttl_expression` — A [TTL](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl) expression.
+- `is_subcolumn` — A flag that equals `1` for internal subcolumns. It is included into the result only if subcolumn description is enabled by the [describe_include_subcolumns](../../operations/settings/settings.md#describe_include_subcolumns) setting.
-Nested data structures are output in “expanded” format. Each column is shown separately, with the name after a dot.
+All columns in [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) data structures are described separately. The name of each column is prefixed with a parent column name and a dot.
+
+To show internal subcolumns of other data types, use the [describe_include_subcolumns](../../operations/settings/settings.md#describe_include_subcolumns) setting.
+
+**Example**
+
+Query:
+
+``` sql
+CREATE TABLE describe_example (
+ id UInt64, text String DEFAULT 'unknown' CODEC(ZSTD),
+ user Tuple (name String, age UInt8)
+) ENGINE = MergeTree() ORDER BY id;
+
+DESCRIBE TABLE describe_example;
+DESCRIBE TABLE describe_example SETTINGS describe_include_subcolumns=1;
+```
+
+Result:
+
+``` text
+┌─name─┬─type──────────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
+│ id │ UInt64 │ │ │ │ │ │
+│ text │ String │ DEFAULT │ 'unknown' │ │ ZSTD(1) │ │
+│ user │ Tuple(name String, age UInt8) │ │ │ │ │ │
+└──────┴───────────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
+```
+
+The second query additionally shows subcolumns:
+
+``` text
+┌─name──────┬─type──────────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┬─is_subcolumn─┐
+│ id │ UInt64 │ │ │ │ │ │ 0 │
+│ text │ String │ DEFAULT │ 'unknown' │ │ ZSTD(1) │ │ 0 │
+│ user │ Tuple(name String, age UInt8) │ │ │ │ │ │ 0 │
+│ user.name │ String │ │ │ │ │ │ 1 │
+│ user.age │ UInt8 │ │ │ │ │ │ 1 │
+└───────────┴───────────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┴──────────────┘
+```
+
+**See Also**
+
+- [describe_include_subcolumns](../../operations/settings/settings.md#describe_include_subcolumns) setting.
diff --git a/docs/ru/operations/settings/settings.md b/docs/ru/operations/settings/settings.md
index 6d5bcda8452..e5efa657620 100644
--- a/docs/ru/operations/settings/settings.md
+++ b/docs/ru/operations/settings/settings.md
@@ -1705,6 +1705,32 @@ ClickHouse генерирует исключение
Значение по умолчанию: 0.
+## distributed_push_down_limit {#distributed-push-down-limit}
+
+Включает или отключает [LIMIT](#limit), применяемый к каждому шарду по отдельности.
+
+Это позволяет избежать:
+- отправки дополнительных строк по сети;
+- обработки строк за пределами ограничения для инициатора.
+
+Начиная с версии 21.9 вы больше не сможете получить неточные результаты, так как `distributed_push_down_limit` изменяет выполнение запроса только в том случае, если выполнено хотя бы одно из условий:
+- `distributed_group_by_no_merge` > 0.
+- запрос **не содержит** `GROUP BY`/`DISTINCT`/`LIMIT BY`, но содержит `ORDER BY`/`LIMIT`.
+- запрос **содержит** `GROUP BY`/`DISTINCT`/`LIMIT BY` с `ORDER BY`/`LIMIT` и:
+ - включена настройка [optimize_skip_unused_shards](#optimize-skip-unused-shards).
+ - включена настройка `optimize_distributed_group_by_sharding_key`.
+
+Возможные значения:
+
+- 0 — выключена.
+- 1 — включена.
+
+Значение по умолчанию: `1`.
+
+См. также:
+
+- [optimize_skip_unused_shards](#optimize-skip-unused-shards)
+
## optimize_skip_unused_shards {#optimize-skip-unused-shards}
Включает или отключает пропуск неиспользуемых шардов для запросов [SELECT](../../sql-reference/statements/select/index.md) , в которых условие ключа шардирования задано в секции `WHERE/PREWHERE`. Предполагается, что данные распределены с помощью ключа шардирования, в противном случае запрос выдаст неверный результат.
@@ -3641,6 +3667,21 @@ SELECT * FROM positional_arguments ORDER BY 2,3;
- настройка [optimize_move_to_prewhere](#optimize_move_to_prewhere)
+## describe_include_subcolumns {#describe_include_subcolumns}
+
+Включает или отключает описание подстолбцов при выполнении запроса [DESCRIBE](../../sql-reference/statements/describe-table.md). Настройка действует, например, на элементы [Tuple](../../sql-reference/data-types/tuple.md) или подстолбцы типов [Map](../../sql-reference/data-types/map.md#map-subcolumns), [Nullable](../../sql-reference/data-types/nullable.md#finding-null) или [Array](../../sql-reference/data-types/array.md#array-size).
+
+Возможные значения:
+
+- 0 — подстолбцы не включаются в результат запросов `DESCRIBE`.
+- 1 — подстолбцы включаются в результат запросов `DESCRIBE`.
+
+Значение по умолчанию: `0`.
+
+**Пример**
+
+Смотрите пример запроса [DESCRIBE](../../sql-reference/statements/describe-table.md).
+
## async_insert {#async-insert}
Включает или отключает асинхронные вставки. Работает только для вставок по протоколу HTTP. Обратите внимание, что при таких вставках дедупликация не производится.
diff --git a/docs/ru/sql-reference/functions/geo/s2.md b/docs/ru/sql-reference/functions/geo/s2.md
new file mode 100644
index 00000000000..6b801e1d08f
--- /dev/null
+++ b/docs/ru/sql-reference/functions/geo/s2.md
@@ -0,0 +1,376 @@
+---
+toc_title: "Функции для работы с индексами S2"
+---
+
+# Функции для работы с индексами S2 {#s2index}
+
+[S2](https://s2geometry.io/) — это система геокодирования, в которой все географические данные представлены на трехмерной сфере (аналогично глобусу).
+
+В библиотеке S2 точки представлены в виде индекса S2 — определенного числа, которое внутренне кодирует точку на поверхности трехмерной единичной сферы, в отличие от традиционных пар (широта, долгота). Чтобы получить индекс S2 для точки, заданной в формате (широта, долгота), используйте функцию [geoToS2](#geotools2). Также вы можете использовать функцию [s2togeo](#s2togeo) для получения географических координат, соответствующих заданному S2 индексу точки.
+
+## geoToS2 {#geotos2}
+
+Возвращает [S2](#s2index) индекс точки, соответствующий заданным координатам в формате `(долгота, широта)`.
+
+**Синтаксис**
+
+``` sql
+geoToS2(lon, lat)
+```
+
+**Аргументы**
+
+- `lon` — долгота. [Float64](../../../sql-reference/data-types/float.md).
+- `lat` — широта. [Float64](../../../sql-reference/data-types/float.md).
+
+**Возвращаемое значение**
+
+- S2 индекс точки.
+
+Тип: [UInt64](../../../sql-reference/data-types/int-uint.md).
+
+**Пример**
+
+Запрос:
+
+``` sql
+SELECT geoToS2(37.79506683, 55.71290588) AS s2Index;
+```
+
+Результат:
+
+``` text
+┌─────────────s2Index─┐
+│ 4704772434919038107 │
+└─────────────────────┘
+```
+
+## s2ToGeo {#s2togeo}
+
+Возвращает географические координаты `(долгота, широта)`, соответствующие заданному [S2](#s2index) индексу точки.
+
+**Синтаксис**
+
+``` sql
+s2ToGeo(s2index)
+```
+
+**Аргументы**
+
+- `s2index` — [S2](#s2index) индекс. [UInt64](../../../sql-reference/data-types/int-uint.md).
+
+**Возвращаемые значения**
+
+- Кортеж их двух значений: `tuple(lon,lat)`.
+
+Тип: `lon` — [Float64](../../../sql-reference/data-types/float.md). `lat` — [Float64](../../../sql-reference/data-types/float.md).
+
+**Пример**
+
+Запрос:
+
+``` sql
+SELECT s2ToGeo(4704772434919038107) AS s2Coodrinates;
+```
+
+Результат:
+
+``` text
+┌─s2Coodrinates────────────────────────┐
+│ (37.79506681471008,55.7129059052841) │
+└──────────────────────────────────────┘
+```
+
+## s2GetNeighbors {#s2getneighbors}
+
+Возвращает [S2](#s2index) индексы ячеек, которые являются соседними для заданного S2 индекса. Ячейка в системе S2 представляет собой прямоугольник, ограниченный четырьмя сторонами. Соответственно, у каждой ячейки есть 4 соседние ячейки.
+
+**Синтаксис**
+
+``` sql
+s2GetNeighbors(s2index)
+```
+
+**Аргументы**
+
+- `s2index` — [S2](#s2index) индекс. [UInt64](../../../sql-reference/data-types/int-uint.md).
+
+**Возвращаемые значения**
+
+- Массив, содержащий 4 значения — S2 индекса соседних ячеек: `array[s2index1, s2index3, s2index2, s2index4]`.
+
+Тип: [UInt64](../../../sql-reference/data-types/int-uint.md).
+
+**Пример**
+
+Запрос:
+
+``` sql
+SELECT s2GetNeighbors(5074766849661468672) AS s2Neighbors;
+```
+
+Результат:
+
+``` text
+┌─s2Neighbors───────────────────────────────────────────────────────────────────────┐
+│ [5074766987100422144,5074766712222515200,5074767536856236032,5074767261978329088] │
+└───────────────────────────────────────────────────────────────────────────────────┘
+```
+
+## s2CellsIntersect {#s2cellsintersect}
+
+Проверяет, пересекаются ли две заданные ячейки или нет.
+
+**Синтаксис**
+
+``` sql
+s2CellsIntersect(s2index1, s2index2)
+```
+
+**Аргументы**
+
+- `siIndex1`, `s2index2` — S2 индексы первой и второй ячейки. [UInt64](../../../sql-reference/data-types/int-uint.md).
+
+**Возвращаемые значения**
+
+- 1 — ячейки пересекаются.
+- 0 — ячейки не пересекаются.
+
+Тип: [UInt8](../../../sql-reference/data-types/int-uint.md).
+
+**Пример**
+
+Запрос:
+
+``` sql
+SELECT s2CellsIntersect(9926595209846587392, 9926594385212866560) AS intersect;
+```
+
+Результат:
+
+``` text
+┌─intersect─┐
+│ 1 │
+└───────────┘
+```
+
+## s2CapContains {#s2capcontains}
+
+Определяет, содержит ли заданный купол указанную точку. Купол представляет собой часть сферы, которая была отрезана плоскостью. Купол задается точкой на сфере и радиусом в градусах.
+
+**Синтаксис**
+
+``` sql
+s2CapContains(center, degrees, point)
+```
+
+**Аргументы**
+
+- `center` — S2 индекс точки, определяющей центр купола. [UInt64](../../../sql-reference/data-types/int-uint.md).
+- `degrees` — радиус купола в градусах. [Float64](../../../sql-reference/data-types/float.md).
+- `point` — S2 индекс проверяемой точки. [UInt64](../../../sql-reference/data-types/int-uint.md).
+
+**Возвращаемые значения**
+
+- 1 — купол содержит точку.
+- 0 — купол не содержит точку.
+
+Тип: [UInt8](../../../sql-reference/data-types/int-uint.md).
+
+**Пример**
+
+Запрос:
+
+``` sql
+SELECT s2CapContains(1157339245694594829, 1.0, 1157347770437378819) AS capContains;
+```
+
+Результат:
+
+``` text
+┌─capContains─┐
+│ 1 │
+└─────────────┘
+```
+
+## s2CapUnion {#s2capunion}
+
+Определяет наименьший купол, содержащий два заданных купола. Купол представляет собой часть сферы, которая была отрезана плоскостью. Купол задается точкой на сфере и радиусом в градусах.
+
+**Синтаксис**
+
+``` sql
+s2CapUnion(center1, radius1, center2, radius2)
+```
+
+**Аргументы**
+
+- `center1`, `center2` — S2 индексы точек, определяющие два центра куполов. [UInt64](../../../sql-reference/data-types/int-uint.md).
+- `radius1`, `radius2` — значения радиусов в градусах, определяющие два радиуса куполов. [Float64](../../../sql-reference/data-types/float.md).
+
+**Возвращаемые значения**
+
+- `center` — S2 индекс точки, соответствующий центру наименьшего купола, содержащего заданные купола. Тип: [UInt64](../../../sql-reference/data-types/int-uint.md).
+- `radius` — радиус в градусах наименьшего купола, содержащего заданные купола. Тип: [Float64](../../../sql-reference/data-types/float.md).
+
+**Пример**
+
+Запрос:
+
+``` sql
+SELECT s2CapUnion(3814912406305146967, 1.0, 1157347770437378819, 1.0) AS capUnion;
+```
+
+Результат:
+
+``` text
+┌─capUnion───────────────────────────────┐
+│ (4534655147792050737,60.2088283994957) │
+└────────────────────────────────────────┘
+```
+
+## s2RectAdd {#s2rectadd}
+
+Увеличивает размер ограничивающего прямоугольника, чтобы включить в себя точку, заданную S2 индексом. В системе S2 прямоугольник представлен типом S2Region, называемым `S2LatLngRect`, который задает прямоугольник в пространстве широта-долгота.
+
+**Синтаксис**
+
+``` sql
+s2RectAdd(s2pointLow, s2pointHigh, s2Point)
+```
+
+**Аргументы**
+
+- `s2PointLow` — S2 индекс нижней точки, которая задает ограничиваюший прямоугольник. [UInt64](../../../sql-reference/data-types/int-uint.md).
+- `s2PointHigh` — S2 индекс верхний точки, которая задает ограничиваюший прямоугольник. [UInt64](../../../sql-reference/data-types/int-uint.md).
+- `s2Point` — S2 индекс целевой точки, которая будет содержаться увеличенным ограничивающим прямоугольником. [UInt64](../../../sql-reference/data-types/int-uint.md).
+
+**Возвращаемые значения**
+
+- `s2PointLow` — идентификатор нижней S2 ячейки, соответствующий увеличенному прямоугольнику. Тип: [UInt64](../../../sql-reference/data-types/int-uint.md).
+- `s2PointHigh` — идентификатор верхней S2 ячейки, соответствующий увеличенному прямоугольнику. Тип: [UInt64](../../../sql-reference/data-types/float.md).
+
+**Пример**
+
+Запрос:
+
+``` sql
+SELECT s2RectAdd(5178914411069187297, 5177056748191934217, 5179056748191934217) AS rectAdd;
+```
+
+Результат:
+
+``` text
+┌─rectAdd───────────────────────────────────┐
+│ (5179062030687166815,5177056748191934217) │
+└───────────────────────────────────────────┘
+```
+
+## s2RectContains {#s2rectcontains}
+
+Проверяет, содержит ли заданный прямоугольник указанную S2 точку. В системе S2 прямоугольник представлен типом S2Region, называемым `S2LatLngRect`, который задает прямоугольник в пространстве широта-долгота.
+
+**Синтаксис**
+
+``` sql
+s2RectContains(s2PointLow, s2PointHi, s2Point)
+```
+
+**Аргументы**
+
+- `s2PointLow` — S2 индекс самой низкой точки, которая задает прямоугольник. [UInt64](../../../sql-reference/data-types/int-uint.md).
+- `s2PointHigh` — S2 индекс самой высокой точки, которая задает прямоугольник. [UInt64](../../../sql-reference/data-types/int-uint.md).
+- `s2Point` — S2 индекс проверяемой точки. [UInt64](../../../sql-reference/data-types/int-uint.md).
+
+**Возвращаемые значения**
+
+- 1 — прямоугольник содержит заданную точку.
+- 0 — прямоугольник не содержит заданную точку.
+
+**Пример**
+
+Запрос:
+
+``` sql
+SELECT s2RectContains(5179062030687166815, 5177056748191934217, 5177914411069187297) AS rectContains;
+```
+
+Результат:
+
+``` text
+┌─rectContains─┐
+│ 0 │
+└──────────────┘
+```
+
+## s2RectUinion {#s2rectunion}
+
+Возвращает наименьший прямоугольник, содержащий объединение двух заданных прямоугольников. В системе S2 прямоугольник представлен типом S2Region, называемым `S2LatLngRect`, который задает прямоугольник в пространстве широта-долгота.
+
+**Синтаксис**
+
+``` sql
+s2RectUnion(s2Rect1PointLow, s2Rect1PointHi, s2Rect2PointLow, s2Rect2PointHi)
+```
+
+**Аргументы**
+
+- `s2Rect1PointLow`, `s2Rect1PointHi` — значения S2 индекса для самой низкой и самой высокой точек, которые задают первый прямоугольник. [UInt64](../../../sql-reference/data-types/int-uint.md).
+- `s2Rect2PointLow`, `s2Rect2PointHi` — значения S2 индекса для самой низкой и самой высокой точек, которые задают второй прямоугольник. [UInt64](../../../sql-reference/data-types/int-uint.md).
+
+**Возвращаемые значения**
+
+- `s2UnionRect2PointLow` — идентификатор нижней ячейки, соответствующей объединенному прямоугольнику. Тип: [UInt64](../../../sql-reference/data-types/int-uint.md).
+- `s2UnionRect2PointHi` — идентификатор верхней ячейки, соответствующей объединенному прямоугольнику. Тип: [UInt64](../../../sql-reference/data-types/int-uint.md).
+
+**Пример**
+
+Запрос:
+
+``` sql
+SELECT s2RectUnion(5178914411069187297, 5177056748191934217, 5179062030687166815, 5177056748191934217) AS rectUnion;
+```
+
+Результат:
+
+``` text
+┌─rectUnion─────────────────────────────────┐
+│ (5179062030687166815,5177056748191934217) │
+└───────────────────────────────────────────┘
+```
+
+## s2RectIntersection {#s2rectintersection}
+
+Возвращает наименьший прямоугольник, содержащий пересечение двух заданных прямоугольников. В системе S2 прямоугольник представлен типом S2Region, называемым `S2LatLngRect`, который задает прямоугольник в пространстве широта-долгота.
+
+**Синтаксис**
+
+``` sql
+s2RectIntersection(s2Rect1PointLow, s2Rect1PointHi, s2Rect2PointLow, s2Rect2PointHi)
+```
+
+**Аргументы**
+
+- `s2Rect1PointLow`, `s2Rect1PointHi` — значения S2 индекса для самой низкой и самой высокой точек, которые задают первый прямоугольник. [UInt64](../../../sql-reference/data-types/int-uint.md).
+- `s2Rect2PointLow`, `s2Rect2PointHi` — значения S2 индекса для самой низкой и самой высокой точек, которые задают второй прямоугольник. [UInt64](../../../sql-reference/data-types/int-uint.md).
+
+**Возвращаемые значения**
+
+- `s2UnionRect2PointLow` — идентификатор нижней ячейки, соответствующей результирующему прямоугольнику. Тип: [UInt64](../../../sql-reference/data-types/int-uint.md).
+- `s2UnionRect2PointHi` — идентификатор верхней ячейки, соответствующей результирующему прямоугольнику. Тип: [UInt64](../../../sql-reference/data-types/int-uint.md).
+
+**Пример**
+
+Запрос:
+
+``` sql
+SELECT s2RectIntersection(5178914411069187297, 5177056748191934217, 5179062030687166815, 5177056748191934217) AS rectIntersection;
+```
+
+Результат:
+
+``` text
+┌─rectIntersection──────────────────────────┐
+│ (5178914411069187297,5177056748191934217) │
+└───────────────────────────────────────────┘
+```
diff --git a/docs/ru/sql-reference/functions/tuple-map-functions.md b/docs/ru/sql-reference/functions/tuple-map-functions.md
index e4cc1fefab4..45a5018500f 100644
--- a/docs/ru/sql-reference/functions/tuple-map-functions.md
+++ b/docs/ru/sql-reference/functions/tuple-map-functions.md
@@ -108,7 +108,7 @@ SELECT mapAdd(([toUInt8(1), 2], [1, 1]), ([toUInt8(1), 2], [1, 1])) as res, toTy
SELECT mapAdd(map(1,1), map(1,1));
```
-Result:
+Результат:
```text
┌─mapAdd(map(1, 1), map(1, 1))─┐
@@ -128,13 +128,13 @@ mapSubtract(Tuple(Array, Array), Tuple(Array, Array) [, ...])
**Аргументы**
-Аргументами являются [кортежи](../../sql-reference/data-types/tuple.md#tuplet1-t2) из двух [массивов](../../sql-reference/data-types/array.md#data-type-array), где элементы в первом массиве представляют ключи, а второй массив содержит значения для каждого ключа.
+Аргументами являются контейнеры [Map](../../sql-reference/data-types/map.md) или [кортежи](../../sql-reference/data-types/tuple.md#tuplet1-t2) из двух [массивов](../../sql-reference/data-types/array.md#data-type-array), где элементы в первом массиве представляют ключи, а второй массив содержит значения для каждого ключа.
Все массивы ключей должны иметь один и тот же тип, а все массивы значений должны содержать элементы, которые можно приводить к одному типу ([Int64](../../sql-reference/data-types/int-uint.md#int-ranges), [UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges) или [Float64](../../sql-reference/data-types/float.md#float32-float64)).
Общий приведенный тип используется в качестве типа для результирующего массива.
**Возвращаемое значение**
-- Возвращает один [tuple](../../sql-reference/data-types/tuple.md#tuplet1-t2), в котором первый массив содержит отсортированные ключи, а второй - значения.
+- В зависимости от аргумента возвращает один [Map](../../sql-reference/data-types/map.md) или [кортеж](../../sql-reference/data-types/tuple.md#tuplet1-t2), в котором первый массив содержит отсортированные ключи, а второй — значения.
**Пример**
@@ -152,6 +152,20 @@ SELECT mapSubtract(([toUInt8(1), 2], [toInt32(1), 1]), ([toUInt8(1), 2], [toInt3
└────────────────┴───────────────────────────────────┘
```
+Запрос с контейнером `Map`:
+
+```sql
+SELECT mapSubtract(map(1,1), map(1,1));
+```
+
+Результат:
+
+```text
+┌─mapSubtract(map(1, 1), map(1, 1))─┐
+│ {1:0} │
+└───────────────────────────────────┘
+```
+
## mapPopulateSeries {#function-mappopulateseries}
Заполняет недостающие ключи в контейнере map (пара массивов ключей и значений), где ключи являются целыми числами. Кроме того, он поддерживает указание максимального ключа, который используется для расширения массива ключей.
@@ -160,6 +174,7 @@ SELECT mapSubtract(([toUInt8(1), 2], [toInt32(1), 1]), ([toUInt8(1), 2], [toInt3
``` sql
mapPopulateSeries(keys, values[, max])
+mapPopulateSeries(map[, max])
```
Генерирует контейнер map, где ключи - это серия чисел, от минимального до максимального ключа (или аргумент `max`, если он указан), взятых из массива `keys` с размером шага один, и соответствующие значения, взятые из массива `values`. Если значение не указано для ключа, то в результирующем контейнере используется значение по умолчанию.
@@ -168,19 +183,28 @@ mapPopulateSeries(keys, values[, max])
**Аргументы**
-- `keys` — массив ключей [Array](../../sql-reference/data-types/array.md#data-type-array)([Int](../../sql-reference/data-types/int-uint.md#int-ranges)).
+Аргументами являются контейнер [Map](../../sql-reference/data-types/map.md) или два [массива](../../sql-reference/data-types/array.md#data-type-array), где первый массив представляет ключи, а второй массив содержит значения для каждого ключа.
+
+Сопоставленные массивы:
+
+- `keys` — массив ключей. [Array](../../sql-reference/data-types/array.md#data-type-array)([Int](../../sql-reference/data-types/int-uint.md#int-ranges)).
- `values` — массив значений. [Array](../../sql-reference/data-types/array.md#data-type-array)([Int](../../sql-reference/data-types/int-uint.md#int-ranges)).
+- `max` — максимальное значение ключа. Необязательный параметр. [Int8, Int16, Int32, Int64, Int128, Int256](../../sql-reference/data-types/int-uint.md#int-ranges).
+
+или
+
+- `map` — контейнер `Map` с целочисленными ключами. [Map](../../sql-reference/data-types/map.md).
**Возвращаемое значение**
-- Возвращает [кортеж](../../sql-reference/data-types/tuple.md#tuplet1-t2) из двух [массивов](../../sql-reference/data-types/array.md#data-type-array): ключи отсортированные по порядку и значения соответствующих ключей.
+- В зависимости от аргумента возвращает контейнер [Map](../../sql-reference/data-types/map.md) или [кортеж](../../sql-reference/data-types/tuple.md#tuplet1-t2) из двух [массивов](../../sql-reference/data-types/array.md#data-type-array): ключи отсортированные по порядку и значения соответствующих ключей.
**Пример**
-Запрос:
+Запрос с сопоставленными массивами:
```sql
-select mapPopulateSeries([1,2,4], [11,22,44], 5) as res, toTypeName(res) as type;
+SELECT mapPopulateSeries([1,2,4], [11,22,44], 5) AS res, toTypeName(res) AS type;
```
Результат:
@@ -191,6 +215,20 @@ select mapPopulateSeries([1,2,4], [11,22,44], 5) as res, toTypeName(res) as type
└──────────────────────────────┴───────────────────────────────────┘
```
+Запрос с контейнером `Map`:
+
+```sql
+SELECT mapPopulateSeries(map(1, 10, 5, 20), 6);
+```
+
+Результат:
+
+```text
+┌─mapPopulateSeries(map(1, 10, 5, 20), 6)─┐
+│ {1:10,2:0,3:0,4:0,5:20,6:0} │
+└─────────────────────────────────────────┘
+```
+
## mapContains {#mapcontains}
Определяет, содержит ли контейнер `map` ключ `key`.
@@ -319,4 +357,3 @@ SELECT mapValues(a) FROM test;
│ ['twelve','6.0'] │
└──────────────────┘
```
-
diff --git a/docs/ru/sql-reference/statements/describe-table.md b/docs/ru/sql-reference/statements/describe-table.md
index c66dbb66521..56c778f7c76 100644
--- a/docs/ru/sql-reference/statements/describe-table.md
+++ b/docs/ru/sql-reference/statements/describe-table.md
@@ -3,21 +3,66 @@ toc_priority: 42
toc_title: DESCRIBE
---
-# DESCRIBE TABLE Statement {#misc-describe-table}
+# DESCRIBE TABLE {#misc-describe-table}
+
+Возвращает описание столбцов таблицы.
+
+**Синтаксис**
``` sql
DESC|DESCRIBE TABLE [db.]table [INTO OUTFILE filename] [FORMAT format]
```
-Возвращает описание столбцов таблицы.
+Запрос `DESCRIBE` для каждого столбца таблицы возвращает строку со следующими значениями типа [String](../../sql-reference/data-types/string.md):
-Результат запроса содержит столбцы (все столбцы имеют тип String):
-
-- `name` — имя столбца таблицы;
-- `type`— тип столбца;
-- `default_type` — в каком виде задано [выражение для значения по умолчанию](../../sql-reference/statements/create/table.md#create-default-values): `DEFAULT`, `MATERIALIZED` или `ALIAS`. Столбец содержит пустую строку, если значение по умолчанию не задано.
+- `name` — имя столбца;
+- `type` — тип столбца;
+- `default_type` — вид [выражения для значения по умолчанию](../../sql-reference/statements/create/table.md#create-default-values): `DEFAULT`, `MATERIALIZED` или `ALIAS`. Если значение по умолчанию не задано, то возвращается пустая строка;
- `default_expression` — значение, заданное в секции `DEFAULT`;
-- `comment_expression` — комментарий к столбцу.
+- `comment` — [комментарий](../../sql-reference/statements/alter/column.md#alter_comment-column);
+- `codec_expression` — [кодек](../../sql-reference/statements/create/table.md#codecs), который применяется к столбцу;
+- `ttl_expression` — выражение [TTL](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-ttl);
+- `is_subcolumn` — флаг, который равен `1` для внутренних подстолбцов. Он появляется в результате, только если описание подстолбцов разрешено настройкой [describe_include_subcolumns](../../operations/settings/settings.md#describe_include_subcolumns).
-Вложенные структуры данных выводятся в «развёрнутом» виде. То есть, каждый столбец - по отдельности, с именем через точку.
+Каждый столбец [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) структур описывается отдельно. Перед его именем ставится имя родительского столбца с точкой.
+Чтобы отобразить внутренние подстолбцы других типов данных, нужно включить настройку [describe_include_subcolumns](../../operations/settings/settings.md#describe_include_subcolumns).
+**Пример**
+
+Запрос:
+
+``` sql
+CREATE TABLE describe_example (
+ id UInt64, text String DEFAULT 'unknown' CODEC(ZSTD),
+ user Tuple (name String, age UInt8)
+) ENGINE = MergeTree() ORDER BY id;
+
+DESCRIBE TABLE describe_example;
+DESCRIBE TABLE describe_example SETTINGS describe_include_subcolumns=1;
+```
+
+Результат:
+
+``` text
+┌─name─┬─type──────────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
+│ id │ UInt64 │ │ │ │ │ │
+│ text │ String │ DEFAULT │ 'unknown' │ │ ZSTD(1) │ │
+│ user │ Tuple(name String, age UInt8) │ │ │ │ │ │
+└──────┴───────────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
+```
+
+Второй запрос дополнительно выводит информацию о подстолбцах:
+
+``` text
+┌─name──────┬─type──────────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┬─is_subcolumn─┐
+│ id │ UInt64 │ │ │ │ │ │ 0 │
+│ text │ String │ DEFAULT │ 'unknown' │ │ ZSTD(1) │ │ 0 │
+│ user │ Tuple(name String, age UInt8) │ │ │ │ │ │ 0 │
+│ user.name │ String │ │ │ │ │ │ 1 │
+│ user.age │ UInt8 │ │ │ │ │ │ 1 │
+└───────────┴───────────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┴──────────────┘
+```
+
+**См. также**
+
+- настройка [describe_include_subcolumns](../../operations/settings/settings.md#describe_include_subcolumns).
diff --git a/src/Bridge/LibraryBridgeHelper.cpp b/src/Bridge/LibraryBridgeHelper.cpp
index e5c6c09ba62..bd0604ec8e0 100644
--- a/src/Bridge/LibraryBridgeHelper.cpp
+++ b/src/Bridge/LibraryBridgeHelper.cpp
@@ -258,6 +258,7 @@ Pipe LibraryBridgeHelper::loadBase(const Poco::URI & uri, ReadWriteBufferFromHTT
0,
Poco::Net::HTTPBasicCredentials{},
DBMS_DEFAULT_BUFFER_SIZE,
+ getContext()->getReadSettings(),
ReadWriteBufferFromHTTP::HTTPHeaderEntries{});
auto source = FormatFactory::instance().getInput(LibraryBridgeHelper::DEFAULT_FORMAT, *read_buf_ptr, sample_block, getContext(), DEFAULT_BLOCK_SIZE);
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 9e1f25c21d9..87e6cc86d94 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -77,6 +77,7 @@ add_headers_and_sources(clickhouse_common_io IO)
add_headers_and_sources(clickhouse_common_io IO/S3)
list (REMOVE_ITEM clickhouse_common_io_sources Common/malloc.cpp Common/new_delete.cpp)
+add_headers_and_sources(dbms Disks/IO)
if (USE_SQLITE)
add_headers_and_sources(dbms Databases/SQLite)
endif()
diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp
index 44adbccc089..ee44b9eb927 100644
--- a/src/Common/ProfileEvents.cpp
+++ b/src/Common/ProfileEvents.cpp
@@ -250,6 +250,17 @@
M(S3WriteRequestsRedirects, "Number of redirects in POST, DELETE, PUT and PATCH requests to S3 storage.") \
M(QueryMemoryLimitExceeded, "Number of times when memory limit exceeded for query.") \
\
+ M(RemoteFSReadMicroseconds, "Time of reading from remote filesystem.") \
+ M(RemoteFSReadBytes, "Read bytes from remote filesystem.") \
+ \
+ M(RemoteFSSeeks, "Total number of seeks for async buffer") \
+ M(RemoteFSPrefetches, "Number of prefetches made with asynchronous reading from remote filesystem") \
+ M(RemoteFSCancelledPrefetches, "Number of cancelled prefecthes (because of seek)") \
+ M(RemoteFSUnusedPrefetches, "Number of prefetches pending at buffer destruction") \
+ M(RemoteFSPrefetchedReads, "Number of reads from prefecthed buffer") \
+ M(RemoteFSUnprefetchedReads, "Number of reads from unprefetched buffer") \
+ M(RemoteFSBuffers, "Number of buffers created for asynchronous reading from remote filesystem") \
+ \
M(SleepFunctionCalls, "Number of times a sleep function (sleep, sleepEachRow) has been called.") \
M(SleepFunctionMicroseconds, "Time spent sleeping due to a sleep function call.") \
\
diff --git a/src/Compression/CachedCompressedReadBuffer.cpp b/src/Compression/CachedCompressedReadBuffer.cpp
index 4a583773b4b..c19e854dd45 100644
--- a/src/Compression/CachedCompressedReadBuffer.cpp
+++ b/src/Compression/CachedCompressedReadBuffer.cpp
@@ -28,6 +28,12 @@ void CachedCompressedReadBuffer::initInput()
}
+void CachedCompressedReadBuffer::prefetch()
+{
+ file_in->prefetch();
+}
+
+
bool CachedCompressedReadBuffer::nextImpl()
{
/// Let's check for the presence of a decompressed block in the cache, grab the ownership of this block, if it exists.
diff --git a/src/Compression/CachedCompressedReadBuffer.h b/src/Compression/CachedCompressedReadBuffer.h
index bb24f699eed..16770e343cc 100644
--- a/src/Compression/CachedCompressedReadBuffer.h
+++ b/src/Compression/CachedCompressedReadBuffer.h
@@ -33,8 +33,11 @@ private:
UncompressedCache::MappedPtr owned_cell;
void initInput();
+
bool nextImpl() override;
+ void prefetch() override;
+
/// Passed into file_in.
ReadBufferFromFileBase::ProfileCallback profile_callback;
clockid_t clock_type {};
@@ -55,6 +58,18 @@ public:
profile_callback = profile_callback_;
clock_type = clock_type_;
}
+
+ void setReadUntilPosition(size_t position) override
+ {
+ if (file_in)
+ file_in->setReadUntilPosition(position);
+ }
+
+ void setReadUntilEnd() override
+ {
+ if (file_in)
+ file_in->setReadUntilEnd();
+ }
};
}
diff --git a/src/Compression/CompressedReadBufferFromFile.cpp b/src/Compression/CompressedReadBufferFromFile.cpp
index b6da105cd88..1a70b27e9f4 100644
--- a/src/Compression/CompressedReadBufferFromFile.cpp
+++ b/src/Compression/CompressedReadBufferFromFile.cpp
@@ -44,12 +44,6 @@ bool CompressedReadBufferFromFile::nextImpl()
}
-void CompressedReadBufferFromFile::prefetch()
-{
- file_in.prefetch();
-}
-
-
CompressedReadBufferFromFile::CompressedReadBufferFromFile(std::unique_ptr buf, bool allow_different_codecs_)
: BufferWithOwnMemory(0), p_file_in(std::move(buf)), file_in(*p_file_in)
{
@@ -58,6 +52,12 @@ CompressedReadBufferFromFile::CompressedReadBufferFromFile(std::unique_ptr buf, bool allow_different_codecs_ = false);
+ explicit CompressedReadBufferFromFile(std::unique_ptr buf, bool allow_different_codecs_ = false);
/// Seek is lazy in some sense. We move position in compressed file_in to offset_in_compressed_file, but don't
/// read data into working_buffer and don't shit our position to offset_in_decompressed_block. Instead
@@ -58,6 +59,10 @@ public:
{
file_in.setProfileCallback(profile_callback_, clock_type_);
}
+
+ void setReadUntilPosition(size_t position) override { file_in.setReadUntilPosition(position); }
+
+ void setReadUntilEnd() override { file_in.setReadUntilEnd(); }
};
}
diff --git a/src/Compression/CompressionCodecEncrypted.cpp b/src/Compression/CompressionCodecEncrypted.cpp
index 96d264fea1e..c3f14fe1449 100644
--- a/src/Compression/CompressionCodecEncrypted.cpp
+++ b/src/Compression/CompressionCodecEncrypted.cpp
@@ -1,3 +1,4 @@
+#include
#include
#include
#include
@@ -81,9 +82,11 @@ namespace ErrorCodes
namespace
{
-constexpr size_t tag_size = 16; /// AES-GCM-SIV always uses a tag of 16 bytes length
-constexpr size_t key_id_max_size = 8; /// Max size of varint.
-constexpr size_t nonce_max_size = 13; /// Nonce size and one byte to show if nonce in in text
+constexpr size_t tag_size = 16; /// AES-GCM-SIV always uses a tag of 16 bytes length
+constexpr size_t key_id_max_size = 8; /// Max size of varint.
+constexpr size_t nonce_max_size = 13; /// Nonce size and one byte to show if nonce in in text
+constexpr size_t actual_nonce_size = 12; /// Nonce actual size
+const String empty_nonce = {"\0\0\0\0\0\0\0\0\0\0\0\0", actual_nonce_size};
/// Get encryption/decryption algorithms.
auto getMethod(EncryptionMethod Method)
@@ -137,7 +140,7 @@ size_t encrypt(const std::string_view & plaintext, char * ciphertext_and_tag, En
EVP_AEAD_CTX_zero(&encrypt_ctx);
const int ok_init = EVP_AEAD_CTX_init(&encrypt_ctx, getMethod(method)(),
reinterpret_cast(key.data()), key.size(),
- 16 /* tag size */, nullptr);
+ tag_size, nullptr);
if (!ok_init)
throw Exception(lastErrorString(), ErrorCodes::OPENSSL_ERROR);
@@ -145,7 +148,7 @@ size_t encrypt(const std::string_view & plaintext, char * ciphertext_and_tag, En
size_t out_len;
const int ok_open = EVP_AEAD_CTX_seal(&encrypt_ctx,
reinterpret_cast(ciphertext_and_tag),
- &out_len, plaintext.size() + 16,
+ &out_len, plaintext.size() + tag_size,
reinterpret_cast(nonce.data()), nonce.size(),
reinterpret_cast(plaintext.data()), plaintext.size(),
nullptr, 0);
@@ -167,7 +170,7 @@ size_t decrypt(const std::string_view & ciphertext, char * plaintext, Encryption
const int ok_init = EVP_AEAD_CTX_init(&decrypt_ctx, getMethod(method)(),
reinterpret_cast(key.data()), key.size(),
- 16 /* tag size */, nullptr);
+ tag_size, nullptr);
if (!ok_init)
throw Exception(lastErrorString(), ErrorCodes::OPENSSL_ERROR);
@@ -221,7 +224,7 @@ inline char* writeNonce(const String& nonce, char* dest)
{
/// If nonce consists of nul bytes, it shouldn't be in dest. Zero byte is the only byte that should be written.
/// Otherwise, 1 is written and data from nonce is copied
- if (nonce != String("\0\0\0\0\0\0\0\0\0\0\0\0", 12))
+ if (nonce != empty_nonce)
{
*dest = 1;
++dest;
@@ -246,15 +249,15 @@ inline const char* readNonce(String& nonce, const char* source)
/// If first is zero byte: move source and set zero-bytes nonce
if (!*source)
{
- nonce = {"\0\0\0\0\0\0\0\0\0\0\0\0", 12};
+ nonce = empty_nonce;
return ++source;
}
/// Move to next byte. Nonce will begin from there
++source;
/// Otherwise, use data from source in nonce
- nonce = {source, 12};
- source += 12;
+ nonce = {source, actual_nonce_size};
+ source += actual_nonce_size;
return source;
}
@@ -332,14 +335,14 @@ void CompressionCodecEncrypted::Configuration::loadImpl(
if (!new_params->keys_storage[method].contains(new_params->current_key_id[method]))
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Not found a key with the current ID {}", new_params->current_key_id[method]);
- /// Read nonce (in hex or in string). Its length should be 12 bytes.
+ /// Read nonce (in hex or in string). Its length should be 12 bytes (actual_nonce_size).
if (config.has(config_prefix + ".nonce_hex"))
new_params->nonce[method] = unhexKey(config.getString(config_prefix + ".nonce_hex"));
else
new_params->nonce[method] = config.getString(config_prefix + ".nonce", "");
- if (new_params->nonce[method].size() != 12 && !new_params->nonce[method].empty())
- throw Exception(ErrorCodes::BAD_ARGUMENTS, "Got nonce with unexpected size {}, the size should be 12", new_params->nonce[method].size());
+ if (new_params->nonce[method].size() != actual_nonce_size && !new_params->nonce[method].empty())
+ throw Exception(ErrorCodes::BAD_ARGUMENTS, "Got nonce with unexpected size {}, the size should be {}", new_params->nonce[method].size(), actual_nonce_size);
}
bool CompressionCodecEncrypted::Configuration::tryLoad(const Poco::Util::AbstractConfiguration & config, const String & config_prefix)
@@ -399,7 +402,7 @@ void CompressionCodecEncrypted::Configuration::getCurrentKeyAndNonce(EncryptionM
/// This will lead to data loss.
nonce = current_params->nonce[method];
if (nonce.empty())
- nonce = {"\0\0\0\0\0\0\0\0\0\0\0\0", 12};
+ nonce = empty_nonce;
}
String CompressionCodecEncrypted::Configuration::getKey(EncryptionMethod method, const UInt64 & key_id) const
@@ -448,8 +451,10 @@ UInt32 CompressionCodecEncrypted::getMaxCompressedDataSize(UInt32 uncompressed_s
UInt32 CompressionCodecEncrypted::doCompressData(const char * source, UInt32 source_size, char * dest) const
{
- // Generate an IV out of the data block and the key-generation
- // key. It is completely deterministic, but does not leak any
+ // Nonce, key and plaintext will be used to generate authentication tag
+ // and message encryption key. AES-GCM-SIV authenticates the encoded additional data and plaintext.
+ // For this purpose message_authentication_key is used.
+ // Algorithm is completely deterministic, but does not leak any
// information about the data block except for equivalence of
// identical blocks (under the same key).
@@ -470,8 +475,7 @@ UInt32 CompressionCodecEncrypted::doCompressData(const char * source, UInt32 sou
char* ciphertext = writeNonce(nonce, ciphertext_with_nonce);
UInt64 nonce_size = ciphertext - ciphertext_with_nonce;
- // The IV will be used as an authentication tag. The ciphertext and the
- // tag will be written directly in the dest buffer.
+ // The ciphertext and the authentication tag will be written directly in the dest buffer.
size_t out_len = encrypt(plaintext, ciphertext, encryption_method, current_key, nonce);
/// Length of encrypted text should be equal to text length plus tag_size (which was added by algorithm).
diff --git a/src/Core/Settings.h b/src/Core/Settings.h
index 7aade8e2d0f..e6f685a0650 100644
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@@ -75,7 +75,6 @@ class IColumn;
M(UInt64, s3_max_single_read_retries, 4, "The maximum number of retries during single S3 read.", 0) \
M(UInt64, s3_max_redirects, 10, "Max number of S3 redirects hops allowed.", 0) \
M(UInt64, s3_max_connections, 1024, "The maximum number of connections per server.", 0) \
- M(UInt64, http_max_single_read_retries, 4, "The maximum number of retries during single http read.", 0) \
M(UInt64, hsts_max_age, 0, "Expired time for hsts. 0 means disable HSTS.", 0) \
M(Bool, extremes, false, "Calculate minimums and maximums of the result columns. They can be output in JSON-formats.", IMPORTANT) \
M(Bool, use_uncompressed_cache, false, "Whether to use the cache of uncompressed blocks.", 0) \
@@ -508,6 +507,7 @@ class IColumn;
M(ShortCircuitFunctionEvaluation, short_circuit_function_evaluation, ShortCircuitFunctionEvaluation::ENABLE, "Setting for short-circuit function evaluation configuration. Possible values: 'enable' - use short-circuit function evaluation for functions that are suitable for it, 'disable' - disable short-circuit function evaluation, 'force_enable' - use short-circuit function evaluation for all functions.", 0) \
\
M(String, local_filesystem_read_method, "pread", "Method of reading data from local filesystem, one of: read, pread, mmap, pread_threadpool.", 0) \
+ M(String, remote_filesystem_read_method, "read", "Method of reading data from remote filesystem, one of: read, read_threadpool.", 0) \
M(Bool, local_filesystem_read_prefetch, false, "Should use prefetching when reading data from local filesystem.", 0) \
M(Bool, remote_filesystem_read_prefetch, true, "Should use prefetching when reading data from remote filesystem.", 0) \
M(Int64, read_priority, 0, "Priority to read data from local filesystem. Only supported for 'pread_threadpool' method.", 0) \
@@ -520,7 +520,7 @@ class IColumn;
M(Milliseconds, async_insert_busy_timeout_ms, 200, "Maximum time to wait before dumping collected data per query since the first data appeared", 0) \
M(Milliseconds, async_insert_stale_timeout_ms, 0, "Maximum time to wait before dumping collected data per query since the last data appeared. Zero means no timeout at all", 0) \
\
- M(Int64, remote_fs_read_backoff_threshold, 10000, "Max wait time when trying to read data for remote disk", 0) \
+ M(Int64, remote_fs_read_max_backoff_ms, 10000, "Max wait time when trying to read data for remote disk", 0) \
M(Int64, remote_fs_read_backoff_max_tries, 5, "Max attempts to read with backoff", 0) \
\
M(Bool, force_remove_data_recursively_on_drop, false, "Recursively remove data on DROP query. Avoids 'Directory not empty' error, but may silently remove detached data", 0) \
diff --git a/src/Dictionaries/DictionarySource.cpp b/src/Dictionaries/DictionarySource.cpp
index c3f0ecf3cde..fa25dab6115 100644
--- a/src/Dictionaries/DictionarySource.cpp
+++ b/src/Dictionaries/DictionarySource.cpp
@@ -7,7 +7,7 @@ namespace DB
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
- extern const int UNSUPPORTED_METHOD;
+ extern const int NO_SUCH_COLUMN_IN_TABLE;
}
bool DictionarySourceCoordinator::getKeyColumnsNextRangeToRead(ColumnsWithTypeAndName & key_columns, ColumnsWithTypeAndName & data_columns)
@@ -69,12 +69,6 @@ void DictionarySourceCoordinator::initialize(const Names & column_names)
}
}
}
- else
- {
- throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "No such column name {} in dictionary {}",
- column_name,
- dictionary->getDictionaryID().getNameForLogs());
- }
}
else
{
@@ -86,6 +80,11 @@ void DictionarySourceCoordinator::initialize(const Names & column_names)
column_with_type.type = attribute.type;
}
+ if (!column_with_type.type)
+ throw Exception(ErrorCodes::NO_SUCH_COLUMN_IN_TABLE, "No such column name {} in dictionary {}",
+ column_name,
+ dictionary->getDictionaryID().getNameForLogs());
+
column_with_type.column = column_with_type.type->createColumn();
columns_with_type.emplace_back(std::move(column_with_type));
}
diff --git a/src/Dictionaries/HTTPDictionarySource.cpp b/src/Dictionaries/HTTPDictionarySource.cpp
index 26ebde36f7d..29f503fc160 100644
--- a/src/Dictionaries/HTTPDictionarySource.cpp
+++ b/src/Dictionaries/HTTPDictionarySource.cpp
@@ -101,6 +101,7 @@ Pipe HTTPDictionarySource::loadAll()
0,
credentials,
DBMS_DEFAULT_BUFFER_SIZE,
+ context->getReadSettings(),
configuration.header_entries);
return createWrappedBuffer(std::move(in_ptr));
@@ -119,6 +120,7 @@ Pipe HTTPDictionarySource::loadUpdatedAll()
0,
credentials,
DBMS_DEFAULT_BUFFER_SIZE,
+ context->getReadSettings(),
configuration.header_entries);
return createWrappedBuffer(std::move(in_ptr));
@@ -146,6 +148,7 @@ Pipe HTTPDictionarySource::loadIds(const std::vector & ids)
0,
credentials,
DBMS_DEFAULT_BUFFER_SIZE,
+ context->getReadSettings(),
configuration.header_entries);
return createWrappedBuffer(std::move(in_ptr));
@@ -173,6 +176,7 @@ Pipe HTTPDictionarySource::loadKeys(const Columns & key_columns, const std::vect
0,
credentials,
DBMS_DEFAULT_BUFFER_SIZE,
+ context->getReadSettings(),
configuration.header_entries);
return createWrappedBuffer(std::move(in_ptr));
diff --git a/src/Disks/DiskRestartProxy.cpp b/src/Disks/DiskRestartProxy.cpp
index dfb64cac0ef..f2b27c2c876 100644
--- a/src/Disks/DiskRestartProxy.cpp
+++ b/src/Disks/DiskRestartProxy.cpp
@@ -20,6 +20,10 @@ public:
RestartAwareReadBuffer(const DiskRestartProxy & disk, std::unique_ptr impl_)
: ReadBufferFromFileDecorator(std::move(impl_)), lock(disk.mutex) { }
+ void prefetch() override { impl->prefetch(); }
+
+ void setReadUntilPosition(size_t position) override { impl->setReadUntilPosition(position); }
+
private:
ReadLock lock;
};
diff --git a/src/Disks/DiskWebServer.cpp b/src/Disks/DiskWebServer.cpp
index 320b4d179f3..55ea91c40c9 100644
--- a/src/Disks/DiskWebServer.cpp
+++ b/src/Disks/DiskWebServer.cpp
@@ -3,15 +3,18 @@
#include
#include
-#include
-#include
-#include
-
#include
#include
#include
#include
+#include
+#include
+#include
+#include
+#include
+#include
+
#include
#include
@@ -105,39 +108,6 @@ private:
};
-class ReadBufferFromWebServer final : public ReadIndirectBufferFromRemoteFS
-{
-public:
- ReadBufferFromWebServer(
- const String & uri_,
- RemoteMetadata metadata_,
- ContextPtr context_,
- size_t buf_size_,
- size_t backoff_threshold_,
- size_t max_tries_)
- : ReadIndirectBufferFromRemoteFS(metadata_)
- , uri(uri_)
- , context(context_)
- , buf_size(buf_size_)
- , backoff_threshold(backoff_threshold_)
- , max_tries(max_tries_)
- {
- }
-
- std::unique_ptr createReadBuffer(const String & path) override
- {
- return std::make_unique(fs::path(uri) / path, context, buf_size, backoff_threshold, max_tries);
- }
-
-private:
- String uri;
- ContextPtr context;
- size_t buf_size;
- size_t backoff_threshold;
- size_t max_tries;
-};
-
-
DiskWebServer::DiskWebServer(
const String & disk_name_,
const String & url_,
@@ -196,9 +166,20 @@ std::unique_ptr DiskWebServer::readFile(const String & p
RemoteMetadata meta(path, remote_path);
meta.remote_fs_objects.emplace_back(std::make_pair(remote_path, iter->second.size));
- auto reader = std::make_unique(url, meta, getContext(),
- read_settings.remote_fs_buffer_size, read_settings.remote_fs_backoff_threshold, read_settings.remote_fs_backoff_max_tries);
- return std::make_unique(std::move(reader), min_bytes_for_seek);
+ bool threadpool_read = read_settings.remote_fs_method == RemoteFSReadMethod::read_threadpool;
+
+ auto web_impl = std::make_unique(path, url, meta, getContext(), threadpool_read, read_settings);
+
+ if (threadpool_read)
+ {
+ auto reader = IDiskRemote::getThreadPoolReader();
+ return std::make_unique(reader, read_settings, std::move(web_impl), min_bytes_for_seek);
+ }
+ else
+ {
+ auto buf = std::make_unique(std::move(web_impl));
+ return std::make_unique(std::move(buf), min_bytes_for_seek);
+ }
}
diff --git a/src/Disks/HDFS/DiskHDFS.cpp b/src/Disks/HDFS/DiskHDFS.cpp
index 3e137056377..9071ce1d139 100644
--- a/src/Disks/HDFS/DiskHDFS.cpp
+++ b/src/Disks/HDFS/DiskHDFS.cpp
@@ -1,10 +1,14 @@
#include
-#include
-#include
#include
-#include
-#include
+#include
+
+#include
+#include
+#include
+#include
+#include
+
#include
#include
@@ -48,37 +52,6 @@ private:
};
-/// Reads data from HDFS using stored paths in metadata.
-class ReadIndirectBufferFromHDFS final : public ReadIndirectBufferFromRemoteFS
-{
-public:
- ReadIndirectBufferFromHDFS(
- const Poco::Util::AbstractConfiguration & config_,
- const String & hdfs_uri_,
- DiskHDFS::Metadata metadata_,
- size_t buf_size_)
- : ReadIndirectBufferFromRemoteFS(metadata_)
- , config(config_)
- , buf_size(buf_size_)
- {
- const size_t begin_of_path = hdfs_uri_.find('/', hdfs_uri_.find("//") + 2);
- hdfs_directory = hdfs_uri_.substr(begin_of_path);
- hdfs_uri = hdfs_uri_.substr(0, begin_of_path);
- }
-
- std::unique_ptr createReadBuffer(const String & path) override
- {
- return std::make_unique(hdfs_uri, hdfs_directory + path, config, buf_size);
- }
-
-private:
- const Poco::Util::AbstractConfiguration & config;
- String hdfs_uri;
- String hdfs_directory;
- size_t buf_size;
-};
-
-
DiskHDFS::DiskHDFS(
const String & disk_name_,
const String & hdfs_root_path_,
@@ -102,8 +75,18 @@ std::unique_ptr DiskHDFS::readFile(const String & path,
"Read from file by path: {}. Existing HDFS objects: {}",
backQuote(metadata_path + path), metadata.remote_fs_objects.size());
- auto reader = std::make_unique(config, remote_fs_root_path, metadata, read_settings.remote_fs_buffer_size);
- return std::make_unique(std::move(reader), settings->min_bytes_for_seek);
+ auto hdfs_impl = std::make_unique(path, config, remote_fs_root_path, metadata, read_settings.remote_fs_buffer_size);
+
+ if (read_settings.remote_fs_method == RemoteFSReadMethod::read_threadpool)
+ {
+ auto reader = getThreadPoolReader();
+ return std::make_unique(reader, read_settings, std::move(hdfs_impl));
+ }
+ else
+ {
+ auto buf = std::make_unique(std::move(hdfs_impl));
+ return std::make_unique(std::move(buf), settings->min_bytes_for_seek);
+ }
}
diff --git a/src/Disks/IDiskRemote.cpp b/src/Disks/IDiskRemote.cpp
index 1a6e4eb73a1..cf1baafce6c 100644
--- a/src/Disks/IDiskRemote.cpp
+++ b/src/Disks/IDiskRemote.cpp
@@ -12,6 +12,7 @@
#include
#include
#include
+#include
namespace DB
@@ -496,4 +497,13 @@ String IDiskRemote::getUniqueId(const String & path) const
return id;
}
+
+AsynchronousReaderPtr IDiskRemote::getThreadPoolReader()
+{
+ constexpr size_t pool_size = 50;
+ constexpr size_t queue_size = 1000000;
+ static AsynchronousReaderPtr reader = std::make_shared(pool_size, queue_size);
+ return reader;
+}
+
}
diff --git a/src/Disks/IDiskRemote.h b/src/Disks/IDiskRemote.h
index 7453fce747e..50c8d73c048 100644
--- a/src/Disks/IDiskRemote.h
+++ b/src/Disks/IDiskRemote.h
@@ -3,8 +3,8 @@
#include
#include
-#include "Disks/DiskFactory.h"
-#include "Disks/Executor.h"
+#include
+#include
#include
#include
#include
@@ -33,6 +33,10 @@ protected:
using RemoteFSPathKeeperPtr = std::shared_ptr;
+class IAsynchronousReader;
+using AsynchronousReaderPtr = std::shared_ptr;
+
+
/// Base Disk class for remote FS's, which are not posix-compatible (DiskS3 and DiskHDFS)
class IDiskRemote : public IDisk
{
@@ -125,6 +129,8 @@ public:
virtual RemoteFSPathKeeperPtr createFSPathKeeper() const = 0;
+ static AsynchronousReaderPtr getThreadPoolReader();
+
protected:
Poco::Logger * log;
const String name;
diff --git a/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.cpp b/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.cpp
new file mode 100644
index 00000000000..c283e0ea159
--- /dev/null
+++ b/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.cpp
@@ -0,0 +1,262 @@
+#include "AsynchronousReadIndirectBufferFromRemoteFS.h"
+
+#include
+#include
+#include
+#include
+#include
+
+
+namespace CurrentMetrics
+{
+ extern const Metric AsynchronousReadWait;
+}
+
+namespace ProfileEvents
+{
+ extern const Event AsynchronousReadWaitMicroseconds;
+ extern const Event RemoteFSSeeks;
+ extern const Event RemoteFSPrefetches;
+ extern const Event RemoteFSCancelledPrefetches;
+ extern const Event RemoteFSUnusedPrefetches;
+ extern const Event RemoteFSPrefetchedReads;
+ extern const Event RemoteFSUnprefetchedReads;
+ extern const Event RemoteFSBuffers;
+}
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int LOGICAL_ERROR;
+ extern const int CANNOT_SEEK_THROUGH_FILE;
+}
+
+
+AsynchronousReadIndirectBufferFromRemoteFS::AsynchronousReadIndirectBufferFromRemoteFS(
+ AsynchronousReaderPtr reader_,
+ const ReadSettings & settings_,
+ std::shared_ptr impl_,
+ size_t min_bytes_for_seek_)
+ : ReadBufferFromFileBase(settings_.remote_fs_buffer_size, nullptr, 0)
+ , reader(reader_)
+ , priority(settings_.priority)
+ , impl(impl_)
+ , prefetch_buffer(settings_.remote_fs_buffer_size)
+ , min_bytes_for_seek(min_bytes_for_seek_)
+ , must_read_until_position(settings_.must_read_until_position)
+{
+ ProfileEvents::increment(ProfileEvents::RemoteFSBuffers);
+}
+
+
+String AsynchronousReadIndirectBufferFromRemoteFS::getFileName() const
+{
+ return impl->getFileName();
+}
+
+
+bool AsynchronousReadIndirectBufferFromRemoteFS::hasPendingDataToRead()
+{
+ /// Position is set only for MergeTree tables.
+ if (read_until_position)
+ {
+ /// Everything is already read.
+ if (file_offset_of_buffer_end == read_until_position)
+ return false;
+
+ if (file_offset_of_buffer_end > read_until_position)
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "Read beyond last offset ({} > {})",
+ file_offset_of_buffer_end, read_until_position);
+ }
+ else if (must_read_until_position)
+ throw Exception(ErrorCodes::LOGICAL_ERROR,
+ "Reading for MergeTree family tables must be done with last position boundary");
+
+ return true;
+}
+
+
+std::future AsynchronousReadIndirectBufferFromRemoteFS::readInto(char * data, size_t size)
+{
+ IAsynchronousReader::Request request;
+ request.descriptor = std::make_shared(impl);
+ request.buf = data;
+ request.size = size;
+ request.offset = file_offset_of_buffer_end;
+ request.priority = priority;
+
+ if (bytes_to_ignore)
+ {
+ request.ignore = bytes_to_ignore;
+ bytes_to_ignore = 0;
+ }
+ return reader->submit(request);
+}
+
+
+void AsynchronousReadIndirectBufferFromRemoteFS::prefetch()
+{
+ if (prefetch_future.valid())
+ return;
+
+ /// Check boundary, which was set in readUntilPosition().
+ if (!hasPendingDataToRead())
+ return;
+
+ /// Prefetch even in case hasPendingData() == true.
+ prefetch_future = readInto(prefetch_buffer.data(), prefetch_buffer.size());
+ ProfileEvents::increment(ProfileEvents::RemoteFSPrefetches);
+}
+
+
+void AsynchronousReadIndirectBufferFromRemoteFS::setReadUntilPosition(size_t position)
+{
+ if (prefetch_future.valid())
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "Prefetch is valid in readUntilPosition");
+
+ read_until_position = position;
+ impl->setReadUntilPosition(read_until_position);
+}
+
+
+void AsynchronousReadIndirectBufferFromRemoteFS::setReadUntilEnd()
+{
+ if (prefetch_future.valid())
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "Prefetch is valid in readUntilEnd");
+
+ read_until_position = impl->getFileSize();
+ impl->setReadUntilPosition(read_until_position);
+}
+
+
+bool AsynchronousReadIndirectBufferFromRemoteFS::nextImpl()
+{
+ if (!hasPendingDataToRead())
+ return false;
+
+ size_t size = 0;
+
+ if (prefetch_future.valid())
+ {
+ ProfileEvents::increment(ProfileEvents::RemoteFSPrefetchedReads);
+
+ CurrentMetrics::Increment metric_increment{CurrentMetrics::AsynchronousReadWait};
+ Stopwatch watch;
+ {
+ size = prefetch_future.get();
+ if (size)
+ {
+ memory.swap(prefetch_buffer);
+ set(memory.data(), memory.size());
+ working_buffer.resize(size);
+ file_offset_of_buffer_end += size;
+ }
+ }
+
+ watch.stop();
+ ProfileEvents::increment(ProfileEvents::AsynchronousReadWaitMicroseconds, watch.elapsedMicroseconds());
+ }
+ else
+ {
+ ProfileEvents::increment(ProfileEvents::RemoteFSUnprefetchedReads);
+ size = readInto(memory.data(), memory.size()).get();
+
+ if (size)
+ {
+ set(memory.data(), memory.size());
+ working_buffer.resize(size);
+ file_offset_of_buffer_end += size;
+ }
+ }
+
+ prefetch_future = {};
+ return size;
+}
+
+
+off_t AsynchronousReadIndirectBufferFromRemoteFS::seek(off_t offset_, int whence)
+{
+ ProfileEvents::increment(ProfileEvents::RemoteFSSeeks);
+
+ if (whence == SEEK_CUR)
+ {
+ /// If position within current working buffer - shift pos.
+ if (!working_buffer.empty() && static_cast(getPosition() + offset_) < file_offset_of_buffer_end)
+ {
+ pos += offset_;
+ return getPosition();
+ }
+ else
+ {
+ file_offset_of_buffer_end += offset_;
+ }
+ }
+ else if (whence == SEEK_SET)
+ {
+ /// If position is within current working buffer - shift pos.
+ if (!working_buffer.empty()
+ && static_cast(offset_) >= file_offset_of_buffer_end - working_buffer.size()
+ && size_t(offset_) < file_offset_of_buffer_end)
+ {
+ pos = working_buffer.end() - (file_offset_of_buffer_end - offset_);
+
+ assert(pos >= working_buffer.begin());
+ assert(pos <= working_buffer.end());
+
+ return getPosition();
+ }
+ else
+ {
+ file_offset_of_buffer_end = offset_;
+ }
+ }
+ else
+ throw Exception("Only SEEK_SET or SEEK_CUR modes are allowed.", ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
+
+ if (prefetch_future.valid())
+ {
+ ProfileEvents::increment(ProfileEvents::RemoteFSCancelledPrefetches);
+ prefetch_future.wait();
+ prefetch_future = {};
+ }
+
+ pos = working_buffer.end();
+
+ /// Note: we read in range [file_offset_of_buffer_end, read_until_position).
+ if (file_offset_of_buffer_end < read_until_position
+ && static_cast(file_offset_of_buffer_end) >= getPosition()
+ && static_cast(file_offset_of_buffer_end) < getPosition() + static_cast(min_bytes_for_seek))
+ {
+ /**
+ * Lazy ignore. Save number of bytes to ignore and ignore it either for prefetch buffer or current buffer.
+ */
+ bytes_to_ignore = file_offset_of_buffer_end - getPosition();
+ }
+ else
+ {
+ impl->reset();
+ }
+
+ return file_offset_of_buffer_end;
+}
+
+
+void AsynchronousReadIndirectBufferFromRemoteFS::finalize()
+{
+ if (prefetch_future.valid())
+ {
+ ProfileEvents::increment(ProfileEvents::RemoteFSUnusedPrefetches);
+ prefetch_future.wait();
+ prefetch_future = {};
+ }
+}
+
+
+AsynchronousReadIndirectBufferFromRemoteFS::~AsynchronousReadIndirectBufferFromRemoteFS()
+{
+ finalize();
+}
+
+}
diff --git a/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h b/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h
new file mode 100644
index 00000000000..d8fad08bc8a
--- /dev/null
+++ b/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.h
@@ -0,0 +1,84 @@
+#pragma once
+
+#if !defined(ARCADIA_BUILD)
+#include
+#endif
+
+#include
+#include
+#include
+
+
+namespace DB
+{
+
+class ReadBufferFromRemoteFSGather;
+struct ReadSettings;
+
+/**
+ * Reads data from S3/HDFS/Web using stored paths in metadata.
+* This class is an asynchronous version of ReadIndirectBufferFromRemoteFS.
+*
+* Buffers chain for diskS3:
+* AsynchronousIndirectReadBufferFromRemoteFS -> ReadBufferFromRemoteFS ->
+* -> ReadBufferFromS3 -> ReadBufferFromIStream.
+*
+* Buffers chain for diskWeb:
+* AsynchronousIndirectReadBufferFromRemoteFS -> ReadBufferFromRemoteFS ->
+* -> ReadIndirectBufferFromWebServer -> ReadBufferFromHttp -> ReadBufferFromIStream.
+*
+* We pass either `memory` or `prefetch_buffer` through all this chain and return it back.
+*/
+class AsynchronousReadIndirectBufferFromRemoteFS : public ReadBufferFromFileBase
+{
+public:
+ explicit AsynchronousReadIndirectBufferFromRemoteFS(
+ AsynchronousReaderPtr reader_, const ReadSettings & settings_,
+ std::shared_ptr impl_,
+ size_t min_bytes_for_seek = 1024 * 1024);
+
+ ~AsynchronousReadIndirectBufferFromRemoteFS() override;
+
+ off_t seek(off_t offset_, int whence) override;
+
+ off_t getPosition() override { return file_offset_of_buffer_end - available(); }
+
+ String getFileName() const override;
+
+ void prefetch() override;
+
+ void setReadUntilPosition(size_t position) override;
+
+ void setReadUntilEnd() override;
+
+private:
+ bool nextImpl() override;
+
+ void finalize();
+
+ bool hasPendingDataToRead();
+
+ std::future readInto(char * data, size_t size);
+
+ AsynchronousReaderPtr reader;
+
+ Int32 priority;
+
+ std::shared_ptr impl;
+
+ std::future prefetch_future;
+
+ size_t file_offset_of_buffer_end = 0;
+
+ Memory<> prefetch_buffer;
+
+ size_t min_bytes_for_seek;
+
+ size_t bytes_to_ignore = 0;
+
+ size_t read_until_position = 0;
+
+ bool must_read_until_position;
+};
+
+}
diff --git a/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp b/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp
new file mode 100644
index 00000000000..a9a94fa63e2
--- /dev/null
+++ b/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp
@@ -0,0 +1,182 @@
+#include "ReadBufferFromRemoteFSGather.h"
+
+#include
+#include
+#include
+
+#if USE_AWS_S3
+#include
+#endif
+
+#if USE_HDFS
+#include
+#endif
+
+#include
+#include
+#include
+
+namespace fs = std::filesystem;
+
+namespace DB
+{
+
+#if USE_AWS_S3
+SeekableReadBufferPtr ReadBufferFromS3Gather::createImplementationBuffer(const String & path, size_t read_until_position_) const
+{
+ return std::make_unique(client_ptr, bucket,
+ fs::path(metadata.remote_fs_root_path) / path, max_single_read_retries, settings, threadpool_read, read_until_position_);
+}
+#endif
+
+
+SeekableReadBufferPtr ReadBufferFromWebServerGather::createImplementationBuffer(const String & path, size_t read_until_position_) const
+{
+ return std::make_unique(fs::path(uri) / path, context, settings, threadpool_read, read_until_position_);
+}
+
+
+#if USE_HDFS
+SeekableReadBufferPtr ReadBufferFromHDFSGather::createImplementationBuffer(const String & path, size_t read_until_position_) const
+{
+ return std::make_unique(hdfs_uri, fs::path(hdfs_directory) / path, config, buf_size, read_until_position_);
+}
+#endif
+
+
+ReadBufferFromRemoteFSGather::ReadBufferFromRemoteFSGather(const RemoteMetadata & metadata_, const String & path_)
+ : ReadBuffer(nullptr, 0)
+ , metadata(metadata_)
+ , canonical_path(path_)
+{
+}
+
+
+size_t ReadBufferFromRemoteFSGather::readInto(char * data, size_t size, size_t offset, size_t ignore)
+{
+ /**
+ * Set `data` to current working and internal buffers.
+ * Internal buffer with size `size`. Working buffer with size 0.
+ */
+ set(data, size);
+
+ absolute_position = offset;
+ bytes_to_ignore = ignore;
+
+ auto result = nextImpl();
+ bytes_to_ignore = 0;
+
+ if (result)
+ return working_buffer.size();
+
+ return 0;
+}
+
+
+void ReadBufferFromRemoteFSGather::initialize()
+{
+ /// One clickhouse file can be split into multiple files in remote fs.
+ auto current_buf_offset = absolute_position;
+ for (size_t i = 0; i < metadata.remote_fs_objects.size(); ++i)
+ {
+ const auto & [file_path, size] = metadata.remote_fs_objects[i];
+
+ if (size > current_buf_offset)
+ {
+ /// Do not create a new buffer if we already have what we need.
+ if (!current_buf || current_buf_idx != i)
+ {
+ current_buf = createImplementationBuffer(file_path, read_until_position);
+ current_buf_idx = i;
+ }
+
+ current_buf->seek(current_buf_offset, SEEK_SET);
+ return;
+ }
+
+ current_buf_offset -= size;
+ }
+ current_buf_idx = metadata.remote_fs_objects.size();
+ current_buf = nullptr;
+}
+
+
+bool ReadBufferFromRemoteFSGather::nextImpl()
+{
+ /// Find first available buffer that fits to given offset.
+ if (!current_buf)
+ initialize();
+
+ /// If current buffer has remaining data - use it.
+ if (current_buf)
+ {
+ if (readImpl())
+ return true;
+ }
+ else
+ return false;
+
+ /// If there is no available buffers - nothing to read.
+ if (current_buf_idx + 1 >= metadata.remote_fs_objects.size())
+ return false;
+
+ ++current_buf_idx;
+
+ const auto & current_path = metadata.remote_fs_objects[current_buf_idx].first;
+ current_buf = createImplementationBuffer(current_path, read_until_position);
+
+ return readImpl();
+}
+
+
+bool ReadBufferFromRemoteFSGather::readImpl()
+{
+ swap(*current_buf);
+
+ /**
+ * Lazy seek is performed here.
+ * In asynchronous buffer when seeking to offset in range [pos, pos + min_bytes_for_seek]
+ * we save how many bytes need to be ignored (new_offset - position() bytes).
+ */
+ if (bytes_to_ignore)
+ current_buf->ignore(bytes_to_ignore);
+
+ auto result = current_buf->next();
+
+ swap(*current_buf);
+
+ if (result)
+ absolute_position += working_buffer.size();
+
+ return result;
+}
+
+
+void ReadBufferFromRemoteFSGather::setReadUntilPosition(size_t position)
+{
+ read_until_position = position;
+ reset();
+}
+
+
+void ReadBufferFromRemoteFSGather::reset()
+{
+ current_buf.reset();
+}
+
+
+String ReadBufferFromRemoteFSGather::getFileName() const
+{
+ return canonical_path;
+}
+
+
+size_t ReadBufferFromRemoteFSGather::getFileSize() const
+{
+ size_t size = 0;
+ for (const auto & object : metadata.remote_fs_objects)
+ size += object.second;
+ return size;
+}
+
+}
diff --git a/src/Disks/IO/ReadBufferFromRemoteFSGather.h b/src/Disks/IO/ReadBufferFromRemoteFSGather.h
new file mode 100644
index 00000000000..5bc7d4e4819
--- /dev/null
+++ b/src/Disks/IO/ReadBufferFromRemoteFSGather.h
@@ -0,0 +1,161 @@
+#pragma once
+
+#if !defined(ARCADIA_BUILD)
+#include
+#endif
+
+#include
+#include
+#include
+
+namespace Aws
+{
+namespace S3
+{
+class S3Client;
+}
+}
+
+namespace DB
+{
+
+/**
+ * Remote disk might need to split one clickhouse file into multiple files in remote fs.
+ * This class works like a proxy to allow transition from one file into multiple.
+ */
+class ReadBufferFromRemoteFSGather : public ReadBuffer
+{
+friend class ReadIndirectBufferFromRemoteFS;
+
+public:
+ explicit ReadBufferFromRemoteFSGather(const RemoteMetadata & metadata_, const String & path_);
+
+ String getFileName() const;
+
+ void reset();
+
+ void setReadUntilPosition(size_t position) override;
+
+ size_t readInto(char * data, size_t size, size_t offset, size_t ignore = 0);
+
+ size_t getFileSize() const;
+
+protected:
+ virtual SeekableReadBufferPtr createImplementationBuffer(const String & path, size_t read_until_position) const = 0;
+
+ RemoteMetadata metadata;
+
+private:
+ bool nextImpl() override;
+
+ void initialize();
+
+ bool readImpl();
+
+ SeekableReadBufferPtr current_buf;
+
+ size_t current_buf_idx = 0;
+
+ size_t absolute_position = 0;
+
+ size_t bytes_to_ignore = 0;
+
+ size_t read_until_position = 0;
+
+ String canonical_path;
+};
+
+
+#if USE_AWS_S3
+/// Reads data from S3 using stored paths in metadata.
+class ReadBufferFromS3Gather final : public ReadBufferFromRemoteFSGather
+{
+public:
+ ReadBufferFromS3Gather(
+ const String & path_,
+ std::shared_ptr client_ptr_,
+ const String & bucket_,
+ IDiskRemote::Metadata metadata_,
+ size_t max_single_read_retries_,
+ const ReadSettings & settings_,
+ bool threadpool_read_ = false)
+ : ReadBufferFromRemoteFSGather(metadata_, path_)
+ , client_ptr(std::move(client_ptr_))
+ , bucket(bucket_)
+ , max_single_read_retries(max_single_read_retries_)
+ , settings(settings_)
+ , threadpool_read(threadpool_read_)
+ {
+ }
+
+ SeekableReadBufferPtr createImplementationBuffer(const String & path, size_t read_until_position) const override;
+
+private:
+ std::shared_ptr client_ptr;
+ String bucket;
+ UInt64 max_single_read_retries;
+ ReadSettings settings;
+ bool threadpool_read;
+};
+#endif
+
+
+class ReadBufferFromWebServerGather final : public ReadBufferFromRemoteFSGather
+{
+public:
+ ReadBufferFromWebServerGather(
+ const String & path_,
+ const String & uri_,
+ RemoteMetadata metadata_,
+ ContextPtr context_,
+ size_t threadpool_read_,
+ const ReadSettings & settings_)
+ : ReadBufferFromRemoteFSGather(metadata_, path_)
+ , uri(uri_)
+ , context(context_)
+ , threadpool_read(threadpool_read_)
+ , settings(settings_)
+ {
+ }
+
+ SeekableReadBufferPtr createImplementationBuffer(const String & path, size_t read_until_position) const override;
+
+private:
+ String uri;
+ ContextPtr context;
+ bool threadpool_read;
+ ReadSettings settings;
+};
+
+
+#if USE_HDFS
+/// Reads data from HDFS using stored paths in metadata.
+class ReadBufferFromHDFSGather final : public ReadBufferFromRemoteFSGather
+{
+public:
+ ReadBufferFromHDFSGather(
+ const String & path_,
+ const Poco::Util::AbstractConfiguration & config_,
+ const String & hdfs_uri_,
+ IDiskRemote::Metadata metadata_,
+ size_t buf_size_)
+ : ReadBufferFromRemoteFSGather(metadata_, path_)
+ , config(config_)
+ , buf_size(buf_size_)
+ {
+ const size_t begin_of_path = hdfs_uri_.find('/', hdfs_uri_.find("//") + 2);
+ hdfs_directory = hdfs_uri_.substr(begin_of_path);
+ hdfs_uri = hdfs_uri_.substr(0, begin_of_path);
+ }
+
+ SeekableReadBufferPtr createImplementationBuffer(const String & path, size_t read_until_position) const override;
+
+private:
+ const Poco::Util::AbstractConfiguration & config;
+ String hdfs_uri;
+ String hdfs_directory;
+ size_t buf_size;
+};
+#endif
+
+}
diff --git a/src/Disks/IO/ReadBufferFromWebServer.cpp b/src/Disks/IO/ReadBufferFromWebServer.cpp
new file mode 100644
index 00000000000..bda20f78e79
--- /dev/null
+++ b/src/Disks/IO/ReadBufferFromWebServer.cpp
@@ -0,0 +1,198 @@
+#include "ReadBufferFromWebServer.h"
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int CANNOT_SEEK_THROUGH_FILE;
+ extern const int SEEK_POSITION_OUT_OF_BOUND;
+ extern const int LOGICAL_ERROR;
+}
+
+
+static constexpr size_t HTTP_MAX_TRIES = 10;
+static constexpr size_t WAIT_INIT = 100;
+
+ReadBufferFromWebServer::ReadBufferFromWebServer(
+ const String & url_,
+ ContextPtr context_,
+ const ReadSettings & settings_,
+ bool use_external_buffer_,
+ size_t last_offset_)
+ : SeekableReadBuffer(nullptr, 0)
+ , log(&Poco::Logger::get("ReadBufferFromWebServer"))
+ , context(context_)
+ , url(url_)
+ , buf_size(settings_.remote_fs_buffer_size)
+ , read_settings(settings_)
+ , use_external_buffer(use_external_buffer_)
+ , last_offset(last_offset_)
+{
+}
+
+
+std::unique_ptr ReadBufferFromWebServer::initialize()
+{
+ Poco::URI uri(url);
+
+ ReadWriteBufferFromHTTP::HTTPHeaderEntries headers;
+
+ if (last_offset)
+ {
+ if (last_offset < offset)
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to read beyond right offset ({} > {})", offset, last_offset - 1);
+
+ headers.emplace_back(std::make_pair("Range", fmt::format("bytes={}-{}", offset, last_offset - 1)));
+ LOG_DEBUG(log, "Reading with range: {}-{}", offset, last_offset);
+ }
+ else
+ {
+ headers.emplace_back(std::make_pair("Range", fmt::format("bytes={}-", offset)));
+ LOG_DEBUG(log, "Reading from offset: {}", offset);
+ }
+
+ const auto & settings = context->getSettingsRef();
+ const auto & config = context->getConfigRef();
+ Poco::Timespan http_keep_alive_timeout{config.getUInt("keep_alive_timeout", 20), 0};
+
+ return std::make_unique(
+ uri,
+ Poco::Net::HTTPRequest::HTTP_GET,
+ ReadWriteBufferFromHTTP::OutStreamCallback(),
+ ConnectionTimeouts(std::max(Poco::Timespan(settings.http_connection_timeout.totalSeconds(), 0), Poco::Timespan(20, 0)),
+ settings.http_send_timeout,
+ std::max(Poco::Timespan(settings.http_receive_timeout.totalSeconds(), 0), Poco::Timespan(20, 0)),
+ settings.tcp_keep_alive_timeout,
+ http_keep_alive_timeout),
+ 0,
+ Poco::Net::HTTPBasicCredentials{},
+ buf_size,
+ read_settings,
+ headers,
+ context->getRemoteHostFilter(),
+ use_external_buffer);
+}
+
+
+void ReadBufferFromWebServer::initializeWithRetry()
+{
+ /// Initialize impl with retry.
+ size_t milliseconds_to_wait = WAIT_INIT;
+ for (size_t i = 0; i < HTTP_MAX_TRIES; ++i)
+ {
+ try
+ {
+ impl = initialize();
+
+ if (use_external_buffer)
+ {
+ /**
+ * See comment 30 lines lower.
+ */
+ impl->set(internal_buffer.begin(), internal_buffer.size());
+ assert(working_buffer.begin() != nullptr);
+ assert(!internal_buffer.empty());
+ }
+
+ break;
+ }
+ catch (Poco::Exception & e)
+ {
+ if (i == HTTP_MAX_TRIES - 1)
+ throw;
+
+ LOG_ERROR(&Poco::Logger::get("ReadBufferFromWeb"), "Error: {}, code: {}", e.what(), e.code());
+ sleepForMilliseconds(milliseconds_to_wait);
+ milliseconds_to_wait *= 2;
+ }
+ }
+}
+
+
+bool ReadBufferFromWebServer::nextImpl()
+{
+ if (last_offset)
+ {
+ if (last_offset == offset)
+ return false;
+
+ if (last_offset < offset)
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to read beyond right offset ({} > {})", offset, last_offset - 1);
+ }
+
+ if (impl)
+ {
+ if (use_external_buffer)
+ {
+ /**
+ * use_external_buffer -- means we read into the buffer which
+ * was passed to us from somewhere else. We do not check whether
+ * previously returned buffer was read or not, because this branch
+ * means we are prefetching data, each nextImpl() call we can fill
+ * a different buffer.
+ */
+ impl->set(internal_buffer.begin(), internal_buffer.size());
+ assert(working_buffer.begin() != nullptr);
+ assert(!internal_buffer.empty());
+ }
+ else
+ {
+ /**
+ * impl was initialized before, pass position() to it to make
+ * sure there is no pending data which was not read, because
+ * this branch means we read sequentially.
+ */
+ impl->position() = position();
+ assert(!impl->hasPendingData());
+ }
+ }
+ else
+ {
+ initializeWithRetry();
+ }
+
+ auto result = impl->next();
+ if (result)
+ {
+ BufferBase::set(impl->buffer().begin(), impl->buffer().size(), impl->offset());
+ offset += working_buffer.size();
+ }
+
+ return result;
+}
+
+
+off_t ReadBufferFromWebServer::seek(off_t offset_, int whence)
+{
+ if (impl)
+ throw Exception(ErrorCodes::CANNOT_SEEK_THROUGH_FILE, "Seek is allowed only before first read attempt from the buffer");
+
+ if (whence != SEEK_SET)
+ throw Exception(ErrorCodes::CANNOT_SEEK_THROUGH_FILE, "Only SEEK_SET mode is allowed");
+
+ if (offset_ < 0)
+ throw Exception(ErrorCodes::SEEK_POSITION_OUT_OF_BOUND, "Seek position is out of bounds. Offset: {}", std::to_string(offset_));
+
+ offset = offset_;
+
+ return offset;
+}
+
+
+off_t ReadBufferFromWebServer::getPosition()
+{
+ return offset - available();
+}
+
+}
diff --git a/src/Disks/ReadIndirectBufferFromWebServer.h b/src/Disks/IO/ReadBufferFromWebServer.h
similarity index 56%
rename from src/Disks/ReadIndirectBufferFromWebServer.h
rename to src/Disks/IO/ReadBufferFromWebServer.h
index 04bb155f83b..1ffb8589392 100644
--- a/src/Disks/ReadIndirectBufferFromWebServer.h
+++ b/src/Disks/IO/ReadBufferFromWebServer.h
@@ -2,6 +2,7 @@
#include
#include
+#include
#include
@@ -11,15 +12,16 @@ namespace DB
/* Read buffer, which reads via http, but is used as ReadBufferFromFileBase.
* Used to read files, hosted on a web server with static files.
*
- * Usage: ReadIndirectBufferFromRemoteFS -> SeekAvoidingReadBuffer -> ReadIndirectBufferFromWebServer -> ReadWriteBufferFromHTTP.
+ * Usage: ReadIndirectBufferFromRemoteFS -> SeekAvoidingReadBuffer -> ReadBufferFromWebServer -> ReadWriteBufferFromHTTP.
*/
-class ReadIndirectBufferFromWebServer : public BufferWithOwnMemory
+class ReadBufferFromWebServer : public SeekableReadBuffer
{
public:
- explicit ReadIndirectBufferFromWebServer(const String & url_,
- ContextPtr context_,
- size_t buf_size_ = DBMS_DEFAULT_BUFFER_SIZE,
- size_t backoff_threshold_ = 10000, size_t max_tries_ = 4);
+ explicit ReadBufferFromWebServer(
+ const String & url_, ContextPtr context_,
+ const ReadSettings & settings_ = {},
+ bool use_external_buffer_ = false,
+ size_t last_offset = 0);
bool nextImpl() override;
@@ -30,6 +32,8 @@ public:
private:
std::unique_ptr initialize();
+ void initializeWithRetry();
+
Poco::Logger * log;
ContextPtr context;
@@ -40,8 +44,11 @@ private:
off_t offset = 0;
- size_t backoff_threshold_ms;
- size_t max_tries;
+ ReadSettings read_settings;
+
+ bool use_external_buffer;
+
+ off_t last_offset = 0;
};
}
diff --git a/src/Disks/IO/ReadIndirectBufferFromRemoteFS.cpp b/src/Disks/IO/ReadIndirectBufferFromRemoteFS.cpp
new file mode 100644
index 00000000000..112124d9fd7
--- /dev/null
+++ b/src/Disks/IO/ReadIndirectBufferFromRemoteFS.cpp
@@ -0,0 +1,85 @@
+#include "ReadIndirectBufferFromRemoteFS.h"
+
+#include
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int CANNOT_SEEK_THROUGH_FILE;
+}
+
+
+ReadIndirectBufferFromRemoteFS::ReadIndirectBufferFromRemoteFS(
+ std::shared_ptr impl_) : impl(std::move(impl_))
+{
+}
+
+
+off_t ReadIndirectBufferFromRemoteFS::getPosition()
+{
+ return impl->absolute_position - available();
+}
+
+
+String ReadIndirectBufferFromRemoteFS::getFileName() const
+{
+ return impl->getFileName();
+}
+
+
+off_t ReadIndirectBufferFromRemoteFS::seek(off_t offset_, int whence)
+{
+ if (whence == SEEK_CUR)
+ {
+ /// If position within current working buffer - shift pos.
+ if (!working_buffer.empty() && size_t(getPosition() + offset_) < impl->absolute_position)
+ {
+ pos += offset_;
+ return getPosition();
+ }
+ else
+ {
+ impl->absolute_position += offset_;
+ }
+ }
+ else if (whence == SEEK_SET)
+ {
+ /// If position within current working buffer - shift pos.
+ if (!working_buffer.empty()
+ && size_t(offset_) >= impl->absolute_position - working_buffer.size()
+ && size_t(offset_) < impl->absolute_position)
+ {
+ pos = working_buffer.end() - (impl->absolute_position - offset_);
+ return getPosition();
+ }
+ else
+ {
+ impl->absolute_position = offset_;
+ }
+ }
+ else
+ throw Exception("Only SEEK_SET or SEEK_CUR modes are allowed.", ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
+
+ impl->reset();
+ pos = working_buffer.end();
+
+ return impl->absolute_position;
+}
+
+
+bool ReadIndirectBufferFromRemoteFS::nextImpl()
+{
+ /// Transfer current position and working_buffer to actual ReadBuffer
+ swap(*impl);
+ /// Position and working_buffer will be updated in next() call
+ auto result = impl->next();
+ /// and assigned to current buffer.
+ swap(*impl);
+
+ return result;
+}
+
+}
diff --git a/src/Disks/IO/ReadIndirectBufferFromRemoteFS.h b/src/Disks/IO/ReadIndirectBufferFromRemoteFS.h
new file mode 100644
index 00000000000..0c8b1b4dd21
--- /dev/null
+++ b/src/Disks/IO/ReadIndirectBufferFromRemoteFS.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include
+#include
+#include
+#include
+
+
+namespace DB
+{
+
+class ReadBufferFromRemoteFSGather;
+
+/**
+* Reads data from S3/HDFS/Web using stored paths in metadata.
+* There is asynchronous version of this class -- AsynchronousReadIndirectBufferFromRemoteFS.
+*/
+class ReadIndirectBufferFromRemoteFS : public ReadBufferFromFileBase
+{
+
+public:
+ explicit ReadIndirectBufferFromRemoteFS(std::shared_ptr impl_);
+
+ off_t seek(off_t offset_, int whence) override;
+
+ off_t getPosition() override;
+
+ String getFileName() const override;
+
+private:
+ bool nextImpl() override;
+
+ std::shared_ptr impl;
+};
+
+}
diff --git a/src/Disks/IO/ThreadPoolRemoteFSReader.cpp b/src/Disks/IO/ThreadPoolRemoteFSReader.cpp
new file mode 100644
index 00000000000..945b2d3eb7e
--- /dev/null
+++ b/src/Disks/IO/ThreadPoolRemoteFSReader.cpp
@@ -0,0 +1,68 @@
+#include "ThreadPoolRemoteFSReader.h"
+
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+
+#include
+#include
+
+
+namespace ProfileEvents
+{
+ extern const Event RemoteFSReadMicroseconds;
+ extern const Event RemoteFSReadBytes;
+}
+
+namespace CurrentMetrics
+{
+ extern const Metric Read;
+}
+
+namespace DB
+{
+
+size_t ThreadPoolRemoteFSReader::RemoteFSFileDescriptor::readInto(char * data, size_t size, size_t offset, size_t ignore)
+{
+ return reader->readInto(data, size, offset, ignore);
+}
+
+
+ThreadPoolRemoteFSReader::ThreadPoolRemoteFSReader(size_t pool_size, size_t queue_size_)
+ : pool(pool_size, pool_size, queue_size_)
+{
+}
+
+
+std::future ThreadPoolRemoteFSReader::submit(Request request)
+{
+ auto task = std::make_shared>([request]
+ {
+ setThreadName("ThreadPoolRemoteFSRead");
+ CurrentMetrics::Increment metric_increment{CurrentMetrics::Read};
+ auto * remote_fs_fd = assert_cast(request.descriptor.get());
+
+ Stopwatch watch(CLOCK_MONOTONIC);
+ auto bytes_read = remote_fs_fd->readInto(request.buf, request.size, request.offset, request.ignore);
+ watch.stop();
+
+ ProfileEvents::increment(ProfileEvents::RemoteFSReadMicroseconds, watch.elapsedMicroseconds());
+ ProfileEvents::increment(ProfileEvents::RemoteFSReadBytes, bytes_read);
+
+ return bytes_read;
+ });
+
+ auto future = task->get_future();
+
+ /// ThreadPool is using "bigger is higher priority" instead of "smaller is more priority".
+ pool.scheduleOrThrow([task]{ (*task)(); }, -request.priority);
+
+ return future;
+}
+}
diff --git a/src/Disks/IO/ThreadPoolRemoteFSReader.h b/src/Disks/IO/ThreadPoolRemoteFSReader.h
new file mode 100644
index 00000000000..c300162e214
--- /dev/null
+++ b/src/Disks/IO/ThreadPoolRemoteFSReader.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include
+#include
+#include
+#include
+
+
+namespace DB
+{
+class ReadBufferFromRemoteFSGather;
+
+class ThreadPoolRemoteFSReader : public IAsynchronousReader
+{
+
+private:
+ ThreadPool pool;
+
+public:
+ ThreadPoolRemoteFSReader(size_t pool_size, size_t queue_size_);
+
+ std::future submit(Request request) override;
+
+ struct RemoteFSFileDescriptor;
+};
+
+
+struct ThreadPoolRemoteFSReader::RemoteFSFileDescriptor : public IFileDescriptor
+{
+public:
+ RemoteFSFileDescriptor(std::shared_ptr reader_) : reader(reader_) {}
+
+ size_t readInto(char * data, size_t size, size_t offset, size_t ignore = 0);
+
+private:
+ std::shared_ptr reader;
+};
+
+}
diff --git a/src/Disks/WriteIndirectBufferFromRemoteFS.cpp b/src/Disks/IO/WriteIndirectBufferFromRemoteFS.cpp
similarity index 100%
rename from src/Disks/WriteIndirectBufferFromRemoteFS.cpp
rename to src/Disks/IO/WriteIndirectBufferFromRemoteFS.cpp
diff --git a/src/Disks/WriteIndirectBufferFromRemoteFS.h b/src/Disks/IO/WriteIndirectBufferFromRemoteFS.h
similarity index 100%
rename from src/Disks/WriteIndirectBufferFromRemoteFS.h
rename to src/Disks/IO/WriteIndirectBufferFromRemoteFS.h
diff --git a/src/Disks/ReadIndirectBufferFromRemoteFS.cpp b/src/Disks/ReadIndirectBufferFromRemoteFS.cpp
deleted file mode 100644
index 3bc22167f50..00000000000
--- a/src/Disks/ReadIndirectBufferFromRemoteFS.cpp
+++ /dev/null
@@ -1,143 +0,0 @@
-#include "ReadIndirectBufferFromRemoteFS.h"
-
-#include
-#include
-#include
-
-
-namespace DB
-{
-
-namespace ErrorCodes
-{
- extern const int CANNOT_SEEK_THROUGH_FILE;
-}
-
-
-template
-ReadIndirectBufferFromRemoteFS::ReadIndirectBufferFromRemoteFS(
- RemoteMetadata metadata_)
- : metadata(std::move(metadata_))
-{
-}
-
-
-template
-off_t ReadIndirectBufferFromRemoteFS::seek(off_t offset_, int whence)
-{
- if (whence == SEEK_CUR)
- {
- /// If position within current working buffer - shift pos.
- if (!working_buffer.empty() && size_t(getPosition() + offset_) < absolute_position)
- {
- pos += offset_;
- return getPosition();
- }
- else
- {
- absolute_position += offset_;
- }
- }
- else if (whence == SEEK_SET)
- {
- /// If position within current working buffer - shift pos.
- if (!working_buffer.empty() && size_t(offset_) >= absolute_position - working_buffer.size()
- && size_t(offset_) < absolute_position)
- {
- pos = working_buffer.end() - (absolute_position - offset_);
- return getPosition();
- }
- else
- {
- absolute_position = offset_;
- }
- }
- else
- throw Exception("Only SEEK_SET or SEEK_CUR modes are allowed.", ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
-
- current_buf = initialize();
- pos = working_buffer.end();
-
- return absolute_position;
-}
-
-
-template
-std::unique_ptr ReadIndirectBufferFromRemoteFS::initialize()
-{
- size_t offset = absolute_position;
- for (size_t i = 0; i < metadata.remote_fs_objects.size(); ++i)
- {
- current_buf_idx = i;
- const auto & [file_path, size] = metadata.remote_fs_objects[i];
- if (size > offset)
- {
- auto buf = createReadBuffer(file_path);
- buf->seek(offset, SEEK_SET);
- return buf;
- }
- offset -= size;
- }
- return nullptr;
-}
-
-
-template
-bool ReadIndirectBufferFromRemoteFS::nextImpl()
-{
- /// Find first available buffer that fits to given offset.
- if (!current_buf)
- current_buf = initialize();
-
- /// If current buffer has remaining data - use it.
- if (current_buf)
- {
- bool result = nextAndShiftPosition();
- if (result)
- return true;
- }
-
- /// If there is no available buffers - nothing to read.
- if (current_buf_idx + 1 >= metadata.remote_fs_objects.size())
- return false;
-
- ++current_buf_idx;
- const auto & path = metadata.remote_fs_objects[current_buf_idx].first;
-
- current_buf = createReadBuffer(path);
-
- return nextAndShiftPosition();
-}
-
-template
-bool ReadIndirectBufferFromRemoteFS::nextAndShiftPosition()
-{
- /// Transfer current position and working_buffer to actual ReadBuffer
- swap(*current_buf);
- /// Position and working_buffer will be updated in next() call
- auto result = current_buf->next();
- /// and assigned to current buffer.
- swap(*current_buf);
-
- /// absolute position is shifted by a data size that was read in next() call above.
- if (result)
- absolute_position += working_buffer.size();
-
- return result;
-}
-
-
-#if USE_AWS_S3
-template
-class ReadIndirectBufferFromRemoteFS;
-#endif
-
-#if USE_HDFS
-template
-class ReadIndirectBufferFromRemoteFS;
-#endif
-
-template
-class ReadIndirectBufferFromRemoteFS;
-
-}
diff --git a/src/Disks/ReadIndirectBufferFromRemoteFS.h b/src/Disks/ReadIndirectBufferFromRemoteFS.h
deleted file mode 100644
index bf7f95c7987..00000000000
--- a/src/Disks/ReadIndirectBufferFromRemoteFS.h
+++ /dev/null
@@ -1,45 +0,0 @@
-#pragma once
-
-#include
-
-#include
-#include
-#include
-
-
-namespace DB
-{
-
-/// Reads data from S3/HDFS using stored paths in metadata.
-template
-class ReadIndirectBufferFromRemoteFS : public ReadBufferFromFileBase
-{
-public:
- explicit ReadIndirectBufferFromRemoteFS(RemoteMetadata metadata_);
-
- off_t seek(off_t offset_, int whence) override;
-
- off_t getPosition() override { return absolute_position - available(); }
-
- String getFileName() const override { return metadata.metadata_file_path; }
-
- virtual std::unique_ptr createReadBuffer(const String & path) = 0;
-
-protected:
- RemoteMetadata metadata;
-
-private:
- std::unique_ptr initialize();
-
- bool nextAndShiftPosition();
-
- bool nextImpl() override;
-
- size_t absolute_position = 0;
-
- size_t current_buf_idx = 0;
-
- std::unique_ptr current_buf;
-};
-
-}
diff --git a/src/Disks/ReadIndirectBufferFromWebServer.cpp b/src/Disks/ReadIndirectBufferFromWebServer.cpp
deleted file mode 100644
index f4c01784542..00000000000
--- a/src/Disks/ReadIndirectBufferFromWebServer.cpp
+++ /dev/null
@@ -1,146 +0,0 @@
-#include "ReadIndirectBufferFromWebServer.h"
-
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-
-
-namespace DB
-{
-
-namespace ErrorCodes
-{
- extern const int CANNOT_SEEK_THROUGH_FILE;
- extern const int SEEK_POSITION_OUT_OF_BOUND;
- extern const int NETWORK_ERROR;
-}
-
-static const auto WAIT_MS = 10;
-
-
-ReadIndirectBufferFromWebServer::ReadIndirectBufferFromWebServer(
- const String & url_, ContextPtr context_, size_t buf_size_, size_t backoff_threshold_, size_t max_tries_)
- : BufferWithOwnMemory(buf_size_)
- , log(&Poco::Logger::get("ReadIndirectBufferFromWebServer"))
- , context(context_)
- , url(url_)
- , buf_size(buf_size_)
- , backoff_threshold_ms(backoff_threshold_)
- , max_tries(max_tries_)
-{
-}
-
-
-std::unique_ptr ReadIndirectBufferFromWebServer::initialize()
-{
- Poco::URI uri(url);
-
- ReadWriteBufferFromHTTP::HTTPHeaderEntries headers;
- headers.emplace_back(std::make_pair("Range", fmt::format("bytes={}-", offset)));
- const auto & settings = context->getSettingsRef();
- LOG_DEBUG(log, "Reading from offset: {}", offset);
- const auto & config = context->getConfigRef();
- Poco::Timespan http_keep_alive_timeout{config.getUInt("keep_alive_timeout", 20), 0};
-
- return std::make_unique(
- uri,
- Poco::Net::HTTPRequest::HTTP_GET,
- ReadWriteBufferFromHTTP::OutStreamCallback(),
- ConnectionTimeouts(std::max(Poco::Timespan(settings.http_connection_timeout.totalSeconds(), 0), Poco::Timespan(20, 0)),
- settings.http_send_timeout,
- std::max(Poco::Timespan(settings.http_receive_timeout.totalSeconds(), 0), Poco::Timespan(20, 0)),
- settings.tcp_keep_alive_timeout,
- http_keep_alive_timeout),
- 0,
- Poco::Net::HTTPBasicCredentials{},
- buf_size,
- headers);
-}
-
-
-bool ReadIndirectBufferFromWebServer::nextImpl()
-{
- bool next_result = false, successful_read = false;
- UInt16 milliseconds_to_wait = WAIT_MS;
-
- if (impl)
- {
- /// Restore correct position at the needed offset.
- impl->position() = position();
- assert(!impl->hasPendingData());
- }
-
- WriteBufferFromOwnString error_msg;
- for (size_t i = 0; (i < max_tries) && !successful_read && !next_result; ++i)
- {
- while (milliseconds_to_wait < backoff_threshold_ms)
- {
- try
- {
- if (!impl)
- {
- impl = initialize();
- next_result = impl->hasPendingData();
- if (next_result)
- break;
- }
-
- next_result = impl->next();
- successful_read = true;
- break;
- }
- catch (const Poco::Exception & e)
- {
- LOG_WARNING(log, "Read attempt failed for url: {}. Error: {}", url, e.what());
- error_msg << fmt::format("Error: {}\n", e.what());
-
- sleepForMilliseconds(milliseconds_to_wait);
- milliseconds_to_wait *= 2;
- impl.reset();
- }
- }
- milliseconds_to_wait = WAIT_MS;
- }
-
- if (!successful_read)
- throw Exception(ErrorCodes::NETWORK_ERROR,
- "All read attempts failed for url: {}. Reason:\n{}", url, error_msg.str());
-
- if (next_result)
- {
- BufferBase::set(impl->buffer().begin(), impl->buffer().size(), impl->offset());
- offset += working_buffer.size();
- }
-
- return next_result;
-}
-
-
-off_t ReadIndirectBufferFromWebServer::seek(off_t offset_, int whence)
-{
- if (impl)
- throw Exception(ErrorCodes::CANNOT_SEEK_THROUGH_FILE, "Seek is allowed only before first read attempt from the buffer");
-
- if (whence != SEEK_SET)
- throw Exception(ErrorCodes::CANNOT_SEEK_THROUGH_FILE, "Only SEEK_SET mode is allowed");
-
- if (offset_ < 0)
- throw Exception(ErrorCodes::SEEK_POSITION_OUT_OF_BOUND, "Seek position is out of bounds. Offset: {}", std::to_string(offset_));
-
- offset = offset_;
-
- return offset;
-}
-
-
-off_t ReadIndirectBufferFromWebServer::getPosition()
-{
- return offset - available();
-}
-
-}
diff --git a/src/Disks/S3/DiskS3.cpp b/src/Disks/S3/DiskS3.cpp
index c8e248538c0..ef8bb8e0feb 100644
--- a/src/Disks/S3/DiskS3.cpp
+++ b/src/Disks/S3/DiskS3.cpp
@@ -17,11 +17,7 @@
#include
#include
-#include
-#include
-
#include
-
#include
#include
#include
@@ -29,6 +25,12 @@
#include
#include
+#include
+#include
+#include
+#include
+#include
+
#include
#include
#include
@@ -127,47 +129,19 @@ void throwIfError(const Aws::Utils::Outcome & response)
}
}
-/// Reads data from S3 using stored paths in metadata.
-class ReadIndirectBufferFromS3 final : public ReadIndirectBufferFromRemoteFS
-{
-public:
- ReadIndirectBufferFromS3(
- std::shared_ptr client_ptr_,
- const String & bucket_,
- DiskS3::Metadata metadata_,
- size_t max_single_read_retries_,
- size_t buf_size_)
- : ReadIndirectBufferFromRemoteFS(metadata_)
- , client_ptr(std::move(client_ptr_))
- , bucket(bucket_)
- , max_single_read_retries(max_single_read_retries_)
- , buf_size(buf_size_)
- {
- }
-
- std::unique_ptr createReadBuffer(const String & path) override
- {
- return std::make_unique(client_ptr, bucket, fs::path(metadata.remote_fs_root_path) / path, max_single_read_retries, buf_size);
- }
-
-private:
- std::shared_ptr client_ptr;
- const String & bucket;
- UInt64 max_single_read_retries;
- size_t buf_size;
-};
-
DiskS3::DiskS3(
String name_,
String bucket_,
String s3_root_path_,
String metadata_path_,
+ ContextPtr context_,
SettingsPtr settings_,
GetDiskSettings settings_getter_)
: IDiskRemote(name_, s3_root_path_, metadata_path_, "DiskS3", settings_->thread_pool_size)
, bucket(std::move(bucket_))
, current_settings(std::move(settings_))
, settings_getter(settings_getter_)
+ , context(context_)
{
}
@@ -230,9 +204,23 @@ std::unique_ptr DiskS3::readFile(const String & path, co
LOG_TRACE(log, "Read from file by path: {}. Existing S3 objects: {}",
backQuote(metadata_path + path), metadata.remote_fs_objects.size());
- auto reader = std::make_unique(
- settings->client, bucket, metadata, settings->s3_max_single_read_retries, read_settings.remote_fs_buffer_size);
- return std::make_unique(std::move(reader), settings->min_bytes_for_seek);
+ bool threadpool_read = read_settings.remote_fs_method == RemoteFSReadMethod::read_threadpool;
+
+ auto s3_impl = std::make_unique(
+ path,
+ settings->client, bucket, metadata,
+ settings->s3_max_single_read_retries, read_settings, threadpool_read);
+
+ if (threadpool_read)
+ {
+ auto reader = getThreadPoolReader();
+ return std::make_unique(reader, read_settings, std::move(s3_impl));
+ }
+ else
+ {
+ auto buf = std::make_unique(std::move(s3_impl));
+ return std::make_unique(std::move(buf), settings->min_bytes_for_seek);
+ }
}
std::unique_ptr DiskS3::writeFile(const String & path, size_t buf_size, WriteMode mode)
@@ -378,7 +366,7 @@ int DiskS3::readSchemaVersion(const String & source_bucket, const String & sourc
source_bucket,
source_path + SCHEMA_VERSION_OBJECT,
settings->s3_max_single_read_retries,
- DBMS_DEFAULT_BUFFER_SIZE);
+ context->getReadSettings());
readIntText(version, buffer);
@@ -1033,9 +1021,9 @@ void DiskS3::onFreeze(const String & path)
revision_file_buf.finalize();
}
-void DiskS3::applyNewSettings(const Poco::Util::AbstractConfiguration & config, ContextPtr context, const String &, const DisksMap &)
+void DiskS3::applyNewSettings(const Poco::Util::AbstractConfiguration & config, ContextPtr context_, const String &, const DisksMap &)
{
- auto new_settings = settings_getter(config, "storage_configuration.disks." + name, context);
+ auto new_settings = settings_getter(config, "storage_configuration.disks." + name, context_);
current_settings.set(std::move(new_settings));
diff --git a/src/Disks/S3/DiskS3.h b/src/Disks/S3/DiskS3.h
index b8f83b4763d..19bcb925cb4 100644
--- a/src/Disks/S3/DiskS3.h
+++ b/src/Disks/S3/DiskS3.h
@@ -69,6 +69,7 @@ public:
String bucket_,
String s3_root_path_,
String metadata_path_,
+ ContextPtr context_,
SettingsPtr settings_,
GetDiskSettings settings_getter_);
@@ -175,6 +176,8 @@ private:
static constexpr int RESTORABLE_SCHEMA_VERSION = 1;
/// Directories with data.
const std::vector data_roots {"data", "store"};
+
+ ContextPtr context;
};
}
diff --git a/src/Disks/S3/registerDiskS3.cpp b/src/Disks/S3/registerDiskS3.cpp
index 57c2cf17239..8174ccea330 100644
--- a/src/Disks/S3/registerDiskS3.cpp
+++ b/src/Disks/S3/registerDiskS3.cpp
@@ -184,6 +184,7 @@ void registerDiskS3(DiskFactory & factory)
uri.bucket,
uri.key,
metadata_path,
+ context,
getSettings(config, config_prefix, context),
getSettings);
diff --git a/src/IO/AsynchronousReader.h b/src/IO/AsynchronousReader.h
index 77b4a2f5b22..e4a81623205 100644
--- a/src/IO/AsynchronousReader.h
+++ b/src/IO/AsynchronousReader.h
@@ -46,6 +46,7 @@ public:
size_t size = 0;
char * buf = nullptr;
int64_t priority = 0;
+ size_t ignore = 0;
};
/// Less than requested amount of data can be returned.
diff --git a/src/IO/ReadBuffer.h b/src/IO/ReadBuffer.h
index 609065feb7e..be456ea398c 100644
--- a/src/IO/ReadBuffer.h
+++ b/src/IO/ReadBuffer.h
@@ -202,6 +202,12 @@ public:
*/
virtual void prefetch() {}
+ /**
+ * For reading from remote filesystem, when it matters how much we read.
+ */
+ virtual void setReadUntilPosition(size_t /* position */) {}
+ virtual void setReadUntilEnd() {}
+
protected:
/// The number of bytes to ignore from the initial position of `working_buffer`
/// buffer. Apparently this is an additional out-parameter for nextImpl(),
diff --git a/src/IO/ReadBufferFromS3.cpp b/src/IO/ReadBufferFromS3.cpp
index b1c4cdc3bb9..78d296be60e 100644
--- a/src/IO/ReadBufferFromS3.cpp
+++ b/src/IO/ReadBufferFromS3.cpp
@@ -2,15 +2,17 @@
#if USE_AWS_S3
-# include
-# include
-# include
+#include
+#include
+#include
-# include
-# include
-# include
+#include
+#include
-# include
+#include
+#include
+
+#include
namespace ProfileEvents
@@ -27,43 +29,81 @@ namespace ErrorCodes
extern const int S3_ERROR;
extern const int CANNOT_SEEK_THROUGH_FILE;
extern const int SEEK_POSITION_OUT_OF_BOUND;
+ extern const int LOGICAL_ERROR;
}
ReadBufferFromS3::ReadBufferFromS3(
- std::shared_ptr client_ptr_, const String & bucket_, const String & key_, UInt64 max_single_read_retries_, size_t buffer_size_)
+ std::shared_ptr client_ptr_, const String & bucket_, const String & key_,
+ UInt64 max_single_read_retries_, const ReadSettings & settings_, bool use_external_buffer_, size_t read_until_position_)
: SeekableReadBuffer(nullptr, 0)
, client_ptr(std::move(client_ptr_))
, bucket(bucket_)
, key(key_)
, max_single_read_retries(max_single_read_retries_)
- , buffer_size(buffer_size_)
+ , read_settings(settings_)
+ , use_external_buffer(use_external_buffer_)
+ , read_until_position(read_until_position_)
{
}
bool ReadBufferFromS3::nextImpl()
{
+ if (read_until_position)
+ {
+ if (read_until_position == offset)
+ return false;
+
+ if (read_until_position < offset)
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to read beyond right offset ({} > {})", offset, read_until_position - 1);
+ }
+
bool next_result = false;
if (impl)
{
- /// `impl` has been initialized earlier and now we're at the end of the current portion of data.
- impl->position() = position();
- assert(!impl->hasPendingData());
- }
- else
- {
- /// `impl` is not initialized and we're about to read the first portion of data.
- impl = initialize();
- next_result = impl->hasPendingData();
+ if (use_external_buffer)
+ {
+ /**
+ * use_external_buffer -- means we read into the buffer which
+ * was passed to us from somewhere else. We do not check whether
+ * previously returned buffer was read or not (no hasPendingData() check is needed),
+ * because this branch means we are prefetching data,
+ * each nextImpl() call we can fill a different buffer.
+ */
+ impl->set(internal_buffer.begin(), internal_buffer.size());
+ assert(working_buffer.begin() != nullptr);
+ assert(!internal_buffer.empty());
+ }
+ else
+ {
+ /**
+ * impl was initialized before, pass position() to it to make
+ * sure there is no pending data which was not read.
+ */
+ impl->position() = position();
+ assert(!impl->hasPendingData());
+ }
}
- auto sleep_time_with_backoff_milliseconds = std::chrono::milliseconds(100);
+ size_t sleep_time_with_backoff_milliseconds = 100;
for (size_t attempt = 0; (attempt < max_single_read_retries) && !next_result; ++attempt)
{
Stopwatch watch;
try
{
+ if (!impl)
+ {
+ impl = initialize();
+
+ if (use_external_buffer)
+ {
+ impl->set(internal_buffer.begin(), internal_buffer.size());
+ assert(working_buffer.begin() != nullptr);
+ assert(!internal_buffer.empty());
+ }
+ }
+
/// Try to read a next portion of data.
next_result = impl->next();
watch.stop();
@@ -83,13 +123,11 @@ bool ReadBufferFromS3::nextImpl()
throw;
/// Pause before next attempt.
- std::this_thread::sleep_for(sleep_time_with_backoff_milliseconds);
+ sleepForMilliseconds(sleep_time_with_backoff_milliseconds);
sleep_time_with_backoff_milliseconds *= 2;
/// Try to reinitialize `impl`.
impl.reset();
- impl = initialize();
- next_result = impl->hasPendingData();
}
}
@@ -127,19 +165,34 @@ off_t ReadBufferFromS3::getPosition()
std::unique_ptr ReadBufferFromS3::initialize()
{
- LOG_TRACE(log, "Read S3 object. Bucket: {}, Key: {}, Offset: {}", bucket, key, offset);
-
Aws::S3::Model::GetObjectRequest req;
req.SetBucket(bucket);
req.SetKey(key);
- req.SetRange(fmt::format("bytes={}-", offset));
+
+ /**
+ * If remote_filesystem_read_method = 'read_threadpool', then for MergeTree family tables
+ * exact byte ranges to read are always passed here.
+ */
+ if (read_until_position)
+ {
+ if (offset >= read_until_position)
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to read beyond right offset ({} > {})", offset, read_until_position - 1);
+
+ req.SetRange(fmt::format("bytes={}-{}", offset, read_until_position - 1));
+ LOG_DEBUG(log, "Read S3 object. Bucket: {}, Key: {}, Range: {}-{}", bucket, key, offset, read_until_position - 1);
+ }
+ else
+ {
+ req.SetRange(fmt::format("bytes={}-", offset));
+ LOG_DEBUG(log, "Read S3 object. Bucket: {}, Key: {}, Offset: {}", bucket, key, offset);
+ }
Aws::S3::Model::GetObjectOutcome outcome = client_ptr->GetObject(req);
if (outcome.IsSuccess())
{
read_result = outcome.GetResultWithOwnership();
- return std::make_unique(read_result.GetBody(), buffer_size);
+ return std::make_unique(read_result.GetBody(), read_settings.remote_fs_buffer_size);
}
else
throw Exception(outcome.GetError().GetMessage(), ErrorCodes::S3_ERROR);
diff --git a/src/IO/ReadBufferFromS3.h b/src/IO/ReadBufferFromS3.h
index e557e3bc324..d0df5c59eb3 100644
--- a/src/IO/ReadBufferFromS3.h
+++ b/src/IO/ReadBufferFromS3.h
@@ -4,12 +4,14 @@
#if USE_AWS_S3
-# include
+#include
-# include
-# include
-# include
-# include "SeekableReadBuffer.h"
+#include
+#include
+#include
+#include
+
+#include
namespace Aws::S3
{
@@ -28,7 +30,6 @@ private:
String bucket;
String key;
UInt64 max_single_read_retries;
- size_t buffer_size;
off_t offset = 0;
Aws::S3::Model::GetObjectResult read_result;
std::unique_ptr impl;
@@ -41,7 +42,9 @@ public:
const String & bucket_,
const String & key_,
UInt64 max_single_read_retries_,
- size_t buffer_size_);
+ const ReadSettings & settings_,
+ bool use_external_buffer = false,
+ size_t read_until_position_ = 0);
bool nextImpl() override;
@@ -50,6 +53,10 @@ public:
private:
std::unique_ptr initialize();
+
+ ReadSettings read_settings;
+ bool use_external_buffer;
+ off_t read_until_position = 0;
};
}
diff --git a/src/IO/ReadSettings.h b/src/IO/ReadSettings.h
index 379b7bc2216..be328e28118 100644
--- a/src/IO/ReadSettings.h
+++ b/src/IO/ReadSettings.h
@@ -6,7 +6,7 @@
namespace DB
{
-enum class ReadMethod
+enum class LocalFSReadMethod
{
/**
* Simple synchronous reads with 'read'.
@@ -43,12 +43,20 @@ enum class ReadMethod
pread_fake_async
};
+enum class RemoteFSReadMethod
+{
+ read,
+ read_threadpool,
+};
+
class MMappedFileCache;
struct ReadSettings
{
/// Method to use reading from local filesystem.
- ReadMethod local_fs_method = ReadMethod::pread;
+ LocalFSReadMethod local_fs_method = LocalFSReadMethod::pread;
+ /// Method to use reading from remote filesystem.
+ RemoteFSReadMethod remote_fs_method = RemoteFSReadMethod::read;
size_t local_fs_buffer_size = DBMS_DEFAULT_BUFFER_SIZE;
size_t remote_fs_buffer_size = DBMS_DEFAULT_BUFFER_SIZE;
@@ -66,8 +74,14 @@ struct ReadSettings
/// For 'pread_threadpool' method. Lower is more priority.
size_t priority = 0;
- size_t remote_fs_backoff_threshold = 10000;
- size_t remote_fs_backoff_max_tries = 4;
+ size_t remote_fs_read_max_backoff_ms = 10000;
+ size_t remote_fs_read_backoff_max_tries = 4;
+
+ /// Set to true for MergeTree tables to make sure
+ /// that last position (offset in compressed file) is always passed.
+ /// (Otherwise asynchronous reading from remote fs is not efficient).
+ /// If reading is done without final position set, throw logical_error.
+ bool must_read_until_position = false;
ReadSettings adjustBufferSize(size_t file_size) const
{
diff --git a/src/IO/ReadWriteBufferFromHTTP.h b/src/IO/ReadWriteBufferFromHTTP.h
index 55fcdca89b3..5254b140679 100644
--- a/src/IO/ReadWriteBufferFromHTTP.h
+++ b/src/IO/ReadWriteBufferFromHTTP.h
@@ -2,10 +2,12 @@
#include