diff --git a/.github/workflows/backport_branches.yml b/.github/workflows/backport_branches.yml index b0380b939bb..c8c6ba30b0b 100644 --- a/.github/workflows/backport_branches.yml +++ b/.github/workflows/backport_branches.yml @@ -273,5 +273,5 @@ jobs: - name: Finish label run: | cd "$GITHUB_WORKSPACE/tests/ci" - python3 finish_check.py + python3 finish_check.py --wf-status ${{ contains(needs.*.result, 'failure') && 'failure' || 'success' }} python3 merge_pr.py diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index 91dcb6a4968..f5c78a6b6a1 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -173,4 +173,4 @@ jobs: - name: Finish label run: | cd "$GITHUB_WORKSPACE/tests/ci" - python3 finish_check.py + python3 finish_check.py --wf-status ${{ contains(needs.*.result, 'failure') && 'failure' || 'success' }} diff --git a/.github/workflows/merge_queue.yml b/.github/workflows/merge_queue.yml index c8b2452829b..cfa01b0e8f3 100644 --- a/.github/workflows/merge_queue.yml +++ b/.github/workflows/merge_queue.yml @@ -99,7 +99,7 @@ jobs: ################################# Stage Final ################################# # FinishCheck: - if: ${{ !failure() && !cancelled() }} + if: ${{ !cancelled() }} needs: [RunConfig, BuildDockers, StyleCheck, FastTest, Builds_1, Tests_1] runs-on: [self-hosted, style-checker-aarch64] steps: @@ -112,4 +112,4 @@ jobs: - name: Finish label run: | cd "$GITHUB_WORKSPACE/tests/ci" - python3 finish_check.py ${{ (contains(needs.*.result, 'failure') && github.event_name == 'merge_group') && '--pipeline-failure' || '' }} + python3 finish_check.py --wf-status ${{ contains(needs.*.result, 'failure') && 'failure' || 'success' }} diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index e4deaf9f35e..66ca3381a40 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -191,7 +191,7 @@ jobs: - name: Finish label run: | cd "$GITHUB_WORKSPACE/tests/ci" - python3 finish_check.py + python3 finish_check.py --wf-status ${{ contains(needs.*.result, 'failure') && 'failure' || 'success' }} ############################################################################################# ###################################### JEPSEN TESTS ######################################### diff --git a/.github/workflows/release_branches.yml b/.github/workflows/release_branches.yml index 4d45c8d8d4b..f9b8a4fa764 100644 --- a/.github/workflows/release_branches.yml +++ b/.github/workflows/release_branches.yml @@ -496,4 +496,4 @@ jobs: - name: Finish label run: | cd "$GITHUB_WORKSPACE/tests/ci" - python3 finish_check.py + python3 finish_check.py --wf-status ${{ contains(needs.*.result, 'failure') && 'failure' || 'success' }} diff --git a/.github/workflows/tags_stable.yml b/.github/workflows/tags_stable.yml index e4fc9f0b1d3..2aa7694bc41 100644 --- a/.github/workflows/tags_stable.yml +++ b/.github/workflows/tags_stable.yml @@ -46,9 +46,10 @@ jobs: ./utils/list-versions/list-versions.sh > ./utils/list-versions/version_date.tsv ./utils/list-versions/update-docker-version.sh GID=$(id -g "${UID}") - docker run -u "${UID}:${GID}" -e PYTHONUNBUFFERED=1 \ + # --network=host and CI=1 are required for the S3 access from a container + docker run -u "${UID}:${GID}" -e PYTHONUNBUFFERED=1 -e CI=1 --network=host \ --volume="${GITHUB_WORKSPACE}:/ClickHouse" clickhouse/style-test \ - /ClickHouse/utils/changelog/changelog.py -v --debug-helpers \ + /ClickHouse/tests/ci/changelog.py -v --debug-helpers \ --gh-user-or-token="$GITHUB_TOKEN" --jobs=5 \ --output="/ClickHouse/docs/changelogs/${GITHUB_TAG}.md" "${GITHUB_TAG}" git add "./docs/changelogs/${GITHUB_TAG}.md" diff --git a/.gitmodules b/.gitmodules index 6d64c52ce00..12d865307d8 100644 --- a/.gitmodules +++ b/.gitmodules @@ -91,13 +91,13 @@ [submodule "contrib/aws"] path = contrib/aws url = https://github.com/ClickHouse/aws-sdk-cpp -[submodule "aws-c-event-stream"] +[submodule "contrib/aws-c-event-stream"] path = contrib/aws-c-event-stream url = https://github.com/awslabs/aws-c-event-stream -[submodule "aws-c-common"] +[submodule "contrib/aws-c-common"] path = contrib/aws-c-common url = https://github.com/awslabs/aws-c-common.git -[submodule "aws-checksums"] +[submodule "contrib/aws-checksums"] path = contrib/aws-checksums url = https://github.com/awslabs/aws-checksums [submodule "contrib/curl"] @@ -161,7 +161,7 @@ [submodule "contrib/xz"] path = contrib/xz url = https://github.com/xz-mirror/xz -[submodule "abseil"] +[submodule "contrib/abseil-cpp"] path = contrib/abseil-cpp url = https://github.com/ClickHouse/abseil-cpp.git [submodule "contrib/dragonbox"] diff --git a/CMakeLists.txt b/CMakeLists.txt index 455adc24182..c4f093b1c99 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -399,7 +399,7 @@ option (ENABLE_GWP_ASAN "Enable Gwp-Asan" ON) # but GWP-ASan also wants to use mmap frequently, # and due to a large number of memory mappings, # it does not work together well. -if ((NOT OS_LINUX AND NOT OS_ANDROID) OR (CMAKE_BUILD_TYPE_UC STREQUAL "DEBUG")) +if ((NOT OS_LINUX AND NOT OS_ANDROID) OR (CMAKE_BUILD_TYPE_UC STREQUAL "DEBUG") OR SANITIZE) set(ENABLE_GWP_ASAN OFF) endif () diff --git a/contrib/aws b/contrib/aws index deeaa9e7c5f..1c2946bfcb7 160000 --- a/contrib/aws +++ b/contrib/aws @@ -1 +1 @@ -Subproject commit deeaa9e7c5fe690e3dacc4005d7ecfa7a66a32bb +Subproject commit 1c2946bfcb7f1e3ae0a858de0b59d4f1a7b4ccaf diff --git a/contrib/openssl b/contrib/openssl index f7b8721dfc6..67c0b63e578 160000 --- a/contrib/openssl +++ b/contrib/openssl @@ -1 +1 @@ -Subproject commit f7b8721dfc66abb147f24ca07b9c9d1d64f40f71 +Subproject commit 67c0b63e578e4c751ac9edf490f5a96124fff8dc diff --git a/docker/keeper/Dockerfile b/docker/keeper/Dockerfile index b3271d94184..24f38740ff5 100644 --- a/docker/keeper/Dockerfile +++ b/docker/keeper/Dockerfile @@ -34,7 +34,7 @@ RUN arch=${TARGETARCH:-amd64} \ # lts / testing / prestable / etc ARG REPO_CHANNEL="stable" ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}" -ARG VERSION="24.5.1.1763" +ARG VERSION="24.5.3.5" ARG PACKAGES="clickhouse-keeper" ARG DIRECT_DOWNLOAD_URLS="" diff --git a/docker/server/Dockerfile.alpine b/docker/server/Dockerfile.alpine index 3f3b880c8f3..c71319a2a7e 100644 --- a/docker/server/Dockerfile.alpine +++ b/docker/server/Dockerfile.alpine @@ -32,7 +32,7 @@ RUN arch=${TARGETARCH:-amd64} \ # lts / testing / prestable / etc ARG REPO_CHANNEL="stable" ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}" -ARG VERSION="24.5.1.1763" +ARG VERSION="24.5.3.5" ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static" ARG DIRECT_DOWNLOAD_URLS="" diff --git a/docker/server/Dockerfile.ubuntu b/docker/server/Dockerfile.ubuntu index 5fd22ee9b51..ed8cf3d657d 100644 --- a/docker/server/Dockerfile.ubuntu +++ b/docker/server/Dockerfile.ubuntu @@ -28,7 +28,7 @@ RUN sed -i "s|http://archive.ubuntu.com|${apt_archive}|g" /etc/apt/sources.list ARG REPO_CHANNEL="stable" ARG REPOSITORY="deb [signed-by=/usr/share/keyrings/clickhouse-keyring.gpg] https://packages.clickhouse.com/deb ${REPO_CHANNEL} main" -ARG VERSION="24.5.1.1763" +ARG VERSION="24.5.3.5" ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static" #docker-official-library:off diff --git a/docker/test/fuzzer/run-fuzzer.sh b/docker/test/fuzzer/run-fuzzer.sh index b8f967ed9c2..6191aeaf304 100755 --- a/docker/test/fuzzer/run-fuzzer.sh +++ b/docker/test/fuzzer/run-fuzzer.sh @@ -208,6 +208,7 @@ handle SIGPIPE nostop noprint pass handle SIGTERM nostop noprint pass handle SIGUSR1 nostop noprint pass handle SIGUSR2 nostop noprint pass +handle SIGSEGV nostop pass handle SIG$RTMIN nostop noprint pass info signals continue diff --git a/docker/test/stateless/attach_gdb.lib b/docker/test/stateless/attach_gdb.lib index d288288bb17..eb54f920b98 100644 --- a/docker/test/stateless/attach_gdb.lib +++ b/docker/test/stateless/attach_gdb.lib @@ -20,6 +20,7 @@ handle SIGPIPE nostop noprint pass handle SIGTERM nostop noprint pass handle SIGUSR1 nostop noprint pass handle SIGUSR2 nostop noprint pass +handle SIGSEGV nostop pass handle SIG$RTMIN nostop noprint pass info signals continue diff --git a/docker/test/style/Dockerfile b/docker/test/style/Dockerfile index 91768c8328d..6ad03852b66 100644 --- a/docker/test/style/Dockerfile +++ b/docker/test/style/Dockerfile @@ -10,14 +10,15 @@ RUN sed -i "s|http://archive.ubuntu.com|$apt_archive|g" /etc/apt/sources.list RUN apt-get update && env DEBIAN_FRONTEND=noninteractive apt-get install --yes \ aspell \ curl \ - git \ - gh \ file \ + gh \ + git \ libxml2-utils \ + locales \ moreutils \ python3-pip \ yamllint \ - locales \ + zstd \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/* @@ -33,6 +34,7 @@ RUN pip3 install \ flake8==4.0.1 \ requests \ thefuzz \ + tqdm==4.66.4 \ types-requests \ unidiff \ && rm -rf /root/.cache/pip diff --git a/docs/changelogs/v24.1.6.52-stable.md b/docs/changelogs/v24.1.6.52-stable.md new file mode 100644 index 00000000000..341561e9a64 --- /dev/null +++ b/docs/changelogs/v24.1.6.52-stable.md @@ -0,0 +1,45 @@ +--- +sidebar_position: 1 +sidebar_label: 2024 +--- + +# 2024 Changelog + +### ClickHouse release v24.1.6.52-stable (fa09f677bc9) FIXME as compared to v24.1.5.6-stable (7f67181ff31) + +#### Improvement +* Backported in [#60292](https://github.com/ClickHouse/ClickHouse/issues/60292): Copy S3 file GCP fallback to buffer copy in case GCP returned `Internal Error` with `GATEWAY_TIMEOUT` HTTP error code. [#60164](https://github.com/ClickHouse/ClickHouse/pull/60164) ([Maksim Kita](https://github.com/kitaisreal)). +* Backported in [#60832](https://github.com/ClickHouse/ClickHouse/issues/60832): Update tzdata to 2024a. [#60768](https://github.com/ClickHouse/ClickHouse/pull/60768) ([Raúl Marín](https://github.com/Algunenano)). + +#### Bug Fix (user-visible misbehavior in an official stable release) + +* Backported in [#60413](https://github.com/ClickHouse/ClickHouse/issues/60413): Fix segmentation fault in KQL parser when the input query exceeds the `max_query_size`. Also re-enable the KQL dialect. Fixes [#59036](https://github.com/ClickHouse/ClickHouse/issues/59036) and [#59037](https://github.com/ClickHouse/ClickHouse/issues/59037). [#59626](https://github.com/ClickHouse/ClickHouse/pull/59626) ([Yong Wang](https://github.com/kashwy)). +* Backported in [#60074](https://github.com/ClickHouse/ClickHouse/issues/60074): Fix error `Read beyond last offset` for `AsynchronousBoundedReadBuffer`. [#59630](https://github.com/ClickHouse/ClickHouse/pull/59630) ([Vitaly Baranov](https://github.com/vitlibar)). +* Backported in [#60299](https://github.com/ClickHouse/ClickHouse/issues/60299): Fix having neigher acked nor nacked messages. If exception happens during read-write phase, messages will be nacked. [#59775](https://github.com/ClickHouse/ClickHouse/pull/59775) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Backported in [#60066](https://github.com/ClickHouse/ClickHouse/issues/60066): Fix optimize_uniq_to_count removing the column alias. [#60026](https://github.com/ClickHouse/ClickHouse/pull/60026) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#60638](https://github.com/ClickHouse/ClickHouse/issues/60638): Fixed a bug in parallel optimization for queries with `FINAL`, which could give an incorrect result in rare cases. [#60041](https://github.com/ClickHouse/ClickHouse/pull/60041) ([Maksim Kita](https://github.com/kitaisreal)). +* Backported in [#60177](https://github.com/ClickHouse/ClickHouse/issues/60177): Fix cosineDistance crash with Nullable. [#60150](https://github.com/ClickHouse/ClickHouse/pull/60150) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#60279](https://github.com/ClickHouse/ClickHouse/issues/60279): Hide sensitive info for `S3Queue` table engine. [#60233](https://github.com/ClickHouse/ClickHouse/pull/60233) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Backported in [#61000](https://github.com/ClickHouse/ClickHouse/issues/61000): Reduce the number of read rows from `system.numbers`. Fixes [#59418](https://github.com/ClickHouse/ClickHouse/issues/59418). [#60546](https://github.com/ClickHouse/ClickHouse/pull/60546) ([JackyWoo](https://github.com/JackyWoo)). +* Backported in [#60791](https://github.com/ClickHouse/ClickHouse/issues/60791): Fix buffer overflow that can happen if the attacker asks the HTTP server to decompress data with a composition of codecs and size triggering numeric overflow. Fix buffer overflow that can happen inside codec NONE on wrong input data. This was submitted by TIANGONG research team through our [Bug Bounty program](https://github.com/ClickHouse/ClickHouse/issues/38986). [#60731](https://github.com/ClickHouse/ClickHouse/pull/60731) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Backported in [#60783](https://github.com/ClickHouse/ClickHouse/issues/60783): Functions for SQL/JSON were able to read uninitialized memory. This closes [#60017](https://github.com/ClickHouse/ClickHouse/issues/60017). Found by Fuzzer. [#60738](https://github.com/ClickHouse/ClickHouse/pull/60738) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Backported in [#60803](https://github.com/ClickHouse/ClickHouse/issues/60803): Do not set aws custom metadata `x-amz-meta-*` headers on UploadPart & CompleteMultipartUpload calls. [#60748](https://github.com/ClickHouse/ClickHouse/pull/60748) ([Francisco J. Jurado Moreno](https://github.com/Beetelbrox)). +* Backported in [#60820](https://github.com/ClickHouse/ClickHouse/issues/60820): Fix crash in arrayEnumerateRanked. [#60764](https://github.com/ClickHouse/ClickHouse/pull/60764) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#60841](https://github.com/ClickHouse/ClickHouse/issues/60841): Fix crash when using input() in INSERT SELECT JOIN. Closes [#60035](https://github.com/ClickHouse/ClickHouse/issues/60035). [#60765](https://github.com/ClickHouse/ClickHouse/pull/60765) ([Kruglov Pavel](https://github.com/Avogar)). +* Backported in [#60904](https://github.com/ClickHouse/ClickHouse/issues/60904): Avoid segfault if too many keys are skipped when reading from S3. [#60849](https://github.com/ClickHouse/ClickHouse/pull/60849) ([Antonio Andelic](https://github.com/antonio2368)). + +#### NO CL CATEGORY + +* Backported in [#60186](https://github.com/ClickHouse/ClickHouse/issues/60186):. [#60181](https://github.com/ClickHouse/ClickHouse/pull/60181) ([Alexey Milovidov](https://github.com/alexey-milovidov)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Backported in [#60333](https://github.com/ClickHouse/ClickHouse/issues/60333): CI: Fix job failures due to jepsen artifacts. [#59890](https://github.com/ClickHouse/ClickHouse/pull/59890) ([Max K.](https://github.com/maxknv)). +* Backported in [#60034](https://github.com/ClickHouse/ClickHouse/issues/60034): Fix mark release ready. [#59994](https://github.com/ClickHouse/ClickHouse/pull/59994) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Backported in [#60326](https://github.com/ClickHouse/ClickHouse/issues/60326): Ability to detect undead ZooKeeper sessions. [#60044](https://github.com/ClickHouse/ClickHouse/pull/60044) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Backported in [#60363](https://github.com/ClickHouse/ClickHouse/issues/60363): CI: hot fix for gh statuses. [#60201](https://github.com/ClickHouse/ClickHouse/pull/60201) ([Max K.](https://github.com/maxknv)). +* Backported in [#60648](https://github.com/ClickHouse/ClickHouse/issues/60648): Detect io_uring in tests. [#60373](https://github.com/ClickHouse/ClickHouse/pull/60373) ([Azat Khuzhin](https://github.com/azat)). +* Backported in [#60569](https://github.com/ClickHouse/ClickHouse/issues/60569): Remove broken test while we fix it. [#60547](https://github.com/ClickHouse/ClickHouse/pull/60547) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#60756](https://github.com/ClickHouse/ClickHouse/issues/60756): Update shellcheck. [#60553](https://github.com/ClickHouse/ClickHouse/pull/60553) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Backported in [#60584](https://github.com/ClickHouse/ClickHouse/issues/60584): CI: fix docker build job name. [#60554](https://github.com/ClickHouse/ClickHouse/pull/60554) ([Max K.](https://github.com/maxknv)). + diff --git a/docs/changelogs/v24.3.4.147-lts.md b/docs/changelogs/v24.3.4.147-lts.md new file mode 100644 index 00000000000..7d77fb29977 --- /dev/null +++ b/docs/changelogs/v24.3.4.147-lts.md @@ -0,0 +1,100 @@ +--- +sidebar_position: 1 +sidebar_label: 2024 +--- + +# 2024 Changelog + +### ClickHouse release v24.3.4.147-lts (31a7bdc346d) FIXME as compared to v24.3.3.102-lts (7e7f3bdd9be) + +#### Improvement +* Backported in [#63465](https://github.com/ClickHouse/ClickHouse/issues/63465): Make rabbitmq nack broken messages. Closes [#45350](https://github.com/ClickHouse/ClickHouse/issues/45350). [#60312](https://github.com/ClickHouse/ClickHouse/pull/60312) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Backported in [#64290](https://github.com/ClickHouse/ClickHouse/issues/64290): Fix logical-error when undoing quorum insert transaction. [#61953](https://github.com/ClickHouse/ClickHouse/pull/61953) ([Han Fei](https://github.com/hanfei1991)). + +#### Build/Testing/Packaging Improvement +* Backported in [#63610](https://github.com/ClickHouse/ClickHouse/issues/63610): The Dockerfile is reviewed by the docker official library in https://github.com/docker-library/official-images/pull/15846. [#63400](https://github.com/ClickHouse/ClickHouse/pull/63400) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Backported in [#65128](https://github.com/ClickHouse/ClickHouse/issues/65128): Decrease the `unit-test` image a few times. [#65102](https://github.com/ClickHouse/ClickHouse/pull/65102) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + +#### Bug Fix (user-visible misbehavior in an official stable release) + +* Backported in [#64277](https://github.com/ClickHouse/ClickHouse/issues/64277): Fix queries with FINAL give wrong result when table does not use adaptive granularity. [#62432](https://github.com/ClickHouse/ClickHouse/pull/62432) ([Duc Canh Le](https://github.com/canhld94)). +* Backported in [#63716](https://github.com/ClickHouse/ClickHouse/issues/63716): Fix excessive memory usage for queries with nested lambdas. Fixes [#62036](https://github.com/ClickHouse/ClickHouse/issues/62036). [#62462](https://github.com/ClickHouse/ClickHouse/pull/62462) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Backported in [#63247](https://github.com/ClickHouse/ClickHouse/issues/63247): Fix size checks when updating materialized nested columns ( fixes [#62731](https://github.com/ClickHouse/ClickHouse/issues/62731) ). [#62773](https://github.com/ClickHouse/ClickHouse/pull/62773) ([Eliot Hautefeuille](https://github.com/hileef)). +* Backported in [#62984](https://github.com/ClickHouse/ClickHouse/issues/62984): Fix the `Unexpected return type` error for queries that read from `StorageBuffer` with `PREWHERE` when the source table has different types. Fixes [#62545](https://github.com/ClickHouse/ClickHouse/issues/62545). [#62916](https://github.com/ClickHouse/ClickHouse/pull/62916) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Backported in [#63185](https://github.com/ClickHouse/ClickHouse/issues/63185): Sanity check: Clamp values instead of throwing. [#63119](https://github.com/ClickHouse/ClickHouse/pull/63119) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#63293](https://github.com/ClickHouse/ClickHouse/issues/63293): Fix crash with untuple and unresolved lambda. [#63131](https://github.com/ClickHouse/ClickHouse/pull/63131) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#63411](https://github.com/ClickHouse/ClickHouse/issues/63411): Fix a misbehavior when SQL security defaults don't load for old tables during server startup. [#63209](https://github.com/ClickHouse/ClickHouse/pull/63209) ([pufit](https://github.com/pufit)). +* Backported in [#63616](https://github.com/ClickHouse/ClickHouse/issues/63616): Fix bug which could potentially lead to rare LOGICAL_ERROR during SELECT query with message: `Unexpected return type from materialize. Expected type_XXX. Got type_YYY.` Introduced in [#59379](https://github.com/ClickHouse/ClickHouse/issues/59379). [#63353](https://github.com/ClickHouse/ClickHouse/pull/63353) ([alesapin](https://github.com/alesapin)). +* Backported in [#63455](https://github.com/ClickHouse/ClickHouse/issues/63455): Fix `X-ClickHouse-Timezone` header returning wrong timezone when using `session_timezone` as query level setting. [#63377](https://github.com/ClickHouse/ClickHouse/pull/63377) ([Andrey Zvonov](https://github.com/zvonand)). +* Backported in [#63603](https://github.com/ClickHouse/ClickHouse/issues/63603): Fix backup of projection part in case projection was removed from table metadata, but part still has projection. [#63426](https://github.com/ClickHouse/ClickHouse/pull/63426) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Backported in [#63508](https://github.com/ClickHouse/ClickHouse/issues/63508): Fix 'Every derived table must have its own alias' error for MYSQL dictionary source, close [#63341](https://github.com/ClickHouse/ClickHouse/issues/63341). [#63481](https://github.com/ClickHouse/ClickHouse/pull/63481) ([vdimir](https://github.com/vdimir)). +* Backported in [#63595](https://github.com/ClickHouse/ClickHouse/issues/63595): Avoid segafult in `MergeTreePrefetchedReadPool` while fetching projection parts. [#63513](https://github.com/ClickHouse/ClickHouse/pull/63513) ([Antonio Andelic](https://github.com/antonio2368)). +* Backported in [#63748](https://github.com/ClickHouse/ClickHouse/issues/63748): Read only the necessary columns from VIEW (new analyzer). Closes [#62594](https://github.com/ClickHouse/ClickHouse/issues/62594). [#63688](https://github.com/ClickHouse/ClickHouse/pull/63688) ([Maksim Kita](https://github.com/kitaisreal)). +* Backported in [#63770](https://github.com/ClickHouse/ClickHouse/issues/63770): Fix [#63539](https://github.com/ClickHouse/ClickHouse/issues/63539). Forbid WINDOW redefinition in new analyzer. [#63694](https://github.com/ClickHouse/ClickHouse/pull/63694) ([Dmitry Novik](https://github.com/novikd)). +* Backported in [#64189](https://github.com/ClickHouse/ClickHouse/issues/64189): Fix `Not found column` and `CAST AS Map from array requires nested tuple of 2 elements` exceptions for distributed queries which use `Map(Nothing, Nothing)` type. Fixes [#63637](https://github.com/ClickHouse/ClickHouse/issues/63637). [#63753](https://github.com/ClickHouse/ClickHouse/pull/63753) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Backported in [#63845](https://github.com/ClickHouse/ClickHouse/issues/63845): Fix possible `ILLEGAL_COLUMN` error in `partial_merge` join, close [#37928](https://github.com/ClickHouse/ClickHouse/issues/37928). [#63755](https://github.com/ClickHouse/ClickHouse/pull/63755) ([vdimir](https://github.com/vdimir)). +* Backported in [#63906](https://github.com/ClickHouse/ClickHouse/issues/63906): `query_plan_remove_redundant_distinct` can break queries with WINDOW FUNCTIONS (with `allow_experimental_analyzer` is on). Fixes [#62820](https://github.com/ClickHouse/ClickHouse/issues/62820). [#63776](https://github.com/ClickHouse/ClickHouse/pull/63776) ([Igor Nikonov](https://github.com/devcrafter)). +* Backported in [#63989](https://github.com/ClickHouse/ClickHouse/issues/63989): Fix incorrect select query result when parallel replicas were used to read from a Materialized View. [#63861](https://github.com/ClickHouse/ClickHouse/pull/63861) ([Nikita Taranov](https://github.com/nickitat)). +* Backported in [#64031](https://github.com/ClickHouse/ClickHouse/issues/64031): Fix a error `Database name is empty` for remote queries with lambdas over the cluster with modified default database. Fixes [#63471](https://github.com/ClickHouse/ClickHouse/issues/63471). [#63864](https://github.com/ClickHouse/ClickHouse/pull/63864) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Backported in [#64559](https://github.com/ClickHouse/ClickHouse/issues/64559): Fix SIGSEGV due to CPU/Real (`query_profiler_real_time_period_ns`/`query_profiler_cpu_time_period_ns`) profiler (has been an issue since 2022, that leads to periodic server crashes, especially if you were using distributed engine). [#63865](https://github.com/ClickHouse/ClickHouse/pull/63865) ([Azat Khuzhin](https://github.com/azat)). +* Backported in [#64009](https://github.com/ClickHouse/ClickHouse/issues/64009): Fix analyzer - IN function with arbitrary deep sub-selects in materialized view to use insertion block. [#63930](https://github.com/ClickHouse/ClickHouse/pull/63930) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). +* Backported in [#64236](https://github.com/ClickHouse/ClickHouse/issues/64236): Fix resolve of unqualified COLUMNS matcher. Preserve the input columns order and forbid usage of unknown identifiers. [#63962](https://github.com/ClickHouse/ClickHouse/pull/63962) ([Dmitry Novik](https://github.com/novikd)). +* Backported in [#64106](https://github.com/ClickHouse/ClickHouse/issues/64106): Deserialize untrusted binary inputs in a safer way. [#64024](https://github.com/ClickHouse/ClickHouse/pull/64024) ([Robert Schulze](https://github.com/rschu1ze)). +* Backported in [#64168](https://github.com/ClickHouse/ClickHouse/issues/64168): Add missing settings to recoverLostReplica. [#64040](https://github.com/ClickHouse/ClickHouse/pull/64040) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#64320](https://github.com/ClickHouse/ClickHouse/issues/64320): This fix will use a proper redefined context with the correct definer for each individual view in the query pipeline Closes [#63777](https://github.com/ClickHouse/ClickHouse/issues/63777). [#64079](https://github.com/ClickHouse/ClickHouse/pull/64079) ([pufit](https://github.com/pufit)). +* Backported in [#64380](https://github.com/ClickHouse/ClickHouse/issues/64380): Fix analyzer: "Not found column" error is fixed when using INTERPOLATE. [#64096](https://github.com/ClickHouse/ClickHouse/pull/64096) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). +* Backported in [#64567](https://github.com/ClickHouse/ClickHouse/issues/64567): Fix creating backups to S3 buckets with different credentials from the disk containing the file. [#64153](https://github.com/ClickHouse/ClickHouse/pull/64153) ([Antonio Andelic](https://github.com/antonio2368)). +* Backported in [#64270](https://github.com/ClickHouse/ClickHouse/issues/64270): Prevent LOGICAL_ERROR on CREATE TABLE as MaterializedView. [#64174](https://github.com/ClickHouse/ClickHouse/pull/64174) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#64339](https://github.com/ClickHouse/ClickHouse/issues/64339): The query cache now considers two identical queries against different databases as different. The previous behavior could be used to bypass missing privileges to read from a table. [#64199](https://github.com/ClickHouse/ClickHouse/pull/64199) ([Robert Schulze](https://github.com/rschu1ze)). +* Backported in [#64259](https://github.com/ClickHouse/ClickHouse/issues/64259): Ignore `text_log` config when using Keeper. [#64218](https://github.com/ClickHouse/ClickHouse/pull/64218) ([Antonio Andelic](https://github.com/antonio2368)). +* Backported in [#64688](https://github.com/ClickHouse/ClickHouse/issues/64688): Fix Query Tree size validation. Closes [#63701](https://github.com/ClickHouse/ClickHouse/issues/63701). [#64377](https://github.com/ClickHouse/ClickHouse/pull/64377) ([Dmitry Novik](https://github.com/novikd)). +* Backported in [#64725](https://github.com/ClickHouse/ClickHouse/issues/64725): Fixed `CREATE TABLE AS` queries for tables with default expressions. [#64455](https://github.com/ClickHouse/ClickHouse/pull/64455) ([Anton Popov](https://github.com/CurtizJ)). +* Backported in [#64621](https://github.com/ClickHouse/ClickHouse/issues/64621): Fix an error `Cannot find column` in distributed queries with constant CTE in the `GROUP BY` key. [#64519](https://github.com/ClickHouse/ClickHouse/pull/64519) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Backported in [#64678](https://github.com/ClickHouse/ClickHouse/issues/64678): Fix [#64612](https://github.com/ClickHouse/ClickHouse/issues/64612). Do not rewrite aggregation if `-If` combinator is already used. [#64638](https://github.com/ClickHouse/ClickHouse/pull/64638) ([Dmitry Novik](https://github.com/novikd)). +* Backported in [#64831](https://github.com/ClickHouse/ClickHouse/issues/64831): Fix bug which could lead to non-working TTLs with expressions. Fixes [#63700](https://github.com/ClickHouse/ClickHouse/issues/63700). [#64694](https://github.com/ClickHouse/ClickHouse/pull/64694) ([alesapin](https://github.com/alesapin)). +* Backported in [#64940](https://github.com/ClickHouse/ClickHouse/issues/64940): Fix OrderByLimitByDuplicateEliminationVisitor across subqueries. [#64766](https://github.com/ClickHouse/ClickHouse/pull/64766) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#64869](https://github.com/ClickHouse/ClickHouse/issues/64869): Fixed memory possible incorrect memory tracking in several kinds of queries: queries that read any data from S3, queries via http protocol, asynchronous inserts. [#64844](https://github.com/ClickHouse/ClickHouse/pull/64844) ([Anton Popov](https://github.com/CurtizJ)). +* Backported in [#64980](https://github.com/ClickHouse/ClickHouse/issues/64980): Fix the `Block structure mismatch` error for queries reading with `PREWHERE` from the materialized view when the materialized view has columns of different types than the source table. Fixes [#64611](https://github.com/ClickHouse/ClickHouse/issues/64611). [#64855](https://github.com/ClickHouse/ClickHouse/pull/64855) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Backported in [#64972](https://github.com/ClickHouse/ClickHouse/issues/64972): Fix rare crash when table has TTL with subquery + database replicated + parallel replicas + analyzer. It's really rare, but please don't use TTLs with subqueries. [#64858](https://github.com/ClickHouse/ClickHouse/pull/64858) ([alesapin](https://github.com/alesapin)). +* Backported in [#65070](https://github.com/ClickHouse/ClickHouse/issues/65070): Fix `ALTER MODIFY COMMENT` query that was broken for parameterized VIEWs in https://github.com/ClickHouse/ClickHouse/pull/54211. [#65031](https://github.com/ClickHouse/ClickHouse/pull/65031) ([Nikolay Degterinsky](https://github.com/evillique)). +* Backported in [#65175](https://github.com/ClickHouse/ClickHouse/issues/65175): Fix the `Unknown expression identifier` error for remote queries with `INTERPOLATE (alias)` (new analyzer). Fixes [#64636](https://github.com/ClickHouse/ClickHouse/issues/64636). [#65090](https://github.com/ClickHouse/ClickHouse/pull/65090) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). + +#### Critical Bug Fix (crash, LOGICAL_ERROR, data loss, RBAC) + +* Backported in [#64587](https://github.com/ClickHouse/ClickHouse/issues/64587): Disabled `enable_vertical_final` setting by default. This feature should not be used because it has a bug: [#64543](https://github.com/ClickHouse/ClickHouse/issues/64543). [#64544](https://github.com/ClickHouse/ClickHouse/pull/64544) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Backported in [#64878](https://github.com/ClickHouse/ClickHouse/issues/64878): This PR fixes an error when a user in a specific situation can escalate their privileges on the default database without necessary grants. [#64769](https://github.com/ClickHouse/ClickHouse/pull/64769) ([pufit](https://github.com/pufit)). + +#### NO CL CATEGORY + +* Backported in [#63304](https://github.com/ClickHouse/ClickHouse/issues/63304):. [#63297](https://github.com/ClickHouse/ClickHouse/pull/63297) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Backported in [#63708](https://github.com/ClickHouse/ClickHouse/issues/63708):. [#63415](https://github.com/ClickHouse/ClickHouse/pull/63415) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). + +#### NO CL ENTRY + +* NO CL ENTRY: 'Revert "Backport [#64363](https://github.com/ClickHouse/ClickHouse/issues/64363) to 24.3: Split tests 03039_dynamic_all_merge_algorithms to avoid timeouts"'. [#64907](https://github.com/ClickHouse/ClickHouse/pull/64907) ([Raúl Marín](https://github.com/Algunenano)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Backported in [#63751](https://github.com/ClickHouse/ClickHouse/issues/63751): group_by_use_nulls strikes back. [#62922](https://github.com/ClickHouse/ClickHouse/pull/62922) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Backported in [#63558](https://github.com/ClickHouse/ClickHouse/issues/63558): Try fix segfault in `MergeTreeReadPoolBase::createTask`. [#63323](https://github.com/ClickHouse/ClickHouse/pull/63323) ([Antonio Andelic](https://github.com/antonio2368)). +* Backported in [#63336](https://github.com/ClickHouse/ClickHouse/issues/63336): The commit url has different pattern. [#63331](https://github.com/ClickHouse/ClickHouse/pull/63331) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Backported in [#63374](https://github.com/ClickHouse/ClickHouse/issues/63374): Add tags for the test 03000_traverse_shadow_system_data_paths.sql to make it stable. [#63366](https://github.com/ClickHouse/ClickHouse/pull/63366) ([Aleksei Filatov](https://github.com/aalexfvk)). +* Backported in [#63625](https://github.com/ClickHouse/ClickHouse/issues/63625): Workaround for `oklch()` inside canvas bug for firefox. [#63404](https://github.com/ClickHouse/ClickHouse/pull/63404) ([Sergei Trifonov](https://github.com/serxa)). +* Backported in [#63569](https://github.com/ClickHouse/ClickHouse/issues/63569): Add `jwcrypto` to integration tests runner. [#63551](https://github.com/ClickHouse/ClickHouse/pull/63551) ([Konstantin Bogdanov](https://github.com/thevar1able)). +* Backported in [#63649](https://github.com/ClickHouse/ClickHouse/issues/63649): Fix `02362_part_log_merge_algorithm` flaky test. [#63635](https://github.com/ClickHouse/ClickHouse/pull/63635) ([Miсhael Stetsyuk](https://github.com/mstetsyuk)). +* Backported in [#63762](https://github.com/ClickHouse/ClickHouse/issues/63762): Cancel S3 reads properly when parallel reads are used. [#63687](https://github.com/ClickHouse/ClickHouse/pull/63687) ([Antonio Andelic](https://github.com/antonio2368)). +* Backported in [#63741](https://github.com/ClickHouse/ClickHouse/issues/63741): Userspace page cache: don't collect stats if cache is unused. [#63730](https://github.com/ClickHouse/ClickHouse/pull/63730) ([Michael Kolupaev](https://github.com/al13n321)). +* Backported in [#63826](https://github.com/ClickHouse/ClickHouse/issues/63826): Fix `test_odbc_interaction` for arm64 on linux. [#63787](https://github.com/ClickHouse/ClickHouse/pull/63787) ([alesapin](https://github.com/alesapin)). +* Backported in [#63895](https://github.com/ClickHouse/ClickHouse/issues/63895): Fix `test_catboost_evaluate` for aarch64. [#63789](https://github.com/ClickHouse/ClickHouse/pull/63789) ([alesapin](https://github.com/alesapin)). +* Backported in [#63887](https://github.com/ClickHouse/ClickHouse/issues/63887): Fix `test_disk_types` for aarch64. [#63832](https://github.com/ClickHouse/ClickHouse/pull/63832) ([alesapin](https://github.com/alesapin)). +* Backported in [#63879](https://github.com/ClickHouse/ClickHouse/issues/63879): Fix `test_short_strings_aggregation` for arm. [#63836](https://github.com/ClickHouse/ClickHouse/pull/63836) ([alesapin](https://github.com/alesapin)). +* Backported in [#63916](https://github.com/ClickHouse/ClickHouse/issues/63916): Disable `test_non_default_compression/test.py::test_preconfigured_deflateqpl_codec` on arm. [#63839](https://github.com/ClickHouse/ClickHouse/pull/63839) ([alesapin](https://github.com/alesapin)). +* Backported in [#63969](https://github.com/ClickHouse/ClickHouse/issues/63969): fix 02124_insert_deduplication_token_multiple_blocks. [#63950](https://github.com/ClickHouse/ClickHouse/pull/63950) ([Han Fei](https://github.com/hanfei1991)). +* Backported in [#64047](https://github.com/ClickHouse/ClickHouse/issues/64047): Do not create new release in release branch automatically. [#64039](https://github.com/ClickHouse/ClickHouse/pull/64039) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Backported in [#64076](https://github.com/ClickHouse/ClickHouse/issues/64076): Files without shebang have mime 'text/plain' or 'inode/x-empty'. [#64062](https://github.com/ClickHouse/ClickHouse/pull/64062) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Backported in [#64142](https://github.com/ClickHouse/ClickHouse/issues/64142): Fix sanitizers. [#64090](https://github.com/ClickHouse/ClickHouse/pull/64090) ([Azat Khuzhin](https://github.com/azat)). +* Backported in [#64159](https://github.com/ClickHouse/ClickHouse/issues/64159): Add retries in `git submodule update`. [#64125](https://github.com/ClickHouse/ClickHouse/pull/64125) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Backported in [#64473](https://github.com/ClickHouse/ClickHouse/issues/64473): Split tests 03039_dynamic_all_merge_algorithms to avoid timeouts. [#64363](https://github.com/ClickHouse/ClickHouse/pull/64363) ([Kruglov Pavel](https://github.com/Avogar)). +* Backported in [#65113](https://github.com/ClickHouse/ClickHouse/issues/65113): Adjust the `version_helper` and script to a new release scheme. [#64759](https://github.com/ClickHouse/ClickHouse/pull/64759) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Backported in [#64999](https://github.com/ClickHouse/ClickHouse/issues/64999): Fix crash with DISTINCT and window functions. [#64767](https://github.com/ClickHouse/ClickHouse/pull/64767) ([Igor Nikonov](https://github.com/devcrafter)). + diff --git a/docs/changelogs/v24.5.2.34-stable.md b/docs/changelogs/v24.5.2.34-stable.md new file mode 100644 index 00000000000..2db05a5f5dc --- /dev/null +++ b/docs/changelogs/v24.5.2.34-stable.md @@ -0,0 +1,38 @@ +--- +sidebar_position: 1 +sidebar_label: 2024 +--- + +# 2024 Changelog + +### ClickHouse release v24.5.2.34-stable (45589aeee49) FIXME as compared to v24.5.1.1763-stable (647c154a94d) + +#### Improvement +* Backported in [#65096](https://github.com/ClickHouse/ClickHouse/issues/65096): The setting `allow_experimental_join_condition` was accidentally marked as important which may prevent distributed queries in a mixed versions cluster from being executed successfully. [#65008](https://github.com/ClickHouse/ClickHouse/pull/65008) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). + +#### Build/Testing/Packaging Improvement +* Backported in [#65132](https://github.com/ClickHouse/ClickHouse/issues/65132): Decrease the `unit-test` image a few times. [#65102](https://github.com/ClickHouse/ClickHouse/pull/65102) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + +#### Bug Fix (user-visible misbehavior in an official stable release) + +* Backported in [#64729](https://github.com/ClickHouse/ClickHouse/issues/64729): Fixed `CREATE TABLE AS` queries for tables with default expressions. [#64455](https://github.com/ClickHouse/ClickHouse/pull/64455) ([Anton Popov](https://github.com/CurtizJ)). +* Backported in [#65061](https://github.com/ClickHouse/ClickHouse/issues/65061): Fix the `Expression nodes list expected 1 projection names` and `Unknown expression or identifier` errors for queries with aliases to `GLOBAL IN.` Fixes [#64445](https://github.com/ClickHouse/ClickHouse/issues/64445). [#64517](https://github.com/ClickHouse/ClickHouse/pull/64517) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Backported in [#65088](https://github.com/ClickHouse/ClickHouse/issues/65088): Fix removing the `WHERE` and `PREWHERE` expressions, which are always true (for the new analyzer). Fixes [#64575](https://github.com/ClickHouse/ClickHouse/issues/64575). [#64695](https://github.com/ClickHouse/ClickHouse/pull/64695) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Backported in [#64944](https://github.com/ClickHouse/ClickHouse/issues/64944): Fix OrderByLimitByDuplicateEliminationVisitor across subqueries. [#64766](https://github.com/ClickHouse/ClickHouse/pull/64766) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#64873](https://github.com/ClickHouse/ClickHouse/issues/64873): Fixed memory possible incorrect memory tracking in several kinds of queries: queries that read any data from S3, queries via http protocol, asynchronous inserts. [#64844](https://github.com/ClickHouse/ClickHouse/pull/64844) ([Anton Popov](https://github.com/CurtizJ)). +* Backported in [#64984](https://github.com/ClickHouse/ClickHouse/issues/64984): Fix the `Block structure mismatch` error for queries reading with `PREWHERE` from the materialized view when the materialized view has columns of different types than the source table. Fixes [#64611](https://github.com/ClickHouse/ClickHouse/issues/64611). [#64855](https://github.com/ClickHouse/ClickHouse/pull/64855) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Backported in [#64976](https://github.com/ClickHouse/ClickHouse/issues/64976): Fix rare crash when table has TTL with subquery + database replicated + parallel replicas + analyzer. It's really rare, but please don't use TTLs with subqueries. [#64858](https://github.com/ClickHouse/ClickHouse/pull/64858) ([alesapin](https://github.com/alesapin)). +* Backported in [#65074](https://github.com/ClickHouse/ClickHouse/issues/65074): Fix `ALTER MODIFY COMMENT` query that was broken for parameterized VIEWs in https://github.com/ClickHouse/ClickHouse/pull/54211. [#65031](https://github.com/ClickHouse/ClickHouse/pull/65031) ([Nikolay Degterinsky](https://github.com/evillique)). +* Backported in [#65179](https://github.com/ClickHouse/ClickHouse/issues/65179): Fix the `Unknown expression identifier` error for remote queries with `INTERPOLATE (alias)` (new analyzer). Fixes [#64636](https://github.com/ClickHouse/ClickHouse/issues/64636). [#65090](https://github.com/ClickHouse/ClickHouse/pull/65090) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Backported in [#65163](https://github.com/ClickHouse/ClickHouse/issues/65163): Fix pushing arithmetic operations out of aggregation. In the new analyzer, optimization was applied only once. Part of [#62245](https://github.com/ClickHouse/ClickHouse/issues/62245). [#65104](https://github.com/ClickHouse/ClickHouse/pull/65104) ([Dmitry Novik](https://github.com/novikd)). + +#### Critical Bug Fix (crash, LOGICAL_ERROR, data loss, RBAC) + +* Backported in [#64882](https://github.com/ClickHouse/ClickHouse/issues/64882): This PR fixes an error when a user in a specific situation can escalate their privileges on the default database without necessary grants. [#64769](https://github.com/ClickHouse/ClickHouse/pull/64769) ([pufit](https://github.com/pufit)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Backported in [#65002](https://github.com/ClickHouse/ClickHouse/issues/65002): Be more graceful with existing tables with `inverted` indexes. [#64656](https://github.com/ClickHouse/ClickHouse/pull/64656) ([Robert Schulze](https://github.com/rschu1ze)). +* Backported in [#65115](https://github.com/ClickHouse/ClickHouse/issues/65115): Adjust the `version_helper` and script to a new release scheme. [#64759](https://github.com/ClickHouse/ClickHouse/pull/64759) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Backported in [#64796](https://github.com/ClickHouse/ClickHouse/issues/64796): Fix crash with DISTINCT and window functions. [#64767](https://github.com/ClickHouse/ClickHouse/pull/64767) ([Igor Nikonov](https://github.com/devcrafter)). + diff --git a/docs/changelogs/v24.5.3.5-stable.md b/docs/changelogs/v24.5.3.5-stable.md new file mode 100644 index 00000000000..4606e58d0a4 --- /dev/null +++ b/docs/changelogs/v24.5.3.5-stable.md @@ -0,0 +1,14 @@ +--- +sidebar_position: 1 +sidebar_label: 2024 +--- + +# 2024 Changelog + +### ClickHouse release v24.5.3.5-stable (e0eb66f8e17) FIXME as compared to v24.5.2.34-stable (45589aeee49) + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Backported in [#65227](https://github.com/ClickHouse/ClickHouse/issues/65227): Capture weak_ptr of ContextAccess for safety. [#65051](https://github.com/ClickHouse/ClickHouse/pull/65051) ([Alexander Gololobov](https://github.com/davenger)). +* Backported in [#65219](https://github.com/ClickHouse/ClickHouse/issues/65219): Fix false positives leaky memory warnings in OpenSSL. [#65125](https://github.com/ClickHouse/ClickHouse/pull/65125) ([Robert Schulze](https://github.com/rschu1ze)). + diff --git a/docs/en/development/tests.md b/docs/en/development/tests.md index bbc7dac0a2a..8dff6f0ed1d 100644 --- a/docs/en/development/tests.md +++ b/docs/en/development/tests.md @@ -229,6 +229,10 @@ For production builds, clang is used, but we also test make gcc builds. For deve ## Sanitizers {#sanitizers} +:::note +If the process (ClickHouse server or client) crashes at startup when running it locally, you might need to disable address space layout randomization: `sudo sysctl kernel.randomize_va_space=0` +::: + ### Address sanitizer We run functional, integration, stress and unit tests under ASan on per-commit basis. diff --git a/docs/en/engines/table-engines/integrations/s3queue.md b/docs/en/engines/table-engines/integrations/s3queue.md index f930fab1805..0958680dc56 100644 --- a/docs/en/engines/table-engines/integrations/s3queue.md +++ b/docs/en/engines/table-engines/integrations/s3queue.md @@ -75,7 +75,7 @@ Possible values: - unordered — With unordered mode, the set of all already processed files is tracked with persistent nodes in ZooKeeper. - ordered — With ordered mode, only the max name of the successfully consumed file, and the names of files that will be retried after unsuccessful loading attempt are being stored in ZooKeeper. -Default value: `unordered`. +Default value: `ordered` in versions before 24.6. Starting with 24.6 there is no default value, the setting becomes required to be specified manually. For tables created on earlier versions the default value will remain `Ordered` for compatibility. ### after_processing {#after_processing} @@ -181,6 +181,10 @@ For 'Ordered' mode. Defines a maximum boundary for reschedule interval for a bac Default value: `30000`. +### s3queue_buckets {#buckets} + +For 'Ordered' mode. Available since `24.6`. If there are several replicas of S3Queue table, each working with the same metadata directory in keeper, the value of `s3queue_buckets` needs to be equal to at least the number of replicas. If `s3queue_processing_threads` setting is used as well, it makes sense to increase the value of `s3queue_buckets` setting even further, as it defines the actual parallelism of `S3Queue` processing. + ## S3-related Settings {#s3-settings} Engine supports all s3 related settings. For more information about S3 settings see [here](../../../engines/table-engines/integrations/s3.md). diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index 3de416ae64d..fdbfb742a10 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -480,7 +480,7 @@ The CSV format supports the output of totals and extremes the same way as `TabSe - [input_format_csv_detect_header](/docs/en/operations/settings/settings-formats.md/#input_format_csv_detect_header) - automatically detect header with names and types in CSV format. Default value - `true`. - [input_format_csv_skip_trailing_empty_lines](/docs/en/operations/settings/settings-formats.md/#input_format_csv_skip_trailing_empty_lines) - skip trailing empty lines at the end of data. Default value - `false`. - [input_format_csv_trim_whitespaces](/docs/en/operations/settings/settings-formats.md/#input_format_csv_trim_whitespaces) - trim spaces and tabs in non-quoted CSV strings. Default value - `true`. -- [input_format_csv_allow_whitespace_or_tab_as_delimiter](/docs/en/operations/settings/settings-formats.md/# input_format_csv_allow_whitespace_or_tab_as_delimiter) - Allow to use whitespace or tab as field delimiter in CSV strings. Default value - `false`. +- [input_format_csv_allow_whitespace_or_tab_as_delimiter](/docs/en/operations/settings/settings-formats.md/#input_format_csv_allow_whitespace_or_tab_as_delimiter) - Allow to use whitespace or tab as field delimiter in CSV strings. Default value - `false`. - [input_format_csv_allow_variable_number_of_columns](/docs/en/operations/settings/settings-formats.md/#input_format_csv_allow_variable_number_of_columns) - allow variable number of columns in CSV format, ignore extra columns and use default values on missing columns. Default value - `false`. - [input_format_csv_use_default_on_bad_values](/docs/en/operations/settings/settings-formats.md/#input_format_csv_use_default_on_bad_values) - Allow to set default value to column when CSV field deserialization failed on bad value. Default value - `false`. - [input_format_csv_try_infer_numbers_from_strings](/docs/en/operations/settings/settings-formats.md/#input_format_csv_try_infer_numbers_from_strings) - Try to infer numbers from string fields while schema inference. Default value - `false`. diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md index ee3ec5126a2..f50dae0f1a2 100644 --- a/docs/en/operations/server-configuration-parameters/settings.md +++ b/docs/en/operations/server-configuration-parameters/settings.md @@ -2924,6 +2924,8 @@ Define proxy servers for HTTP and HTTPS requests, currently supported by S3 stor There are three ways to define proxy servers: environment variables, proxy lists, and remote proxy resolvers. +Bypassing proxy servers for specific hosts is also supported with the use of `no_proxy`. + ### Environment variables The `http_proxy` and `https_proxy` environment variables allow you to specify a @@ -3033,6 +3035,29 @@ This also allows a mix of resolver types can be used. By default, tunneling (i.e, `HTTP CONNECT`) is used to make `HTTPS` requests over `HTTP` proxy. This setting can be used to disable it. +### no_proxy +By default, all requests will go through the proxy. In order to disable it for specific hosts, the `no_proxy` variable must be set. +It can be set inside the `` clause for list and remote resolvers and as an environment variable for environment resolver. +It supports IP addresses, domains, subdomains and `'*'` wildcard for full bypass. Leading dots are stripped just like curl does. + +Example: + +The below configuration bypasses proxy requests to `clickhouse.cloud` and all of its subdomains (e.g, `auth.clickhouse.cloud`). +The same applies to GitLab, even though it has a leading dot. Both `gitlab.com` and `about.gitlab.com` would bypass the proxy. + +``` xml + + clickhouse.cloud,.gitlab.com + + http://proxy1 + http://proxy2:3128 + + + http://proxy1:3128 + + +``` + ## max_materialized_views_count_for_table {#max_materialized_views_count_for_table} A limit on the number of materialized views attached to a table. diff --git a/docs/en/sql-reference/aggregate-functions/combinators.md b/docs/en/sql-reference/aggregate-functions/combinators.md index 8ccc5e292b5..e30aa66b3b3 100644 --- a/docs/en/sql-reference/aggregate-functions/combinators.md +++ b/docs/en/sql-reference/aggregate-functions/combinators.md @@ -106,8 +106,8 @@ To work with these states, use: - [AggregatingMergeTree](../../engines/table-engines/mergetree-family/aggregatingmergetree.md) table engine. - [finalizeAggregation](../../sql-reference/functions/other-functions.md#function-finalizeaggregation) function. - [runningAccumulate](../../sql-reference/functions/other-functions.md#runningaccumulate) function. -- [-Merge](#aggregate_functions_combinators-merge) combinator. -- [-MergeState](#aggregate_functions_combinators-mergestate) combinator. +- [-Merge](#-merge) combinator. +- [-MergeState](#-mergestate) combinator. ## -Merge diff --git a/docs/en/sql-reference/aggregate-functions/parametric-functions.md b/docs/en/sql-reference/aggregate-functions/parametric-functions.md index 1dc89b8dcf9..093d88f939f 100644 --- a/docs/en/sql-reference/aggregate-functions/parametric-functions.md +++ b/docs/en/sql-reference/aggregate-functions/parametric-functions.md @@ -82,10 +82,12 @@ FROM In this case, you should remember that you do not know the histogram bin borders. -## sequenceMatch(pattern)(timestamp, cond1, cond2, ...) +## sequenceMatch Checks whether the sequence contains an event chain that matches the pattern. +**Syntax** + ``` sql sequenceMatch(pattern)(timestamp, cond1, cond2, ...) ``` @@ -102,7 +104,7 @@ Events that occur at the same second may lay in the sequence in an undefined ord **Parameters** -- `pattern` — Pattern string. See [Pattern syntax](#sequence-function-pattern-syntax). +- `pattern` — Pattern string. See [Pattern syntax](#sequencematch). **Returned values** @@ -170,9 +172,9 @@ SELECT sequenceMatch('(?1)(?2)')(time, number = 1, number = 2, number = 4) FROM **See Also** -- [sequenceCount](#function-sequencecount) +- [sequenceCount](#sequencecount) -## sequenceCount(pattern)(time, cond1, cond2, ...) +## sequenceCount Counts the number of event chains that matched the pattern. The function searches event chains that do not overlap. It starts to search for the next chain after the current chain is matched. @@ -180,6 +182,8 @@ Counts the number of event chains that matched the pattern. The function searche Events that occur at the same second may lay in the sequence in an undefined order affecting the result. ::: +**Syntax** + ``` sql sequenceCount(pattern)(timestamp, cond1, cond2, ...) ``` @@ -192,7 +196,7 @@ sequenceCount(pattern)(timestamp, cond1, cond2, ...) **Parameters** -- `pattern` — Pattern string. See [Pattern syntax](#sequence-function-pattern-syntax). +- `pattern` — Pattern string. See [Pattern syntax](#sequencematch). **Returned values** @@ -229,7 +233,7 @@ SELECT sequenceCount('(?1).*(?2)')(time, number = 1, number = 2) FROM t **See Also** -- [sequenceMatch](#function-sequencematch) +- [sequenceMatch](#sequencematch) ## windowFunnel diff --git a/docs/en/sql-reference/aggregate-functions/reference/stochasticlinearregression.md b/docs/en/sql-reference/aggregate-functions/reference/stochasticlinearregression.md index ddac82a0977..7ab9e1d3256 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/stochasticlinearregression.md +++ b/docs/en/sql-reference/aggregate-functions/reference/stochasticlinearregression.md @@ -3,7 +3,7 @@ slug: /en/sql-reference/aggregate-functions/reference/stochasticlinearregression sidebar_position: 221 --- -# stochasticLinearRegression +# stochasticLinearRegression {#agg_functions_stochasticlinearregression_parameters} This function implements stochastic linear regression. It supports custom parameters for learning rate, L2 regularization coefficient, mini-batch size, and has a few methods for updating weights ([Adam](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Adam) (used by default), [simple SGD](https://en.wikipedia.org/wiki/Stochastic_gradient_descent), [Momentum](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Momentum), and [Nesterov](https://mipt.ru/upload/medialibrary/d7e/41-91.pdf)). @@ -72,5 +72,5 @@ The query will return a column of predicted values. Note that first argument of **See Also** -- [stochasticLogisticRegression](../../../sql-reference/aggregate-functions/reference/stochasticlogisticregression.md#agg_functions-stochasticlogisticregression) +- [stochasticLogisticRegression](../../../sql-reference/aggregate-functions/reference/stochasticlogisticregression.md#stochasticlogisticregression) - [Difference between linear and logistic regressions](https://stackoverflow.com/questions/12146914/what-is-the-difference-between-linear-regression-and-logistic-regression) diff --git a/docs/en/sql-reference/aggregate-functions/reference/stochasticlogisticregression.md b/docs/en/sql-reference/aggregate-functions/reference/stochasticlogisticregression.md index 0a040689681..4bf5529ddcb 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/stochasticlogisticregression.md +++ b/docs/en/sql-reference/aggregate-functions/reference/stochasticlogisticregression.md @@ -11,7 +11,7 @@ This function implements stochastic logistic regression. It can be used for bina Parameters are exactly the same as in stochasticLinearRegression: `learning rate`, `l2 regularization coefficient`, `mini-batch size`, `method for updating weights`. -For more information see [parameters](#agg_functions-stochasticlinearregression-parameters). +For more information see [parameters](../reference/stochasticlinearregression.md/#parameters). ``` text stochasticLogisticRegression(1.0, 1.0, 10, 'SGD') diff --git a/docs/en/sql-reference/aggregate-functions/reference/varpop.md b/docs/en/sql-reference/aggregate-functions/reference/varpop.md index fcabeb4c6a8..4e010248f6e 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/varpop.md +++ b/docs/en/sql-reference/aggregate-functions/reference/varpop.md @@ -27,7 +27,7 @@ Returns an integer of type `Float64`. **Implementation details** -This function uses a numerically unstable algorithm. If you need numerical stability in calculations, use the slower but more stable [`varPopStable` function](#varPopStable). +This function uses a numerically unstable algorithm. If you need numerical stability in calculations, use the slower but more stable [`varPopStable`](#varpopstable) function. **Example** @@ -76,7 +76,7 @@ Returns an integer of type `Float64`. **Implementation details** -Unlike [`varPop()`](#varPop), this function uses a stable, numerically accurate algorithm to calculate the population variance to avoid issues like catastrophic cancellation or loss of precision. This function also handles `NaN` and `Inf` values correctly, excluding them from calculations. +Unlike [`varPop`](#varpop), this function uses a stable, numerically accurate algorithm to calculate the population variance to avoid issues like catastrophic cancellation or loss of precision. This function also handles `NaN` and `Inf` values correctly, excluding them from calculations. **Example** diff --git a/docs/en/sql-reference/aggregate-functions/reference/varsamp.md b/docs/en/sql-reference/aggregate-functions/reference/varsamp.md index be669a16ae8..bd1cfa5742a 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/varsamp.md +++ b/docs/en/sql-reference/aggregate-functions/reference/varsamp.md @@ -40,7 +40,7 @@ Where: The function assumes that the input data set represents a sample from a larger population. If you want to calculate the variance of the entire population (when you have the complete data set), you should use the [`varPop()` function](./varpop#varpop) instead. -This function uses a numerically unstable algorithm. If you need numerical stability in calculations, use the slower but more stable [`varSampStable` function](#varSampStable). +This function uses a numerically unstable algorithm. If you need numerical stability in calculations, use the slower but more stable [`varSampStable`](#varsampstable) function. **Example** @@ -82,11 +82,11 @@ varSampStable(expr) **Returned value** -The `varSampStable()` function returns a Float64 value representing the sample variance of the input data set. +The `varSampStable` function returns a Float64 value representing the sample variance of the input data set. **Implementation details** -The `varSampStable()` function calculates the sample variance using the same formula as the [`varSamp()`](#varSamp function): +The `varSampStable` function calculates the sample variance using the same formula as the [`varSamp`](#varsamp) function: ```plaintext ∑(x - mean(x))^2 / (n - 1) @@ -97,9 +97,9 @@ Where: - `mean(x)` is the arithmetic mean of the data set. - `n` is the number of data points in the data set. -The difference between `varSampStable()` and `varSamp()` is that `varSampStable()` is designed to provide a more deterministic and stable result when dealing with floating-point arithmetic. It uses an algorithm that minimizes the accumulation of rounding errors, which can be particularly important when dealing with large data sets or data with a wide range of values. +The difference between `varSampStable` and `varSamp` is that `varSampStable` is designed to provide a more deterministic and stable result when dealing with floating-point arithmetic. It uses an algorithm that minimizes the accumulation of rounding errors, which can be particularly important when dealing with large data sets or data with a wide range of values. -Like `varSamp()`, the `varSampStable()` function assumes that the input data set represents a sample from a larger population. If you want to calculate the variance of the entire population (when you have the complete data set), you should use the [`varPopStable()` function](./varpop#varpopstable) instead. +Like `varSamp`, the `varSampStable` function assumes that the input data set represents a sample from a larger population. If you want to calculate the variance of the entire population (when you have the complete data set), you should use the [`varPopStable`](./varpop#varpopstable) function instead. **Example** @@ -125,4 +125,4 @@ Response: 0.865 ``` -This query calculates the sample variance of the `value` column in the `example_table` using the `varSampStable()` function. The result shows that the sample variance of the values `[10.5, 12.3, 9.8, 11.2, 10.7]` is approximately 0.865, which may differ slightly from the result of `varSamp()` due to the more precise handling of floating-point arithmetic. +This query calculates the sample variance of the `value` column in the `example_table` using the `varSampStable()` function. The result shows that the sample variance of the values `[10.5, 12.3, 9.8, 11.2, 10.7]` is approximately 0.865, which may differ slightly from the result of `varSamp` due to the more precise handling of floating-point arithmetic. diff --git a/docs/en/sql-reference/data-types/geo.md b/docs/en/sql-reference/data-types/geo.md index 7e3c32b3451..7ffc7447d96 100644 --- a/docs/en/sql-reference/data-types/geo.md +++ b/docs/en/sql-reference/data-types/geo.md @@ -33,7 +33,7 @@ Result: ## Ring -`Ring` is a simple polygon without holes stored as an array of points: [Array](array.md)([Point](#point-data-type)). +`Ring` is a simple polygon without holes stored as an array of points: [Array](array.md)([Point](#point)). **Example** @@ -54,7 +54,7 @@ Result: ## Polygon -`Polygon` is a polygon with holes stored as an array of rings: [Array](array.md)([Ring](#ring-data-type)). First element of outer array is the outer shape of polygon and all the following elements are holes. +`Polygon` is a polygon with holes stored as an array of rings: [Array](array.md)([Ring](#ring)). First element of outer array is the outer shape of polygon and all the following elements are holes. **Example** @@ -76,7 +76,7 @@ Result: ## MultiPolygon -`MultiPolygon` consists of multiple polygons and is stored as an array of polygons: [Array](array.md)([Polygon](#polygon-data-type)). +`MultiPolygon` consists of multiple polygons and is stored as an array of polygons: [Array](array.md)([Polygon](#polygon)). **Example** diff --git a/docs/en/sql-reference/dictionaries/index.md b/docs/en/sql-reference/dictionaries/index.md index 080de94f8b7..4c7421d57c0 100644 --- a/docs/en/sql-reference/dictionaries/index.md +++ b/docs/en/sql-reference/dictionaries/index.md @@ -16,7 +16,7 @@ ClickHouse supports special functions for working with dictionaries that can be ClickHouse supports: - Dictionaries with a [set of functions](../../sql-reference/functions/ext-dict-functions.md). -- [Embedded dictionaries](#embedded_dictionaries) with a specific [set of functions](../../sql-reference/functions/ym-dict-functions.md). +- [Embedded dictionaries](#embedded-dictionaries) with a specific [set of functions](../../sql-reference/functions/ym-dict-functions.md). :::tip Tutorial @@ -82,7 +82,7 @@ You can [configure](#configuring-a-dictionary) any number of dictionaries in the You can convert values for a small dictionary by describing it in a `SELECT` query (see the [transform](../../sql-reference/functions/other-functions.md) function). This functionality is not related to dictionaries. ::: -## Configuring a Dictionary {#configuring-a-dictionary} +## Configuring a Dictionary @@ -123,7 +123,7 @@ LAYOUT(...) -- Memory layout configuration LIFETIME(...) -- Lifetime of dictionary in memory ``` -## Storing Dictionaries in Memory {#storing-dictionaries-in-memory} +## Storing Dictionaries in Memory There are a variety of ways to store dictionaries in memory. @@ -415,7 +415,7 @@ or LAYOUT(COMPLEX_KEY_HASHED_ARRAY([SHARDS 1])) ``` -### range_hashed {#range_hashed} +### range_hashed The dictionary is stored in memory in the form of a hash table with an ordered array of ranges and their corresponding values. @@ -679,7 +679,7 @@ When searching for a dictionary, the cache is searched first. For each block of If keys are not found in dictionary, then update cache task is created and added into update queue. Update queue properties can be controlled with settings `max_update_queue_size`, `update_queue_push_timeout_milliseconds`, `query_wait_timeout_milliseconds`, `max_threads_for_updates`. -For cache dictionaries, the expiration [lifetime](#dictionary-updates) of data in the cache can be set. If more time than `lifetime` has passed since loading the data in a cell, the cell’s value is not used and key becomes expired. The key is re-requested the next time it needs to be used. This behaviour can be configured with setting `allow_read_expired_keys`. +For cache dictionaries, the expiration [lifetime](#refreshing-dictionary-data-using-lifetime) of data in the cache can be set. If more time than `lifetime` has passed since loading the data in a cell, the cell’s value is not used and key becomes expired. The key is re-requested the next time it needs to be used. This behaviour can be configured with setting `allow_read_expired_keys`. This is the least effective of all the ways to store dictionaries. The speed of the cache depends strongly on correct settings and the usage scenario. A cache type dictionary performs well only when the hit rates are high enough (recommended 99% and higher). You can view the average hit rate in the [system.dictionaries](../../operations/system-tables/dictionaries.md) table. @@ -899,7 +899,7 @@ Other types are not supported yet. The function returns the attribute for the pr Data must completely fit into RAM. -## Refreshing dictionary data using LIFETIME {#lifetime} +## Refreshing dictionary data using LIFETIME ClickHouse periodically updates dictionaries based on the `LIFETIME` tag (defined in seconds). `LIFETIME` is the update interval for fully downloaded dictionaries and the invalidation interval for cached dictionaries. @@ -1031,7 +1031,7 @@ SOURCE(CLICKHOUSE(... update_field 'added_time' update_lag 15)) ... ``` -## Dictionary Sources {#dictionary-sources} +## Dictionary Sources @@ -1065,7 +1065,7 @@ SOURCE(SOURCE_TYPE(param1 val1 ... paramN valN)) -- Source configuration The source is configured in the `source` section. -For source types [Local file](#local_file), [Executable file](#executable), [HTTP(s)](#https), [ClickHouse](#clickhouse) +For source types [Local file](#local-file), [Executable file](#executable-file), [HTTP(s)](#https), [ClickHouse](#clickhouse) optional settings are available: ``` xml @@ -1089,10 +1089,10 @@ SETTINGS(format_csv_allow_single_quotes = 0) Types of sources (`source_type`): -- [Local file](#local_file) -- [Executable File](#executable) -- [Executable Pool](#executable_pool) -- [HTTP(S)](#http) +- [Local file](#local-file) +- [Executable File](#executable-file) +- [Executable Pool](#executable-pool) +- [HTTP(S)](#https) - DBMS - [ODBC](#odbc) - [MySQL](#mysql) @@ -1102,7 +1102,7 @@ Types of sources (`source_type`): - [Cassandra](#cassandra) - [PostgreSQL](#postgresql) -### Local File {#local_file} +### Local File Example of settings: @@ -1132,9 +1132,9 @@ When a dictionary with source `FILE` is created via DDL command (`CREATE DICTION - [Dictionary function](../../sql-reference/table-functions/dictionary.md#dictionary-function) -### Executable File {#executable} +### Executable File -Working with executable files depends on [how the dictionary is stored in memory](#storig-dictionaries-in-memory). If the dictionary is stored using `cache` and `complex_key_cache`, ClickHouse requests the necessary keys by sending a request to the executable file’s STDIN. Otherwise, ClickHouse starts the executable file and treats its output as dictionary data. +Working with executable files depends on [how the dictionary is stored in memory](#storing-dictionaries-in-memory). If the dictionary is stored using `cache` and `complex_key_cache`, ClickHouse requests the necessary keys by sending a request to the executable file’s STDIN. Otherwise, ClickHouse starts the executable file and treats its output as dictionary data. Example of settings: @@ -1161,7 +1161,7 @@ Setting fields: That dictionary source can be configured only via XML configuration. Creating dictionaries with executable source via DDL is disabled; otherwise, the DB user would be able to execute arbitrary binaries on the ClickHouse node. -### Executable Pool {#executable_pool} +### Executable Pool Executable pool allows loading data from pool of processes. This source does not work with dictionary layouts that need to load all data from source. Executable pool works if the dictionary [is stored](#ways-to-store-dictionaries-in-memory) using `cache`, `complex_key_cache`, `ssd_cache`, `complex_key_ssd_cache`, `direct`, or `complex_key_direct` layouts. @@ -1196,9 +1196,9 @@ Setting fields: That dictionary source can be configured only via XML configuration. Creating dictionaries with executable source via DDL is disabled, otherwise, the DB user would be able to execute arbitrary binary on ClickHouse node. -### HTTP(S) {#https} +### HTTP(S) -Working with an HTTP(S) server depends on [how the dictionary is stored in memory](#storig-dictionaries-in-memory). If the dictionary is stored using `cache` and `complex_key_cache`, ClickHouse requests the necessary keys by sending a request via the `POST` method. +Working with an HTTP(S) server depends on [how the dictionary is stored in memory](#storing-dictionaries-in-memory). If the dictionary is stored using `cache` and `complex_key_cache`, ClickHouse requests the necessary keys by sending a request via the `POST` method. Example of settings: @@ -1285,7 +1285,7 @@ Setting fields: - `db` – Name of the database. Omit it if the database name is set in the `` parameters. - `table` – Name of the table and schema if exists. - `connection_string` – Connection string. -- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Updating dictionaries](#dictionary-updates). +- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Refreshing dictionary data using LIFETIME](#refreshing-dictionary-data-using-lifetime). - `query` – The custom query. Optional parameter. :::note @@ -1575,7 +1575,7 @@ Setting fields: - `where` – The selection criteria. The syntax for conditions is the same as for `WHERE` clause in MySQL, for example, `id > 10 AND id < 20`. Optional parameter. -- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Updating dictionaries](#dictionary-updates). +- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Refreshing dictionary data using LIFETIME](#refreshing-dictionary-data-using-lifetime). - `fail_on_connection_loss` – The configuration parameter that controls behavior of the server on connection loss. If `true`, an exception is thrown immediately if the connection between client and server was lost. If `false`, the ClickHouse server retries to execute the query three times before throwing an exception. Note that retrying leads to increased response times. Default value: `false`. @@ -1672,7 +1672,7 @@ Setting fields: - `db` – Name of the database. - `table` – Name of the table. - `where` – The selection criteria. May be omitted. -- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Updating dictionaries](#dictionary-updates). +- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Refreshing dictionary data using LIFETIME](#refreshing-dictionary-data-using-lifetime). - `secure` - Use ssl for connection. - `query` – The custom query. Optional parameter. @@ -1849,7 +1849,7 @@ Setting fields: - `db` – Name of the database. - `table` – Name of the table. - `where` – The selection criteria. The syntax for conditions is the same as for `WHERE` clause in PostgreSQL. For example, `id > 10 AND id < 20`. Optional parameter. -- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Updating dictionaries](#dictionary-updates). +- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Refreshing dictionary data using LIFETIME](#refreshing-dictionary-data-using-lifetime). - `query` – The custom query. Optional parameter. :::note @@ -1873,7 +1873,7 @@ LAYOUT(FLAT()) LIFETIME(0); ``` -## Dictionary Key and Fields {#dictionary-key-and-fields} +## Dictionary Key and Fields @@ -1963,7 +1963,7 @@ PRIMARY KEY Id ### Composite Key -The key can be a `tuple` from any types of fields. The [layout](#storig-dictionaries-in-memory) in this case must be `complex_key_hashed` or `complex_key_cache`. +The key can be a `tuple` from any types of fields. The [layout](#storing-dictionaries-in-memory) in this case must be `complex_key_hashed` or `complex_key_cache`. :::tip A composite key can consist of a single element. This makes it possible to use a string as the key, for instance. @@ -2030,17 +2030,17 @@ CREATE DICTIONARY somename ( Configuration fields: -| Tag | Description | Required | -|------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------| -| `name` | Column name. | Yes | -| `type` | ClickHouse data type: [UInt8](../../sql-reference/data-types/int-uint.md), [UInt16](../../sql-reference/data-types/int-uint.md), [UInt32](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md), [Int8](../../sql-reference/data-types/int-uint.md), [Int16](../../sql-reference/data-types/int-uint.md), [Int32](../../sql-reference/data-types/int-uint.md), [Int64](../../sql-reference/data-types/int-uint.md), [Float32](../../sql-reference/data-types/float.md), [Float64](../../sql-reference/data-types/float.md), [UUID](../../sql-reference/data-types/uuid.md), [Decimal32](../../sql-reference/data-types/decimal.md), [Decimal64](../../sql-reference/data-types/decimal.md), [Decimal128](../../sql-reference/data-types/decimal.md), [Decimal256](../../sql-reference/data-types/decimal.md),[Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md), [DateTime64](../../sql-reference/data-types/datetime64.md), [String](../../sql-reference/data-types/string.md), [Array](../../sql-reference/data-types/array.md).
ClickHouse tries to cast value from dictionary to the specified data type. For example, for MySQL, the field might be `TEXT`, `VARCHAR`, or `BLOB` in the MySQL source table, but it can be uploaded as `String` in ClickHouse.
[Nullable](../../sql-reference/data-types/nullable.md) is currently supported for [Flat](#flat), [Hashed](#hashed), [ComplexKeyHashed](#complex_key_hashed), [Direct](#direct), [ComplexKeyDirect](#complex_key_direct), [RangeHashed](#range_hashed), Polygon, [Cache](#cache), [ComplexKeyCache](#complex_key_cache), [SSDCache](#ssd_cache), [SSDComplexKeyCache](#complex_key_ssd_cache) dictionaries. In [IPTrie](#ip_trie) dictionaries `Nullable` types are not supported. | Yes | -| `null_value` | Default value for a non-existing element.
In the example, it is an empty string. [NULL](../syntax.md#null) value can be used only for the `Nullable` types (see the previous line with types description). | Yes | -| `expression` | [Expression](../../sql-reference/syntax.md#expressions) that ClickHouse executes on the value.
The expression can be a column name in the remote SQL database. Thus, you can use it to create an alias for the remote column.

Default value: no expression. | No | -| `hierarchical` | If `true`, the attribute contains the value of a parent key for the current key. See [Hierarchical Dictionaries](#hierarchical-dictionaries).

Default value: `false`. | No | -| `injective` | Flag that shows whether the `id -> attribute` image is [injective](https://en.wikipedia.org/wiki/Injective_function).
If `true`, ClickHouse can automatically place after the `GROUP BY` clause the requests to dictionaries with injection. Usually it significantly reduces the amount of such requests.

Default value: `false`. | No | -| `is_object_id` | Flag that shows whether the query is executed for a MongoDB document by `ObjectID`.

Default value: `false`. +| Tag | Description | Required | +|------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------| +| `name` | Column name. | Yes | +| `type` | ClickHouse data type: [UInt8](../../sql-reference/data-types/int-uint.md), [UInt16](../../sql-reference/data-types/int-uint.md), [UInt32](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md), [Int8](../../sql-reference/data-types/int-uint.md), [Int16](../../sql-reference/data-types/int-uint.md), [Int32](../../sql-reference/data-types/int-uint.md), [Int64](../../sql-reference/data-types/int-uint.md), [Float32](../../sql-reference/data-types/float.md), [Float64](../../sql-reference/data-types/float.md), [UUID](../../sql-reference/data-types/uuid.md), [Decimal32](../../sql-reference/data-types/decimal.md), [Decimal64](../../sql-reference/data-types/decimal.md), [Decimal128](../../sql-reference/data-types/decimal.md), [Decimal256](../../sql-reference/data-types/decimal.md),[Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md), [DateTime64](../../sql-reference/data-types/datetime64.md), [String](../../sql-reference/data-types/string.md), [Array](../../sql-reference/data-types/array.md).
ClickHouse tries to cast value from dictionary to the specified data type. For example, for MySQL, the field might be `TEXT`, `VARCHAR`, or `BLOB` in the MySQL source table, but it can be uploaded as `String` in ClickHouse.
[Nullable](../../sql-reference/data-types/nullable.md) is currently supported for [Flat](#flat), [Hashed](#hashed), [ComplexKeyHashed](#complex_key_hashed), [Direct](#direct), [ComplexKeyDirect](#complex_key_direct), [RangeHashed](#range_hashed), Polygon, [Cache](#cache), [ComplexKeyCache](#complex_key_cache), [SSDCache](#ssd_cache), [SSDComplexKeyCache](#complex_key_ssd_cache) dictionaries. In [IPTrie](#ip_trie) dictionaries `Nullable` types are not supported. | Yes | +| `null_value` | Default value for a non-existing element.
In the example, it is an empty string. [NULL](../syntax.md#null) value can be used only for the `Nullable` types (see the previous line with types description). | Yes | +| `expression` | [Expression](../../sql-reference/syntax.md#expressions) that ClickHouse executes on the value.
The expression can be a column name in the remote SQL database. Thus, you can use it to create an alias for the remote column.

Default value: no expression. | No | +| `hierarchical` | If `true`, the attribute contains the value of a parent key for the current key. See [Hierarchical Dictionaries](#hierarchical-dictionaries).

Default value: `false`. | No | +| `injective` | Flag that shows whether the `id -> attribute` image is [injective](https://en.wikipedia.org/wiki/Injective_function).
If `true`, ClickHouse can automatically place after the `GROUP BY` clause the requests to dictionaries with injection. Usually it significantly reduces the amount of such requests.

Default value: `false`. | No | +| `is_object_id` | Flag that shows whether the query is executed for a MongoDB document by `ObjectID`.

Default value: `false`. -## Hierarchical Dictionaries {#hierarchical-dictionaries} +## Hierarchical Dictionaries ClickHouse supports hierarchical dictionaries with a [numeric key](#numeric-key). @@ -2165,7 +2165,7 @@ Points can be specified as an array or a tuple of their coordinates. In the curr The user can upload their own data in all formats supported by ClickHouse. -There are 3 types of [in-memory storage](#storig-dictionaries-in-memory) available: +There are 3 types of [in-memory storage](#storing-dictionaries-in-memory) available: - `POLYGON_SIMPLE`. This is a naive implementation, where a linear pass through all polygons is made for each query, and membership is checked for each one without using additional indexes. @@ -2435,7 +2435,7 @@ LIFETIME(0) LAYOUT(regexp_tree); ``` -## Embedded Dictionaries {#embedded-dictionaries} +## Embedded Dictionaries diff --git a/docs/en/sql-reference/functions/array-functions.md b/docs/en/sql-reference/functions/array-functions.md index 7b52fbff714..d87ca4a0fe7 100644 --- a/docs/en/sql-reference/functions/array-functions.md +++ b/docs/en/sql-reference/functions/array-functions.md @@ -1261,7 +1261,7 @@ SELECT arraySort((x) -> -x, [1, 2, 3]) as res; └─────────┘ ``` -For each element of the source array, the lambda function returns the sorting key, that is, \[1 –\> -1, 2 –\> -2, 3 –\> -3\]. Since the `arraySort` function sorts the keys in ascending order, the result is \[3, 2, 1\]. Thus, the `(x) –> -x` lambda function sets the [descending order](#reverse-sort) in a sorting. +For each element of the source array, the lambda function returns the sorting key, that is, \[1 –\> -1, 2 –\> -2, 3 –\> -3\]. Since the `arraySort` function sorts the keys in ascending order, the result is \[3, 2, 1\]. Thus, the `(x) –> -x` lambda function sets the [descending order](#arrayreversesort) in a sorting. The lambda function can accept multiple arguments. In this case, you need to pass the `arraySort` function several arrays of identical length that the arguments of lambda function will correspond to. The resulting array will consist of elements from the first input array; elements from the next input array(s) specify the sorting keys. For example: @@ -1307,10 +1307,15 @@ To improve sorting efficiency, the [Schwartzian transform](https://en.wikipedia. Same as `arraySort` with additional `limit` argument allowing partial sorting. Returns an array of the same size as the original array where elements in range `[1..limit]` are sorted in ascending order. Remaining elements `(limit..N]` shall contain elements in unspecified order. -## arrayReverseSort(\[func,\] arr, ...) {#reverse-sort} +## arrayReverseSort Sorts the elements of the `arr` array in descending order. If the `func` function is specified, `arr` is sorted according to the result of the `func` function applied to the elements of the array, and then the sorted array is reversed. If `func` accepts multiple arguments, the `arrayReverseSort` function is passed several arrays that the arguments of `func` will correspond to. Detailed examples are shown at the end of `arrayReverseSort` description. +**Syntax** + +```sql +arrayReverseSort([func,] arr, ...) +``` Example of integer values sorting: ``` sql @@ -1907,10 +1912,16 @@ FROM numbers(1,10); - [arrayReduce](#arrayreduce) -## arrayReverse(arr) +## arrayReverse Returns an array of the same size as the original array containing the elements in reverse order. +**Syntax** + +```sql +arrayReverse(arr) +``` + Example: ``` sql diff --git a/docs/en/sql-reference/functions/bitmap-functions.md b/docs/en/sql-reference/functions/bitmap-functions.md index a5c8a663b71..d30c0f4dde4 100644 --- a/docs/en/sql-reference/functions/bitmap-functions.md +++ b/docs/en/sql-reference/functions/bitmap-functions.md @@ -74,7 +74,7 @@ bitmapSubsetInRange(bitmap, range_start, range_end) **Arguments** -- `bitmap` – [Bitmap object](#bitmap_functions-bitmapbuild). +- `bitmap` – [Bitmap object](#bitmapbuild). - `range_start` – Start of the range (inclusive). [UInt32](../data-types/int-uint.md). - `range_end` – End of the range (exclusive). [UInt32](../data-types/int-uint.md). @@ -104,7 +104,7 @@ bitmapSubsetLimit(bitmap, range_start, cardinality_limit) **Arguments** -- `bitmap` – [Bitmap object](#bitmap_functions-bitmapbuild). +- `bitmap` – [Bitmap object](#bitmapbuild). - `range_start` – Start of the range (inclusive). [UInt32](../data-types/int-uint.md). - `cardinality_limit` – Maximum cardinality of the subset. [UInt32](../data-types/int-uint.md). @@ -134,7 +134,7 @@ subBitmap(bitmap, offset, cardinality_limit) **Arguments** -- `bitmap` – The bitmap. [Bitmap object](#bitmap_functions-bitmapbuild). +- `bitmap` – The bitmap. [Bitmap object](#bitmapbuild). - `offset` – The position of the first element of the subset. [UInt32](../data-types/int-uint.md). - `cardinality_limit` – The maximum number of elements in the subset. [UInt32](../data-types/int-uint.md). @@ -162,7 +162,7 @@ bitmapContains(bitmap, needle) **Arguments** -- `bitmap` – [Bitmap object](#bitmap_functions-bitmapbuild). +- `bitmap` – [Bitmap object](#bitmapbuild). - `needle` – Searched bit value. [UInt32](../data-types/int-uint.md). **Returned values** @@ -188,7 +188,7 @@ Result: Checks whether two bitmaps intersect. -If `bitmap2` contains exactly one element, consider using [bitmapContains](#bitmap_functions-bitmapcontains) instead as it works more efficiently. +If `bitmap2` contains exactly one element, consider using [bitmapContains](#bitmapcontains) instead as it works more efficiently. **Syntax** diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index 4092c83954a..b532e0de8f0 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -83,7 +83,7 @@ Result: ``` ## makeDate32 -Like [makeDate](#makeDate) but produces a [Date32](../data-types/date32.md). +Like [makeDate](#makedate) but produces a [Date32](../data-types/date32.md). ## makeDateTime @@ -214,7 +214,7 @@ Result: **See also** -- [serverTimeZone](#serverTimeZone) +- [serverTimeZone](#servertimezone) ## serverTimeZone @@ -249,7 +249,7 @@ Result: **See also** -- [timeZone](#timeZone) +- [timeZone](#timezone) ## toTimeZone @@ -305,7 +305,7 @@ int32samoa: 1546300800 **See Also** -- [formatDateTime](#formatDateTime) - supports non-constant timezone. +- [formatDateTime](#formatdatetime) - supports non-constant timezone. - [toString](type-conversion-functions.md#tostring) - supports non-constant timezone. ## timeZoneOf @@ -1006,7 +1006,7 @@ toStartOfWeek(t[, mode[, timezone]]) **Arguments** - `t` - a [Date](../data-types/date.md), [Date32](../data-types/date32.md), [DateTime](../data-types/datetime.md) or [DateTime64](../data-types/datetime64.md) -- `mode` - determines the first day of the week as described in the [toWeek()](date-time-functions#toweek) function +- `mode` - determines the first day of the week as described in the [toWeek()](#toweek) function - `timezone` - Optional parameter, it behaves like any other conversion function **Returned value** @@ -1049,7 +1049,7 @@ toLastDayOfWeek(t[, mode[, timezone]]) **Arguments** - `t` - a [Date](../data-types/date.md), [Date32](../data-types/date32.md), [DateTime](../data-types/datetime.md) or [DateTime64](../data-types/datetime64.md) -- `mode` - determines the last day of the week as described in the [toWeek()](date-time-functions#toweek) function +- `mode` - determines the last day of the week as described in the [toWeek](#toweek) function - `timezone` - Optional parameter, it behaves like any other conversion function **Returned value** @@ -1719,7 +1719,7 @@ Result: **See Also** -- [fromDaysSinceYearZero](#fromDaysSinceYearZero) +- [fromDaysSinceYearZero](#fromdayssinceyearzero) ## fromDaysSinceYearZero @@ -1759,11 +1759,11 @@ Result: **See Also** -- [toDaysSinceYearZero](#toDaysSinceYearZero) +- [toDaysSinceYearZero](#todayssinceyearzero) ## fromDaysSinceYearZero32 -Like [fromDaysSinceYearZero](#fromDaysSinceYearZero) but returns a [Date32](../data-types/date32.md). +Like [fromDaysSinceYearZero](#fromdayssinceyearzero) but returns a [Date32](../data-types/date32.md). ## age @@ -1982,7 +1982,7 @@ Result: **See Also** -- [toStartOfInterval](#tostartofintervaldate_or_date_with_time-interval-x-unit--time_zone) +- [toStartOfInterval](#tostartofinterval) ## date\_add @@ -2055,7 +2055,7 @@ Result: **See Also** -- [addDate](#addDate) +- [addDate](#adddate) ## date\_sub @@ -2129,7 +2129,7 @@ Result: **See Also** -- [subDate](#subDate) +- [subDate](#subdate) ## timestamp\_add @@ -2310,7 +2310,7 @@ Alias: `SUBDATE` - [date_sub](#date_sub) -## now {#now} +## now Returns the current date and time at the moment of query analysis. The function is a constant expression. @@ -3609,7 +3609,7 @@ SELECT timeSlots(toDateTime64('1980-12-12 21:01:02.1234', 4, 'UTC'), toDecimal64 └───────────────────────────────────────────────────────────────────────────────────────────────────────────┘ ``` -## formatDateTime {#formatDateTime} +## formatDateTime Formats a Time according to the given Format string. Format is a constant expression, so you cannot have multiple formats for a single result column. @@ -3734,10 +3734,9 @@ LIMIT 10 **See Also** -- [formatDateTimeInJodaSyntax](##formatDateTimeInJodaSyntax) +- [formatDateTimeInJodaSyntax](#formatdatetimeinjodasyntax) - -## formatDateTimeInJodaSyntax {#formatDateTimeInJodaSyntax} +## formatDateTimeInJodaSyntax Similar to formatDateTime, except that it formats datetime in Joda style instead of MySQL style. Refer to https://joda-time.sourceforge.net/apidocs/org/joda/time/format/DateTimeFormat.html. @@ -3902,11 +3901,11 @@ Result: **See Also** -- [fromUnixTimestampInJodaSyntax](##fromUnixTimestampInJodaSyntax) +- [fromUnixTimestampInJodaSyntax](#fromunixtimestampinjodasyntax) ## fromUnixTimestampInJodaSyntax -Same as [fromUnixTimestamp](#fromUnixTimestamp) but when called in the second way (two or three arguments), the formatting is performed using [Joda style](https://joda-time.sourceforge.net/apidocs/org/joda/time/format/DateTimeFormat.html) instead of MySQL style. +Same as [fromUnixTimestamp](#fromunixtimestamp) but when called in the second way (two or three arguments), the formatting is performed using [Joda style](https://joda-time.sourceforge.net/apidocs/org/joda/time/format/DateTimeFormat.html) instead of MySQL style. **Example:** @@ -4121,7 +4120,7 @@ Result: Returns the current date and time at the moment of query analysis. The function is a constant expression. :::note -This function gives the same result that `now('UTC')` would. It was added only for MySQL support and [`now`](#now-now) is the preferred usage. +This function gives the same result that `now('UTC')` would. It was added only for MySQL support and [`now`](#now) is the preferred usage. ::: **Syntax** diff --git a/docs/en/sql-reference/functions/ext-dict-functions.md b/docs/en/sql-reference/functions/ext-dict-functions.md index 82c21ce40c8..093ee690d47 100644 --- a/docs/en/sql-reference/functions/ext-dict-functions.md +++ b/docs/en/sql-reference/functions/ext-dict-functions.md @@ -12,7 +12,7 @@ For dictionaries created with [DDL queries](../../sql-reference/statements/creat For information on connecting and configuring dictionaries, see [Dictionaries](../../sql-reference/dictionaries/index.md). -## dictGet, dictGetOrDefault, dictGetOrNull {#dictGet} +## dictGet, dictGetOrDefault, dictGetOrNull Retrieves values from a dictionary. diff --git a/docs/en/sql-reference/functions/geo/geohash.md b/docs/en/sql-reference/functions/geo/geohash.md index 8abc8006e5d..b6ac7a74092 100644 --- a/docs/en/sql-reference/functions/geo/geohash.md +++ b/docs/en/sql-reference/functions/geo/geohash.md @@ -4,6 +4,8 @@ sidebar_label: Geohash title: "Functions for Working with Geohash" --- +## Geohash + [Geohash](https://en.wikipedia.org/wiki/Geohash) is the geocode system, which subdivides Earth’s surface into buckets of grid shape and encodes each cell into a short string of letters and digits. It is a hierarchical data structure, so the longer is the geohash string, the more precise is the geographic location. If you need to manually convert geographic coordinates to geohash strings, you can use [geohash.org](http://geohash.org/). diff --git a/docs/en/sql-reference/functions/geo/h3.md b/docs/en/sql-reference/functions/geo/h3.md index bcdd457964a..5fbc2adf2fa 100644 --- a/docs/en/sql-reference/functions/geo/h3.md +++ b/docs/en/sql-reference/functions/geo/h3.md @@ -4,6 +4,8 @@ sidebar_label: H3 Indexes title: "Functions for Working with H3 Indexes" --- +## H3 Index + [H3](https://eng.uber.com/h3/) is a geographical indexing system where Earth’s surface divided into a grid of even hexagonal cells. This system is hierarchical, i. e. each hexagon on the top level ("parent") can be split into seven even but smaller ones ("children"), and so on. The level of the hierarchy is called `resolution` and can receive a value from `0` till `15`, where `0` is the `base` level with the largest and coarsest cells. @@ -16,7 +18,7 @@ The full description of the H3 system is available at [the Uber Engineering site ## h3IsValid -Verifies whether the number is a valid [H3](#h3index) index. +Verifies whether the number is a valid [H3](#h3-index) index. **Syntax** @@ -51,7 +53,7 @@ Result: ## h3GetResolution -Defines the resolution of the given [H3](#h3index) index. +Defines the resolution of the given [H3](#h3-index) index. **Syntax** @@ -86,7 +88,7 @@ Result: ## h3EdgeAngle -Calculates the average length of the [H3](#h3index) hexagon edge in grades. +Calculates the average length of the [H3](#h3-index) hexagon edge in grades. **Syntax** @@ -100,7 +102,7 @@ h3EdgeAngle(resolution) **Returned values** -- The average length of the [H3](#h3index) hexagon edge in grades. [Float64](../../data-types/float.md). +- The average length of the [H3](#h3-index) hexagon edge in grades. [Float64](../../data-types/float.md). **Example** @@ -120,7 +122,7 @@ Result: ## h3EdgeLengthM -Calculates the average length of the [H3](#h3index) hexagon edge in meters. +Calculates the average length of the [H3](#h3-index) hexagon edge in meters. **Syntax** @@ -134,7 +136,7 @@ h3EdgeLengthM(resolution) **Returned values** -- The average length of the [H3](#h3index) hexagon edge in meters. [Float64](../../data-types/float.md). +- The average length of the [H3](#h3-index) hexagon edge in meters. [Float64](../../data-types/float.md). **Example** @@ -154,7 +156,7 @@ Result: ## h3EdgeLengthKm -Calculates the average length of the [H3](#h3index) hexagon edge in kilometers. +Calculates the average length of the [H3](#h3-index) hexagon edge in kilometers. **Syntax** @@ -168,7 +170,7 @@ h3EdgeLengthKm(resolution) **Returned values** -- The average length of the [H3](#h3index) hexagon edge in kilometers. [Float64](../../data-types/float.md). +- The average length of the [H3](#h3-index) hexagon edge in kilometers. [Float64](../../data-types/float.md). **Example** @@ -188,7 +190,7 @@ Result: ## geoToH3 -Returns [H3](#h3index) point index `(lon, lat)` with specified resolution. +Returns [H3](#h3-index) point index `(lon, lat)` with specified resolution. **Syntax** @@ -225,7 +227,7 @@ Result: ## h3ToGeo -Returns the centroid longitude and latitude corresponding to the provided [H3](#h3index) index. +Returns the centroid longitude and latitude corresponding to the provided [H3](#h3-index) index. **Syntax** @@ -294,7 +296,7 @@ Result: ## h3kRing - Lists all the [H3](#h3index) hexagons in the raduis of `k` from the given hexagon in random order. + Lists all the [H3](#h3-index) hexagons in the raduis of `k` from the given hexagon in random order. **Syntax** @@ -335,7 +337,7 @@ Result: ## h3GetBaseCell -Returns the base cell number of the [H3](#h3index) index. +Returns the base cell number of the [H3](#h3-index) index. **Syntax** @@ -437,7 +439,7 @@ Result: ## h3IndexesAreNeighbors -Returns whether or not the provided [H3](#h3index) indexes are neighbors. +Returns whether or not the provided [H3](#h3-index) indexes are neighbors. **Syntax** @@ -473,7 +475,7 @@ Result: ## h3ToChildren -Returns an array of child indexes for the given [H3](#h3index) index. +Returns an array of child indexes for the given [H3](#h3-index) index. **Syntax** @@ -508,7 +510,7 @@ Result: ## h3ToParent -Returns the parent (coarser) index containing the given [H3](#h3index) index. +Returns the parent (coarser) index containing the given [H3](#h3-index) index. **Syntax** @@ -609,7 +611,7 @@ Result: ## h3GetResolution -Returns the resolution of the [H3](#h3index) index. +Returns the resolution of the [H3](#h3-index) index. **Syntax** @@ -643,7 +645,7 @@ Result: ## h3IsResClassIII -Returns whether [H3](#h3index) index has a resolution with Class III orientation. +Returns whether [H3](#h3-index) index has a resolution with Class III orientation. **Syntax** @@ -678,7 +680,7 @@ Result: ## h3IsPentagon -Returns whether this [H3](#h3index) index represents a pentagonal cell. +Returns whether this [H3](#h3-index) index represents a pentagonal cell. **Syntax** @@ -713,7 +715,7 @@ Result: ## h3GetFaces -Returns icosahedron faces intersected by a given [H3](#h3index) index. +Returns icosahedron faces intersected by a given [H3](#h3-index) index. **Syntax** @@ -815,7 +817,7 @@ Result: ## h3ToCenterChild -Returns the center child (finer) [H3](#h3index) index contained by given [H3](#h3index) at the given resolution. +Returns the center child (finer) [H3](#h3-index) index contained by given [H3](#h3-index) at the given resolution. **Syntax** @@ -830,7 +832,7 @@ h3ToCenterChild(index, resolution) **Returned values** -- [H3](#h3index) index of the center child contained by given [H3](#h3index) at the given resolution. [UInt64](../../data-types/int-uint.md). +- [H3](#h3-index) index of the center child contained by given [H3](#h3-index) at the given resolution. [UInt64](../../data-types/int-uint.md). **Example** diff --git a/docs/en/sql-reference/functions/geo/s2.md b/docs/en/sql-reference/functions/geo/s2.md index 3165b21318b..e022ce870b0 100644 --- a/docs/en/sql-reference/functions/geo/s2.md +++ b/docs/en/sql-reference/functions/geo/s2.md @@ -5,6 +5,8 @@ sidebar_label: S2 Geometry # Functions for Working with S2 Index +## S2Index + [S2](https://s2geometry.io/) is a geographical indexing system where all geographical data is represented on a three-dimensional sphere (similar to a globe). In the S2 library points are represented as the S2 Index - a specific number which encodes internally a point on the surface of a unit sphere, unlike traditional (latitude, longitude) pairs. To get the S2 point index for a given point specified in the format (latitude, longitude) use the [geoToS2](#geotos2) function. Also, you can use the [s2ToGeo](#s2togeo) function for getting geographical coordinates corresponding to the specified S2 point index. diff --git a/docs/en/sql-reference/functions/hash-functions.md b/docs/en/sql-reference/functions/hash-functions.md index 506114038f7..e431ed75465 100644 --- a/docs/en/sql-reference/functions/hash-functions.md +++ b/docs/en/sql-reference/functions/hash-functions.md @@ -45,13 +45,13 @@ SELECT halfMD5(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00') Calculates the MD4 from a string and returns the resulting set of bytes as FixedString(16). -## MD5 {#md5} +## MD5 Calculates the MD5 from a string and returns the resulting set of bytes as FixedString(16). If you do not need MD5 in particular, but you need a decent cryptographic 128-bit hash, use the ‘sipHash128’ function instead. If you want to get the same result as output by the md5sum utility, use lower(hex(MD5(s))). -## sipHash64 {#siphash64} +## sipHash64 Produces a 64-bit [SipHash](https://en.wikipedia.org/wiki/SipHash) hash value. diff --git a/docs/en/sql-reference/functions/ip-address-functions.md b/docs/en/sql-reference/functions/ip-address-functions.md index 5b6a3aef2c8..11a7749b33d 100644 --- a/docs/en/sql-reference/functions/ip-address-functions.md +++ b/docs/en/sql-reference/functions/ip-address-functions.md @@ -295,7 +295,7 @@ Same as `toIPv6`, but if the IPv6 address has an invalid format, it returns null ## toIPv6 Converts a string form of IPv6 address to [IPv6](../data-types/ipv6.md) type. If the IPv6 address has an invalid format, returns an empty value. -Similar to [IPv6StringToNum](#ipv6stringtonums) function, which converts IPv6 address to binary format. +Similar to [IPv6StringToNum](#ipv6stringtonum) function, which converts IPv6 address to binary format. If the input string contains a valid IPv4 address, then the IPv6 equivalent of the IPv4 address is returned. diff --git a/docs/en/sql-reference/functions/json-functions.md b/docs/en/sql-reference/functions/json-functions.md index 5d73c9a83b3..7bff6a6cba5 100644 --- a/docs/en/sql-reference/functions/json-functions.md +++ b/docs/en/sql-reference/functions/json-functions.md @@ -5,10 +5,10 @@ sidebar_label: JSON --- There are two sets of functions to parse JSON: - - [`simpleJSON*` (`visitParam*`)](#simplejson--visitparam-functions) which is made for parsing a limited subset of JSON extremely fast. + - [`simpleJSON*` (`visitParam*`)](#simplejson-visitparam-functions) which is made for parsing a limited subset of JSON extremely fast. - [`JSONExtract*`](#jsonextract-functions) which is made for parsing ordinary JSON. -## simpleJSON / visitParam functions +## simpleJSON (visitParam) functions ClickHouse has special functions for working with simplified JSON. All these JSON functions are based on strong assumptions about what the JSON can be. They try to do as little as possible to get the job done as quickly as possible. diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index 5e63d9824b4..e22dd5d827c 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -762,7 +762,7 @@ LIMIT 10 Given a size (number of bytes), this function returns a readable, rounded size with suffix (KB, MB, etc.) as string. -The opposite operations of this function are [parseReadableSize](#parseReadableSize), [parseReadableSizeOrZero](#parseReadableSizeOrZero), and [parseReadableSizeOrNull](#parseReadableSizeOrNull). +The opposite operations of this function are [parseReadableSize](#parsereadablesize), [parseReadableSizeOrZero](#parsereadablesizeorzero), and [parseReadableSizeOrNull](#parsereadablesizeornull). **Syntax** @@ -795,7 +795,7 @@ Result: Given a size (number of bytes), this function returns a readable, rounded size with suffix (KiB, MiB, etc.) as string. -The opposite operations of this function are [parseReadableSize](#parseReadableSize), [parseReadableSizeOrZero](#parseReadableSizeOrZero), and [parseReadableSizeOrNull](#parseReadableSizeOrNull). +The opposite operations of this function are [parseReadableSize](#parsereadablesize), [parseReadableSizeOrZero](#parsereadablesizeorzero), and [parseReadableSizeOrNull](#parsereadablesizeornull). **Syntax** @@ -926,7 +926,7 @@ SELECT Given a string containing a byte size and `B`, `KiB`, `KB`, `MiB`, `MB`, etc. as a unit (i.e. [ISO/IEC 80000-13](https://en.wikipedia.org/wiki/ISO/IEC_80000) or decimal byte unit), this function returns the corresponding number of bytes. If the function is unable to parse the input value, it throws an exception. -The inverse operations of this function are [formatReadableSize](#formatReadableSize) and [formatReadableDecimalSize](#formatReadableDecimalSize). +The inverse operations of this function are [formatReadableSize](#formatreadablesize) and [formatReadableDecimalSize](#formatreadabledecimalsize). **Syntax** @@ -964,7 +964,7 @@ SELECT Given a string containing a byte size and `B`, `KiB`, `KB`, `MiB`, `MB`, etc. as a unit (i.e. [ISO/IEC 80000-13](https://en.wikipedia.org/wiki/ISO/IEC_80000) or decimal byte unit), this function returns the corresponding number of bytes. If the function is unable to parse the input value, it returns `NULL`. -The inverse operations of this function are [formatReadableSize](#formatReadableSize) and [formatReadableDecimalSize](#formatReadableDecimalSize). +The inverse operations of this function are [formatReadableSize](#formatreadablesize) and [formatReadableDecimalSize](#formatreadabledecimalsize). **Syntax** @@ -1002,7 +1002,7 @@ SELECT Given a string containing a byte size and `B`, `KiB`, `KB`, `MiB`, `MB`, etc. as a unit (i.e. [ISO/IEC 80000-13](https://en.wikipedia.org/wiki/ISO/IEC_80000) or decimal byte unit), this function returns the corresponding number of bytes. If the function is unable to parse the input value, it returns `0`. -The inverse operations of this function are [formatReadableSize](#formatReadableSize) and [formatReadableDecimalSize](#formatReadableDecimalSize). +The inverse operations of this function are [formatReadableSize](#formatreadablesize) and [formatReadableDecimalSize](#formatreadabledecimalsize). **Syntax** @@ -2711,7 +2711,7 @@ countDigits(x) - Number of digits. [UInt8](../data-types/int-uint.md#uint-ranges). :::note -For `Decimal` values takes into account their scales: calculates result over underlying integer type which is `(value * scale)`. For example: `countDigits(42) = 2`, `countDigits(42.000) = 5`, `countDigits(0.04200) = 4`. I.e. you may check decimal overflow for `Decimal64` with `countDecimal(x) > 18`. It's a slow variant of [isDecimalOverflow](#is-decimal-overflow). +For `Decimal` values takes into account their scales: calculates result over underlying integer type which is `(value * scale)`. For example: `countDigits(42) = 2`, `countDigits(42.000) = 5`, `countDigits(0.04200) = 4`. I.e. you may check decimal overflow for `Decimal64` with `countDecimal(x) > 18`. It's a slow variant of [isDecimalOverflow](#isdecimaloverflow). ::: **Example** @@ -2803,7 +2803,7 @@ currentProfiles() ## enabledProfiles -Returns settings profiles, assigned to the current user both explicitly and implicitly. Explicitly assigned profiles are the same as returned by the [currentProfiles](#current-profiles) function. Implicitly assigned profiles include parent profiles of other assigned profiles, profiles assigned via granted roles, profiles assigned via their own settings, and the main default profile (see the `default_profile` section in the main server configuration file). +Returns settings profiles, assigned to the current user both explicitly and implicitly. Explicitly assigned profiles are the same as returned by the [currentProfiles](#currentprofiles) function. Implicitly assigned profiles include parent profiles of other assigned profiles, profiles assigned via granted roles, profiles assigned via their own settings, and the main default profile (see the `default_profile` section in the main server configuration file). **Syntax** @@ -2916,11 +2916,11 @@ Result: └───────────────────────────┘ ``` -## queryID {#queryID} +## queryID Returns the ID of the current query. Other parameters of a query can be extracted from the [system.query_log](../../operations/system-tables/query_log.md) table via `query_id`. -In contrast to [initialQueryID](#initial-query-id) function, `queryID` can return different results on different shards (see the example). +In contrast to [initialQueryID](#initialqueryid) function, `queryID` can return different results on different shards (see the example). **Syntax** @@ -2954,7 +2954,7 @@ Result: Returns the ID of the initial current query. Other parameters of a query can be extracted from the [system.query_log](../../operations/system-tables/query_log.md) table via `initial_query_id`. -In contrast to [queryID](#query-id) function, `initialQueryID` returns the same results on different shards (see example). +In contrast to [queryID](#queryid) function, `initialQueryID` returns the same results on different shards (see example). **Syntax** @@ -3041,7 +3041,7 @@ shardCount() **See Also** -- [shardNum()](#shard-num) function example also contains `shardCount()` function call. +- [shardNum()](#shardnum) function example also contains `shardCount()` function call. ## getOSKernelVersion diff --git a/docs/en/sql-reference/functions/rounding-functions.md b/docs/en/sql-reference/functions/rounding-functions.md index 6495a43fc85..e2f471d47eb 100644 --- a/docs/en/sql-reference/functions/rounding-functions.md +++ b/docs/en/sql-reference/functions/rounding-functions.md @@ -200,7 +200,7 @@ Banker's rounding is a method of rounding fractional numbers When the rounding number is halfway between two numbers, it's rounded to the nearest even digit at the specified decimal position. For example: 3.5 rounds up to 4, 2.5 rounds down to 2. It's the default rounding method for floating point numbers defined in [IEEE 754](https://en.wikipedia.org/wiki/IEEE_754#Roundings_to_nearest). -The [round](#rounding_functions-round) function performs the same rounding for floating point numbers. +The [round](#round) function performs the same rounding for floating point numbers. The `roundBankers` function also rounds integers the same way, for example, `roundBankers(45, -1) = 40`. In other cases, the function rounds numbers to the nearest integer. @@ -274,7 +274,7 @@ roundBankers(10.755, 2) = 10.76 **See Also** -- [round](#rounding_functions-round) +- [round](#round) ## roundToExp2 diff --git a/docs/en/sql-reference/functions/string-functions.md b/docs/en/sql-reference/functions/string-functions.md index 342ca2b9f03..c535b82d710 100644 --- a/docs/en/sql-reference/functions/string-functions.md +++ b/docs/en/sql-reference/functions/string-functions.md @@ -1136,16 +1136,136 @@ SELECT tryBase58Decode('3dc8KtHrwM') as res, tryBase58Decode('invalid') as res_i ## base64Encode -Encodes a String or FixedString as base64. +Encodes a String or FixedString as base64, according to [RFC 4648](https://datatracker.ietf.org/doc/html/rfc4648#section-4). Alias: `TO_BASE64`. +**Syntax** + +```sql +base64Encode(plaintext) +``` + +**Arguments** + +- `plaintext` — [String](../data-types/string.md) column or constant. + +**Returned value** + +- A string containing the encoded value of the argument. + +**Example** + +``` sql +SELECT base64Encode('clickhouse'); +``` + +Result: + +```result +┌─base64Encode('clickhouse')─┐ +│ Y2xpY2tob3VzZQ== │ +└────────────────────────────┘ +``` + +## base64UrlEncode + +Encodes an URL (String or FixedString) as base64 with URL-specific modifications, according to [RFC 4648](https://datatracker.ietf.org/doc/html/rfc4648#section-5). + +**Syntax** + +```sql +base64UrlEncode(url) +``` + +**Arguments** + +- `url` — [String](../data-types/string.md) column or constant. + +**Returned value** + +- A string containing the encoded value of the argument. + +**Example** + +``` sql +SELECT base64UrlEncode('https://clickhouse.com'); +``` + +Result: + +```result +┌─base64UrlEncode('https://clickhouse.com')─┐ +│ aHR0cDovL2NsaWNraG91c2UuY29t │ +└───────────────────────────────────────────┘ +``` + ## base64Decode -Decodes a base64-encoded String or FixedString. Throws an exception in case of error. +Accepts a String and decodes it from base64, according to [RFC 4648](https://datatracker.ietf.org/doc/html/rfc4648#section-4). Throws an exception in case of an error. Alias: `FROM_BASE64`. +**Syntax** + +```sql +base64Decode(encoded) +``` + +**Arguments** + +- `encoded` — [String](../data-types/string.md) column or constant. If the string is not a valid Base64-encoded value, an exception is thrown. + +**Returned value** + +- A string containing the decoded value of the argument. + +**Example** + +``` sql +SELECT base64Decode('Y2xpY2tob3VzZQ=='); +``` + +Result: + +```result +┌─base64Decode('Y2xpY2tob3VzZQ==')─┐ +│ clickhouse │ +└──────────────────────────────────┘ +``` + +## base64UrlDecode + +Accepts a base64-encoded URL and decodes it from base64 with URL-specific modifications, according to [RFC 4648](https://datatracker.ietf.org/doc/html/rfc4648#section-5). Throws an exception in case of an error. + +**Syntax** + +```sql +base64UrlDecode(encodedUrl) +``` + +**Arguments** + +- `encodedUrl` — [String](../data-types/string.md) column or constant. If the string is not a valid Base64-encoded value with URL-specific modifications, an exception is thrown. + +**Returned value** + +- A string containing the decoded value of the argument. + +**Example** + +``` sql +SELECT base64UrlDecode('aHR0cDovL2NsaWNraG91c2UuY29t'); +``` + +Result: + +```result +┌─base64UrlDecode('aHR0cDovL2NsaWNraG91c2UuY29t')─┐ +│ https://clickhouse.com │ +└─────────────────────────────────────────────────┘ +``` + ## tryBase64Decode Like `base64Decode` but returns an empty string in case of error. @@ -1156,9 +1276,13 @@ Like `base64Decode` but returns an empty string in case of error. tryBase64Decode(encoded) ``` -**Parameters** +**Arguments** -- `encoded`: [String](../data-types/string.md) column or constant. If the string is not a valid Base58-encoded value, returns an empty string in case of error. +- `encoded`: [String](../data-types/string.md) column or constant. If the string is not a valid Base64-encoded value, returns an empty string. + +**Returned value** + +- A string containing the decoded value of the argument. **Examples** @@ -1169,9 +1293,41 @@ SELECT tryBase64Decode('RW5jb2RlZA==') as res, tryBase64Decode('invalid') as res ``` ```response -┌─res─────┬─res_invalid─┐ -│ Encoded │ │ -└─────────┴─────────────┘ +┌─res────────┬─res_invalid─┐ +│ clickhouse │ │ +└────────────┴─────────────┘ +``` + +## tryBase64UrlDecode + +Like `base64UrlDecode` but returns an empty string in case of error. + +**Syntax** + +```sql +tryBase64UrlDecode(encodedUrl) +``` + +**Parameters** + +- `encodedUrl`: [String](../data-types/string.md) column or constant. If the string is not a valid Base64-encoded value with URL-specific modifications, returns an empty string. + +**Returned value** + +- A string containing the decoded value of the argument. + +**Examples** + +Query: + +```sql +SELECT tryBase64UrlDecode('aHR0cDovL2NsaWNraG91c2UuY29t') as res, tryBase64Decode('aHR0cHM6Ly9jbGlja') as res_invalid; +``` + +```response +┌─res────────────────────┬─res_invalid─┐ +│ https://clickhouse.com │ │ +└────────────────────────┴─────────────┘ ``` ## endsWith {#endswith} @@ -1994,7 +2150,7 @@ Result: ## stringJaccardIndexUTF8 -Like [stringJaccardIndex](#stringJaccardIndex) but for UTF8-encoded strings. +Like [stringJaccardIndex](#stringjaccardindex) but for UTF8-encoded strings. ## editDistance diff --git a/docs/en/sql-reference/functions/string-search-functions.md b/docs/en/sql-reference/functions/string-search-functions.md index d261cff3580..b7ba1d4feb7 100644 --- a/docs/en/sql-reference/functions/string-search-functions.md +++ b/docs/en/sql-reference/functions/string-search-functions.md @@ -262,7 +262,7 @@ Result: ## multiSearchAllPositionsUTF8 -Like [multiSearchAllPositions](#multiSearchAllPositions) but assumes `haystack` and the `needle` substrings are UTF-8 encoded strings. +Like [multiSearchAllPositions](#multisearchallpositions) but assumes `haystack` and the `needle` substrings are UTF-8 encoded strings. **Syntax** @@ -336,7 +336,7 @@ Result: Like [`position`](#position) but returns the leftmost offset in a `haystack` string which matches any of multiple `needle` strings. -Functions [`multiSearchFirstPositionCaseInsensitive`](#multiSearchFirstPositionCaseInsensitive), [`multiSearchFirstPositionUTF8`](#multiSearchFirstPositionUTF8) and [`multiSearchFirstPositionCaseInsensitiveUTF8`](#multiSearchFirstPositionCaseInsensitiveUTF8) provide case-insensitive and/or UTF-8 variants of this function. +Functions [`multiSearchFirstPositionCaseInsensitive`](#multisearchfirstpositioncaseinsensitive), [`multiSearchFirstPositionUTF8`](#multisearchfirstpositionutf8) and [`multiSearchFirstPositionCaseInsensitiveUTF8`](#multisearchfirstpositioncaseinsensitiveutf8) provide case-insensitive and/or UTF-8 variants of this function. **Syntax** @@ -370,7 +370,7 @@ Result: ## multiSearchFirstPositionCaseInsensitive -Like [`multiSearchFirstPosition`](#multiSearchFirstPosition) but ignores case. +Like [`multiSearchFirstPosition`](#multisearchfirstposition) but ignores case. **Syntax** @@ -404,7 +404,7 @@ Result: ## multiSearchFirstPositionUTF8 -Like [`multiSearchFirstPosition`](#multiSearchFirstPosition) but assumes `haystack` and `needle` to be UTF-8 strings. +Like [`multiSearchFirstPosition`](#multisearchfirstposition) but assumes `haystack` and `needle` to be UTF-8 strings. **Syntax** @@ -440,7 +440,7 @@ Result: ## multiSearchFirstPositionCaseInsensitiveUTF8 -Like [`multiSearchFirstPosition`](#multiSearchFirstPosition) but assumes `haystack` and `needle` to be UTF-8 strings and ignores case. +Like [`multiSearchFirstPosition`](#multisearchfirstposition) but assumes `haystack` and `needle` to be UTF-8 strings and ignores case. **Syntax** @@ -478,7 +478,7 @@ Result: Returns the index `i` (starting from 1) of the leftmost found needlei in the string `haystack` and 0 otherwise. -Functions [`multiSearchFirstIndexCaseInsensitive`](#multiSearchFirstIndexCaseInsensitive), [`multiSearchFirstIndexUTF8`](#multiSearchFirstIndexUTF8) and [`multiSearchFirstIndexCaseInsensitiveUTF8`](#multiSearchFirstIndexCaseInsensitiveUTF8) provide case-insensitive and/or UTF-8 variants of this function. +Functions [`multiSearchFirstIndexCaseInsensitive`](#multisearchfirstindexcaseinsensitive), [`multiSearchFirstIndexUTF8`](#multisearchfirstindexutf8) and [`multiSearchFirstIndexCaseInsensitiveUTF8`](#multisearchfirstindexcaseinsensitiveutf8) provide case-insensitive and/or UTF-8 variants of this function. **Syntax** @@ -615,7 +615,7 @@ Result: Returns 1, if at least one string needlei matches the string `haystack` and 0 otherwise. -Functions [`multiSearchAnyCaseInsensitive`](#multiSearchAnyCaseInsensitive), [`multiSearchAnyUTF8`](#multiSearchAnyUTF8) and []`multiSearchAnyCaseInsensitiveUTF8`](#multiSearchAnyCaseInsensitiveUTF8) provide case-insensitive and/or UTF-8 variants of this function. +Functions [`multiSearchAnyCaseInsensitive`](#multisearchanycaseinsensitive), [`multiSearchAnyUTF8`](#multisearchanyutf8) and [`multiSearchAnyCaseInsensitiveUTF8`](#multisearchanycaseinsensitiveutf8) provide case-insensitive and/or UTF-8 variants of this function. **Syntax** @@ -719,7 +719,7 @@ Result: ## multiSearchAnyCaseInsensitiveUTF8 -Like [multiSearchAnyUTF8](#multiSearchAnyUTF8) but ignores case. +Like [multiSearchAnyUTF8](#multisearchanyutf8) but ignores case. *Syntax** @@ -880,7 +880,7 @@ extractAll(haystack, pattern) Matches all groups of the `haystack` string using the `pattern` regular expression. Returns an array of arrays, where the first array includes all fragments matching the first group, the second array - matching the second group, etc. -This function is slower than [extractAllGroupsVertical](#extractallgroups-vertical). +This function is slower than [extractAllGroupsVertical](#extractallgroupsvertical). **Syntax** @@ -952,7 +952,7 @@ Result: └────────────────────────────────────────────────────────────────────────────────────────┘ ``` -## like {#like} +## like Returns whether string `haystack` matches the LIKE expression `pattern`. @@ -1215,7 +1215,7 @@ Result: ## ngramSearchCaseInsensitive -Provides a case-insensitive variant of [ngramSearch](#ngramSearch). +Provides a case-insensitive variant of [ngramSearch](#ngramsearch). **Syntax** @@ -1630,7 +1630,7 @@ Result: ## hasSubsequenceCaseInsensitive -Like [hasSubsequence](#hasSubsequence) but searches case-insensitively. +Like [hasSubsequence](#hassubsequence) but searches case-insensitively. **Syntax** @@ -1665,7 +1665,7 @@ Result: ## hasSubsequenceUTF8 -Like [hasSubsequence](#hasSubsequence) but assumes `haystack` and `needle` are UTF-8 encoded strings. +Like [hasSubsequence](#hassubsequence) but assumes `haystack` and `needle` are UTF-8 encoded strings. **Syntax** @@ -1700,7 +1700,7 @@ Result: ## hasSubsequenceCaseInsensitiveUTF8 -Like [hasSubsequenceUTF8](#hasSubsequenceUTF8) but searches case-insensitively. +Like [hasSubsequenceUTF8](#hassubsequenceutf8) but searches case-insensitively. **Syntax** diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 2ec51d43c59..61e84ca72d1 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -10,7 +10,7 @@ sidebar_label: Type Conversion ClickHouse generally uses the [same behavior as C++ programs](https://en.cppreference.com/w/cpp/language/implicit_conversion). -`to` functions and [cast](#castx-t) behave differently in some cases, for example in case of [LowCardinality](../data-types/lowcardinality.md): [cast](#castx-t) removes [LowCardinality](../data-types/lowcardinality.md) trait `to` functions don't. The same with [Nullable](../data-types/nullable.md), this behaviour is not compatible with SQL standard, and it can be changed using [cast_keep_nullable](../../operations/settings/settings.md/#cast_keep_nullable) setting. +`to` functions and [cast](#cast) behave differently in some cases, for example in case of [LowCardinality](../data-types/lowcardinality.md): [cast](#cast) removes [LowCardinality](../data-types/lowcardinality.md) trait `to` functions don't. The same with [Nullable](../data-types/nullable.md), this behaviour is not compatible with SQL standard, and it can be changed using [cast_keep_nullable](../../operations/settings/settings.md/#cast_keep_nullable) setting. :::note Be aware of potential data loss if values of a datatype are converted to a smaller datatype (for example from `Int64` to `Int32`) or between @@ -70,7 +70,7 @@ Integer value in the `Int8`, `Int16`, `Int32`, `Int64`, `Int128` or `Int256` dat Functions use [rounding towards zero](https://en.wikipedia.org/wiki/Rounding#Rounding_towards_zero), meaning they truncate fractional digits of numbers. -The behavior of functions for the [NaN and Inf](../data-types/float.md/#data_type-float-nan-inf) arguments is undefined. Remember about [numeric conversions issues](#numeric-conversion-issues), when using the functions. +The behavior of functions for the [NaN and Inf](../data-types/float.md/#data_type-float-nan-inf) arguments is undefined. Remember about [numeric conversions issues](#common-issues-with-data-conversion), when using the functions. **Example** @@ -169,7 +169,7 @@ Converts an input value to the [UInt](../data-types/int-uint.md) data type. This Functions use [rounding towards zero](https://en.wikipedia.org/wiki/Rounding#Rounding_towards_zero), meaning they truncate fractional digits of numbers. -The behavior of functions for negative arguments and for the [NaN and Inf](../data-types/float.md/#data_type-float-nan-inf) arguments is undefined. If you pass a string with a negative number, for example `'-32'`, ClickHouse raises an exception. Remember about [numeric conversions issues](#numeric-conversion-issues), when using the functions. +The behavior of functions for negative arguments and for the [NaN and Inf](../data-types/float.md/#data_type-float-nan-inf) arguments is undefined. If you pass a string with a negative number, for example `'-32'`, ClickHouse raises an exception. Remember about [numeric conversions issues](#common-issues-with-data-conversion), when using the functions. **Example** @@ -996,7 +996,7 @@ Result: ## reinterpretAsUInt8 -Performs byte reinterpretation by treating the input value as a value of type UInt8. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. +Performs byte reinterpretation by treating the input value as a value of type UInt8. Unlike [`CAST`](#cast), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1034,7 +1034,7 @@ Result: ## reinterpretAsUInt16 -Performs byte reinterpretation by treating the input value as a value of type UInt16. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. +Performs byte reinterpretation by treating the input value as a value of type UInt16. Unlike [`CAST`](#cast), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1072,7 +1072,7 @@ Result: ## reinterpretAsUInt32 -Performs byte reinterpretation by treating the input value as a value of type UInt32. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. +Performs byte reinterpretation by treating the input value as a value of type UInt32. Unlike [`CAST`](#cast), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1110,7 +1110,7 @@ Result: ## reinterpretAsUInt64 -Performs byte reinterpretation by treating the input value as a value of type UInt64. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. +Performs byte reinterpretation by treating the input value as a value of type UInt64. Unlike [`CAST`](#cast), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1148,7 +1148,7 @@ Result: ## reinterpretAsUInt128 -Performs byte reinterpretation by treating the input value as a value of type UInt128. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. +Performs byte reinterpretation by treating the input value as a value of type UInt128. Unlike [`CAST`](#cast), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1186,7 +1186,7 @@ Result: ## reinterpretAsUInt256 -Performs byte reinterpretation by treating the input value as a value of type UInt256. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. +Performs byte reinterpretation by treating the input value as a value of type UInt256. Unlike [`CAST`](#cast), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1224,7 +1224,7 @@ Result: ## reinterpretAsInt8 -Performs byte reinterpretation by treating the input value as a value of type Int8. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. +Performs byte reinterpretation by treating the input value as a value of type Int8. Unlike [`CAST`](#cast), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1262,7 +1262,7 @@ Result: ## reinterpretAsInt16 -Performs byte reinterpretation by treating the input value as a value of type Int16. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. +Performs byte reinterpretation by treating the input value as a value of type Int16. Unlike [`CAST`](#cast), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1300,7 +1300,7 @@ Result: ## reinterpretAsInt32 -Performs byte reinterpretation by treating the input value as a value of type Int32. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. +Performs byte reinterpretation by treating the input value as a value of type Int32. Unlike [`CAST`](#cast), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1338,7 +1338,7 @@ Result: ## reinterpretAsInt64 -Performs byte reinterpretation by treating the input value as a value of type Int64. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. +Performs byte reinterpretation by treating the input value as a value of type Int64. Unlike [`CAST`](#cast), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1376,7 +1376,7 @@ Result: ## reinterpretAsInt128 -Performs byte reinterpretation by treating the input value as a value of type Int128. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. +Performs byte reinterpretation by treating the input value as a value of type Int128. Unlike [`CAST`](#cast), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1414,7 +1414,7 @@ Result: ## reinterpretAsInt256 -Performs byte reinterpretation by treating the input value as a value of type Int256. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. +Performs byte reinterpretation by treating the input value as a value of type Int256. Unlike [`CAST`](#cast), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1452,7 +1452,7 @@ Result: ## reinterpretAsFloat32 -Performs byte reinterpretation by treating the input value as a value of type Float32. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. +Performs byte reinterpretation by treating the input value as a value of type Float32. Unlike [`CAST`](#cast), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1486,7 +1486,7 @@ Result: ## reinterpretAsFloat64 -Performs byte reinterpretation by treating the input value as a value of type Float64. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. +Performs byte reinterpretation by treating the input value as a value of type Float64. Unlike [`CAST`](#cast), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1730,7 +1730,7 @@ Result: └─────────────────────┘ ``` -## reinterpret(x, T) +## reinterpret Uses the same source in-memory bytes sequence for `x` value and reinterprets it to destination type. @@ -1766,9 +1766,9 @@ Result: └─────────────┴──────────────┴───────────────┘ ``` -## CAST(x, T) +## CAST -Converts an input value to the specified data type. Unlike the [reinterpret](#type_conversion_function-reinterpret) function, `CAST` tries to present the same value using the new data type. If the conversion can not be done then an exception is raised. +Converts an input value to the specified data type. Unlike the [reinterpret](#reinterpret) function, `CAST` tries to present the same value using the new data type. If the conversion can not be done then an exception is raised. Several syntax variants are supported. **Syntax** @@ -1875,7 +1875,7 @@ Result: Converts `x` to the `T` data type. -The difference from [cast(x, T)](#type_conversion_function-cast) is that `accurateCast` does not allow overflow of numeric types during cast if type value `x` does not fit the bounds of type `T`. For example, `accurateCast(-1, 'UInt8')` throws an exception. +The difference from [cast](#cast) is that `accurateCast` does not allow overflow of numeric types during cast if type value `x` does not fit the bounds of type `T`. For example, `accurateCast(-1, 'UInt8')` throws an exception. **Example** @@ -2061,7 +2061,7 @@ Result: └───────────────────────────┴──────────────────────────────┘ ``` -## parseDateTime {#type_conversion_functions-parseDateTime} +## parseDateTime Converts a [String](../data-types/string.md) to [DateTime](../data-types/datetime.md) according to a [MySQL format string](https://dev.mysql.com/doc/refman/8.0/en/date-and-time-functions.html#function_date-format). @@ -2102,15 +2102,15 @@ Alias: `TO_TIMESTAMP`. ## parseDateTimeOrZero -Same as for [parseDateTime](#type_conversion_functions-parseDateTime) except that it returns zero date when it encounters a date format that cannot be processed. +Same as for [parseDateTime](#parsedatetime) except that it returns zero date when it encounters a date format that cannot be processed. ## parseDateTimeOrNull -Same as for [parseDateTime](#type_conversion_functions-parseDateTime) except that it returns `NULL` when it encounters a date format that cannot be processed. +Same as for [parseDateTime](#parsedatetime) except that it returns `NULL` when it encounters a date format that cannot be processed. Alias: `str_to_date`. -## parseDateTimeInJodaSyntax {#type_conversion_functions-parseDateTimeInJodaSyntax} +## parseDateTimeInJodaSyntax Similar to [parseDateTime](#parsedatetime), except that the format string is in [Joda](https://joda-time.sourceforge.net/apidocs/org/joda/time/format/DateTimeFormat.html) instead of MySQL syntax. @@ -2151,11 +2151,11 @@ SELECT parseDateTimeInJodaSyntax('2023-02-24 14:53:31', 'yyyy-MM-dd HH:mm:ss', ' ## parseDateTimeInJodaSyntaxOrZero -Same as for [parseDateTimeInJodaSyntax](#type_conversion_functions-parseDateTimeInJodaSyntax) except that it returns zero date when it encounters a date format that cannot be processed. +Same as for [parseDateTimeInJodaSyntax](#parsedatetimeinjodasyntax) except that it returns zero date when it encounters a date format that cannot be processed. ## parseDateTimeInJodaSyntaxOrNull -Same as for [parseDateTimeInJodaSyntax](#type_conversion_functions-parseDateTimeInJodaSyntax) except that it returns `NULL` when it encounters a date format that cannot be processed. +Same as for [parseDateTimeInJodaSyntax](#parsedatetimeinjodasyntax) except that it returns `NULL` when it encounters a date format that cannot be processed. ## parseDateTimeBestEffort ## parseDateTime32BestEffort @@ -2313,11 +2313,11 @@ Same as for [parseDateTimeBestEffort](#parsedatetimebesteffort) except that it r ## parseDateTimeBestEffortUSOrNull -Same as [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS) function except that it returns `NULL` when it encounters a date format that cannot be processed. +Same as [parseDateTimeBestEffortUS](#parsedatetimebesteffortus) function except that it returns `NULL` when it encounters a date format that cannot be processed. ## parseDateTimeBestEffortUSOrZero -Same as [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS) function except that it returns zero date (`1970-01-01`) or zero date with time (`1970-01-01 00:00:00`) when it encounters a date format that cannot be processed. +Same as [parseDateTimeBestEffortUS](#parsedatetimebesteffortus) function except that it returns zero date (`1970-01-01`) or zero date with time (`1970-01-01 00:00:00`) when it encounters a date format that cannot be processed. ## parseDateTime64BestEffort @@ -2389,7 +2389,7 @@ Same as for [parseDateTime64BestEffort](#parsedatetime64besteffort), except that Converts input parameter to the [LowCardinality](../data-types/lowcardinality.md) version of same data type. -To convert data from the `LowCardinality` data type use the [CAST](#type_conversion_function-cast) function. For example, `CAST(x as String)`. +To convert data from the `LowCardinality` data type use the [CAST](#cast) function. For example, `CAST(x as String)`. **Syntax** diff --git a/docs/en/sql-reference/functions/uuid-functions.md b/docs/en/sql-reference/functions/uuid-functions.md index 0323ae728a9..5f15907d029 100644 --- a/docs/en/sql-reference/functions/uuid-functions.md +++ b/docs/en/sql-reference/functions/uuid-functions.md @@ -150,7 +150,7 @@ The function also works for [Arrays](array-functions.md#function-empty) and [Str **Example** -To generate the UUID value, ClickHouse provides the [generateUUIDv4](#uuid-function-generate) function. +To generate the UUID value, ClickHouse provides the [generateUUIDv4](#generateuuidv4) function. Query: @@ -190,7 +190,7 @@ The function also works for [Arrays](array-functions.md#function-notempty) or [S **Example** -To generate the UUID value, ClickHouse provides the [generateUUIDv4](#uuid-function-generate) function. +To generate the UUID value, ClickHouse provides the [generateUUIDv4](#generateuuidv4) function. Query: diff --git a/docs/en/sql-reference/operators/in.md b/docs/en/sql-reference/operators/in.md index 0257d21b30f..ed75b1802d8 100644 --- a/docs/en/sql-reference/operators/in.md +++ b/docs/en/sql-reference/operators/in.md @@ -235,7 +235,7 @@ If `some_predicate` is not selective enough, it will return a large amount of da ### Distributed Subqueries and max_parallel_replicas -When [max_parallel_replicas](#settings-max_parallel_replicas) is greater than 1, distributed queries are further transformed. +When [max_parallel_replicas](#distributed-subqueries-and-max_parallel_replicas) is greater than 1, distributed queries are further transformed. For example, the following: @@ -255,7 +255,7 @@ where `M` is between `1` and `3` depending on which replica the local query is e These settings affect every MergeTree-family table in the query and have the same effect as applying `SAMPLE 1/3 OFFSET (M-1)/3` on each table. -Therefore adding the [max_parallel_replicas](#settings-max_parallel_replicas) setting will only produce correct results if both tables have the same replication scheme and are sampled by UserID or a subkey of it. In particular, if `local_table_2` does not have a sampling key, incorrect results will be produced. The same rule applies to `JOIN`. +Therefore adding the [max_parallel_replicas](#distributed-subqueries-and-max_parallel_replicas) setting will only produce correct results if both tables have the same replication scheme and are sampled by UserID or a subkey of it. In particular, if `local_table_2` does not have a sampling key, incorrect results will be produced. The same rule applies to `JOIN`. One workaround if `local_table_2` does not meet the requirements, is to use `GLOBAL IN` or `GLOBAL JOIN`. diff --git a/docs/en/sql-reference/statements/alter/column.md b/docs/en/sql-reference/statements/alter/column.md index a23710b12bd..aa6f132e08e 100644 --- a/docs/en/sql-reference/statements/alter/column.md +++ b/docs/en/sql-reference/statements/alter/column.md @@ -108,7 +108,7 @@ ALTER TABLE visits RENAME COLUMN webBrowser TO browser CLEAR COLUMN [IF EXISTS] name IN PARTITION partition_name ``` -Resets all data in a column for a specified partition. Read more about setting the partition name in the section [How to set the partition expression](partition.md/#how-to-set-partition-expression). +Resets all data in a column for a specified partition. Read more about setting the partition name in the section [How to set the partition expression](../alter/partition.md/#how-to-set-partition-expression). If the `IF EXISTS` clause is specified, the query won’t return an error if the column does not exist. @@ -173,7 +173,7 @@ ALTER TABLE visits MODIFY COLUMN browser Array(String) Changing the column type is the only complex action – it changes the contents of files with data. For large tables, this may take a long time. -The query also can change the order of the columns using `FIRST | AFTER` clause, see [ADD COLUMN](#alter_add-column) description, but column type is mandatory in this case. +The query also can change the order of the columns using `FIRST | AFTER` clause, see [ADD COLUMN](#add-column) description, but column type is mandatory in this case. Example: diff --git a/docs/en/sql-reference/statements/alter/partition.md b/docs/en/sql-reference/statements/alter/partition.md index 0ed1e523669..778816f8934 100644 --- a/docs/en/sql-reference/statements/alter/partition.md +++ b/docs/en/sql-reference/statements/alter/partition.md @@ -31,7 +31,7 @@ The following operations with [partitions](/docs/en/engines/table-engines/merget ALTER TABLE table_name [ON CLUSTER cluster] DETACH PARTITION|PART partition_expr ``` -Moves all data for the specified partition to the `detached` directory. The server forgets about the detached data partition as if it does not exist. The server will not know about this data until you make the [ATTACH](#alter_attach-partition) query. +Moves all data for the specified partition to the `detached` directory. The server forgets about the detached data partition as if it does not exist. The server will not know about this data until you make the [ATTACH](#attach-partitionpart) query. Example: @@ -252,7 +252,7 @@ Downloads a partition from another server. This query only works for the replica The query does the following: 1. Downloads the partition|part from the specified shard. In ‘path-in-zookeeper’ you must specify a path to the shard in ZooKeeper. -2. Then the query puts the downloaded data to the `detached` directory of the `table_name` table. Use the [ATTACH PARTITION\|PART](#alter_attach-partition) query to add the data to the table. +2. Then the query puts the downloaded data to the `detached` directory of the `table_name` table. Use the [ATTACH PARTITION\|PART](#attach-partitionpart) query to add the data to the table. For example: @@ -353,7 +353,7 @@ You can specify the partition expression in `ALTER ... PARTITION` queries in dif - Using the keyword `ALL`. It can be used only with DROP/DETACH/ATTACH. For example, `ALTER TABLE visits ATTACH PARTITION ALL`. - As a tuple of expressions or constants that matches (in types) the table partitioning keys tuple. In the case of a single element partitioning key, the expression should be wrapped in the `tuple (...)` function. For example, `ALTER TABLE visits DETACH PARTITION tuple(toYYYYMM(toDate('2019-01-25')))`. - Using the partition ID. Partition ID is a string identifier of the partition (human-readable, if possible) that is used as the names of partitions in the file system and in ZooKeeper. The partition ID must be specified in the `PARTITION ID` clause, in a single quotes. For example, `ALTER TABLE visits DETACH PARTITION ID '201901'`. -- In the [ALTER ATTACH PART](#alter_attach-partition) and [DROP DETACHED PART](#alter_drop-detached) query, to specify the name of a part, use string literal with a value from the `name` column of the [system.detached_parts](/docs/en/operations/system-tables/detached_parts.md/#system_tables-detached_parts) table. For example, `ALTER TABLE visits ATTACH PART '201901_1_1_0'`. +- In the [ALTER ATTACH PART](#attach-partitionpart) and [DROP DETACHED PART](#drop-detached-partitionpart) query, to specify the name of a part, use string literal with a value from the `name` column of the [system.detached_parts](/docs/en/operations/system-tables/detached_parts.md/#system_tables-detached_parts) table. For example, `ALTER TABLE visits ATTACH PART '201901_1_1_0'`. Usage of quotes when specifying the partition depends on the type of partition expression. For example, for the `String` type, you have to specify its name in quotes (`'`). For the `Date` and `Int*` types no quotes are needed. diff --git a/docs/en/sql-reference/statements/create/table.md b/docs/en/sql-reference/statements/create/table.md index 628fe1d2875..0253bc647e6 100644 --- a/docs/en/sql-reference/statements/create/table.md +++ b/docs/en/sql-reference/statements/create/table.md @@ -17,8 +17,8 @@ By default, tables are created only on the current server. Distributed DDL queri ``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( - name1 [type1] [NULL|NOT NULL] [DEFAULT|MATERIALIZED|EPHEMERAL|ALIAS expr1] [compression_codec] [TTL expr1] [COMMENT 'comment for column'], - name2 [type2] [NULL|NOT NULL] [DEFAULT|MATERIALIZED|EPHEMERAL|ALIAS expr2] [compression_codec] [TTL expr2] [COMMENT 'comment for column'], + name1 [type1] [NULL|NOT NULL] [DEFAULT|MATERIALIZED|EPHEMERAL|ALIAS expr1] [COMMENT 'comment for column'] [compression_codec] [TTL expr1], + name2 [type2] [NULL|NOT NULL] [DEFAULT|MATERIALIZED|EPHEMERAL|ALIAS expr2] [COMMENT 'comment for column'] [compression_codec] [TTL expr2], ... ) ENGINE = engine COMMENT 'comment for table' diff --git a/docs/en/sql-reference/statements/create/view.md b/docs/en/sql-reference/statements/create/view.md index 1bdf22b35b0..1fabb6d8cc7 100644 --- a/docs/en/sql-reference/statements/create/view.md +++ b/docs/en/sql-reference/statements/create/view.md @@ -6,7 +6,7 @@ sidebar_label: VIEW # CREATE VIEW -Creates a new view. Views can be [normal](#normal-view), [materialized](#materialized-view), [live](#live-view-experimental), and [window](#window-view-experimental) (live view and window view are experimental features). +Creates a new view. Views can be [normal](#normal-view), [materialized](#materialized-view), [live](#live-view-deprecated), and [window](#window-view-experimental) (live view and window view are experimental features). ## Normal View diff --git a/docs/en/sql-reference/statements/grant.md b/docs/en/sql-reference/statements/grant.md index 2850ce71781..43fa344a16d 100644 --- a/docs/en/sql-reference/statements/grant.md +++ b/docs/en/sql-reference/statements/grant.md @@ -33,7 +33,7 @@ GRANT [ON CLUSTER cluster_name] role [,...] TO {user | another_role | CURRENT_US - `role` — ClickHouse user role. - `user` — ClickHouse user account. -The `WITH ADMIN OPTION` clause grants [ADMIN OPTION](#admin-option-privilege) privilege to `user` or `role`. +The `WITH ADMIN OPTION` clause grants [ADMIN OPTION](#admin-option) privilege to `user` or `role`. The `WITH REPLACE OPTION` clause replace old roles by new role for the `user` or `role`, if is not specified it appends roles. ## Grant Current Grants Syntax @@ -201,7 +201,7 @@ Hierarchy of privileges: - `HDFS` - `S3` - [dictGet](#dictget) -- [displaySecretsInShowAndSelect](#display-secrets) +- [displaySecretsInShowAndSelect](#displaysecretsinshowandselect) - [NAMED COLLECTION ADMIN](#named-collection-admin) - `CREATE NAMED COLLECTION` - `DROP NAMED COLLECTION` @@ -498,7 +498,7 @@ Privilege level: `DICTIONARY`. - `GRANT dictGet ON mydictionary TO john` -### displaySecretsInShowAndSelect {#display-secrets} +### displaySecretsInShowAndSelect Allows a user to view secrets in `SHOW` and `SELECT` queries if both [`display_secrets_in_show_and_select` server setting](../../operations/server-configuration-parameters/settings#display_secrets_in_show_and_select) diff --git a/docs/en/sql-reference/statements/select/sample.md b/docs/en/sql-reference/statements/select/sample.md index 137f86cc8b9..78e05b19bd1 100644 --- a/docs/en/sql-reference/statements/select/sample.md +++ b/docs/en/sql-reference/statements/select/sample.md @@ -27,14 +27,14 @@ The features of data sampling are listed below: For the `SAMPLE` clause the following syntax is supported: -| SAMPLE Clause Syntax | Description | -|----------------------|------------------------------| -| `SAMPLE k` | Here `k` is the number from 0 to 1. The query is executed on `k` fraction of data. For example, `SAMPLE 0.1` runs the query on 10% of data. [Read more](#select-sample-k) | -| `SAMPLE n` | Here `n` is a sufficiently large integer. The query is executed on a sample of at least `n` rows (but not significantly more than this). For example, `SAMPLE 10000000` runs the query on a minimum of 10,000,000 rows. [Read more](#select-sample-n) | -| `SAMPLE k OFFSET m` | Here `k` and `m` are the numbers from 0 to 1. The query is executed on a sample of `k` fraction of the data. The data used for the sample is offset by `m` fraction. [Read more](#select-sample-offset) | +| SAMPLE Clause Syntax | Description | +|----------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `SAMPLE k` | Here `k` is the number from 0 to 1. The query is executed on `k` fraction of data. For example, `SAMPLE 0.1` runs the query on 10% of data. [Read more](#sample-k) | +| `SAMPLE n` | Here `n` is a sufficiently large integer. The query is executed on a sample of at least `n` rows (but not significantly more than this). For example, `SAMPLE 10000000` runs the query on a minimum of 10,000,000 rows. [Read more](#sample-n) | +| `SAMPLE k OFFSET m` | Here `k` and `m` are the numbers from 0 to 1. The query is executed on a sample of `k` fraction of the data. The data used for the sample is offset by `m` fraction. [Read more](#sample-k-offset-m) | -## SAMPLE K {#select-sample-k} +## SAMPLE K Here `k` is the number from 0 to 1 (both fractional and decimal notations are supported). For example, `SAMPLE 1/2` or `SAMPLE 0.5`. @@ -54,7 +54,7 @@ ORDER BY PageViews DESC LIMIT 1000 In this example, the query is executed on a sample from 0.1 (10%) of data. Values of aggregate functions are not corrected automatically, so to get an approximate result, the value `count()` is manually multiplied by 10. -## SAMPLE N {#select-sample-n} +## SAMPLE N Here `n` is a sufficiently large integer. For example, `SAMPLE 10000000`. @@ -90,7 +90,7 @@ FROM visits SAMPLE 10000000 ``` -## SAMPLE K OFFSET M {#select-sample-offset} +## SAMPLE K OFFSET M Here `k` and `m` are numbers from 0 to 1. Examples are shown below. diff --git a/docs/en/sql-reference/statements/system.md b/docs/en/sql-reference/statements/system.md index 7efbff1b42b..e6d3439d2b9 100644 --- a/docs/en/sql-reference/statements/system.md +++ b/docs/en/sql-reference/statements/system.md @@ -174,7 +174,7 @@ Aborts ClickHouse process (like `kill -9 {$ pid_clickhouse-server}`) ## Managing Distributed Tables -ClickHouse can manage [distributed](../../engines/table-engines/special/distributed.md) tables. When a user inserts data into these tables, ClickHouse first creates a queue of the data that should be sent to cluster nodes, then asynchronously sends it. You can manage queue processing with the [STOP DISTRIBUTED SENDS](#query_language-system-stop-distributed-sends), [FLUSH DISTRIBUTED](#query_language-system-flush-distributed), and [START DISTRIBUTED SENDS](#query_language-system-start-distributed-sends) queries. You can also synchronously insert distributed data with the [distributed_foreground_insert](../../operations/settings/settings.md#distributed_foreground_insert) setting. +ClickHouse can manage [distributed](../../engines/table-engines/special/distributed.md) tables. When a user inserts data into these tables, ClickHouse first creates a queue of the data that should be sent to cluster nodes, then asynchronously sends it. You can manage queue processing with the [STOP DISTRIBUTED SENDS](#stop-distributed-sends), [FLUSH DISTRIBUTED](#flush-distributed), and [START DISTRIBUTED SENDS](#start-distributed-sends) queries. You can also synchronously insert distributed data with the [distributed_foreground_insert](../../operations/settings/settings.md#distributed_foreground_insert) setting. ### STOP DISTRIBUTED SENDS diff --git a/docs/en/sql-reference/syntax.md b/docs/en/sql-reference/syntax.md index fc0286e76ad..6a4afb63db8 100644 --- a/docs/en/sql-reference/syntax.md +++ b/docs/en/sql-reference/syntax.md @@ -54,11 +54,11 @@ Identifiers are: - Cluster, database, table, partition, and column names. - Functions. - Data types. -- [Expression aliases](#expression_aliases). +- [Expression aliases](#expression-aliases). Identifiers can be quoted or non-quoted. The latter is preferred. -Non-quoted identifiers must match the regex `^[a-zA-Z_][0-9a-zA-Z_]*$` and can not be equal to [keywords](#syntax-keywords). Examples: `x`, `_1`, `X_y__Z123_`. +Non-quoted identifiers must match the regex `^[a-zA-Z_][0-9a-zA-Z_]*$` and can not be equal to [keywords](#keywords). Examples: `x`, `_1`, `X_y__Z123_`. If you want to use identifiers the same as keywords or you want to use other symbols in identifiers, quote it using double quotes or backticks, for example, `"id"`, `` `id` ``. diff --git a/docs/en/sql-reference/table-functions/file.md b/docs/en/sql-reference/table-functions/file.md index 4fec772c373..3a3162dad9a 100644 --- a/docs/en/sql-reference/table-functions/file.md +++ b/docs/en/sql-reference/table-functions/file.md @@ -18,7 +18,7 @@ file([path_to_archive ::] path [,format] [,structure] [,compression]) **Parameters** -- `path` — The relative path to the file from [user_files_path](/docs/en/operations/server-configuration-parameters/settings.md#server_configuration_parameters-user_files_path). Supports in read-only mode the following [globs](#globs_in_path): `*`, `?`, `{abc,def}` (with `'abc'` and `'def'` being strings) and `{N..M}` (with `N` and `M` being numbers). +- `path` — The relative path to the file from [user_files_path](/docs/en/operations/server-configuration-parameters/settings.md#server_configuration_parameters-user_files_path). Supports in read-only mode the following [globs](#globs-in-path): `*`, `?`, `{abc,def}` (with `'abc'` and `'def'` being strings) and `{N..M}` (with `N` and `M` being numbers). - `path_to_archive` - The relative path to a zip/tar/7z archive. Supports the same globs as `path`. - `format` — The [format](/docs/en/interfaces/formats.md#formats) of the file. - `structure` — Structure of the table. Format: `'column1_name column1_type, column2_name column2_type, ...'`. @@ -128,7 +128,7 @@ Reading data from `table.csv`, located in `archive1.zip` or/and `archive2.zip`: SELECT * FROM file('user_files/archives/archive{1..2}.zip :: table.csv'); ``` -## Globs in path {#globs_in_path} +## Globs in path Paths may use globbing. Files must match the whole path pattern, not only the suffix or prefix. diff --git a/docs/en/sql-reference/table-functions/fileCluster.md b/docs/en/sql-reference/table-functions/fileCluster.md index 4677d2883a7..62b00fadd62 100644 --- a/docs/en/sql-reference/table-functions/fileCluster.md +++ b/docs/en/sql-reference/table-functions/fileCluster.md @@ -22,7 +22,7 @@ fileCluster(cluster_name, path[, format, structure, compression_method]) **Arguments** - `cluster_name` — Name of a cluster that is used to build a set of addresses and connection parameters to remote and local servers. -- `path` — The relative path to the file from [user_files_path](/docs/en/operations/server-configuration-parameters/settings.md#server_configuration_parameters-user_files_path). Path to file also supports [globs](#globs_in_path). +- `path` — The relative path to the file from [user_files_path](/docs/en/operations/server-configuration-parameters/settings.md#server_configuration_parameters-user_files_path). Path to file also supports [globs](#globs-in-path). - `format` — [Format](../../interfaces/formats.md#formats) of the files. Type: [String](../../sql-reference/data-types/string.md). - `structure` — Table structure in `'UserID UInt64, Name String'` format. Determines column names and types. Type: [String](../../sql-reference/data-types/string.md). - `compression_method` — Compression method. Supported compression types are `gz`, `br`, `xz`, `zst`, `lz4`, and `bz2`. @@ -74,7 +74,7 @@ SELECT * FROM fileCluster('my_cluster', 'file{1,2}.csv', 'CSV', 'i UInt32, s Str ``` -## Globs in Path {#globs_in_path} +## Globs in Path All patterns supported by [File](../../sql-reference/table-functions/file.md#globs-in-path) table function are supported by FileCluster. diff --git a/docs/ru/sql-reference/functions/string-functions.md b/docs/ru/sql-reference/functions/string-functions.md index fc258f7b4cf..fa76e84f130 100644 --- a/docs/ru/sql-reference/functions/string-functions.md +++ b/docs/ru/sql-reference/functions/string-functions.md @@ -538,16 +538,28 @@ SELECT base58Decode('3dc8KtHrwM'); Синоним: `TO_BASE64`. +## base64UrlEncode(s) + +Производит кодирование URL (String или FixedString) в base64-представление в соответствии с [RFC 4648](https://tools.ietf.org/html/rfc4648). + ## base64Decode(s) {#base64decode} Декодирует base64-представление s в исходную строку. При невозможности декодирования выбрасывает исключение Синоним: `FROM_BASE64`. +## base64UrlDecode(s) + +Декодирует base64-представление URL в исходную строку в соответствии с [RFC 4648](https://tools.ietf.org/html/rfc4648). При невозможности декодирования выбрасывает исключение + ## tryBase64Decode(s) {#trybase64decode} Функционал аналогичен base64Decode, но при невозможности декодирования возвращает пустую строку. +## tryBase64UrlDecode(s) + +Функционал аналогичен base64UrlDecode, но при невозможности декодирования возвращает пустую строку. + ## endsWith(s, suffix) {#endswith} Возвращает 1, если строка завершается указанным суффиксом, и 0 в противном случае. diff --git a/docs/zh/guides/improving-query-performance/skipping-indexes.md b/docs/zh/guides/improving-query-performance/skipping-indexes.md index f9f43e46927..8eb88d859f2 100644 --- a/docs/zh/guides/improving-query-performance/skipping-indexes.md +++ b/docs/zh/guides/improving-query-performance/skipping-indexes.md @@ -123,7 +123,7 @@ Bloom filter是一种数据结构,它允许对集合成员进行高效的是 有三种基于Bloom过滤器的数据跳数索引类型: -* 基本的**bloom_filter**接受一个可选参数,该参数表示在0到1之间允许的“假阳性”率(如果未指定,则使用.025)。 +* 基本的**bloom_filter**接受一个可选参数,该参数表示在0到1之间允许的“假阳性”率(如果未指定,则使用0.025)。 * 更专业的**tokenbf_v1**。需要三个参数,用来优化布隆过滤器:(1)过滤器的大小字节(大过滤器有更少的假阳性,有更高的存储成本),(2)哈希函数的个数(更多的散列函数可以减少假阳性)。(3)布隆过滤器哈希函数的种子。有关这些参数如何影响布隆过滤器功能的更多细节,请参阅 [这里](https://hur.st/bloomfilter/) 。此索引仅适用于String、FixedString和Map类型的数据。输入表达式被分割为由非字母数字字符分隔的字符序列。例如,列值`This is a candidate for a "full text" search`将被分割为`This` `is` `a` `candidate` `for` `full` `text` `search`。它用于LIKE、EQUALS、in、hasToken()和类似的长字符串中单词和其他值的搜索。例如,一种可能的用途是在非结构的应用程序日志行列中搜索少量的类名或行号。 diff --git a/programs/keeper/CMakeLists.txt b/programs/keeper/CMakeLists.txt index 52aa601b1a2..079951be55e 100644 --- a/programs/keeper/CMakeLists.txt +++ b/programs/keeper/CMakeLists.txt @@ -154,8 +154,6 @@ if (BUILD_STANDALONE_KEEPER) ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/IO/ThreadPoolRemoteFSReader.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/IO/ThreadPoolReader.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Storages/StorageS3Settings.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Daemon/BaseDaemon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Daemon/SentryWriter.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Daemon/GraphiteWriter.cpp diff --git a/programs/keeper/Keeper.cpp b/programs/keeper/Keeper.cpp index dba5c2b7d2a..0d3c1f10894 100644 --- a/programs/keeper/Keeper.cpp +++ b/programs/keeper/Keeper.cpp @@ -361,9 +361,10 @@ try } GlobalThreadPool::initialize( - config().getUInt("max_thread_pool_size", 100), - config().getUInt("max_thread_pool_free_size", 1000), - config().getUInt("thread_pool_queue_size", 10000) + /// We need to have sufficient amount of threads for connections + nuraft workers + keeper workers, 1000 is an estimation + std::min(1000U, config().getUInt("max_thread_pool_size", 1000)), + config().getUInt("max_thread_pool_free_size", 100), + config().getUInt("thread_pool_queue_size", 1000) ); /// Wait for all threads to avoid possible use-after-free (for example logging objects can be already destroyed). SCOPE_EXIT({ diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 00994b39a40..6414f7f6ea5 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -983,6 +983,18 @@ try } } + std::string path_str = getCanonicalPath(config().getString("path", DBMS_DEFAULT_PATH)); + fs::path path = path_str; + + /// Check that the process user id matches the owner of the data. + assertProcessUserMatchesDataOwner(path_str, [&](const std::string & message){ global_context->addWarningMessage(message); }); + + global_context->setPath(path_str); + + StatusFile status{path / "status", StatusFile::write_full_info}; + + ServerUUID::load(path / "uuid", log); + zkutil::validateZooKeeperConfig(config()); bool has_zookeeper = zkutil::hasZooKeeperConfig(config()); @@ -994,7 +1006,7 @@ try ConfigProcessor config_processor(config_path); loaded_config = config_processor.loadConfigWithZooKeeperIncludes( main_config_zk_node_cache, main_config_zk_changed_event, /* fallback_to_preprocessed = */ true); - config_processor.savePreprocessedConfig(loaded_config, config().getString("path", DBMS_DEFAULT_PATH)); + config_processor.savePreprocessedConfig(loaded_config, path_str); config().removeConfiguration(old_configuration.get()); config().add(loaded_config.configuration.duplicate(), PRIO_DEFAULT, false); global_context->setConfig(loaded_config.configuration); @@ -1128,19 +1140,6 @@ try global_context->setRemoteHostFilter(config()); global_context->setHTTPHeaderFilter(config()); - std::string path_str = getCanonicalPath(config().getString("path", DBMS_DEFAULT_PATH)); - fs::path path = path_str; - std::string default_database = server_settings.default_database.toString(); - - /// Check that the process user id matches the owner of the data. - assertProcessUserMatchesDataOwner(path_str, [&](const std::string & message){ global_context->addWarningMessage(message); }); - - global_context->setPath(path_str); - - StatusFile status{path / "status", StatusFile::write_full_info}; - - ServerUUID::load(path / "uuid", log); - /// Try to increase limit on number of open files. { rlimit rlim; @@ -1671,6 +1670,10 @@ try if (global_context->isServerCompletelyStarted()) CannotAllocateThreadFaultInjector::setFaultProbability(new_server_settings.cannot_allocate_thread_fault_injection_probability); +#if USE_GWP_ASAN + GWPAsan::setForceSampleProbability(new_server_settings.gwp_asan_force_sample_probability); +#endif + ProfileEvents::increment(ProfileEvents::MainConfigLoads); /// Must be the last. @@ -1928,6 +1931,7 @@ try /// Set current database name before loading tables and databases because /// system logs may copy global context. + std::string default_database = server_settings.default_database.toString(); global_context->setCurrentDatabaseNameInGlobalContext(default_database); LOG_INFO(log, "Loading metadata from {}", path_str); @@ -2120,6 +2124,10 @@ try CannotAllocateThreadFaultInjector::setFaultProbability(server_settings.cannot_allocate_thread_fault_injection_probability); +#if USE_GWP_ASAN + GWPAsan::setForceSampleProbability(server_settings.gwp_asan_force_sample_probability); +#endif + try { global_context->startClusterDiscovery(); diff --git a/src/Access/ContextAccess.cpp b/src/Access/ContextAccess.cpp index 2a658d7aaa2..28a825de6cf 100644 --- a/src/Access/ContextAccess.cpp +++ b/src/Access/ContextAccess.cpp @@ -360,10 +360,13 @@ void ContextAccess::setUser(const UserPtr & user_) const subscription_for_roles_changes.reset(); enabled_roles = access_control->getEnabledRoles(current_roles, current_roles_with_admin_option); - subscription_for_roles_changes = enabled_roles->subscribeForChanges([this](const std::shared_ptr & roles_info_) + subscription_for_roles_changes = enabled_roles->subscribeForChanges([weak_ptr = weak_from_this()](const std::shared_ptr & roles_info_) { - std::lock_guard lock{mutex}; - setRolesInfo(roles_info_); + auto ptr = weak_ptr.lock(); + if (!ptr) + return; + std::lock_guard lock{ptr->mutex}; + ptr->setRolesInfo(roles_info_); }); setRolesInfo(enabled_roles->getRolesInfo()); diff --git a/src/Analyzer/InterpolateNode.cpp b/src/Analyzer/InterpolateNode.cpp index e4f7e22b803..97dc79f565b 100644 --- a/src/Analyzer/InterpolateNode.cpp +++ b/src/Analyzer/InterpolateNode.cpp @@ -10,9 +10,12 @@ namespace DB { -InterpolateNode::InterpolateNode(QueryTreeNodePtr expression_, QueryTreeNodePtr interpolate_expression_) +InterpolateNode::InterpolateNode(std::shared_ptr expression_, QueryTreeNodePtr interpolate_expression_) : IQueryTreeNode(children_size) { + if (expression_) + expression_name = expression_->getIdentifier().getFullName(); + children[expression_child_index] = std::move(expression_); children[interpolate_expression_child_index] = std::move(interpolate_expression_); } @@ -41,13 +44,23 @@ void InterpolateNode::updateTreeHashImpl(HashState &, CompareOptions) const QueryTreeNodePtr InterpolateNode::cloneImpl() const { - return std::make_shared(nullptr /*expression*/, nullptr /*interpolate_expression*/); + auto cloned = std::make_shared(nullptr /*expression*/, nullptr /*interpolate_expression*/); + cloned->expression_name = expression_name; + return cloned; } ASTPtr InterpolateNode::toASTImpl(const ConvertToASTOptions & options) const { auto result = std::make_shared(); - result->column = getExpression()->toAST(options)->getColumnName(); + + /// Interpolate parser supports only identifier node. + /// In case of alias, identifier is replaced to expression, which can't be parsed. + /// In this case, keep original alias name. + if (const auto * identifier = getExpression()->as()) + result->column = identifier->toAST(options)->getColumnName(); + else + result->column = expression_name; + result->children.push_back(getInterpolateExpression()->toAST(options)); result->expr = result->children.back(); diff --git a/src/Analyzer/InterpolateNode.h b/src/Analyzer/InterpolateNode.h index 9269d3924f5..ec493ed8bdd 100644 --- a/src/Analyzer/InterpolateNode.h +++ b/src/Analyzer/InterpolateNode.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include namespace DB @@ -19,7 +19,7 @@ class InterpolateNode final : public IQueryTreeNode { public: /// Initialize interpolate node with expression and interpolate expression - explicit InterpolateNode(QueryTreeNodePtr expression_, QueryTreeNodePtr interpolate_expression_); + explicit InterpolateNode(std::shared_ptr expression_, QueryTreeNodePtr interpolate_expression_); /// Get expression to interpolate const QueryTreeNodePtr & getExpression() const @@ -61,6 +61,9 @@ protected: ASTPtr toASTImpl(const ConvertToASTOptions & options) const override; + /// Initial name from column identifier. + std::string expression_name; + private: static constexpr size_t expression_child_index = 0; static constexpr size_t interpolate_expression_child_index = 1; diff --git a/src/Analyzer/Passes/AggregateFunctionsArithmericOperationsPass.cpp b/src/Analyzer/Passes/AggregateFunctionsArithmericOperationsPass.cpp index 9153bc4eca2..e6798a792dd 100644 --- a/src/Analyzer/Passes/AggregateFunctionsArithmericOperationsPass.cpp +++ b/src/Analyzer/Passes/AggregateFunctionsArithmericOperationsPass.cpp @@ -51,7 +51,7 @@ public: using Base = InDepthQueryTreeVisitorWithContext; using Base::Base; - void leaveImpl(QueryTreeNodePtr & node) + void enterImpl(QueryTreeNodePtr & node) { if (!getSettings().optimize_arithmetic_operations_in_aggregate_functions) return; diff --git a/src/Analyzer/Passes/UniqInjectiveFunctionsEliminationPass.cpp b/src/Analyzer/Passes/UniqInjectiveFunctionsEliminationPass.cpp index d087fe1c7b9..8a6276008d8 100644 --- a/src/Analyzer/Passes/UniqInjectiveFunctionsEliminationPass.cpp +++ b/src/Analyzer/Passes/UniqInjectiveFunctionsEliminationPass.cpp @@ -41,9 +41,9 @@ public: return; bool replaced_argument = false; - auto & uniq_function_arguments_nodes = function_node->getArguments().getNodes(); + auto replaced_uniq_function_arguments_nodes = function_node->getArguments().getNodes(); - for (auto & uniq_function_argument_node : uniq_function_arguments_nodes) + for (auto & uniq_function_argument_node : replaced_uniq_function_arguments_nodes) { auto * uniq_function_argument_node_typed = uniq_function_argument_node->as(); if (!uniq_function_argument_node_typed || !uniq_function_argument_node_typed->isOrdinaryFunction()) @@ -67,12 +67,10 @@ public: if (!replaced_argument) return; - const auto & function_node_argument_nodes = function_node->getArguments().getNodes(); - DataTypes argument_types; - argument_types.reserve(function_node_argument_nodes.size()); + argument_types.reserve(replaced_uniq_function_arguments_nodes.size()); - for (const auto & function_node_argument : function_node_argument_nodes) + for (const auto & function_node_argument : replaced_uniq_function_arguments_nodes) argument_types.emplace_back(function_node_argument->getResultType()); AggregateFunctionProperties properties; @@ -83,6 +81,11 @@ public: function_node->getAggregateFunction()->getParameters(), properties); + /// uniqCombined returns nullable with nullable arguments so the result type might change which breaks the pass + if (!aggregate_function->getResultType()->equals(*function_node->getAggregateFunction()->getResultType())) + return; + + function_node->getArguments().getNodes() = replaced_uniq_function_arguments_nodes; function_node->resolveAsAggregateFunction(std::move(aggregate_function)); } }; diff --git a/src/Backups/BackupIO_S3.cpp b/src/Backups/BackupIO_S3.cpp index 92f086295a0..56544312c26 100644 --- a/src/Backups/BackupIO_S3.cpp +++ b/src/Backups/BackupIO_S3.cpp @@ -88,14 +88,10 @@ namespace std::move(headers), S3::CredentialsConfiguration { - settings.auth_settings.use_environment_credentials.value_or( - context->getConfigRef().getBool("s3.use_environment_credentials", true)), - settings.auth_settings.use_insecure_imds_request.value_or( - context->getConfigRef().getBool("s3.use_insecure_imds_request", false)), - settings.auth_settings.expiration_window_seconds.value_or( - context->getConfigRef().getUInt64("s3.expiration_window_seconds", S3::DEFAULT_EXPIRATION_WINDOW_SECONDS)), - settings.auth_settings.no_sign_request.value_or( - context->getConfigRef().getBool("s3.no_sign_request", false)), + settings.auth_settings.use_environment_credentials, + settings.auth_settings.use_insecure_imds_request, + settings.auth_settings.expiration_window_seconds, + settings.auth_settings.no_sign_request }); } @@ -131,12 +127,18 @@ BackupReaderS3::BackupReaderS3( : BackupReaderDefault(read_settings_, write_settings_, getLogger("BackupReaderS3")) , s3_uri(s3_uri_) , data_source_description{DataSourceType::ObjectStorage, ObjectStorageType::S3, MetadataStorageType::None, s3_uri.endpoint, false, false} - , s3_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString(), context_->getUserName(), /*ignore_user=*/is_internal_backup).value_or(S3Settings{})) { - auto & request_settings = s3_settings.request_settings; - request_settings.updateFromSettingsIfChanged(context_->getSettingsRef()); - request_settings.max_single_read_retries = context_->getSettingsRef().s3_max_single_read_retries; // FIXME: Avoid taking value for endpoint - request_settings.allow_native_copy = allow_s3_native_copy; + s3_settings.loadFromConfig(context_->getConfigRef(), "s3", context_->getSettingsRef()); + + if (auto endpoint_settings = context_->getStorageS3Settings().getSettings( + s3_uri.uri.toString(), context_->getUserName(), /*ignore_user=*/is_internal_backup)) + { + s3_settings.updateIfChanged(*endpoint_settings); + } + + s3_settings.request_settings.updateFromSettings(context_->getSettingsRef(), /* if_changed */true); + s3_settings.request_settings.allow_native_copy = allow_s3_native_copy; + client = makeS3Client(s3_uri_, access_key_id_, secret_access_key_, s3_settings, context_); if (auto blob_storage_system_log = context_->getBlobStorageLog()) @@ -223,13 +225,19 @@ BackupWriterS3::BackupWriterS3( : BackupWriterDefault(read_settings_, write_settings_, getLogger("BackupWriterS3")) , s3_uri(s3_uri_) , data_source_description{DataSourceType::ObjectStorage, ObjectStorageType::S3, MetadataStorageType::None, s3_uri.endpoint, false, false} - , s3_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString(), context_->getUserName(), /*ignore_user=*/is_internal_backup).value_or(S3Settings{})) { - auto & request_settings = s3_settings.request_settings; - request_settings.updateFromSettingsIfChanged(context_->getSettingsRef()); - request_settings.max_single_read_retries = context_->getSettingsRef().s3_max_single_read_retries; // FIXME: Avoid taking value for endpoint - request_settings.allow_native_copy = allow_s3_native_copy; - request_settings.setStorageClassName(storage_class_name); + s3_settings.loadFromConfig(context_->getConfigRef(), "s3", context_->getSettingsRef()); + + if (auto endpoint_settings = context_->getStorageS3Settings().getSettings( + s3_uri.uri.toString(), context_->getUserName(), /*ignore_user=*/is_internal_backup)) + { + s3_settings.updateIfChanged(*endpoint_settings); + } + + s3_settings.request_settings.updateFromSettings(context_->getSettingsRef(), /* if_changed */true); + s3_settings.request_settings.allow_native_copy = allow_s3_native_copy; + s3_settings.request_settings.storage_class_name = storage_class_name; + client = makeS3Client(s3_uri_, access_key_id_, secret_access_key_, s3_settings, context_); if (auto blob_storage_system_log = context_->getBlobStorageLog()) { diff --git a/src/Backups/BackupIO_S3.h b/src/Backups/BackupIO_S3.h index f81eb975df3..327f06363c5 100644 --- a/src/Backups/BackupIO_S3.h +++ b/src/Backups/BackupIO_S3.h @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 290a7311448..84aaec17a5b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -285,7 +285,7 @@ if (TARGET ch_contrib::llvm) endif () if (TARGET ch_contrib::gwp_asan) - target_link_libraries (clickhouse_common_io PRIVATE ch_contrib::gwp_asan) + target_link_libraries (clickhouse_common_io PUBLIC ch_contrib::gwp_asan) target_link_libraries (clickhouse_new_delete PRIVATE ch_contrib::gwp_asan) endif() diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index f8391c64d5a..854cc3fef8b 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -1188,7 +1188,10 @@ void ClientBase::receiveResult(ASTPtr parsed_query, Int32 signals_before_stop, b std::rethrow_exception(local_format_error); if (cancelled && is_interactive) + { std::cout << "Query was cancelled." << std::endl; + cancelled_printed = true; + } } @@ -1302,8 +1305,13 @@ void ClientBase::onEndOfStream() resetOutput(); - if (is_interactive && !written_first_block) - std::cout << "Ok." << std::endl; + if (is_interactive) + { + if (cancelled && !cancelled_printed) + std::cout << "Query was cancelled." << std::endl; + else if (!written_first_block) + std::cout << "Ok." << std::endl; + } } @@ -1866,6 +1874,7 @@ void ClientBase::processParsedSingleQuery(const String & full_query, const Strin resetOutput(); have_error = false; cancelled = false; + cancelled_printed = false; client_exception.reset(); server_exception.reset(); diff --git a/src/Client/ClientBase.h b/src/Client/ClientBase.h index 7a0489641c8..220fcddc038 100644 --- a/src/Client/ClientBase.h +++ b/src/Client/ClientBase.h @@ -329,6 +329,7 @@ protected: bool allow_merge_tree_settings = false; bool cancelled = false; + bool cancelled_printed = false; /// Does log_comment has specified by user? bool has_log_comment = false; diff --git a/src/Columns/ColumnNullable.h b/src/Columns/ColumnNullable.h index 5578a8dde60..510a4cacf1e 100644 --- a/src/Columns/ColumnNullable.h +++ b/src/Columns/ColumnNullable.h @@ -51,7 +51,7 @@ public: std::string getName() const override { return "Nullable(" + nested_column->getName() + ")"; } TypeIndex getDataType() const override { return TypeIndex::Nullable; } MutableColumnPtr cloneResized(size_t size) const override; - size_t size() const override { return nested_column->size(); } + size_t size() const override { return assert_cast(*null_map).size(); } bool isNullAt(size_t n) const override { return assert_cast(*null_map).getData()[n] != 0;} Field operator[](size_t n) const override; void get(size_t n, Field & res) const override; diff --git a/src/Columns/FilterDescription.h b/src/Columns/FilterDescription.h index 63457b8b544..b4335a49787 100644 --- a/src/Columns/FilterDescription.h +++ b/src/Columns/FilterDescription.h @@ -23,15 +23,10 @@ struct ConstantFilterDescription struct IFilterDescription { - /// has_one can be pre-compute during creating the filter description in some cases - Int64 has_one = -1; virtual ColumnPtr filter(const IColumn & column, ssize_t result_size_hint) const = 0; virtual size_t countBytesInFilter() const = 0; virtual ~IFilterDescription() = default; - bool hasOne() { return has_one >= 0 ? has_one : hasOneImpl();} protected: - /// Calculate if filter has a non-zero from the filter values, may update has_one - virtual bool hasOneImpl() = 0; }; /// Obtain a filter from non constant Column, that may have type: UInt8, Nullable(UInt8). @@ -45,7 +40,6 @@ struct FilterDescription final : public IFilterDescription ColumnPtr filter(const IColumn & column, ssize_t result_size_hint) const override { return column.filter(*data, result_size_hint); } size_t countBytesInFilter() const override { return DB::countBytesInFilter(*data); } protected: - bool hasOneImpl() override { return data ? (has_one = !memoryIsZero(data->data(), 0, data->size())) : false; } }; struct SparseFilterDescription final : public IFilterDescription @@ -56,7 +50,6 @@ struct SparseFilterDescription final : public IFilterDescription ColumnPtr filter(const IColumn & column, ssize_t) const override { return column.index(*filter_indices, 0); } size_t countBytesInFilter() const override { return filter_indices->size(); } protected: - bool hasOneImpl() override { return filter_indices && !filter_indices->empty(); } }; struct ColumnWithTypeAndName; diff --git a/src/Common/Allocator.cpp b/src/Common/Allocator.cpp index e80c125c2a0..bfc85559fe8 100644 --- a/src/Common/Allocator.cpp +++ b/src/Common/Allocator.cpp @@ -1,8 +1,9 @@ #include -#include -#include -#include #include +#include +#include +#include +#include #include #include @@ -10,6 +11,12 @@ #include #include /// MADV_POPULATE_WRITE +namespace ProfileEvents +{ + extern const Event GWPAsanAllocateSuccess; + extern const Event GWPAsanAllocateFailed; + extern const Event GWPAsanFree; +} namespace DB { @@ -60,6 +67,27 @@ template void * allocNoTrack(size_t size, size_t alignment) { void * buf; +#if USE_GWP_ASAN + if (unlikely(GWPAsan::GuardedAlloc.shouldSample())) + { + if (void * ptr = GWPAsan::GuardedAlloc.allocate(size, alignment)) + { + if constexpr (clear_memory) + memset(ptr, 0, size); + + if constexpr (populate) + prefaultPages(ptr, size); + + ProfileEvents::increment(ProfileEvents::GWPAsanAllocateSuccess); + + return ptr; + } + else + { + ProfileEvents::increment(ProfileEvents::GWPAsanAllocateFailed); + } + } +#endif if (alignment <= MALLOC_MIN_ALIGNMENT) { if constexpr (clear_memory) @@ -91,6 +119,15 @@ void * allocNoTrack(size_t size, size_t alignment) void freeNoTrack(void * buf) { +#if USE_GWP_ASAN + if (unlikely(GWPAsan::GuardedAlloc.pointerIsMine(buf))) + { + ProfileEvents::increment(ProfileEvents::GWPAsanFree); + GWPAsan::GuardedAlloc.deallocate(buf); + return; + } +#endif + ::free(buf); } @@ -144,8 +181,54 @@ void * Allocator::realloc(void * buf, size_t old_size, { /// nothing to do. /// BTW, it's not possible to change alignment while doing realloc. + return buf; } - else if (alignment <= MALLOC_MIN_ALIGNMENT) + +#if USE_GWP_ASAN + if (unlikely(GWPAsan::GuardedAlloc.shouldSample())) + { + if (void * ptr = GWPAsan::GuardedAlloc.allocate(new_size, alignment)) + { + auto trace_free = CurrentMemoryTracker::free(old_size); + auto trace_alloc = CurrentMemoryTracker::alloc(new_size); + trace_free.onFree(buf, old_size); + + memcpy(ptr, buf, std::min(old_size, new_size)); + free(buf, old_size); + trace_alloc.onAlloc(buf, new_size); + + if constexpr (clear_memory) + if (new_size > old_size) + memset(reinterpret_cast(ptr) + old_size, 0, new_size - old_size); + + if constexpr (populate) + prefaultPages(ptr, new_size); + + ProfileEvents::increment(ProfileEvents::GWPAsanAllocateSuccess); + return ptr; + } + else + { + ProfileEvents::increment(ProfileEvents::GWPAsanAllocateFailed); + } + } + + if (unlikely(GWPAsan::GuardedAlloc.pointerIsMine(buf))) + { + /// Big allocs that requires a copy. MemoryTracker is called inside 'alloc', 'free' methods. + void * new_buf = alloc(new_size, alignment); + memcpy(new_buf, buf, std::min(old_size, new_size)); + free(buf, old_size); + buf = new_buf; + + if constexpr (populate) + prefaultPages(buf, new_size); + + return buf; + } +#endif + + if (alignment <= MALLOC_MIN_ALIGNMENT) { /// Resize malloc'd memory region with no special alignment requirement. auto trace_free = CurrentMemoryTracker::free(old_size); diff --git a/src/Common/EnvironmentProxyConfigurationResolver.cpp b/src/Common/EnvironmentProxyConfigurationResolver.cpp index f2c60afa1a8..b7b1f1ecfde 100644 --- a/src/Common/EnvironmentProxyConfigurationResolver.cpp +++ b/src/Common/EnvironmentProxyConfigurationResolver.cpp @@ -1,6 +1,7 @@ #include "EnvironmentProxyConfigurationResolver.h" #include +#include #include namespace DB @@ -12,6 +13,7 @@ namespace DB * */ static constexpr auto PROXY_HTTP_ENVIRONMENT_VARIABLE = "http_proxy"; static constexpr auto PROXY_HTTPS_ENVIRONMENT_VARIABLE = "https_proxy"; +static constexpr auto NO_PROXY_ENVIRONMENT_VARIABLE = "no_proxy"; EnvironmentProxyConfigurationResolver::EnvironmentProxyConfigurationResolver( Protocol request_protocol_, bool disable_tunneling_for_https_requests_over_http_proxy_) @@ -34,31 +36,60 @@ namespace return std::getenv(PROXY_HTTPS_ENVIRONMENT_VARIABLE); // NOLINT(concurrency-mt-unsafe) } } + + const char * getNoProxyHosts() + { + return std::getenv(NO_PROXY_ENVIRONMENT_VARIABLE); // NOLINT(concurrency-mt-unsafe) + } + + ProxyConfiguration buildProxyConfiguration( + ProxyConfiguration::Protocol request_protocol, + const Poco::URI & uri, + const std::string & no_proxy_hosts_string, + bool disable_tunneling_for_https_requests_over_http_proxy) + { + if (uri.empty()) + { + return {}; + } + + const auto & host = uri.getHost(); + const auto & scheme = uri.getScheme(); + const auto port = uri.getPort(); + + const bool use_tunneling_for_https_requests_over_http_proxy = ProxyConfiguration::useTunneling( + request_protocol, + ProxyConfiguration::protocolFromString(scheme), + disable_tunneling_for_https_requests_over_http_proxy); + + LOG_TRACE(getLogger("EnvironmentProxyConfigurationResolver"), "Use proxy from environment: {}://{}:{}", scheme, host, port); + + return ProxyConfiguration { + host, + ProxyConfiguration::protocolFromString(scheme), + port, + use_tunneling_for_https_requests_over_http_proxy, + request_protocol, + no_proxy_hosts_string + }; + } } ProxyConfiguration EnvironmentProxyConfigurationResolver::resolve() { - const auto * proxy_host = getProxyHost(request_protocol); + static const auto * http_proxy_host = getProxyHost(Protocol::HTTP); + static const auto * https_proxy_host = getProxyHost(Protocol::HTTPS); + static const auto * no_proxy = getNoProxyHosts(); + static const auto poco_no_proxy_hosts = no_proxy ? buildPocoNonProxyHosts(no_proxy) : ""; - if (!proxy_host) - { - return {}; - } + static const Poco::URI http_proxy_uri(http_proxy_host ? http_proxy_host : ""); + static const Poco::URI https_proxy_uri(https_proxy_host ? https_proxy_host : ""); - auto uri = Poco::URI(proxy_host); - auto host = uri.getHost(); - auto scheme = uri.getScheme(); - auto port = uri.getPort(); - - LOG_TRACE(getLogger("EnvironmentProxyConfigurationResolver"), "Use proxy from environment: {}://{}:{}", scheme, host, port); - - return ProxyConfiguration { - host, - ProxyConfiguration::protocolFromString(scheme), - port, - useTunneling(request_protocol, ProxyConfiguration::protocolFromString(scheme), disable_tunneling_for_https_requests_over_http_proxy), - request_protocol - }; + return buildProxyConfiguration( + request_protocol, + request_protocol == Protocol::HTTP ? http_proxy_uri : https_proxy_uri, + poco_no_proxy_hosts, + disable_tunneling_for_https_requests_over_http_proxy); } } diff --git a/src/Common/GWPAsan.cpp b/src/Common/GWPAsan.cpp new file mode 100644 index 00000000000..488f8e2c5dc --- /dev/null +++ b/src/Common/GWPAsan.cpp @@ -0,0 +1,226 @@ +#include + +#if USE_GWP_ASAN +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include + +# include +# include + +namespace GWPAsan +{ + +namespace +{ +size_t getBackTrace(uintptr_t * trace_buffer, size_t buffer_size) +{ + StackTrace stacktrace; + auto trace_size = std::min(buffer_size, stacktrace.getSize()); + const auto & frame_pointers = stacktrace.getFramePointers(); + memcpy(trace_buffer, frame_pointers.data(), trace_size * sizeof(uintptr_t)); + return trace_size; +} + +__attribute__((__format__ (__printf__, 1, 0))) +void printString(const char * format, ...) // NOLINT(cert-dcl50-cpp) +{ + std::array formatted; + va_list args; + va_start(args, format); + + if (vsnprintf(formatted.data(), formatted.size(), format, args) > 0) + std::cerr << formatted.data() << std::endl; + + va_end(args); +} + +} + +gwp_asan::GuardedPoolAllocator GuardedAlloc; + +static bool guarded_alloc_initialized = [] +{ + const char * env_options_raw = std::getenv("GWP_ASAN_OPTIONS"); // NOLINT(concurrency-mt-unsafe) + if (env_options_raw) + gwp_asan::options::initOptions(env_options_raw, printString); + + auto & opts = gwp_asan::options::getOptions(); + if (!env_options_raw || !std::string_view{env_options_raw}.contains("MaxSimultaneousAllocations")) + opts.MaxSimultaneousAllocations = 1024; + + if (!env_options_raw || !std::string_view{env_options_raw}.contains("SampleRate")) + opts.SampleRate = 50000; + + opts.Backtrace = getBackTrace; + GuardedAlloc.init(opts); + + return true; +}(); + +bool isGWPAsanError(uintptr_t fault_address) +{ + const auto * state = GuardedAlloc.getAllocatorState(); + if (state->FailureType != gwp_asan::Error::UNKNOWN && state->FailureAddress != 0) + return true; + + return fault_address < state->GuardedPagePoolEnd && state->GuardedPagePool <= fault_address; +} + +namespace +{ + +struct ScopedEndOfReportDecorator +{ + explicit ScopedEndOfReportDecorator(Poco::LoggerPtr log_) : log(std::move(log_)) { } + ~ScopedEndOfReportDecorator() { LOG_FATAL(log, "*** End GWP-ASan report ***"); } + Poco::LoggerPtr log; +}; + +// Prints the provided error and metadata information. +void printHeader(gwp_asan::Error error, uintptr_t fault_address, const gwp_asan::AllocationMetadata * allocation_meta, Poco::LoggerPtr log) +{ + bool access_was_in_bounds = false; + std::string description; + if (error != gwp_asan::Error::UNKNOWN && allocation_meta != nullptr) + { + uintptr_t address = __gwp_asan_get_allocation_address(allocation_meta); + size_t size = __gwp_asan_get_allocation_size(allocation_meta); + if (fault_address < address) + { + description = fmt::format( + "({} byte{} to the left of a {}-byte allocation at 0x{}) ", + address - fault_address, + (address - fault_address == 1) ? "" : "s", + size, + address); + } + else if (fault_address > address) + { + description = fmt::format( + "({} byte{} to the right of a {}-byte allocation at 0x{}) ", + fault_address - address, + (fault_address - address == 1) ? "" : "s", + size, + address); + } + else if (error == gwp_asan::Error::DOUBLE_FREE) + { + description = fmt::format("(a {}-byte allocation) ", size); + } + else + { + access_was_in_bounds = true; + description = fmt::format( + "({} byte{} into a {}-byte allocation at 0x{}) ", + fault_address - address, + (fault_address - address == 1) ? "" : "s", + size, + address); + } + } + + uint64_t thread_id = gwp_asan::getThreadID(); + std::string thread_id_string = thread_id == gwp_asan::kInvalidThreadID ? " 512B in length.\n"; + + if (allocation_meta == nullptr) + { + LOG_FATAL(logger, "*** GWP-ASan detected a memory error ***"); + ScopedEndOfReportDecorator decorator(logger); + LOG_FATAL(logger, fmt::runtime(unknown_crash_text)); + return; + } + + LOG_FATAL(logger, "*** GWP-ASan detected a memory error ***"); + ScopedEndOfReportDecorator decorator(logger); + + gwp_asan::Error error = __gwp_asan_diagnose_error(state, allocation_meta, fault_address); + if (error == gwp_asan::Error::UNKNOWN) + { + LOG_FATAL(logger, fmt::runtime(unknown_crash_text)); + return; + } + + // Print the error header. + printHeader(error, fault_address, allocation_meta, logger); + + static constexpr size_t maximum_stack_frames = 512; + std::array trace; + + // Maybe print the deallocation trace. + if (__gwp_asan_is_deallocated(allocation_meta)) + { + uint64_t thread_id = __gwp_asan_get_deallocation_thread_id(allocation_meta); + if (thread_id == gwp_asan::kInvalidThreadID) + LOG_FATAL(logger, "0x{} was deallocated by thread here:", fault_address); + else + LOG_FATAL(logger, "0x{} was deallocated by thread {} here:", fault_address, thread_id); + const auto trace_length = __gwp_asan_get_deallocation_trace(allocation_meta, trace.data(), maximum_stack_frames); + StackTrace::toStringEveryLine( + reinterpret_cast(trace.data()), 0, trace_length, [&](const auto line) { LOG_FATAL(logger, fmt::runtime(line)); }); + } + + // Print the allocation trace. + uint64_t thread_id = __gwp_asan_get_allocation_thread_id(allocation_meta); + if (thread_id == gwp_asan::kInvalidThreadID) + LOG_FATAL(logger, "0x{} was allocated by thread here:", fault_address); + else + LOG_FATAL(logger, "0x{} was allocated by thread {} here:", fault_address, thread_id); + const auto trace_length = __gwp_asan_get_allocation_trace(allocation_meta, trace.data(), maximum_stack_frames); + StackTrace::toStringEveryLine( + reinterpret_cast(trace.data()), 0, trace_length, [&](const auto line) { LOG_FATAL(logger, fmt::runtime(line)); }); +} + +std::atomic force_sample_probability = 0.0; + +void setForceSampleProbability(double value) +{ + force_sample_probability.store(value, std::memory_order_relaxed); +} + +} + +#endif diff --git a/src/Common/GWPAsan.h b/src/Common/GWPAsan.h new file mode 100644 index 00000000000..b3215c6157e --- /dev/null +++ b/src/Common/GWPAsan.h @@ -0,0 +1,34 @@ +#pragma once + +#include "config.h" + +#if USE_GWP_ASAN + +#include +#include + +#include +#include + +namespace GWPAsan +{ + +extern gwp_asan::GuardedPoolAllocator GuardedAlloc; + +bool isGWPAsanError(uintptr_t fault_address); + +void printReport(uintptr_t fault_address); + +extern std::atomic force_sample_probability; + +void setForceSampleProbability(double value); + +inline bool shouldForceSample() +{ + std::bernoulli_distribution dist(force_sample_probability.load(std::memory_order_relaxed)); + return dist(thread_local_rng); +} + +} + +#endif diff --git a/src/Common/HTTPConnectionPool.cpp b/src/Common/HTTPConnectionPool.cpp index 167aeee68f3..f3ff09bc90a 100644 --- a/src/Common/HTTPConnectionPool.cpp +++ b/src/Common/HTTPConnectionPool.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -70,20 +71,6 @@ namespace CurrentMetrics namespace { - Poco::Net::HTTPClientSession::ProxyConfig proxyConfigurationToPocoProxyConfig(const DB::ProxyConfiguration & proxy_configuration) - { - Poco::Net::HTTPClientSession::ProxyConfig poco_proxy_config; - - poco_proxy_config.host = proxy_configuration.host; - poco_proxy_config.port = proxy_configuration.port; - poco_proxy_config.protocol = DB::ProxyConfiguration::protocolToString(proxy_configuration.protocol); - poco_proxy_config.tunnel = proxy_configuration.tunneling; - poco_proxy_config.originalRequestProtocol = DB::ProxyConfiguration::protocolToString(proxy_configuration.original_request_protocol); - - return poco_proxy_config; - } - - constexpr size_t roundUp(size_t x, size_t rounding) { chassert(rounding > 0); @@ -696,7 +683,8 @@ struct EndpointPoolKey proxy_config.port, proxy_config.protocol, proxy_config.tunneling, - proxy_config.original_request_protocol) + proxy_config.original_request_protocol, + proxy_config.no_proxy_hosts) == std::tie( rhs.connection_group, rhs.target_host, @@ -706,7 +694,8 @@ struct EndpointPoolKey rhs.proxy_config.port, rhs.proxy_config.protocol, rhs.proxy_config.tunneling, - rhs.proxy_config.original_request_protocol); + rhs.proxy_config.original_request_protocol, + rhs.proxy_config.no_proxy_hosts); } }; diff --git a/src/Common/PODArray.h b/src/Common/PODArray.h index ece5114a998..92ef0597c7e 100644 --- a/src/Common/PODArray.h +++ b/src/Common/PODArray.h @@ -1,17 +1,20 @@ #pragma once -#include -#include -#include -#include +#include "config.h" + #include #include +#include +#include +#include +#include +#include + +#include +#include +#include #include #include -#include -#include -#include -#include #ifndef NDEBUG #include @@ -112,6 +115,11 @@ protected: template void alloc(size_t bytes, TAllocatorParams &&... allocator_params) { +#if USE_GWP_ASAN + if (unlikely(GWPAsan::shouldForceSample())) + gwp_asan::getThreadLocals()->NextSampleCounter = 1; +#endif + char * allocated = reinterpret_cast(TAllocator::alloc(bytes, std::forward(allocator_params)...)); c_start = allocated + pad_left; @@ -141,6 +149,11 @@ protected: return; } +#if USE_GWP_ASAN + if (unlikely(GWPAsan::shouldForceSample())) + gwp_asan::getThreadLocals()->NextSampleCounter = 1; +#endif + unprotect(); ptrdiff_t end_diff = c_end - c_start; diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index f73e16c517d..fef1c4a2b75 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -643,7 +643,8 @@ The server successfully detected this situation and will download merged part fr \ M(ServerStartupMilliseconds, "Time elapsed from starting server to listening to sockets in milliseconds")\ M(IOUringSQEsSubmitted, "Total number of io_uring SQEs submitted") \ - M(IOUringSQEsResubmits, "Total number of io_uring SQE resubmits performed") \ + M(IOUringSQEsResubmitsAsync, "Total number of asynchronous io_uring SQE resubmits performed") \ + M(IOUringSQEsResubmitsSync, "Total number of synchronous io_uring SQE resubmits performed") \ M(IOUringCQEsCompleted, "Total number of successfully completed io_uring CQEs") \ M(IOUringCQEsFailed, "Total number of completed io_uring CQEs with failures") \ \ @@ -753,6 +754,10 @@ The server successfully detected this situation and will download merged part fr \ M(ReadWriteBufferFromHTTPRequestsSent, "Number of HTTP requests sent by ReadWriteBufferFromHTTP") \ M(ReadWriteBufferFromHTTPBytes, "Total size of payload bytes received and sent by ReadWriteBufferFromHTTP. Doesn't include HTTP headers.") \ + \ + M(GWPAsanAllocateSuccess, "Number of successful allocations done by GWPAsan") \ + M(GWPAsanAllocateFailed, "Number of failed allocations done by GWPAsan (i.e. filled pool)") \ + M(GWPAsanFree, "Number of free operations done by GWPAsan") \ #ifdef APPLY_FOR_EXTERNAL_EVENTS diff --git a/src/Common/ProfileEvents.h b/src/Common/ProfileEvents.h index e670b8907d2..f196ed5a04c 100644 --- a/src/Common/ProfileEvents.h +++ b/src/Common/ProfileEvents.h @@ -40,6 +40,7 @@ namespace ProfileEvents Timer(Counters & counters_, Event timer_event_, Event counter_event, Resolution resolution_); ~Timer() { end(); } void cancel() { watch.reset(); } + void restart() { watch.restart(); } void end(); UInt64 get(); diff --git a/src/Common/ProxyConfiguration.h b/src/Common/ProxyConfiguration.h index 97577735bce..a9921f1474d 100644 --- a/src/Common/ProxyConfiguration.h +++ b/src/Common/ProxyConfiguration.h @@ -44,11 +44,18 @@ struct ProxyConfiguration } } + static bool useTunneling(Protocol request_protocol, Protocol proxy_protocol, bool disable_tunneling_for_https_requests_over_http_proxy) + { + bool is_https_request_over_http_proxy = request_protocol == Protocol::HTTPS && proxy_protocol == Protocol::HTTP; + return is_https_request_over_http_proxy && !disable_tunneling_for_https_requests_over_http_proxy; + } + std::string host = std::string{}; Protocol protocol = Protocol::HTTP; uint16_t port = 0; bool tunneling = false; Protocol original_request_protocol = Protocol::HTTP; + std::string no_proxy_hosts = std::string{}; bool isEmpty() const { return host.empty(); } }; diff --git a/src/Common/ProxyConfigurationResolver.h b/src/Common/ProxyConfigurationResolver.h index b82936502bb..1e9f4ad77f7 100644 --- a/src/Common/ProxyConfigurationResolver.h +++ b/src/Common/ProxyConfigurationResolver.h @@ -19,13 +19,6 @@ struct ProxyConfigurationResolver virtual void errorReport(const ProxyConfiguration & config) = 0; protected: - - static bool useTunneling(Protocol request_protocol, Protocol proxy_protocol, bool disable_tunneling_for_https_requests_over_http_proxy) - { - bool is_https_request_over_http_proxy = request_protocol == Protocol::HTTPS && proxy_protocol == Protocol::HTTP; - return is_https_request_over_http_proxy && !disable_tunneling_for_https_requests_over_http_proxy; - } - Protocol request_protocol; bool disable_tunneling_for_https_requests_over_http_proxy = false; }; diff --git a/src/Common/ProxyConfigurationResolverProvider.cpp b/src/Common/ProxyConfigurationResolverProvider.cpp index 4008ac2d8a5..b06073121e7 100644 --- a/src/Common/ProxyConfigurationResolverProvider.cpp +++ b/src/Common/ProxyConfigurationResolverProvider.cpp @@ -1,6 +1,7 @@ #include #include +#include #include #include #include @@ -17,6 +18,11 @@ namespace ErrorCodes namespace { + std::string getNoProxyHosts(const Poco::Util::AbstractConfiguration & configuration) + { + return configuration.getString("proxy.no_proxy", ""); + } + bool isTunnelingDisabledForHTTPSRequestsOverHTTPProxy( const Poco::Util::AbstractConfiguration & configuration) { @@ -49,6 +55,7 @@ namespace return std::make_shared( server_configuration, request_protocol, + buildPocoNonProxyHosts(getNoProxyHosts(configuration)), std::make_shared(), isTunnelingDisabledForHTTPSRequestsOverHTTPProxy(configuration)); } @@ -88,7 +95,11 @@ namespace return uris.empty() ? nullptr - : std::make_shared(uris, request_protocol, isTunnelingDisabledForHTTPSRequestsOverHTTPProxy(configuration)); + : std::make_shared( + uris, + request_protocol, + buildPocoNonProxyHosts(getNoProxyHosts(configuration)), + isTunnelingDisabledForHTTPSRequestsOverHTTPProxy(configuration)); } bool hasRemoteResolver(const String & config_prefix, const Poco::Util::AbstractConfiguration & configuration) diff --git a/src/Common/ProxyListConfigurationResolver.cpp b/src/Common/ProxyListConfigurationResolver.cpp index c527c89ea6b..2d5b5e97364 100644 --- a/src/Common/ProxyListConfigurationResolver.cpp +++ b/src/Common/ProxyListConfigurationResolver.cpp @@ -1,7 +1,6 @@ #include #include -#include #include namespace DB @@ -9,8 +8,11 @@ namespace DB ProxyListConfigurationResolver::ProxyListConfigurationResolver( std::vector proxies_, - Protocol request_protocol_, bool disable_tunneling_for_https_requests_over_http_proxy_) - : ProxyConfigurationResolver(request_protocol_, disable_tunneling_for_https_requests_over_http_proxy_), proxies(std::move(proxies_)) + Protocol request_protocol_, + const std::string & no_proxy_hosts_, + bool disable_tunneling_for_https_requests_over_http_proxy_) + : ProxyConfigurationResolver(request_protocol_, disable_tunneling_for_https_requests_over_http_proxy_), + proxies(std::move(proxies_)), no_proxy_hosts(no_proxy_hosts_) { } @@ -26,12 +28,18 @@ ProxyConfiguration ProxyListConfigurationResolver::resolve() auto & proxy = proxies[index]; + bool use_tunneling_for_https_requests_over_http_proxy = ProxyConfiguration::useTunneling( + request_protocol, + ProxyConfiguration::protocolFromString(proxy.getScheme()), + disable_tunneling_for_https_requests_over_http_proxy); + return ProxyConfiguration { proxy.getHost(), ProxyConfiguration::protocolFromString(proxy.getScheme()), proxy.getPort(), - useTunneling(request_protocol, ProxyConfiguration::protocolFromString(proxy.getScheme()), disable_tunneling_for_https_requests_over_http_proxy), - request_protocol + use_tunneling_for_https_requests_over_http_proxy, + request_protocol, + no_proxy_hosts }; } diff --git a/src/Common/ProxyListConfigurationResolver.h b/src/Common/ProxyListConfigurationResolver.h index 95e0073d779..a87826792d4 100644 --- a/src/Common/ProxyListConfigurationResolver.h +++ b/src/Common/ProxyListConfigurationResolver.h @@ -15,7 +15,11 @@ namespace DB class ProxyListConfigurationResolver : public ProxyConfigurationResolver { public: - ProxyListConfigurationResolver(std::vector proxies_, Protocol request_protocol_, bool disable_tunneling_for_https_requests_over_http_proxy_ = false); + ProxyListConfigurationResolver( + std::vector proxies_, + Protocol request_protocol_, + const std::string & no_proxy_hosts_, + bool disable_tunneling_for_https_requests_over_http_proxy_ = false); ProxyConfiguration resolve() override; @@ -23,6 +27,7 @@ public: private: std::vector proxies; + std::string no_proxy_hosts; /// Access counter to get proxy using round-robin strategy. std::atomic access_counter; diff --git a/src/Common/RemoteProxyConfigurationResolver.cpp b/src/Common/RemoteProxyConfigurationResolver.cpp index 176e7af4f0f..8fd9d381ece 100644 --- a/src/Common/RemoteProxyConfigurationResolver.cpp +++ b/src/Common/RemoteProxyConfigurationResolver.cpp @@ -42,11 +42,12 @@ std::string RemoteProxyHostFetcherImpl::fetch(const Poco::URI & endpoint, const RemoteProxyConfigurationResolver::RemoteProxyConfigurationResolver( const RemoteServerConfiguration & remote_server_configuration_, Protocol request_protocol_, + const std::string & no_proxy_hosts_, std::shared_ptr fetcher_, bool disable_tunneling_for_https_requests_over_http_proxy_ ) : ProxyConfigurationResolver(request_protocol_, disable_tunneling_for_https_requests_over_http_proxy_), - remote_server_configuration(remote_server_configuration_), fetcher(fetcher_) + remote_server_configuration(remote_server_configuration_), no_proxy_hosts(no_proxy_hosts_), fetcher(fetcher_) { } @@ -84,7 +85,7 @@ ProxyConfiguration RemoteProxyConfigurationResolver::resolve() auto proxy_protocol = ProxyConfiguration::protocolFromString(proxy_protocol_string); - bool use_tunneling_for_https_requests_over_http_proxy = useTunneling( + bool use_tunneling_for_https_requests_over_http_proxy = ProxyConfiguration::useTunneling( request_protocol, proxy_protocol, disable_tunneling_for_https_requests_over_http_proxy); @@ -94,6 +95,7 @@ ProxyConfiguration RemoteProxyConfigurationResolver::resolve() cached_config.port = proxy_port; cached_config.tunneling = use_tunneling_for_https_requests_over_http_proxy; cached_config.original_request_protocol = request_protocol; + cached_config.no_proxy_hosts = no_proxy_hosts; cache_timestamp = std::chrono::system_clock::now(); cache_valid = true; diff --git a/src/Common/RemoteProxyConfigurationResolver.h b/src/Common/RemoteProxyConfigurationResolver.h index 4e61a185bb3..d41f6267b89 100644 --- a/src/Common/RemoteProxyConfigurationResolver.h +++ b/src/Common/RemoteProxyConfigurationResolver.h @@ -41,6 +41,7 @@ public: RemoteProxyConfigurationResolver( const RemoteServerConfiguration & remote_server_configuration_, Protocol request_protocol_, + const std::string & no_proxy_hosts_, std::shared_ptr fetcher_, bool disable_tunneling_for_https_requests_over_http_proxy_ = false); @@ -50,6 +51,7 @@ public: private: RemoteServerConfiguration remote_server_configuration; + std::string no_proxy_hosts; std::shared_ptr fetcher; std::mutex cache_mutex; diff --git a/src/Common/memory.h b/src/Common/memory.h index a828ba7a38e..caa0418fa56 100644 --- a/src/Common/memory.h +++ b/src/Common/memory.h @@ -5,6 +5,8 @@ #include #include +#include +#include #include "config.h" #if USE_JEMALLOC @@ -15,11 +17,12 @@ # include #endif -#if USE_GWP_ASAN -# include - -static gwp_asan::GuardedPoolAllocator GuardedAlloc; -#endif +namespace ProfileEvents +{ + extern const Event GWPAsanAllocateSuccess; + extern const Event GWPAsanAllocateFailed; + extern const Event GWPAsanFree; +} namespace Memory { @@ -34,17 +37,31 @@ requires DB::OptionalArgument inline ALWAYS_INLINE void * newImpl(std::size_t size, TAlign... align) { #if USE_GWP_ASAN - if (unlikely(GuardedAlloc.shouldSample())) + if (unlikely(GWPAsan::GuardedAlloc.shouldSample())) { if constexpr (sizeof...(TAlign) == 1) { - if (void * ptr = GuardedAlloc.allocate(size, alignToSizeT(align...))) + if (void * ptr = GWPAsan::GuardedAlloc.allocate(size, alignToSizeT(align...))) + { + ProfileEvents::increment(ProfileEvents::GWPAsanAllocateSuccess); return ptr; + } + else + { + ProfileEvents::increment(ProfileEvents::GWPAsanAllocateFailed); + } } else { - if (void * ptr = GuardedAlloc.allocate(size)) + if (void * ptr = GWPAsan::GuardedAlloc.allocate(size)) + { + ProfileEvents::increment(ProfileEvents::GWPAsanAllocateSuccess); return ptr; + } + else + { + ProfileEvents::increment(ProfileEvents::GWPAsanAllocateFailed); + } } } @@ -66,10 +83,17 @@ inline ALWAYS_INLINE void * newImpl(std::size_t size, TAlign... align) inline ALWAYS_INLINE void * newNoExept(std::size_t size) noexcept { #if USE_GWP_ASAN - if (unlikely(GuardedAlloc.shouldSample())) + if (unlikely(GWPAsan::GuardedAlloc.shouldSample())) { - if (void * ptr = GuardedAlloc.allocate(size)) + if (void * ptr = GWPAsan::GuardedAlloc.allocate(size)) + { + ProfileEvents::increment(ProfileEvents::GWPAsanAllocateSuccess); return ptr; + } + else + { + ProfileEvents::increment(ProfileEvents::GWPAsanAllocateFailed); + } } #endif return malloc(size); @@ -78,10 +102,17 @@ inline ALWAYS_INLINE void * newNoExept(std::size_t size) noexcept inline ALWAYS_INLINE void * newNoExept(std::size_t size, std::align_val_t align) noexcept { #if USE_GWP_ASAN - if (unlikely(GuardedAlloc.shouldSample())) + if (unlikely(GWPAsan::GuardedAlloc.shouldSample())) { - if (void * ptr = GuardedAlloc.allocate(size, alignToSizeT(align))) + if (void * ptr = GWPAsan::GuardedAlloc.allocate(size, alignToSizeT(align))) + { + ProfileEvents::increment(ProfileEvents::GWPAsanAllocateSuccess); return ptr; + } + else + { + ProfileEvents::increment(ProfileEvents::GWPAsanAllocateFailed); + } } #endif return aligned_alloc(static_cast(align), size); @@ -90,9 +121,10 @@ inline ALWAYS_INLINE void * newNoExept(std::size_t size, std::align_val_t align) inline ALWAYS_INLINE void deleteImpl(void * ptr) noexcept { #if USE_GWP_ASAN - if (unlikely(GuardedAlloc.pointerIsMine(ptr))) + if (unlikely(GWPAsan::GuardedAlloc.pointerIsMine(ptr))) { - GuardedAlloc.deallocate(ptr); + ProfileEvents::increment(ProfileEvents::GWPAsanFree); + GWPAsan::GuardedAlloc.deallocate(ptr); return; } #endif @@ -109,9 +141,10 @@ inline ALWAYS_INLINE void deleteSized(void * ptr, std::size_t size, TAlign... al return; #if USE_GWP_ASAN - if (unlikely(GuardedAlloc.pointerIsMine(ptr))) + if (unlikely(GWPAsan::GuardedAlloc.pointerIsMine(ptr))) { - GuardedAlloc.deallocate(ptr); + ProfileEvents::increment(ProfileEvents::GWPAsanFree); + GWPAsan::GuardedAlloc.deallocate(ptr); return; } #endif @@ -129,9 +162,10 @@ requires DB::OptionalArgument inline ALWAYS_INLINE void deleteSized(void * ptr, std::size_t size [[maybe_unused]], TAlign... /* align */) noexcept { #if USE_GWP_ASAN - if (unlikely(GuardedAlloc.pointerIsMine(ptr))) + if (unlikely(GWPAsan::GuardedAlloc.pointerIsMine(ptr))) { - GuardedAlloc.deallocate(ptr); + ProfileEvents::increment(ProfileEvents::GWPAsanFree); + GWPAsan::GuardedAlloc.deallocate(ptr); return; } #endif @@ -183,10 +217,10 @@ inline ALWAYS_INLINE size_t untrackMemory(void * ptr [[maybe_unused]], Allocatio std::size_t actual_size = 0; #if USE_GWP_ASAN - if (unlikely(GuardedAlloc.pointerIsMine(ptr))) + if (unlikely(GWPAsan::GuardedAlloc.pointerIsMine(ptr))) { if (!size) - size = GuardedAlloc.getSize(ptr); + size = GWPAsan::GuardedAlloc.getSize(ptr); trace = CurrentMemoryTracker::free(size); return size; } diff --git a/src/Common/new_delete.cpp b/src/Common/new_delete.cpp index 9e93dca9787..e8151fbe201 100644 --- a/src/Common/new_delete.cpp +++ b/src/Common/new_delete.cpp @@ -1,5 +1,4 @@ #include -#include #include #include "config.h" #include @@ -42,27 +41,6 @@ static struct InitializeJemallocZoneAllocatorForOSX } initializeJemallocZoneAllocatorForOSX; #endif -#if USE_GWP_ASAN - -#include - -/// Both clickhouse_new_delete and clickhouse_common_io links gwp_asan, but It should only init once, otherwise it -/// will cause unexpected deadlock. -static struct InitGwpAsan -{ - InitGwpAsan() - { - gwp_asan::options::initOptions(); - gwp_asan::options::Options &opts = gwp_asan::options::getOptions(); - GuardedAlloc.init(opts); - - ///std::cerr << "GwpAsan is initialized, the options are { Enabled: " << opts.Enabled - /// << ", MaxSimultaneousAllocations: " << opts.MaxSimultaneousAllocations - /// << ", SampleRate: " << opts.SampleRate << " }\n"; - } -} init_gwp_asan; -#endif - /// Replace default new/delete with memory tracking versions. /// @sa https://en.cppreference.com/w/cpp/memory/new/operator_new /// https://en.cppreference.com/w/cpp/memory/new/operator_delete diff --git a/src/Common/proxyConfigurationToPocoProxyConfig.cpp b/src/Common/proxyConfigurationToPocoProxyConfig.cpp new file mode 100644 index 00000000000..c06014ac2dc --- /dev/null +++ b/src/Common/proxyConfigurationToPocoProxyConfig.cpp @@ -0,0 +1,117 @@ +#include + + +#include +#include + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wzero-as-null-pointer-constant" +#pragma clang diagnostic ignored "-Wgnu-anonymous-struct" +#pragma clang diagnostic ignored "-Wnested-anon-types" +#pragma clang diagnostic ignored "-Wunused-parameter" +#pragma clang diagnostic ignored "-Wshadow-field-in-constructor" +#pragma clang diagnostic ignored "-Wdtor-name" +#include +#pragma clang diagnostic pop + +namespace DB +{ + +namespace +{ + +/* + * Copy `curl` behavior instead of `wget` as it seems to be more flexible. + * `curl` strips leading dot and accepts url gitlab.com as a match for no_proxy .gitlab.com, + * while `wget` does an exact match. + * */ +std::string buildPocoRegexpEntryWithoutLeadingDot(const std::string & host) +{ + std::string_view view_without_leading_dot = host; + if (host[0] == '.') + { + view_without_leading_dot = std::string_view {host.begin() + 1u, host.end()}; + } + + return RE2::QuoteMeta(view_without_leading_dot); +} + +} + +/* + * Even though there is not an RFC that defines NO_PROXY, it is usually a comma-separated list of domains. + * Different tools implement their own versions of `NO_PROXY` support. Some support CIDR blocks, some support wildcard etc. + * Opting for a simple implementation that covers most use cases: + * * Support only single wildcard * (match anything) + * * Match subdomains + * * Strip leading dots + * * No regex + * * No CIDR blocks + * * No fancy stuff about loopback IPs + * https://about.gitlab.com/blog/2021/01/27/we-need-to-talk-no-proxy/ + * Open for discussions + * */ +std::string buildPocoNonProxyHosts(const std::string & no_proxy_hosts_string) +{ + if (no_proxy_hosts_string.empty()) + { + return ""; + } + + static constexpr auto OR_SEPARATOR = "|"; + static constexpr auto MATCH_ANYTHING = R"(.*)"; + static constexpr auto MATCH_SUBDOMAINS_REGEX = R"((?:.*\.)?)"; + + bool match_any_host = no_proxy_hosts_string.size() == 1 && no_proxy_hosts_string[0] == '*'; + + if (match_any_host) + { + return MATCH_ANYTHING; + } + + std::vector no_proxy_hosts; + splitInto<','>(no_proxy_hosts, no_proxy_hosts_string); + + bool first = true; + std::string result; + + for (auto & host : no_proxy_hosts) + { + trim(host); + + if (host.empty()) + { + continue; + } + + if (!first) + { + result.append(OR_SEPARATOR); + } + + auto escaped_host_without_leading_dot = buildPocoRegexpEntryWithoutLeadingDot(host); + + result.append(MATCH_SUBDOMAINS_REGEX); + result.append(escaped_host_without_leading_dot); + + first = false; + } + + return result; +} + +Poco::Net::HTTPClientSession::ProxyConfig proxyConfigurationToPocoProxyConfig(const DB::ProxyConfiguration & proxy_configuration) +{ + Poco::Net::HTTPClientSession::ProxyConfig poco_proxy_config; + + poco_proxy_config.host = proxy_configuration.host; + poco_proxy_config.port = proxy_configuration.port; + poco_proxy_config.protocol = DB::ProxyConfiguration::protocolToString(proxy_configuration.protocol); + poco_proxy_config.tunnel = proxy_configuration.tunneling; + poco_proxy_config.originalRequestProtocol = DB::ProxyConfiguration::protocolToString(proxy_configuration.original_request_protocol); + poco_proxy_config.nonProxyHosts = proxy_configuration.no_proxy_hosts; + + return poco_proxy_config; +} + +} diff --git a/src/Common/proxyConfigurationToPocoProxyConfig.h b/src/Common/proxyConfigurationToPocoProxyConfig.h new file mode 100644 index 00000000000..c118bd059f9 --- /dev/null +++ b/src/Common/proxyConfigurationToPocoProxyConfig.h @@ -0,0 +1,13 @@ +#pragma once + +#include +#include + +namespace DB +{ + +Poco::Net::HTTPClientSession::ProxyConfig proxyConfigurationToPocoProxyConfig(const DB::ProxyConfiguration & proxy_configuration); + +std::string buildPocoNonProxyHosts(const std::string & no_proxy_hosts_string); + +} diff --git a/src/Common/tests/gtest_helper_functions.h b/src/Common/tests/gtest_helper_functions.h index 54c9ae9170d..90c5d4d2088 100644 --- a/src/Common/tests/gtest_helper_functions.h +++ b/src/Common/tests/gtest_helper_functions.h @@ -76,22 +76,28 @@ inline std::string xmlNodeAsString(Poco::XML::Node *pNode) struct EnvironmentProxySetter { - EnvironmentProxySetter(const Poco::URI & http_proxy, const Poco::URI & https_proxy) - { - if (!http_proxy.empty()) - { - setenv("http_proxy", http_proxy.toString().c_str(), 1); // NOLINT(concurrency-mt-unsafe) - } + static constexpr auto * NO_PROXY = "*"; + static constexpr auto * HTTP_PROXY = "http://proxy_server:3128"; + static constexpr auto * HTTPS_PROXY = "https://proxy_server:3128"; - if (!https_proxy.empty()) - { - setenv("https_proxy", https_proxy.toString().c_str(), 1); // NOLINT(concurrency-mt-unsafe) - } + EnvironmentProxySetter() + { + setenv("http_proxy", HTTP_PROXY, 1); // NOLINT(concurrency-mt-unsafe) + + setenv("https_proxy", HTTPS_PROXY, 1); // NOLINT(concurrency-mt-unsafe) + + // Some other tests rely on HTTP clients (e.g, gtest_aws_s3_client), which depend on proxy configuration + // since in https://github.com/ClickHouse/ClickHouse/pull/63314 the environment proxy resolver reads only once + // from the environment, the proxy configuration will always be there. + // The problem is that the proxy server does not exist, causing the test to fail. + // To work around this issue, `no_proxy` is set to bypass all domains. + setenv("no_proxy", NO_PROXY, 1); // NOLINT(concurrency-mt-unsafe) } ~EnvironmentProxySetter() { unsetenv("http_proxy"); // NOLINT(concurrency-mt-unsafe) unsetenv("https_proxy"); // NOLINT(concurrency-mt-unsafe) + unsetenv("no_proxy"); // NOLINT(concurrency-mt-unsafe) } }; diff --git a/src/Common/tests/gtest_poco_no_proxy_regex.cpp b/src/Common/tests/gtest_poco_no_proxy_regex.cpp new file mode 100644 index 00000000000..c3c1b512c08 --- /dev/null +++ b/src/Common/tests/gtest_poco_no_proxy_regex.cpp @@ -0,0 +1,24 @@ +#include + +#include + +TEST(ProxyConfigurationToPocoProxyConfiguration, TestNoProxyHostRegexBuild) +{ + ASSERT_EQ( + DB::buildPocoNonProxyHosts("localhost,127.0.0.1,some_other_domain:8080,sub-domain.domain.com"), + R"((?:.*\.)?localhost|(?:.*\.)?127\.0\.0\.1|(?:.*\.)?some_other_domain\:8080|(?:.*\.)?sub\-domain\.domain\.com)"); +} + +TEST(ProxyConfigurationToPocoProxyConfiguration, TestNoProxyHostRegexBuildMatchAnything) +{ + ASSERT_EQ( + DB::buildPocoNonProxyHosts("*"), + ".*"); +} + +TEST(ProxyConfigurationToPocoProxyConfiguration, TestNoProxyHostRegexBuildEmpty) +{ + ASSERT_EQ( + DB::buildPocoNonProxyHosts(""), + ""); +} diff --git a/src/Common/tests/gtest_proxy_configuration_resolver_provider.cpp b/src/Common/tests/gtest_proxy_configuration_resolver_provider.cpp index d5d6f86f661..7bc48203998 100644 --- a/src/Common/tests/gtest_proxy_configuration_resolver_provider.cpp +++ b/src/Common/tests/gtest_proxy_configuration_resolver_provider.cpp @@ -1,6 +1,9 @@ #include #include +#include +#include +#include #include #include @@ -25,27 +28,19 @@ protected: DB::ContextMutablePtr ProxyConfigurationResolverProviderTests::context; -Poco::URI http_env_proxy_server = Poco::URI("http://http_environment_proxy:3128"); -Poco::URI https_env_proxy_server = Poco::URI("http://https_environment_proxy:3128"); - Poco::URI http_list_proxy_server = Poco::URI("http://http_list_proxy:3128"); Poco::URI https_list_proxy_server = Poco::URI("http://https_list_proxy:3128"); TEST_F(ProxyConfigurationResolverProviderTests, EnvironmentResolverShouldBeUsedIfNoSettings) { - EnvironmentProxySetter setter(http_env_proxy_server, https_env_proxy_server); + EnvironmentProxySetter setter; const auto & config = getContext().context->getConfigRef(); - auto http_configuration = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTP, config)->resolve(); - auto https_configuration = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTPS, config)->resolve(); + auto http_resolver = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTP, config); + auto https_resolver = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTPS, config); - ASSERT_EQ(http_configuration.host, http_env_proxy_server.getHost()); - ASSERT_EQ(http_configuration.port, http_env_proxy_server.getPort()); - ASSERT_EQ(http_configuration.protocol, DB::ProxyConfiguration::protocolFromString(http_env_proxy_server.getScheme())); - - ASSERT_EQ(https_configuration.host, https_env_proxy_server.getHost()); - ASSERT_EQ(https_configuration.port, https_env_proxy_server.getPort()); - ASSERT_EQ(https_configuration.protocol, DB::ProxyConfiguration::protocolFromString(https_env_proxy_server.getScheme())); + ASSERT_TRUE(std::dynamic_pointer_cast(http_resolver)); + ASSERT_TRUE(std::dynamic_pointer_cast(https_resolver)); } TEST_F(ProxyConfigurationResolverProviderTests, ListHTTPOnly) @@ -57,17 +52,11 @@ TEST_F(ProxyConfigurationResolverProviderTests, ListHTTPOnly) config->setString("proxy.http.uri", http_list_proxy_server.toString()); context->setConfig(config); - auto http_proxy_configuration = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTP, *config)->resolve(); + auto http_resolver = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTP, *config); + auto https_resolver = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTPS, *config); - ASSERT_EQ(http_proxy_configuration.host, http_list_proxy_server.getHost()); - ASSERT_EQ(http_proxy_configuration.port, http_list_proxy_server.getPort()); - ASSERT_EQ(http_proxy_configuration.protocol, DB::ProxyConfiguration::protocolFromString(http_list_proxy_server.getScheme())); - - auto https_proxy_configuration = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTPS, *config)->resolve(); - - // No https configuration since it's not set - ASSERT_EQ(https_proxy_configuration.host, ""); - ASSERT_EQ(https_proxy_configuration.port, 0); + ASSERT_TRUE(std::dynamic_pointer_cast(http_resolver)); + ASSERT_TRUE(std::dynamic_pointer_cast(https_resolver)); } TEST_F(ProxyConfigurationResolverProviderTests, ListHTTPSOnly) @@ -79,18 +68,11 @@ TEST_F(ProxyConfigurationResolverProviderTests, ListHTTPSOnly) config->setString("proxy.https.uri", https_list_proxy_server.toString()); context->setConfig(config); - auto http_proxy_configuration = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTP, *config)->resolve(); + auto http_resolver = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTP, *config); + auto https_resolver = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTPS, *config); - ASSERT_EQ(http_proxy_configuration.host, ""); - ASSERT_EQ(http_proxy_configuration.port, 0); - - auto https_proxy_configuration = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTPS, *config)->resolve(); - - ASSERT_EQ(https_proxy_configuration.host, https_list_proxy_server.getHost()); - - // still HTTP because the proxy host is not HTTPS - ASSERT_EQ(https_proxy_configuration.protocol, DB::ProxyConfiguration::protocolFromString(https_list_proxy_server.getScheme())); - ASSERT_EQ(https_proxy_configuration.port, https_list_proxy_server.getPort()); + ASSERT_TRUE(std::dynamic_pointer_cast(http_resolver)); + ASSERT_TRUE(std::dynamic_pointer_cast(https_resolver)); } TEST_F(ProxyConfigurationResolverProviderTests, ListBoth) @@ -107,70 +89,15 @@ TEST_F(ProxyConfigurationResolverProviderTests, ListBoth) context->setConfig(config); - auto http_proxy_configuration = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTP, *config)->resolve(); + auto http_resolver = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTP, *config); + auto https_resolver = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTPS, *config); - ASSERT_EQ(http_proxy_configuration.host, http_list_proxy_server.getHost()); - ASSERT_EQ(http_proxy_configuration.protocol, DB::ProxyConfiguration::protocolFromString(http_list_proxy_server.getScheme())); - ASSERT_EQ(http_proxy_configuration.port, http_list_proxy_server.getPort()); - - auto https_proxy_configuration = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTPS, *config)->resolve(); - - ASSERT_EQ(https_proxy_configuration.host, https_list_proxy_server.getHost()); - - // still HTTP because the proxy host is not HTTPS - ASSERT_EQ(https_proxy_configuration.protocol, DB::ProxyConfiguration::protocolFromString(https_list_proxy_server.getScheme())); - ASSERT_EQ(https_proxy_configuration.port, https_list_proxy_server.getPort()); -} - -TEST_F(ProxyConfigurationResolverProviderTests, RemoteResolverIsBasedOnProtocolConfigurationHTTP) -{ - /* - * Since there is no way to call `ProxyConfigurationResolver::resolve` on remote resolver, - * it is hard to verify the remote resolver was actually picked. One hackish way to assert - * the remote resolver was OR was not picked based on the configuration, is to use the - * environment resolver. Since the environment resolver is always returned as a fallback, - * we can assert the remote resolver was not picked if `ProxyConfigurationResolver::resolve` - * succeeds and returns an environment proxy configuration. - * */ - EnvironmentProxySetter setter(http_env_proxy_server, https_env_proxy_server); - - ConfigurationPtr config = Poco::AutoPtr(new Poco::Util::MapConfiguration()); - - config->setString("proxy", ""); - config->setString("proxy.https", ""); - config->setString("proxy.https.resolver", ""); - config->setString("proxy.https.resolver.endpoint", "http://resolver:8080/hostname"); - - // even tho proxy protocol / scheme is http, it should not be picked (prior to this PR, it would be picked) - config->setString("proxy.https.resolver.proxy_scheme", "http"); - config->setString("proxy.https.resolver.proxy_port", "80"); - config->setString("proxy.https.resolver.proxy_cache_time", "10"); - - context->setConfig(config); - - auto http_proxy_configuration = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTP, *config)->resolve(); - - /* - * Asserts env proxy is used and not the remote resolver. If the remote resolver is picked, it is an error because - * there is no `http` specification for remote resolver - * */ - ASSERT_EQ(http_proxy_configuration.host, http_env_proxy_server.getHost()); - ASSERT_EQ(http_proxy_configuration.port, http_env_proxy_server.getPort()); - ASSERT_EQ(http_proxy_configuration.protocol, DB::ProxyConfiguration::protocolFromString(http_env_proxy_server.getScheme())); + ASSERT_TRUE(std::dynamic_pointer_cast(http_resolver)); + ASSERT_TRUE(std::dynamic_pointer_cast(https_resolver)); } TEST_F(ProxyConfigurationResolverProviderTests, RemoteResolverIsBasedOnProtocolConfigurationHTTPS) { - /* - * Since there is no way to call `ProxyConfigurationResolver::resolve` on remote resolver, - * it is hard to verify the remote resolver was actually picked. One hackish way to assert - * the remote resolver was OR was not picked based on the configuration, is to use the - * environment resolver. Since the environment resolver is always returned as a fallback, - * we can assert the remote resolver was not picked if `ProxyConfigurationResolver::resolve` - * succeeds and returns an environment proxy configuration. - * */ - EnvironmentProxySetter setter(http_env_proxy_server, https_env_proxy_server); - ConfigurationPtr config = Poco::AutoPtr(new Poco::Util::MapConfiguration()); config->setString("proxy", ""); @@ -185,27 +112,44 @@ TEST_F(ProxyConfigurationResolverProviderTests, RemoteResolverIsBasedOnProtocolC context->setConfig(config); - auto http_proxy_configuration = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTPS, *config)->resolve(); + auto http_resolver = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTP, *config); + auto https_resolver = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTPS, *config); - /* - * Asserts env proxy is used and not the remote resolver. If the remote resolver is picked, it is an error because - * there is no `http` specification for remote resolver - * */ - ASSERT_EQ(http_proxy_configuration.host, https_env_proxy_server.getHost()); - ASSERT_EQ(http_proxy_configuration.port, https_env_proxy_server.getPort()); - ASSERT_EQ(http_proxy_configuration.protocol, DB::ProxyConfiguration::protocolFromString(https_env_proxy_server.getScheme())); + ASSERT_TRUE(std::dynamic_pointer_cast(http_resolver)); + ASSERT_TRUE(std::dynamic_pointer_cast(https_resolver)); } -// remote resolver is tricky to be tested in unit tests +TEST_F(ProxyConfigurationResolverProviderTests, RemoteResolverHTTPSOnly) +{ + ConfigurationPtr config = Poco::AutoPtr(new Poco::Util::MapConfiguration()); + + config->setString("proxy", ""); + config->setString("proxy.https", ""); + config->setString("proxy.https.resolver", ""); + config->setString("proxy.https.resolver.endpoint", "http://resolver:8080/hostname"); + + // even tho proxy protocol / scheme is http, it should not be picked (prior to this PR, it would be picked) + config->setString("proxy.https.resolver.proxy_scheme", "http"); + config->setString("proxy.https.resolver.proxy_port", "80"); + config->setString("proxy.https.resolver.proxy_cache_time", "10"); + + context->setConfig(config); + + auto http_resolver = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTP, *config); + auto https_resolver = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTPS, *config); + + ASSERT_TRUE(std::dynamic_pointer_cast(http_resolver)); + ASSERT_TRUE(std::dynamic_pointer_cast(https_resolver)); +} template void test_tunneling(DB::ContextMutablePtr context) { - EnvironmentProxySetter setter(http_env_proxy_server, https_env_proxy_server); - ConfigurationPtr config = Poco::AutoPtr(new Poco::Util::MapConfiguration()); config->setString("proxy", ""); + config->setString("proxy.https", ""); + config->setString("proxy.https.uri", http_list_proxy_server.toString()); if constexpr (STRING) { @@ -230,4 +174,3 @@ TEST_F(ProxyConfigurationResolverProviderTests, TunnelingForHTTPSRequestsOverHTT test_tunneling(context); test_tunneling(context); } - diff --git a/src/Common/tests/gtest_proxy_environment_configuration.cpp b/src/Common/tests/gtest_proxy_environment_configuration.cpp index 377bef385f6..708c7194785 100644 --- a/src/Common/tests/gtest_proxy_environment_configuration.cpp +++ b/src/Common/tests/gtest_proxy_environment_configuration.cpp @@ -2,81 +2,38 @@ #include #include +#include #include namespace DB { -namespace +TEST(EnvironmentProxyConfigurationResolver, TestHTTPandHTTPS) { - auto http_proxy_server = Poco::URI("http://proxy_server:3128"); - auto https_proxy_server = Poco::URI("https://proxy_server:3128"); -} + const auto http_proxy_server = Poco::URI(EnvironmentProxySetter::HTTP_PROXY); + const auto https_proxy_server = Poco::URI(EnvironmentProxySetter::HTTPS_PROXY); -TEST(EnvironmentProxyConfigurationResolver, TestHTTP) -{ - EnvironmentProxySetter setter(http_proxy_server, {}); + std::string poco_no_proxy_regex = buildPocoNonProxyHosts(EnvironmentProxySetter::NO_PROXY); - EnvironmentProxyConfigurationResolver resolver(ProxyConfiguration::Protocol::HTTP); + EnvironmentProxySetter setter; - auto configuration = resolver.resolve(); + EnvironmentProxyConfigurationResolver http_resolver(ProxyConfiguration::Protocol::HTTP); - ASSERT_EQ(configuration.host, http_proxy_server.getHost()); - ASSERT_EQ(configuration.port, http_proxy_server.getPort()); - ASSERT_EQ(configuration.protocol, ProxyConfiguration::protocolFromString(http_proxy_server.getScheme())); -} + auto http_configuration = http_resolver.resolve(); -TEST(EnvironmentProxyConfigurationResolver, TestHTTPNoEnv) -{ - EnvironmentProxyConfigurationResolver resolver(ProxyConfiguration::Protocol::HTTP); + ASSERT_EQ(http_configuration.host, http_proxy_server.getHost()); + ASSERT_EQ(http_configuration.port, http_proxy_server.getPort()); + ASSERT_EQ(http_configuration.protocol, ProxyConfiguration::protocolFromString(http_proxy_server.getScheme())); + ASSERT_EQ(http_configuration.no_proxy_hosts, poco_no_proxy_regex); - auto configuration = resolver.resolve(); + EnvironmentProxyConfigurationResolver https_resolver(ProxyConfiguration::Protocol::HTTPS); - ASSERT_EQ(configuration.host, ""); - ASSERT_EQ(configuration.protocol, ProxyConfiguration::Protocol::HTTP); - ASSERT_EQ(configuration.port, 0u); -} + auto https_configuration = https_resolver.resolve(); -TEST(EnvironmentProxyConfigurationResolver, TestHTTPs) -{ - EnvironmentProxySetter setter({}, https_proxy_server); - - EnvironmentProxyConfigurationResolver resolver(ProxyConfiguration::Protocol::HTTPS); - - auto configuration = resolver.resolve(); - - ASSERT_EQ(configuration.host, https_proxy_server.getHost()); - ASSERT_EQ(configuration.port, https_proxy_server.getPort()); - ASSERT_EQ(configuration.protocol, ProxyConfiguration::protocolFromString(https_proxy_server.getScheme())); -} - -TEST(EnvironmentProxyConfigurationResolver, TestHTTPsNoEnv) -{ - EnvironmentProxyConfigurationResolver resolver(ProxyConfiguration::Protocol::HTTPS); - - auto configuration = resolver.resolve(); - - ASSERT_EQ(configuration.host, ""); - ASSERT_EQ(configuration.protocol, ProxyConfiguration::Protocol::HTTP); - ASSERT_EQ(configuration.port, 0u); -} - -TEST(EnvironmentProxyConfigurationResolver, TestHTTPsOverHTTPTunnelingDisabled) -{ - // use http proxy for https, this would use connect protocol by default - EnvironmentProxySetter setter({}, http_proxy_server); - - bool disable_tunneling_for_https_requests_over_http_proxy = true; - - EnvironmentProxyConfigurationResolver resolver( - ProxyConfiguration::Protocol::HTTPS, disable_tunneling_for_https_requests_over_http_proxy); - - auto configuration = resolver.resolve(); - - ASSERT_EQ(configuration.host, http_proxy_server.getHost()); - ASSERT_EQ(configuration.port, http_proxy_server.getPort()); - ASSERT_EQ(configuration.protocol, ProxyConfiguration::protocolFromString(http_proxy_server.getScheme())); - ASSERT_EQ(configuration.tunneling, false); + ASSERT_EQ(https_configuration.host, https_proxy_server.getHost()); + ASSERT_EQ(https_configuration.port, https_proxy_server.getPort()); + ASSERT_EQ(https_configuration.protocol, ProxyConfiguration::protocolFromString(https_proxy_server.getScheme())); + ASSERT_EQ(https_configuration.no_proxy_hosts, poco_no_proxy_regex); } } diff --git a/src/Common/tests/gtest_proxy_list_configuration_resolver.cpp b/src/Common/tests/gtest_proxy_list_configuration_resolver.cpp index 3234fe0ccd1..5d8268eb206 100644 --- a/src/Common/tests/gtest_proxy_list_configuration_resolver.cpp +++ b/src/Common/tests/gtest_proxy_list_configuration_resolver.cpp @@ -10,6 +10,8 @@ namespace { auto proxy_server1 = Poco::URI("http://proxy_server1:3128"); auto proxy_server2 = Poco::URI("http://proxy_server2:3128"); + + std::string no_proxy_hosts = "localhost,,127.0.0.1,some_other_domain,,,, sub-domain.domain.com,"; } TEST(ProxyListConfigurationResolver, SimpleTest) @@ -17,7 +19,8 @@ TEST(ProxyListConfigurationResolver, SimpleTest) ProxyListConfigurationResolver resolver( {proxy_server1, proxy_server2}, - ProxyConfiguration::Protocol::HTTP); + ProxyConfiguration::Protocol::HTTP, + no_proxy_hosts); auto configuration1 = resolver.resolve(); auto configuration2 = resolver.resolve(); @@ -25,10 +28,12 @@ TEST(ProxyListConfigurationResolver, SimpleTest) ASSERT_EQ(configuration1.host, proxy_server1.getHost()); ASSERT_EQ(configuration1.port, proxy_server1.getPort()); ASSERT_EQ(configuration1.protocol, ProxyConfiguration::protocolFromString(proxy_server1.getScheme())); + ASSERT_EQ(configuration1.no_proxy_hosts, no_proxy_hosts); ASSERT_EQ(configuration2.host, proxy_server2.getHost()); ASSERT_EQ(configuration2.port, proxy_server2.getPort()); ASSERT_EQ(configuration2.protocol, ProxyConfiguration::protocolFromString(proxy_server2.getScheme())); + ASSERT_EQ(configuration2.no_proxy_hosts, no_proxy_hosts); } TEST(ProxyListConfigurationResolver, HTTPSRequestsOverHTTPProxyDefault) @@ -36,7 +41,8 @@ TEST(ProxyListConfigurationResolver, HTTPSRequestsOverHTTPProxyDefault) ProxyListConfigurationResolver resolver( {proxy_server1, proxy_server2}, - ProxyConfiguration::Protocol::HTTPS); + ProxyConfiguration::Protocol::HTTPS, + ""); auto configuration1 = resolver.resolve(); auto configuration2 = resolver.resolve(); @@ -45,11 +51,12 @@ TEST(ProxyListConfigurationResolver, HTTPSRequestsOverHTTPProxyDefault) ASSERT_EQ(configuration1.port, proxy_server1.getPort()); ASSERT_EQ(configuration1.protocol, ProxyConfiguration::protocolFromString(proxy_server1.getScheme())); ASSERT_EQ(configuration1.tunneling, true); + ASSERT_EQ(configuration1.no_proxy_hosts, ""); ASSERT_EQ(configuration2.host, proxy_server2.getHost()); ASSERT_EQ(configuration2.port, proxy_server2.getPort()); ASSERT_EQ(configuration2.protocol, ProxyConfiguration::protocolFromString(proxy_server2.getScheme())); - ASSERT_EQ(configuration1.tunneling, true); + ASSERT_EQ(configuration2.no_proxy_hosts, ""); } TEST(ProxyListConfigurationResolver, SimpleTestTunnelingDisabled) @@ -58,6 +65,7 @@ TEST(ProxyListConfigurationResolver, SimpleTestTunnelingDisabled) ProxyListConfigurationResolver resolver( {proxy_server1, proxy_server2}, ProxyConfiguration::Protocol::HTTPS, + "", disable_tunneling_for_https_requests_over_http_proxy); auto configuration1 = resolver.resolve(); diff --git a/src/Common/tests/gtest_proxy_remote_configuration_resolver.cpp b/src/Common/tests/gtest_proxy_remote_configuration_resolver.cpp index 7068e0f2061..5489a931f24 100644 --- a/src/Common/tests/gtest_proxy_remote_configuration_resolver.cpp +++ b/src/Common/tests/gtest_proxy_remote_configuration_resolver.cpp @@ -42,6 +42,7 @@ TEST(RemoteProxyConfigurationResolver, HTTPOverHTTP) RemoteProxyConfigurationResolver resolver( remote_server_configuration, ProxyConfiguration::Protocol::HTTP, + "", std::make_shared(proxy_server_mock) ); @@ -68,6 +69,7 @@ TEST(RemoteProxyConfigurationResolver, HTTPSOverHTTPS) RemoteProxyConfigurationResolver resolver( remote_server_configuration, ProxyConfiguration::Protocol::HTTPS, + "", std::make_shared(proxy_server_mock) ); @@ -95,6 +97,7 @@ TEST(RemoteProxyConfigurationResolver, HTTPSOverHTTP) RemoteProxyConfigurationResolver resolver( remote_server_configuration, ProxyConfiguration::Protocol::HTTPS, + "", std::make_shared(proxy_server_mock) ); @@ -122,6 +125,7 @@ TEST(RemoteProxyConfigurationResolver, HTTPSOverHTTPNoTunneling) RemoteProxyConfigurationResolver resolver( remote_server_configuration, ProxyConfiguration::Protocol::HTTPS, + "", std::make_shared(proxy_server_mock), true /* disable_tunneling_for_https_requests_over_http_proxy_ */ ); @@ -153,6 +157,7 @@ TEST(RemoteProxyConfigurationResolver, SimpleCacheTest) RemoteProxyConfigurationResolver resolver( remote_server_configuration, ProxyConfiguration::Protocol::HTTP, + "", fetcher_mock ); diff --git a/src/Common/tests/gtest_resolve_pool.cpp b/src/Common/tests/gtest_resolve_pool.cpp index 2391fc8bacf..b760b9b1524 100644 --- a/src/Common/tests/gtest_resolve_pool.cpp +++ b/src/Common/tests/gtest_resolve_pool.cpp @@ -1,12 +1,39 @@ #include -#include #include #include -#include "base/defines.h" +#include + +#include #include +#include #include -#include + + +using namespace std::literals::chrono_literals; + + +auto now() +{ + return std::chrono::steady_clock::now(); +} + +void sleep_until(auto time_point) +{ + std::this_thread::sleep_until(time_point); +} + +void sleep_for(auto duration) +{ + std::this_thread::sleep_for(duration); +} + +size_t toMilliseconds(auto duration) +{ + return std::chrono::duration_cast(duration).count(); +} + +const auto epsilon = 500us; class ResolvePoolMock : public DB::HostResolver { @@ -267,13 +294,14 @@ TEST_F(ResolvePoolTest, CanFailAndHeal) TEST_F(ResolvePoolTest, CanExpire) { - auto resolver = make_resolver(); + auto history = 5ms; + auto resolver = make_resolver(toMilliseconds(history)); auto expired_addr = resolver->resolve(); ASSERT_TRUE(addresses.contains(*expired_addr)); - addresses.erase(*expired_addr); - sleepForSeconds(1); + + sleep_for(history + epsilon); for (size_t i = 0; i < 1000; ++i) { @@ -310,12 +338,19 @@ TEST_F(ResolvePoolTest, DuplicatesInAddresses) ASSERT_EQ(3, DB::CurrentThread::getProfileEvents()[metrics.discovered]); } -void check_no_failed_address(size_t iteration, auto & resolver, auto & addresses, auto & failed_addr, auto & metrics) +void check_no_failed_address(size_t iteration, auto & resolver, auto & addresses, auto & failed_addr, auto & metrics, auto deadline) { ASSERT_EQ(iteration, DB::CurrentThread::getProfileEvents()[metrics.failed]); for (size_t i = 0; i < 100; ++i) { auto next_addr = resolver->resolve(); + + if (now() > deadline) + { + ASSERT_NE(i, 0); + break; + } + ASSERT_TRUE(addresses.contains(*next_addr)); ASSERT_NE(*next_addr, *failed_addr); } @@ -323,52 +358,60 @@ void check_no_failed_address(size_t iteration, auto & resolver, auto & addresses TEST_F(ResolvePoolTest, BannedForConsiquenceFail) { - size_t history_ms = 5; - auto resolver = make_resolver(history_ms); - + auto history = 5ms; + auto resolver = make_resolver(toMilliseconds(history)); auto failed_addr = resolver->resolve(); ASSERT_TRUE(addresses.contains(*failed_addr)); + auto start_at = now(); + failed_addr.setFail(); ASSERT_EQ(3, CurrentMetrics::get(metrics.active_count)); ASSERT_EQ(1, CurrentMetrics::get(metrics.banned_count)); - check_no_failed_address(1, resolver, addresses, failed_addr, metrics); + check_no_failed_address(1, resolver, addresses, failed_addr, metrics, start_at + history - epsilon); + + sleep_until(start_at + history + epsilon); + start_at = now(); - sleepForMilliseconds(history_ms + 1); resolver->update(); ASSERT_EQ(3, CurrentMetrics::get(metrics.active_count)); ASSERT_EQ(0, CurrentMetrics::get(metrics.banned_count)); failed_addr.setFail(); - check_no_failed_address(2, resolver, addresses, failed_addr, metrics); + check_no_failed_address(2, resolver, addresses, failed_addr, metrics, start_at + history - epsilon); + + sleep_until(start_at + history + epsilon); + start_at = now(); - sleepForMilliseconds(history_ms + 1); resolver->update(); ASSERT_EQ(3, CurrentMetrics::get(metrics.active_count)); ASSERT_EQ(1, CurrentMetrics::get(metrics.banned_count)); // ip still banned adter history_ms + update, because it was his second consiquent fail - check_no_failed_address(2, resolver, addresses, failed_addr, metrics); + check_no_failed_address(2, resolver, addresses, failed_addr, metrics, start_at + history - epsilon); } TEST_F(ResolvePoolTest, NoAditionalBannForConcurrentFail) { - size_t history_ms = 5; - auto resolver = make_resolver(history_ms); + auto history = 5ms; + auto resolver = make_resolver(toMilliseconds(history)); auto failed_addr = resolver->resolve(); ASSERT_TRUE(addresses.contains(*failed_addr)); + auto start_at = now(); + failed_addr.setFail(); failed_addr.setFail(); failed_addr.setFail(); ASSERT_EQ(3, CurrentMetrics::get(metrics.active_count)); ASSERT_EQ(1, CurrentMetrics::get(metrics.banned_count)); - check_no_failed_address(3, resolver, addresses, failed_addr, metrics); + check_no_failed_address(3, resolver, addresses, failed_addr, metrics, start_at + history - epsilon); + + sleep_until(start_at + history + epsilon); - sleepForMilliseconds(history_ms + 1); resolver->update(); // ip is cleared after just 1 history_ms interval. ASSERT_EQ(3, CurrentMetrics::get(metrics.active_count)); @@ -377,8 +420,8 @@ TEST_F(ResolvePoolTest, NoAditionalBannForConcurrentFail) TEST_F(ResolvePoolTest, StillBannedAfterSuccess) { - size_t history_ms = 5; - auto resolver = make_resolver(history_ms); + auto history = 5ms; + auto resolver = make_resolver(toMilliseconds(history)); auto failed_addr = resolver->resolve(); ASSERT_TRUE(addresses.contains(*failed_addr)); @@ -395,11 +438,12 @@ TEST_F(ResolvePoolTest, StillBannedAfterSuccess) } chassert(again_addr); + auto start_at = now(); failed_addr.setFail(); ASSERT_EQ(3, CurrentMetrics::get(metrics.active_count)); ASSERT_EQ(1, CurrentMetrics::get(metrics.banned_count)); - check_no_failed_address(1, resolver, addresses, failed_addr, metrics); + check_no_failed_address(1, resolver, addresses, failed_addr, metrics, start_at + history - epsilon); again_addr = std::nullopt; // success; diff --git a/src/Common/threadPoolCallbackRunner.h b/src/Common/threadPoolCallbackRunner.h index 5beec660801..afbdcf2df19 100644 --- a/src/Common/threadPoolCallbackRunner.h +++ b/src/Common/threadPoolCallbackRunner.h @@ -54,7 +54,6 @@ ThreadPoolCallbackRunnerUnsafe threadPoolCallbackRunnerUnsafe( auto future = task->get_future(); - /// ThreadPool is using "bigger is higher priority" instead of "smaller is more priority". /// Note: calling method scheduleOrThrowOnError in intentional, because we don't want to throw exceptions /// in critical places where this callback runner is used (e.g. loading or deletion of parts) my_pool->scheduleOrThrowOnError([my_task = std::move(task)]{ (*my_task)(); }, priority); @@ -163,7 +162,6 @@ public: task->future = task_func->get_future(); - /// ThreadPool is using "bigger is higher priority" instead of "smaller is more priority". /// Note: calling method scheduleOrThrowOnError in intentional, because we don't want to throw exceptions /// in critical places where this callback runner is used (e.g. loading or deletion of parts) pool.scheduleOrThrowOnError([my_task = std::move(task_func)]{ (*my_task)(); }, priority); diff --git a/src/Coordination/KeeperConstants.cpp b/src/Coordination/KeeperConstants.cpp index 8251dca3d1e..51bf037c1c9 100644 --- a/src/Coordination/KeeperConstants.cpp +++ b/src/Coordination/KeeperConstants.cpp @@ -258,7 +258,8 @@ M(KeeperExistsRequest) \ \ M(IOUringSQEsSubmitted) \ - M(IOUringSQEsResubmits) \ + M(IOUringSQEsResubmitsAsync) \ + M(IOUringSQEsResubmitsSync) \ M(IOUringCQEsCompleted) \ M(IOUringCQEsFailed) \ \ diff --git a/src/Coordination/KeeperSnapshotManagerS3.cpp b/src/Coordination/KeeperSnapshotManagerS3.cpp index 9991bef7be5..66ac2be810e 100644 --- a/src/Coordination/KeeperSnapshotManagerS3.cpp +++ b/src/Coordination/KeeperSnapshotManagerS3.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -64,7 +65,8 @@ void KeeperSnapshotManagerS3::updateS3Configuration(const Poco::Util::AbstractCo return; } - auto auth_settings = S3::AuthSettings::loadFromConfig(config_prefix, config); + const auto & settings = Context::getGlobalContextInstance()->getSettingsRef(); + auto auth_settings = S3::AuthSettings(config, settings, config_prefix); String endpoint = macros->expand(config.getString(config_prefix + ".endpoint")); auto new_uri = S3::URI{endpoint}; @@ -118,10 +120,10 @@ void KeeperSnapshotManagerS3::updateS3Configuration(const Poco::Util::AbstractCo std::move(headers), S3::CredentialsConfiguration { - auth_settings.use_environment_credentials.value_or(true), - auth_settings.use_insecure_imds_request.value_or(false), - auth_settings.expiration_window_seconds.value_or(S3::DEFAULT_EXPIRATION_WINDOW_SECONDS), - auth_settings.no_sign_request.value_or(false), + auth_settings.use_environment_credentials, + auth_settings.use_insecure_imds_request, + auth_settings.expiration_window_seconds, + auth_settings.no_sign_request, }, credentials.GetSessionToken()); @@ -154,7 +156,7 @@ void KeeperSnapshotManagerS3::uploadSnapshotImpl(const SnapshotFileInfo & snapsh if (s3_client == nullptr) return; - S3Settings::RequestSettings request_settings_1; + S3::RequestSettings request_settings_1; const auto create_writer = [&](const auto & key) { @@ -197,7 +199,7 @@ void KeeperSnapshotManagerS3::uploadSnapshotImpl(const SnapshotFileInfo & snapsh lock_writer.finalize(); // We read back the written UUID, if it's the same we can upload the file - S3Settings::RequestSettings request_settings_2; + S3::RequestSettings request_settings_2; request_settings_2.max_single_read_retries = 1; ReadBufferFromS3 lock_reader { diff --git a/src/Coordination/Standalone/Context.cpp b/src/Coordination/Standalone/Context.cpp index 2af8a015c2d..2017adcc58d 100644 --- a/src/Coordination/Standalone/Context.cpp +++ b/src/Coordination/Standalone/Context.cpp @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include #include @@ -146,7 +146,7 @@ struct ContextSharedPart : boost::noncopyable mutable ThrottlerPtr local_read_throttler; /// A server-wide throttler for local IO reads mutable ThrottlerPtr local_write_throttler; /// A server-wide throttler for local IO writes - std::optional storage_s3_settings TSA_GUARDED_BY(mutex); /// Settings of S3 storage + std::optional storage_s3_settings TSA_GUARDED_BY(mutex); /// Settings of S3 storage mutable std::mutex keeper_dispatcher_mutex; mutable std::shared_ptr keeper_dispatcher TSA_GUARDED_BY(keeper_dispatcher_mutex); @@ -455,14 +455,14 @@ std::shared_ptr Context::getZooKeeper() const throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Cannot connect to ZooKeeper from Keeper"); } -const StorageS3Settings & Context::getStorageS3Settings() const +const S3SettingsByEndpoint & Context::getStorageS3Settings() const { std::lock_guard lock(shared->mutex); if (!shared->storage_s3_settings) { const auto & config = shared->config ? *shared->config : Poco::Util::Application::instance().config(); - shared->storage_s3_settings.emplace().loadFromConfig("s3", config, getSettingsRef()); + shared->storage_s3_settings.emplace().loadFromConfig(config, "s3", getSettingsRef()); } return *shared->storage_s3_settings; diff --git a/src/Coordination/Standalone/Context.h b/src/Coordination/Standalone/Context.h index 79a3e32a72d..d3bbfececed 100644 --- a/src/Coordination/Standalone/Context.h +++ b/src/Coordination/Standalone/Context.h @@ -37,7 +37,7 @@ class FilesystemCacheLog; class FilesystemReadPrefetchesLog; class BlobStorageLog; class IOUringReader; -class StorageS3Settings; +class S3SettingsByEndpoint; /// A small class which owns ContextShared. /// We don't use something like unique_ptr directly to allow ContextShared type to be incomplete. @@ -130,7 +130,8 @@ public: enum class ApplicationType : uint8_t { - KEEPER + KEEPER, + SERVER, }; void setApplicationType(ApplicationType) {} @@ -163,7 +164,7 @@ public: zkutil::ZooKeeperPtr getZooKeeper() const; - const StorageS3Settings & getStorageS3Settings() const; + const S3SettingsByEndpoint & getStorageS3Settings() const; const String & getUserName() const { static std::string user; return user; } diff --git a/src/Core/BaseSettings.h b/src/Core/BaseSettings.h index adf7a41193c..6242d78aee7 100644 --- a/src/Core/BaseSettings.h +++ b/src/Core/BaseSettings.h @@ -108,6 +108,7 @@ public: public: const String & getName() const; Field getValue() const; + void setValue(const Field & value); Field getDefaultValue() const; String getValueString() const; String getDefaultValueString() const; @@ -122,10 +123,10 @@ public: private: friend class BaseSettings; - const BaseSettings * settings; + BaseSettings * settings; const typename Traits::Accessor * accessor; size_t index; - std::conditional_t custom_setting; + std::conditional_t custom_setting; }; enum SkipFlags @@ -144,35 +145,50 @@ public: Iterator & operator++(); Iterator operator++(int); /// NOLINT const SettingFieldRef & operator *() const { return field_ref; } + SettingFieldRef & operator *() { return field_ref; } bool operator ==(const Iterator & other) const; bool operator !=(const Iterator & other) const { return !(*this == other); } private: friend class BaseSettings; - Iterator(const BaseSettings & settings_, const typename Traits::Accessor & accessor_, SkipFlags skip_flags_); + Iterator(BaseSettings & settings_, const typename Traits::Accessor & accessor_, SkipFlags skip_flags_); void doSkip(); void setPointerToCustomSetting(); SettingFieldRef field_ref; - std::conditional_t custom_settings_iterator; + std::conditional_t custom_settings_iterator; SkipFlags skip_flags; }; class Range { public: - Range(const BaseSettings & settings_, SkipFlags skip_flags_) : settings(settings_), accessor(Traits::Accessor::instance()), skip_flags(skip_flags_) {} + Range(BaseSettings & settings_, SkipFlags skip_flags_) : settings(settings_), accessor(Traits::Accessor::instance()), skip_flags(skip_flags_) {} Iterator begin() const { return Iterator(settings, accessor, skip_flags); } Iterator end() const { return Iterator(settings, accessor, SKIP_ALL); } private: - const BaseSettings & settings; + BaseSettings & settings; const typename Traits::Accessor & accessor; SkipFlags skip_flags; }; - Range all(SkipFlags skip_flags = SKIP_NONE) const { return Range{*this, skip_flags}; } + class MutableRange + { + public: + MutableRange(BaseSettings & settings_, SkipFlags skip_flags_) : settings(settings_), accessor(Traits::Accessor::instance()), skip_flags(skip_flags_) {} + Iterator begin() { return Iterator(settings, accessor, skip_flags); } + Iterator end() { return Iterator(settings, accessor, SKIP_ALL); } + + private: + BaseSettings & settings; + const typename Traits::Accessor & accessor; + SkipFlags skip_flags; + }; + + Range all(SkipFlags skip_flags = SKIP_NONE) const { return Range{const_cast &>(*this), skip_flags}; } + MutableRange allMutable(SkipFlags skip_flags = SKIP_NONE) { return MutableRange{*this, skip_flags}; } Range allChanged() const { return all(SKIP_UNCHANGED); } Range allUnchanged() const { return all(SKIP_CHANGED); } Range allBuiltin() const { return all(SKIP_CUSTOM); } @@ -608,7 +624,7 @@ const SettingFieldCustom * BaseSettings::tryGetCustomSetting(std::strin } template -BaseSettings::Iterator::Iterator(const BaseSettings & settings_, const typename Traits::Accessor & accessor_, SkipFlags skip_flags_) +BaseSettings::Iterator::Iterator(BaseSettings & settings_, const typename Traits::Accessor & accessor_, SkipFlags skip_flags_) : skip_flags(skip_flags_) { field_ref.settings = &settings_; @@ -741,6 +757,18 @@ Field BaseSettings::SettingFieldRef::getValue() const return accessor->getValue(*settings, index); } +template +void BaseSettings::SettingFieldRef::setValue(const Field & value) +{ + if constexpr (Traits::allow_custom_settings) + { + if (custom_setting) + custom_setting->second = value; + } + else + accessor->setValue(*settings, index, value); +} + template Field BaseSettings::SettingFieldRef::getDefaultValue() const { diff --git a/src/Core/ServerSettings.h b/src/Core/ServerSettings.h index 45f235116ab..309becdd78f 100644 --- a/src/Core/ServerSettings.h +++ b/src/Core/ServerSettings.h @@ -97,11 +97,11 @@ namespace DB \ M(UInt64, max_table_size_to_drop, 50000000000lu, "If size of a table is greater than this value (in bytes) than table could not be dropped with any DROP query.", 0) \ M(UInt64, max_partition_size_to_drop, 50000000000lu, "Same as max_table_size_to_drop, but for the partitions.", 0) \ - M(UInt64, max_table_num_to_warn, 5000lu, "If number of tables is greater than this value, server will create a warning that will displayed to user.", 0) \ - M(UInt64, max_view_num_to_warn, 10000lu, "If number of views is greater than this value, server will create a warning that will displayed to user.", 0) \ - M(UInt64, max_dictionary_num_to_warn, 1000lu, "If number of dictionaries is greater than this value, server will create a warning that will displayed to user.", 0) \ - M(UInt64, max_database_num_to_warn, 1000lu, "If number of databases is greater than this value, server will create a warning that will displayed to user.", 0) \ - M(UInt64, max_part_num_to_warn, 100000lu, "If number of databases is greater than this value, server will create a warning that will displayed to user.", 0) \ + M(UInt64, max_table_num_to_warn, 5000lu, "If the number of tables is greater than this value, the server will create a warning that will displayed to user.", 0) \ + M(UInt64, max_view_num_to_warn, 10000lu, "If the number of views is greater than this value, the server will create a warning that will displayed to user.", 0) \ + M(UInt64, max_dictionary_num_to_warn, 1000lu, "If the number of dictionaries is greater than this value, the server will create a warning that will displayed to user.", 0) \ + M(UInt64, max_database_num_to_warn, 1000lu, "If the number of databases is greater than this value, the server will create a warning that will displayed to user.", 0) \ + M(UInt64, max_part_num_to_warn, 100000lu, "If the number of parts is greater than this value, the server will create a warning that will displayed to user.", 0) \ M(UInt64, concurrent_threads_soft_limit_num, 0, "Sets how many concurrent thread can be allocated before applying CPU pressure. Zero means unlimited.", 0) \ M(UInt64, concurrent_threads_soft_limit_ratio_to_cores, 0, "Same as concurrent_threads_soft_limit_num, but with ratio to cores.", 0) \ \ @@ -146,6 +146,7 @@ namespace DB M(UInt64, global_profiler_real_time_period_ns, 0, "Period for real clock timer of global profiler (in nanoseconds). Set 0 value to turn off the real clock global profiler. Recommended value is at least 10000000 (100 times a second) for single queries or 1000000000 (once a second) for cluster-wide profiling.", 0) \ M(UInt64, global_profiler_cpu_time_period_ns, 0, "Period for CPU clock timer of global profiler (in nanoseconds). Set 0 value to turn off the CPU clock global profiler. Recommended value is at least 10000000 (100 times a second) for single queries or 1000000000 (once a second) for cluster-wide profiling.", 0) \ M(Bool, enable_azure_sdk_logging, false, "Enables logging from Azure sdk", 0) \ + M(Double, gwp_asan_force_sample_probability, 0, "Probability that an allocation from specific places will be sampled by GWP Asan (i.e. PODArray allocations)", 0) \ /// If you add a setting which can be updated at runtime, please update 'changeable_settings' map in StorageSystemServerSettings.cpp diff --git a/src/Core/ServerUUID.cpp b/src/Core/ServerUUID.cpp index c2de6be7794..251b407e673 100644 --- a/src/Core/ServerUUID.cpp +++ b/src/Core/ServerUUID.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -11,6 +12,16 @@ namespace DB namespace ErrorCodes { extern const int CANNOT_CREATE_FILE; + extern const int LOGICAL_ERROR; +} + +UUID ServerUUID::get() +{ + if (server_uuid == UUIDHelpers::Nil && + (Context::getGlobalContextInstance()->getApplicationType() == Context::ApplicationType::SERVER || + Context::getGlobalContextInstance()->getApplicationType() == Context::ApplicationType::KEEPER)) + throw Exception(ErrorCodes::LOGICAL_ERROR, "ServerUUID is not initialized yet"); + return server_uuid; } void ServerUUID::load(const fs::path & server_uuid_file, Poco::Logger * log) @@ -57,4 +68,9 @@ UUID loadServerUUID(const fs::path & server_uuid_file, Poco::Logger * log) } } +void ServerUUID::setRandomForUnitTests() +{ + server_uuid = UUIDHelpers::generateV4(); +} + } diff --git a/src/Core/ServerUUID.h b/src/Core/ServerUUID.h index 71ae9edc00e..9c7f7d32acc 100644 --- a/src/Core/ServerUUID.h +++ b/src/Core/ServerUUID.h @@ -15,10 +15,12 @@ class ServerUUID public: /// Returns persistent UUID of current clickhouse-server or clickhouse-keeper instance. - static UUID get() { return server_uuid; } + static UUID get(); /// Loads server UUID from file or creates new one. Should be called on daemon startup. static void load(const fs::path & server_uuid_file, Poco::Logger * log); + + static void setRandomForUnitTests(); }; UUID loadServerUUID(const fs::path & server_uuid_file, Poco::Logger * log); diff --git a/src/Core/Settings.h b/src/Core/Settings.h index b3e83092a77..bda403b1b40 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -5,6 +5,7 @@ #include #include #include +#include #include @@ -78,34 +79,36 @@ class IColumn; M(UInt64, idle_connection_timeout, 3600, "Close idle TCP connections after specified number of seconds.", 0) \ M(UInt64, distributed_connections_pool_size, 1024, "Maximum number of connections with one remote server in the pool.", 0) \ M(UInt64, connections_with_failover_max_tries, 3, "The maximum number of attempts to connect to replicas.", 0) \ - M(UInt64, s3_strict_upload_part_size, 0, "The exact size of part to upload during multipart upload to S3 (some implementations does not supports variable size parts).", 0) \ + M(UInt64, s3_strict_upload_part_size, S3::DEFAULT_STRICT_UPLOAD_PART_SIZE, "The exact size of part to upload during multipart upload to S3 (some implementations does not supports variable size parts).", 0) \ M(UInt64, azure_strict_upload_part_size, 0, "The exact size of part to upload during multipart upload to Azure blob storage.", 0) \ M(UInt64, azure_max_blocks_in_multipart_upload, 50000, "Maximum number of blocks in multipart upload for Azure.", 0) \ - M(UInt64, s3_min_upload_part_size, 16*1024*1024, "The minimum size of part to upload during multipart upload to S3.", 0) \ - M(UInt64, s3_max_upload_part_size, 5ull*1024*1024*1024, "The maximum size of part to upload during multipart upload to S3.", 0) \ + M(UInt64, s3_min_upload_part_size, S3::DEFAULT_MIN_UPLOAD_PART_SIZE, "The minimum size of part to upload during multipart upload to S3.", 0) \ + M(UInt64, s3_max_upload_part_size, S3::DEFAULT_MAX_UPLOAD_PART_SIZE, "The maximum size of part to upload during multipart upload to S3.", 0) \ M(UInt64, azure_min_upload_part_size, 16*1024*1024, "The minimum size of part to upload during multipart upload to Azure blob storage.", 0) \ M(UInt64, azure_max_upload_part_size, 5ull*1024*1024*1024, "The maximum size of part to upload during multipart upload to Azure blob storage.", 0) \ - M(UInt64, s3_upload_part_size_multiply_factor, 2, "Multiply s3_min_upload_part_size by this factor each time s3_multiply_parts_count_threshold parts were uploaded from a single write to S3.", 0) \ - M(UInt64, s3_upload_part_size_multiply_parts_count_threshold, 500, "Each time this number of parts was uploaded to S3, s3_min_upload_part_size is multiplied by s3_upload_part_size_multiply_factor.", 0) \ + M(UInt64, s3_upload_part_size_multiply_factor, S3::DEFAULT_UPLOAD_PART_SIZE_MULTIPLY_FACTOR, "Multiply s3_min_upload_part_size by this factor each time s3_multiply_parts_count_threshold parts were uploaded from a single write to S3.", 0) \ + M(UInt64, s3_upload_part_size_multiply_parts_count_threshold, S3::DEFAULT_UPLOAD_PART_SIZE_MULTIPLY_PARTS_COUNT_THRESHOLD, "Each time this number of parts was uploaded to S3, s3_min_upload_part_size is multiplied by s3_upload_part_size_multiply_factor.", 0) \ + M(UInt64, s3_max_part_number, S3::DEFAULT_MAX_PART_NUMBER, "Maximum part number number for s3 upload part.", 0) \ + M(UInt64, s3_max_single_operation_copy_size, S3::DEFAULT_MAX_SINGLE_OPERATION_COPY_SIZE, "Maximum size for a single copy operation in s3", 0) \ M(UInt64, azure_upload_part_size_multiply_factor, 2, "Multiply azure_min_upload_part_size by this factor each time azure_multiply_parts_count_threshold parts were uploaded from a single write to Azure blob storage.", 0) \ M(UInt64, azure_upload_part_size_multiply_parts_count_threshold, 500, "Each time this number of parts was uploaded to Azure blob storage, azure_min_upload_part_size is multiplied by azure_upload_part_size_multiply_factor.", 0) \ - M(UInt64, s3_max_inflight_parts_for_one_file, 20, "The maximum number of a concurrent loaded parts in multipart upload request. 0 means unlimited.", 0) \ + M(UInt64, s3_max_inflight_parts_for_one_file, S3::DEFAULT_MAX_INFLIGHT_PARTS_FOR_ONE_FILE, "The maximum number of a concurrent loaded parts in multipart upload request. 0 means unlimited.", 0) \ M(UInt64, azure_max_inflight_parts_for_one_file, 20, "The maximum number of a concurrent loaded parts in multipart upload request. 0 means unlimited.", 0) \ - M(UInt64, s3_max_single_part_upload_size, 32*1024*1024, "The maximum size of object to upload using singlepart upload to S3.", 0) \ + M(UInt64, s3_max_single_part_upload_size, S3::DEFAULT_MAX_SINGLE_PART_UPLOAD_SIZE, "The maximum size of object to upload using singlepart upload to S3.", 0) \ M(UInt64, azure_max_single_part_upload_size, 100*1024*1024, "The maximum size of object to upload using singlepart upload to Azure blob storage.", 0) \ M(UInt64, azure_max_single_part_copy_size, 256*1024*1024, "The maximum size of object to copy using single part copy to Azure blob storage.", 0) \ - M(UInt64, s3_max_single_read_retries, 4, "The maximum number of retries during single S3 read.", 0) \ + M(UInt64, s3_max_single_read_retries, S3::DEFAULT_MAX_SINGLE_READ_TRIES, "The maximum number of retries during single S3 read.", 0) \ M(UInt64, azure_max_single_read_retries, 4, "The maximum number of retries during single Azure blob storage read.", 0) \ M(UInt64, azure_max_unexpected_write_error_retries, 4, "The maximum number of retries in case of unexpected errors during Azure blob storage write", 0) \ - M(UInt64, s3_max_unexpected_write_error_retries, 4, "The maximum number of retries in case of unexpected errors during S3 write.", 0) \ - M(UInt64, s3_max_redirects, 10, "Max number of S3 redirects hops allowed.", 0) \ - M(UInt64, s3_max_connections, 1024, "The maximum number of connections per server.", 0) \ + M(UInt64, s3_max_unexpected_write_error_retries, S3::DEFAULT_MAX_UNEXPECTED_WRITE_ERROR_RETRIES, "The maximum number of retries in case of unexpected errors during S3 write.", 0) \ + M(UInt64, s3_max_redirects, S3::DEFAULT_MAX_REDIRECTS, "Max number of S3 redirects hops allowed.", 0) \ + M(UInt64, s3_max_connections, S3::DEFAULT_MAX_CONNECTIONS, "The maximum number of connections per server.", 0) \ M(UInt64, s3_max_get_rps, 0, "Limit on S3 GET request per second rate before throttling. Zero means unlimited.", 0) \ M(UInt64, s3_max_get_burst, 0, "Max number of requests that can be issued simultaneously before hitting request per second limit. By default (0) equals to `s3_max_get_rps`", 0) \ M(UInt64, s3_max_put_rps, 0, "Limit on S3 PUT request per second rate before throttling. Zero means unlimited.", 0) \ M(UInt64, s3_max_put_burst, 0, "Max number of requests that can be issued simultaneously before hitting request per second limit. By default (0) equals to `s3_max_put_rps`", 0) \ - M(UInt64, s3_list_object_keys_size, 1000, "Maximum number of files that could be returned in batch by ListObject request", 0) \ - M(Bool, s3_use_adaptive_timeouts, true, "When adaptive timeouts are enabled first two attempts are made with low receive and send timeout", 0) \ + M(UInt64, s3_list_object_keys_size, S3::DEFAULT_LIST_OBJECT_KEYS_SIZE, "Maximum number of files that could be returned in batch by ListObject request", 0) \ + M(Bool, s3_use_adaptive_timeouts, S3::DEFAULT_USE_ADAPTIVE_TIMEOUTS, "When adaptive timeouts are enabled first two attempts are made with low receive and send timeout", 0) \ M(UInt64, azure_list_object_keys_size, 1000, "Maximum number of files that could be returned in batch by ListObject request", 0) \ M(Bool, s3_truncate_on_insert, false, "Enables or disables truncate before insert in s3 engine tables.", 0) \ M(Bool, azure_truncate_on_insert, false, "Enables or disables truncate before insert in azure engine tables.", 0) \ @@ -122,10 +125,10 @@ class IColumn; M(Bool, hdfs_ignore_file_doesnt_exist, false, "Return 0 rows when the requested files don't exist, instead of throwing an exception in HDFS table engine", 0) \ M(Bool, azure_ignore_file_doesnt_exist, false, "Return 0 rows when the requested files don't exist, instead of throwing an exception in AzureBlobStorage table engine", 0) \ M(Bool, s3_validate_request_settings, true, "Validate S3 request settings", 0) \ - M(Bool, s3_disable_checksum, false, "Do not calculate a checksum when sending a file to S3. This speeds up writes by avoiding excessive processing passes on a file. It is mostly safe as the data of MergeTree tables is checksummed by ClickHouse anyway, and when S3 is accessed with HTTPS, the TLS layer already provides integrity while transferring through the network. While additional checksums on S3 give defense in depth.", 0) \ - M(UInt64, s3_retry_attempts, 100, "Setting for Aws::Client::RetryStrategy, Aws::Client does retries itself, 0 means no retries", 0) \ - M(UInt64, s3_request_timeout_ms, 30000, "Idleness timeout for sending and receiving data to/from S3. Fail if a single TCP read or write call blocks for this long.", 0) \ - M(UInt64, s3_connect_timeout_ms, 1000, "Connection timeout for host from s3 disks.", 0) \ + M(Bool, s3_disable_checksum, S3::DEFAULT_DISABLE_CHECKSUM, "Do not calculate a checksum when sending a file to S3. This speeds up writes by avoiding excessive processing passes on a file. It is mostly safe as the data of MergeTree tables is checksummed by ClickHouse anyway, and when S3 is accessed with HTTPS, the TLS layer already provides integrity while transferring through the network. While additional checksums on S3 give defense in depth.", 0) \ + M(UInt64, s3_retry_attempts, S3::DEFAULT_RETRY_ATTEMPTS, "Setting for Aws::Client::RetryStrategy, Aws::Client does retries itself, 0 means no retries", 0) \ + M(UInt64, s3_request_timeout_ms, S3::DEFAULT_REQUEST_TIMEOUT_MS, "Idleness timeout for sending and receiving data to/from S3. Fail if a single TCP read or write call blocks for this long.", 0) \ + M(UInt64, s3_connect_timeout_ms, S3::DEFAULT_CONNECT_TIMEOUT_MS, "Connection timeout for host from s3 disks.", 0) \ M(Bool, enable_s3_requests_logging, false, "Enable very explicit logging of S3 requests. Makes sense for debug only.", 0) \ M(String, s3queue_default_zookeeper_path, "/clickhouse/s3queue/", "Default zookeeper path prefix for S3Queue engine", 0) \ M(Bool, s3queue_enable_logging_to_s3queue_log, false, "Enable writing to system.s3queue_log. The value can be overwritten per table with table settings", 0) \ @@ -398,7 +401,7 @@ class IColumn; M(Bool, allow_experimental_analyzer, true, "Allow experimental analyzer.", 0) \ M(Bool, analyzer_compatibility_join_using_top_level_identifier, false, "Force to resolve identifier in JOIN USING from projection (for example, in `SELECT a + 1 AS b FROM t1 JOIN t2 USING (b)` join will be performed by `t1.a + 1 = t2.b`, rather then `t1.b = t2.b`).", 0) \ M(Bool, prefer_global_in_and_join, false, "If enabled, all IN/JOIN operators will be rewritten as GLOBAL IN/JOIN. It's useful when the to-be-joined tables are only available on the initiator and we need to always scatter their data on-the-fly during distributed processing with the GLOBAL keyword. It's also useful to reduce the need to access the external sources joining external tables.", 0) \ - M(Bool, enable_vertical_final, false, "Not recommended. If enable, remove duplicated rows during FINAL by marking rows as deleted and filtering them later instead of merging rows", 0) \ + M(Bool, enable_vertical_final, true, "If enable, remove duplicated rows during FINAL by marking rows as deleted and filtering them later instead of merging rows", 0) \ \ \ /** Limits during query execution are part of the settings. \ @@ -1009,6 +1012,8 @@ class IColumn; M(Char, format_csv_delimiter, ',', "The character to be considered as a delimiter in CSV data. If setting with a string, a string has to have a length of 1.", 0) \ M(Bool, format_csv_allow_single_quotes, false, "If it is set to true, allow strings in single quotes.", 0) \ M(Bool, format_csv_allow_double_quotes, true, "If it is set to true, allow strings in double quotes.", 0) \ + M(Bool, output_format_csv_serialize_tuple_into_separate_columns, true, "If it set to true, then Tuples in CSV format are serialized as separate columns (that is, their nesting in the tuple is lost)", 0) \ + M(Bool, input_format_csv_deserialize_separate_columns_into_tuple, true, "If it set to true, then separate columns written in CSV format can be deserialized to Tuple column.", 0) \ M(Bool, output_format_csv_crlf_end_of_line, false, "If it is set true, end of line in CSV format will be \\r\\n instead of \\n.", 0) \ M(Bool, input_format_csv_allow_cr_end_of_line, false, "If it is set true, \\r will be allowed at end of line not followed by \\n", 0) \ M(Bool, input_format_csv_enum_as_number, false, "Treat inserted enum values in CSV formats as enum indices", 0) \ @@ -1047,6 +1052,7 @@ class IColumn; M(UInt64, input_format_max_bytes_to_read_for_schema_inference, 32 * 1024 * 1024, "The maximum bytes of data to read for automatic schema inference", 0) \ M(Bool, input_format_csv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in CSV format", 0) \ M(Bool, input_format_csv_try_infer_numbers_from_strings, false, "Try to infer numbers from string fields while schema inference in CSV format", 0) \ + M(Bool, input_format_csv_try_infer_strings_from_quoted_tuples, true, "Interpret quoted tuples in the input data as a value of type String.", 0) \ M(Bool, input_format_tsv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in TSV format", 0) \ M(Bool, input_format_csv_detect_header, true, "Automatically detect header with names and types in CSV format", 0) \ M(Bool, input_format_csv_allow_whitespace_or_tab_as_delimiter, false, "Allow to use spaces and tabs(\\t) as field delimiter in the CSV strings", 0) \ diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index 69bc8c5d207..c04f7a498af 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -96,6 +96,8 @@ static const std::map #include #include +#include #include #include @@ -156,6 +157,12 @@ static void signalHandler(int sig, siginfo_t * info, void * context) const ucontext_t * signal_context = reinterpret_cast(context); const StackTrace stack_trace(*signal_context); +#if USE_GWP_ASAN + if (const auto fault_address = reinterpret_cast(info->si_addr); + GWPAsan::isGWPAsanError(fault_address)) + GWPAsan::printReport(fault_address); +#endif + writeBinary(sig, out); writePODBinary(*info, out); writePODBinary(signal_context, out); diff --git a/src/DataTypes/Serializations/SerializationTuple.cpp b/src/DataTypes/Serializations/SerializationTuple.cpp index 101a408a039..7a5227ca752 100644 --- a/src/DataTypes/Serializations/SerializationTuple.cpp +++ b/src/DataTypes/Serializations/SerializationTuple.cpp @@ -531,26 +531,98 @@ void SerializationTuple::serializeTextXML(const IColumn & column, size_t row_num void SerializationTuple::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const { - WriteBufferFromOwnString wb; - serializeText(column, row_num, wb, settings); - writeCSV(wb.str(), ostr); + if (settings.csv.serialize_tuple_into_separate_columns) + { + for (size_t i = 0; i < elems.size(); ++i) + { + if (i != 0) + writeChar(settings.csv.tuple_delimiter, ostr); + elems[i]->serializeTextCSV(extractElementColumn(column, i), row_num, ostr, settings); + } + } + else + { + WriteBufferFromOwnString wb; + serializeText(column, row_num, wb, settings); + writeCSV(wb.str(), ostr); + } } void SerializationTuple::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - String s; - readCSV(s, istr, settings.csv); - ReadBufferFromString rb(s); - deserializeText(column, rb, settings, true); + if (settings.csv.deserialize_separate_columns_into_tuple) + { + addElementSafe(elems.size(), column, [&] + { + const size_t size = elems.size(); + for (size_t i = 0; i < size; ++i) + { + if (i != 0) + { + skipWhitespaceIfAny(istr); + assertChar(settings.csv.tuple_delimiter, istr); + skipWhitespaceIfAny(istr); + } + + auto & element_column = extractElementColumn(column, i); + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(element_column)) + SerializationNullable::deserializeNullAsDefaultOrNestedTextCSV(element_column, istr, settings, elems[i]); + else + elems[i]->deserializeTextCSV(element_column, istr, settings); + } + return true; + }); + } + else + { + String s; + readCSV(s, istr, settings.csv); + ReadBufferFromString rb(s); + deserializeText(column, rb, settings, true); + } } bool SerializationTuple::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - String s; - if (!tryReadCSV(s, istr, settings.csv)) - return false; - ReadBufferFromString rb(s); - return tryDeserializeText(column, rb, settings, true); + if (settings.csv.deserialize_separate_columns_into_tuple) + { + return addElementSafe(elems.size(), column, [&] + { + const size_t size = elems.size(); + for (size_t i = 0; i < size; ++i) + { + if (i != 0) + { + skipWhitespaceIfAny(istr); + if (!checkChar(settings.csv.tuple_delimiter, istr)) + return false; + skipWhitespaceIfAny(istr); + } + + auto & element_column = extractElementColumn(column, i); + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(element_column)) + { + if (!SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextCSV(element_column, istr, settings, elems[i])) + return false; + } + else + { + if (!elems[i]->tryDeserializeTextCSV(element_column, istr, settings)) + return false; + } + } + + return true; + }); + } + else + { + String s; + if (!tryReadCSV(s, istr, settings.csv)) + return false; + ReadBufferFromString rb(s); + return tryDeserializeText(column, rb, settings, true); + } } struct SerializeBinaryBulkStateTuple : public ISerialization::SerializeBinaryBulkState diff --git a/src/Dictionaries/CacheDictionary.cpp b/src/Dictionaries/CacheDictionary.cpp index 2842e2b8799..1816324a93b 100644 --- a/src/Dictionaries/CacheDictionary.cpp +++ b/src/Dictionaries/CacheDictionary.cpp @@ -511,7 +511,10 @@ MutableColumns CacheDictionary::aggregateColumns( if (default_mask) { if (key_state_from_storage.isDefault()) + { (*default_mask)[key_index] = 1; + aggregated_column->insertDefault(); + } else { (*default_mask)[key_index] = 0; diff --git a/src/Disks/IO/IOUringReader.cpp b/src/Disks/IO/IOUringReader.cpp index 6b0e3f8cc89..b0e783e11d9 100644 --- a/src/Disks/IO/IOUringReader.cpp +++ b/src/Disks/IO/IOUringReader.cpp @@ -22,7 +22,8 @@ namespace ProfileEvents extern const Event AsynchronousReaderIgnoredBytes; extern const Event IOUringSQEsSubmitted; - extern const Event IOUringSQEsResubmits; + extern const Event IOUringSQEsResubmitsAsync; + extern const Event IOUringSQEsResubmitsSync; extern const Event IOUringCQEsCompleted; extern const Event IOUringCQEsFailed; } @@ -149,10 +150,12 @@ int IOUringReader::submitToRing(EnqueuedRequest & enqueued) io_uring_prep_read(sqe, fd, request.buf, static_cast(request.size - enqueued.bytes_read), request.offset + enqueued.bytes_read); int ret = 0; - do + ret = io_uring_submit(&ring); + while (ret == -EINTR || ret == -EAGAIN) { + ProfileEvents::increment(ProfileEvents::IOUringSQEsResubmitsSync); ret = io_uring_submit(&ring); - } while (ret == -EINTR || ret == -EAGAIN); + } if (ret > 0 && !enqueued.resubmitting) { @@ -266,7 +269,7 @@ void IOUringReader::monitorRing() if (cqe->res == -EAGAIN || cqe->res == -EINTR) { enqueued.resubmitting = true; - ProfileEvents::increment(ProfileEvents::IOUringSQEsResubmits); + ProfileEvents::increment(ProfileEvents::IOUringSQEsResubmitsAsync); ret = submitToRing(enqueued); if (ret <= 0) @@ -310,6 +313,7 @@ void IOUringReader::monitorRing() // potential short read, re-submit enqueued.resubmitting = true; enqueued.bytes_read += bytes_read; + ProfileEvents::increment(ProfileEvents::IOUringSQEsResubmitsAsync); ret = submitToRing(enqueued); if (ret <= 0) diff --git a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp index 8210255decb..1bf8250adff 100644 --- a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp +++ b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp @@ -172,6 +172,14 @@ void checkS3Capabilities( } } +static std::string getEndpoint( + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix, + const ContextPtr & context) +{ + return context->getMacros()->expand(config.getString(config_prefix + ".endpoint")); +} + void registerS3ObjectStorage(ObjectStorageFactory & factory) { static constexpr auto disk_type = "s3"; @@ -185,8 +193,9 @@ void registerS3ObjectStorage(ObjectStorageFactory & factory) { auto uri = getS3URI(config, config_prefix, context); auto s3_capabilities = getCapabilitiesFromConfig(config, config_prefix); - auto settings = getSettings(config, config_prefix, context); - auto client = getClient(config, config_prefix, context, *settings, true); + auto endpoint = getEndpoint(config, config_prefix, context); + auto settings = getSettings(config, config_prefix, context, endpoint, /* validate_settings */true); + auto client = getClient(endpoint, *settings, context, /* for_disk_s3 */true); auto key_generator = getKeyGenerator(uri, config, config_prefix); auto object_storage = createObjectStorage( @@ -221,8 +230,9 @@ void registerS3PlainObjectStorage(ObjectStorageFactory & factory) auto uri = getS3URI(config, config_prefix, context); auto s3_capabilities = getCapabilitiesFromConfig(config, config_prefix); - auto settings = getSettings(config, config_prefix, context); - auto client = getClient(config, config_prefix, context, *settings, true); + auto endpoint = getEndpoint(config, config_prefix, context); + auto settings = getSettings(config, config_prefix, context, endpoint, /* validate_settings */true); + auto client = getClient(endpoint, *settings, context, /* for_disk_s3 */true); auto key_generator = getKeyGenerator(uri, config, config_prefix); auto object_storage = std::make_shared>( @@ -255,8 +265,9 @@ void registerS3PlainRewritableObjectStorage(ObjectStorageFactory & factory) auto uri = getS3URI(config, config_prefix, context); auto s3_capabilities = getCapabilitiesFromConfig(config, config_prefix); - auto settings = getSettings(config, config_prefix, context); - auto client = getClient(config, config_prefix, context, *settings, true); + auto endpoint = getEndpoint(config, config_prefix, context); + auto settings = getSettings(config, config_prefix, context, endpoint, /* validate_settings */true); + auto client = getClient(endpoint, *settings, context, /* for_disk_s3 */true); auto key_generator = getKeyGenerator(uri, config, config_prefix); auto metadata_storage_metrics = DB::MetadataStorageMetrics::create(); diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index afc13251f5b..63e11dcd8c8 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -168,7 +168,7 @@ private: bool S3ObjectStorage::exists(const StoredObject & object) const { auto settings_ptr = s3_settings.get(); - return S3::objectExists(*client.get(), uri.bucket, object.remote_path, {}, settings_ptr->request_settings); + return S3::objectExists(*client.get(), uri.bucket, object.remote_path, {}); } std::unique_ptr S3ObjectStorage::readObjects( /// NOLINT @@ -258,13 +258,15 @@ std::unique_ptr S3ObjectStorage::writeObject( /// NOLIN if (mode != WriteMode::Rewrite) throw Exception(ErrorCodes::BAD_ARGUMENTS, "S3 doesn't support append to files"); - S3Settings::RequestSettings request_settings = s3_settings.get()->request_settings; + S3::RequestSettings request_settings = s3_settings.get()->request_settings; /// NOTE: For background operations settings are not propagated from session or query. They are taken from /// default user's .xml config. It's obscure and unclear behavior. For them it's always better /// to rely on settings from disk. - if (auto query_context = CurrentThread::getQueryContext(); query_context && !query_context->isBackgroundOperationContext()) + if (auto query_context = CurrentThread::getQueryContext(); + query_context && !query_context->isBackgroundOperationContext()) { - request_settings.updateFromSettingsIfChanged(query_context->getSettingsRef()); + const auto & settings = query_context->getSettingsRef(); + request_settings.updateFromSettings(settings, /* if_changed */true, settings.s3_validate_request_settings); } ThreadPoolCallbackRunnerUnsafe scheduler; @@ -444,8 +446,7 @@ std::optional S3ObjectStorage::tryGetObjectMetadata(const std::s { auto settings_ptr = s3_settings.get(); auto object_info = S3::getObjectInfo( - *client.get(), uri.bucket, path, {}, settings_ptr->request_settings, - /* with_metadata= */ true, /* throw_on_error= */ false); + *client.get(), uri.bucket, path, {}, /* with_metadata= */ true, /* throw_on_error= */ false); if (object_info.size == 0 && object_info.last_modification_time == 0 && object_info.metadata.empty()) return {}; @@ -464,7 +465,7 @@ ObjectMetadata S3ObjectStorage::getObjectMetadata(const std::string & path) cons S3::ObjectInfo object_info; try { - object_info = S3::getObjectInfo(*client.get(), uri.bucket, path, {}, settings_ptr->request_settings, /* with_metadata= */ true); + object_info = S3::getObjectInfo(*client.get(), uri.bucket, path, {}, /* with_metadata= */ true); } catch (DB::Exception & e) { @@ -493,7 +494,7 @@ void S3ObjectStorage::copyObjectToAnotherObjectStorage( // NOLINT { auto current_client = dest_s3->client.get(); auto settings_ptr = s3_settings.get(); - auto size = S3::getObjectSize(*current_client, uri.bucket, object_from.remote_path, {}, settings_ptr->request_settings); + auto size = S3::getObjectSize(*current_client, uri.bucket, object_from.remote_path, {}); auto scheduler = threadPoolCallbackRunnerUnsafe(getThreadPoolWriter(), "S3ObjStor_copy"); try @@ -537,7 +538,7 @@ void S3ObjectStorage::copyObject( // NOLINT { auto current_client = client.get(); auto settings_ptr = s3_settings.get(); - auto size = S3::getObjectSize(*current_client, uri.bucket, object_from.remote_path, {}, settings_ptr->request_settings); + auto size = S3::getObjectSize(*current_client, uri.bucket, object_from.remote_path, {}); auto scheduler = threadPoolCallbackRunnerUnsafe(getThreadPoolWriter(), "S3ObjStor_copy"); copyS3File( @@ -582,19 +583,22 @@ void S3ObjectStorage::applyNewSettings( ContextPtr context, const ApplyNewSettingsOptions & options) { - auto settings_from_config = getSettings(config, config_prefix, context, context->getSettingsRef().s3_validate_request_settings); + auto settings_from_config = getSettings(config, config_prefix, context, uri.uri_str, context->getSettingsRef().s3_validate_request_settings); auto modified_settings = std::make_unique(*s3_settings.get()); - modified_settings->auth_settings.updateFrom(settings_from_config->auth_settings); - modified_settings->request_settings = settings_from_config->request_settings; + modified_settings->auth_settings.updateIfChanged(settings_from_config->auth_settings); + modified_settings->request_settings.updateIfChanged(settings_from_config->request_settings); if (auto endpoint_settings = context->getStorageS3Settings().getSettings(uri.uri.toString(), context->getUserName())) - modified_settings->auth_settings.updateFrom(endpoint_settings->auth_settings); + { + modified_settings->auth_settings.updateIfChanged(endpoint_settings->auth_settings); + modified_settings->request_settings.updateIfChanged(endpoint_settings->request_settings); + } auto current_settings = s3_settings.get(); if (options.allow_client_change && (current_settings->auth_settings.hasUpdates(modified_settings->auth_settings) || for_disk_s3)) { - auto new_client = getClient(config, config_prefix, context, *modified_settings, for_disk_s3, &uri); + auto new_client = getClient(uri, *modified_settings, context, for_disk_s3); client.set(std::move(new_client)); } s3_settings.set(std::move(modified_settings)); @@ -606,8 +610,9 @@ std::unique_ptr S3ObjectStorage::cloneObjectStorage( const std::string & config_prefix, ContextPtr context) { - auto new_s3_settings = getSettings(config, config_prefix, context); - auto new_client = getClient(config, config_prefix, context, *new_s3_settings, true); + const auto & settings = context->getSettingsRef(); + auto new_s3_settings = getSettings(config, config_prefix, context, uri.uri_str, settings.s3_validate_request_settings); + auto new_client = getClient(uri, *new_s3_settings, context, for_disk_s3); auto new_uri{uri}; new_uri.bucket = new_namespace; diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h index 6eacf3a1eee..7446a1f6fc8 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include @@ -20,7 +20,7 @@ struct S3ObjectStorageSettings S3ObjectStorageSettings() = default; S3ObjectStorageSettings( - const S3Settings::RequestSettings & request_settings_, + const S3::RequestSettings & request_settings_, const S3::AuthSettings & auth_settings_, uint64_t min_bytes_for_seek_, int32_t list_object_keys_size_, @@ -34,7 +34,7 @@ struct S3ObjectStorageSettings , read_only(read_only_) {} - S3Settings::RequestSettings request_settings; + S3::RequestSettings request_settings; S3::AuthSettings auth_settings; uint64_t min_bytes_for_seek; diff --git a/src/Disks/ObjectStorages/S3/diskSettings.cpp b/src/Disks/ObjectStorages/S3/diskSettings.cpp index 139472a8b01..62df98f51e6 100644 --- a/src/Disks/ObjectStorages/S3/diskSettings.cpp +++ b/src/Disks/ObjectStorages/S3/diskSettings.cpp @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -18,18 +19,12 @@ #include #include -#include +#include #include #include -#include namespace DB { -namespace ErrorCodes -{ - extern const int LOGICAL_ERROR; -} - namespace ErrorCodes { extern const int NO_ELEMENTS_IN_CONFIG; @@ -39,11 +34,16 @@ std::unique_ptr getSettings( const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr context, + const std::string & endpoint, bool validate_settings) { - const Settings & settings = context->getSettingsRef(); - auto request_settings = S3Settings::RequestSettings(config, config_prefix, settings, "s3_", validate_settings); - auto auth_settings = S3::AuthSettings::loadFromConfig(config_prefix, config); + const auto & settings = context->getSettingsRef(); + + auto auth_settings = S3::AuthSettings(config, settings, config_prefix); + auto request_settings = S3::RequestSettings(config, settings, config_prefix, "s3_", validate_settings); + + request_settings.proxy_resolver = DB::ProxyConfigurationResolverProvider::getFromOldSettingsFormat( + ProxyConfiguration::protocolFromString(S3::URI(endpoint).uri.getScheme()), config_prefix, config); return std::make_unique( request_settings, @@ -55,38 +55,33 @@ std::unique_ptr getSettings( } std::unique_ptr getClient( - const Poco::Util::AbstractConfiguration & config, - const String & config_prefix, - ContextPtr context, + const std::string & endpoint, const S3ObjectStorageSettings & settings, - bool for_disk_s3, - const S3::URI * url_) + ContextPtr context, + bool for_disk_s3) +{ + auto url = S3::URI(endpoint); + if (!url.key.ends_with('/')) + url.key.push_back('/'); + return getClient(url, settings, context, for_disk_s3); +} + +std::unique_ptr getClient( + const S3::URI & url, + const S3ObjectStorageSettings & settings, + ContextPtr context, + bool for_disk_s3) { const Settings & global_settings = context->getGlobalContext()->getSettingsRef(); - const Settings & local_settings = context->getSettingsRef(); - const auto & auth_settings = settings.auth_settings; const auto & request_settings = settings.request_settings; - S3::URI url; - if (for_disk_s3) - { - String endpoint = context->getMacros()->expand(config.getString(config_prefix + ".endpoint")); - url = S3::URI(endpoint); - if (!url.key.ends_with('/')) - url.key.push_back('/'); - } - else - { - if (!url_) - throw Exception(ErrorCodes::LOGICAL_ERROR, "URL not passed"); - url = *url_; - } const bool is_s3_express_bucket = S3::isS3ExpressEndpoint(url.endpoint); - if (is_s3_express_bucket && !config.has(config_prefix + ".region")) + if (is_s3_express_bucket && auth_settings.region.value.empty()) { throw Exception( - ErrorCodes::NO_ELEMENTS_IN_CONFIG, "Region should be explicitly specified for directory buckets ({})", config_prefix); + ErrorCodes::NO_ELEMENTS_IN_CONFIG, + "Region should be explicitly specified for directory buckets"); } S3::PocoHTTPClientConfiguration client_configuration = S3::ClientFactory::instance().createClientConfiguration( @@ -96,49 +91,40 @@ std::unique_ptr getClient( static_cast(global_settings.s3_retry_attempts), global_settings.enable_s3_requests_logging, for_disk_s3, - settings.request_settings.get_request_throttler, - settings.request_settings.put_request_throttler, + request_settings.get_request_throttler, + request_settings.put_request_throttler, url.uri.getScheme()); - client_configuration.connectTimeoutMs = config.getUInt64(config_prefix + ".connect_timeout_ms", local_settings.s3_connect_timeout_ms.value); - client_configuration.requestTimeoutMs = config.getUInt64(config_prefix + ".request_timeout_ms", local_settings.s3_request_timeout_ms.value); - client_configuration.maxConnections = config.getUInt(config_prefix + ".max_connections", static_cast(request_settings.max_connections)); - client_configuration.http_keep_alive_timeout = config.getUInt(config_prefix + ".http_keep_alive_timeout", S3::DEFAULT_KEEP_ALIVE_TIMEOUT); - client_configuration.http_keep_alive_max_requests = config.getUInt(config_prefix + ".http_keep_alive_max_requests", S3::DEFAULT_KEEP_ALIVE_MAX_REQUESTS); + client_configuration.connectTimeoutMs = auth_settings.connect_timeout_ms; + client_configuration.requestTimeoutMs = auth_settings.request_timeout_ms; + client_configuration.maxConnections = static_cast(auth_settings.max_connections); + client_configuration.http_keep_alive_timeout = auth_settings.http_keep_alive_timeout; + client_configuration.http_keep_alive_max_requests = auth_settings.http_keep_alive_max_requests; client_configuration.endpointOverride = url.endpoint; - client_configuration.s3_use_adaptive_timeouts = config.getBool( - config_prefix + ".use_adaptive_timeouts", client_configuration.s3_use_adaptive_timeouts); + client_configuration.s3_use_adaptive_timeouts = auth_settings.use_adaptive_timeouts; - if (for_disk_s3) + if (request_settings.proxy_resolver) { /* * Override proxy configuration for backwards compatibility with old configuration format. * */ - if (auto proxy_config = DB::ProxyConfigurationResolverProvider::getFromOldSettingsFormat( - ProxyConfiguration::protocolFromString(url.uri.getScheme()), config_prefix, config)) - { - client_configuration.per_request_configuration - = [proxy_config]() { return proxy_config->resolve(); }; - client_configuration.error_report - = [proxy_config](const auto & request_config) { proxy_config->errorReport(request_config); }; - } + client_configuration.per_request_configuration = [=]() { return request_settings.proxy_resolver->resolve(); }; + client_configuration.error_report = [=](const auto & request_config) { request_settings.proxy_resolver->errorReport(request_config); }; } - S3::ServerSideEncryptionKMSConfig sse_kms_config = S3::getSSEKMSConfig(config_prefix, config); S3::ClientSettings client_settings{ .use_virtual_addressing = url.is_virtual_hosted_style, - .disable_checksum = local_settings.s3_disable_checksum, - .gcs_issue_compose_request = config.getBool("s3.gcs_issue_compose_request", false), - .is_s3express_bucket = is_s3_express_bucket, + .disable_checksum = auth_settings.disable_checksum, + .gcs_issue_compose_request = auth_settings.gcs_issue_compose_request, }; auto credentials_configuration = S3::CredentialsConfiguration { - auth_settings.use_environment_credentials.value_or(context->getConfigRef().getBool("s3.use_environment_credentials", true)), - auth_settings.use_insecure_imds_request.value_or(context->getConfigRef().getBool("s3.use_insecure_imds_request", false)), - auth_settings.expiration_window_seconds.value_or(context->getConfigRef().getUInt64("s3.expiration_window_seconds", S3::DEFAULT_EXPIRATION_WINDOW_SECONDS)), - auth_settings.no_sign_request.value_or(context->getConfigRef().getBool("s3.no_sign_request", false)), + auth_settings.use_environment_credentials, + auth_settings.use_insecure_imds_request, + auth_settings.expiration_window_seconds, + auth_settings.no_sign_request, }; return S3::ClientFactory::instance().create( @@ -147,7 +133,7 @@ std::unique_ptr getClient( auth_settings.access_key_id, auth_settings.secret_access_key, auth_settings.server_side_encryption_customer_key_base64, - std::move(sse_kms_config), + auth_settings.server_side_encryption_kms_config, auth_settings.headers, credentials_configuration, auth_settings.session_token); diff --git a/src/Disks/ObjectStorages/S3/diskSettings.h b/src/Disks/ObjectStorages/S3/diskSettings.h index 11ac64ce913..aa427bee41a 100644 --- a/src/Disks/ObjectStorages/S3/diskSettings.h +++ b/src/Disks/ObjectStorages/S3/diskSettings.h @@ -18,15 +18,20 @@ std::unique_ptr getSettings( const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr context, - bool validate_settings = true); + const std::string & endpoint, + bool validate_settings); std::unique_ptr getClient( - const Poco::Util::AbstractConfiguration & config, - const String & config_prefix, - ContextPtr context, + const std::string & endpoint, const S3ObjectStorageSettings & settings, - bool for_disk_s3, - const S3::URI * url_ = nullptr); + ContextPtr context, + bool for_disk_s3); + +std::unique_ptr getClient( + const S3::URI & url_, + const S3ObjectStorageSettings & settings, + ContextPtr context, + bool for_disk_s3); } diff --git a/src/Disks/getOrCreateDiskFromAST.cpp b/src/Disks/getOrCreateDiskFromAST.cpp index 7b2762613b6..fd43f31a009 100644 --- a/src/Disks/getOrCreateDiskFromAST.cpp +++ b/src/Disks/getOrCreateDiskFromAST.cpp @@ -47,7 +47,7 @@ namespace auto result_disk = context->getOrCreateDisk(disk_name, [&](const DisksMap & disks_map) -> DiskPtr { auto disk = DiskFactory::instance().create( - disk_name, *config, "", context, disks_map, /* attach */attach, /* custom_disk */true); + disk_name, *config, /* config_path */"", context, disks_map, /* attach */attach, /* custom_disk */true); /// Mark that disk can be used without storage policy. disk->markDiskAsCustom(); return disk; diff --git a/src/Formats/EscapingRuleUtils.cpp b/src/Formats/EscapingRuleUtils.cpp index 9577ca2a8df..36d16d8d154 100644 --- a/src/Formats/EscapingRuleUtils.cpp +++ b/src/Formats/EscapingRuleUtils.cpp @@ -303,7 +303,7 @@ DataTypePtr tryInferDataTypeByEscapingRule(const String & field, const FormatSet auto type = tryInferDataTypeForSingleField(data, format_settings); /// If we couldn't infer any type or it's a number and csv.try_infer_numbers_from_strings = 0, we determine it as a string. - if (!type || (isNumber(type) && !format_settings.csv.try_infer_numbers_from_strings)) + if (!type || (format_settings.csv.try_infer_strings_from_quoted_tuples && isTuple(type)) || (!format_settings.csv.try_infer_numbers_from_strings && isNumber(type))) return std::make_shared(); return type; diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index e90986f2236..e9a405aa796 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -77,6 +77,8 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se format_settings.avro.output_rows_in_file = settings.output_format_avro_rows_in_file; format_settings.csv.allow_double_quotes = settings.format_csv_allow_double_quotes; format_settings.csv.allow_single_quotes = settings.format_csv_allow_single_quotes; + format_settings.csv.serialize_tuple_into_separate_columns = settings.output_format_csv_serialize_tuple_into_separate_columns; + format_settings.csv.deserialize_separate_columns_into_tuple = settings.input_format_csv_deserialize_separate_columns_into_tuple; format_settings.csv.crlf_end_of_line = settings.output_format_csv_crlf_end_of_line; format_settings.csv.allow_cr_end_of_line = settings.input_format_csv_allow_cr_end_of_line; format_settings.csv.delimiter = settings.format_csv_delimiter; @@ -94,6 +96,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se format_settings.csv.allow_variable_number_of_columns = settings.input_format_csv_allow_variable_number_of_columns; format_settings.csv.use_default_on_bad_values = settings.input_format_csv_use_default_on_bad_values; format_settings.csv.try_infer_numbers_from_strings = settings.input_format_csv_try_infer_numbers_from_strings; + format_settings.csv.try_infer_strings_from_quoted_tuples = settings.input_format_csv_try_infer_strings_from_quoted_tuples; format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter; format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter; format_settings.hive_text.map_keys_delimiter = settings.input_format_hive_text_map_keys_delimiter; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index 337aafbbe9c..421ed4d112d 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -153,6 +153,8 @@ struct FormatSettings char delimiter = ','; bool allow_single_quotes = true; bool allow_double_quotes = true; + bool serialize_tuple_into_separate_columns = true; + bool deserialize_separate_columns_into_tuple = true; bool empty_as_default = false; bool crlf_end_of_line = false; bool allow_cr_end_of_line = false; @@ -170,6 +172,7 @@ struct FormatSettings bool allow_variable_number_of_columns = false; bool use_default_on_bad_values = false; bool try_infer_numbers_from_strings = true; + bool try_infer_strings_from_quoted_tuples = true; } csv{}; struct HiveText diff --git a/src/Formats/SchemaInferenceUtils.cpp b/src/Formats/SchemaInferenceUtils.cpp index 6cbcae2bebe..31faea2e13e 100644 --- a/src/Formats/SchemaInferenceUtils.cpp +++ b/src/Formats/SchemaInferenceUtils.cpp @@ -879,11 +879,11 @@ namespace } template - bool tryReadFloat(Float64 & value, ReadBuffer & buf, const FormatSettings & settings) + bool tryReadFloat(Float64 & value, ReadBuffer & buf, const FormatSettings & settings, bool & has_fractional) { if (is_json || settings.try_infer_exponent_floats) - return tryReadFloatText(value, buf); - return tryReadFloatTextNoExponent(value, buf); + return tryReadFloatTextExt(value, buf, has_fractional); + return tryReadFloatTextExtNoExponent(value, buf, has_fractional); } template @@ -893,46 +893,31 @@ namespace return nullptr; Float64 tmp_float; + bool has_fractional; if (settings.try_infer_integers) { /// If we read from String, we can do it in a more efficient way. if (auto * string_buf = dynamic_cast(&buf)) { /// Remember the pointer to the start of the number to rollback to it. - char * number_start = buf.position(); - Int64 tmp_int; - bool read_int = tryReadIntText(tmp_int, buf); - /// If we reached eof, it cannot be float (it requires no less data than integer) - if (buf.eof()) - return read_int ? std::make_shared() : nullptr; - - char * int_end = buf.position(); /// We can safely get back to the start of the number, because we read from a string and we didn't reach eof. - buf.position() = number_start; + char * number_start = buf.position(); - bool read_uint = false; - char * uint_end = nullptr; - /// In case of Int64 overflow we can try to infer UInt64. - if (!read_int) - { - UInt64 tmp_uint; - read_uint = tryReadIntText(tmp_uint, buf); - /// If we reached eof, it cannot be float (it requires no less data than integer) - if (buf.eof()) - return read_uint ? std::make_shared() : nullptr; - - uint_end = buf.position(); - buf.position() = number_start; - } - - if (tryReadFloat(tmp_float, buf, settings)) - { - if (read_int && buf.position() == int_end) - return std::make_shared(); - if (read_uint && buf.position() == uint_end) - return std::make_shared(); + /// NOTE: it may break parsing of tryReadFloat() != tryReadIntText() + parsing of '.'/'e' + /// But, for now it is true + if (tryReadFloat(tmp_float, buf, settings, has_fractional) && has_fractional) return std::make_shared(); - } + + Int64 tmp_int; + buf.position() = number_start; + if (tryReadIntText(tmp_int, buf)) + return std::make_shared(); + + /// In case of Int64 overflow we can try to infer UInt64. + UInt64 tmp_uint; + buf.position() = number_start; + if (tryReadIntText(tmp_uint, buf)) + return std::make_shared(); return nullptr; } @@ -942,36 +927,22 @@ namespace /// and then as float. PeekableReadBuffer peekable_buf(buf); PeekableReadBufferCheckpoint checkpoint(peekable_buf); - Int64 tmp_int; - bool read_int = tryReadIntText(tmp_int, peekable_buf); - auto * int_end = peekable_buf.position(); - peekable_buf.rollbackToCheckpoint(true); - bool read_uint = false; - char * uint_end = nullptr; - /// In case of Int64 overflow we can try to infer UInt64. - if (!read_int) - { - PeekableReadBufferCheckpoint new_checkpoint(peekable_buf); - UInt64 tmp_uint; - read_uint = tryReadIntText(tmp_uint, peekable_buf); - uint_end = peekable_buf.position(); - peekable_buf.rollbackToCheckpoint(true); - } - - if (tryReadFloat(tmp_float, peekable_buf, settings)) - { - /// Float parsing reads no fewer bytes than integer parsing, - /// so position of the buffer is either the same, or further. - /// If it's the same, then it's integer. - if (read_int && peekable_buf.position() == int_end) - return std::make_shared(); - if (read_uint && peekable_buf.position() == uint_end) - return std::make_shared(); + if (tryReadFloat(tmp_float, peekable_buf, settings, has_fractional) && has_fractional) return std::make_shared(); - } + peekable_buf.rollbackToCheckpoint(/* drop= */ false); + + Int64 tmp_int; + if (tryReadIntText(tmp_int, peekable_buf)) + return std::make_shared(); + peekable_buf.rollbackToCheckpoint(/* drop= */ true); + + /// In case of Int64 overflow we can try to infer UInt64. + UInt64 tmp_uint; + if (tryReadIntText(tmp_uint, peekable_buf)) + return std::make_shared(); } - else if (tryReadFloat(tmp_float, buf, settings)) + else if (tryReadFloat(tmp_float, buf, settings, has_fractional)) { return std::make_shared(); } @@ -1004,7 +975,8 @@ namespace buf.position() = buf.buffer().begin(); Float64 tmp; - if (tryReadFloat(tmp, buf, settings) && buf.eof()) + bool has_fractional; + if (tryReadFloat(tmp, buf, settings, has_fractional) && buf.eof()) return std::make_shared(); return nullptr; diff --git a/src/Functions/FunctionBase64Conversion.h b/src/Functions/FunctionBase64Conversion.h index 3906563a254..05914be3837 100644 --- a/src/Functions/FunctionBase64Conversion.h +++ b/src/Functions/FunctionBase64Conversion.h @@ -12,7 +12,7 @@ # include # include -# include +# include namespace DB { @@ -22,36 +22,125 @@ namespace ErrorCodes extern const int INCORRECT_DATA; } +enum class Base64Variant : uint8_t +{ + Normal, + Url +}; + +inline std::string preprocessBase64Url(std::string_view src) +{ + std::string padded_src; + padded_src.reserve(src.size() + 3); + + // Do symbol substitution as described in https://datatracker.ietf.org/doc/html/rfc4648#section-5 + for (auto s : src) + { + switch (s) + { + case '_': + padded_src += '/'; + break; + case '-': + padded_src += '+'; + break; + default: + padded_src += s; + break; + } + } + + /// Insert padding to please aklomp library + size_t remainder = src.size() % 4; + switch (remainder) + { + case 0: + break; // no padding needed + case 1: + padded_src.append("==="); // this case is impossible to occur with valid base64-URL encoded input, however, we'll insert padding anyway + break; + case 2: + padded_src.append("=="); // two bytes padding + break; + default: // remainder == 3 + padded_src.append("="); // one byte padding + break; + } + + return padded_src; +} + +inline size_t postprocessBase64Url(UInt8 * dst, size_t out_len) +{ + // Do symbol substitution as described in https://datatracker.ietf.org/doc/html/rfc4648#section-5 + for (size_t i = 0; i < out_len; ++i) + { + switch (dst[i]) + { + case '/': + dst[i] = '_'; + break; + case '+': + dst[i] = '-'; + break; + case '=': // stop when padding is detected + return i; + default: + break; + } + } + return out_len; +} + +template struct Base64Encode { - static constexpr auto name = "base64Encode"; + static constexpr auto name = (variant == Base64Variant::Normal) ? "base64Encode" : "base64UrlEncode"; static size_t getBufferSize(size_t string_length, size_t string_count) { return ((string_length - string_count) / 3 + string_count) * 4 + string_count; } - static size_t perform(const std::span src, UInt8 * dst) + static size_t perform(std::string_view src, UInt8 * dst) { size_t outlen = 0; - base64_encode(reinterpret_cast(src.data()), src.size(), reinterpret_cast(dst), &outlen, 0); + base64_encode(src.data(), src.size(), reinterpret_cast(dst), &outlen, 0); + + /// Base64 library is using AVX-512 with some shuffle operations. + /// Memory sanitizer doesn't understand if there was uninitialized memory in SIMD register but it was not used in the result of shuffle. + __msan_unpoison(dst, outlen); + + if constexpr (variant == Base64Variant::Url) + outlen = postprocessBase64Url(dst, outlen); + return outlen; } }; +template struct Base64Decode { - static constexpr auto name = "base64Decode"; + static constexpr auto name = (variant == Base64Variant::Normal) ? "base64Decode" : "base64UrlDecode"; static size_t getBufferSize(size_t string_length, size_t string_count) { return ((string_length - string_count) / 4 + string_count) * 3 + string_count; } - static size_t perform(const std::span src, UInt8 * dst) + static size_t perform(std::string_view src, UInt8 * dst) { + int rc; size_t outlen = 0; - int rc = base64_decode(reinterpret_cast(src.data()), src.size(), reinterpret_cast(dst), &outlen, 0); + if constexpr (variant == Base64Variant::Url) + { + std::string src_padded = preprocessBase64Url(src); + rc = base64_decode(src_padded.data(), src_padded.size(), reinterpret_cast(dst), &outlen, 0); + } + else + { + rc = base64_decode(src.data(), src.size(), reinterpret_cast(dst), &outlen, 0); + } if (rc != 1) throw Exception( @@ -64,19 +153,29 @@ struct Base64Decode } }; +template struct TryBase64Decode { - static constexpr auto name = "tryBase64Decode"; + static constexpr auto name = (variant == Base64Variant::Normal) ? "tryBase64Decode" : "tryBase64UrlDecode"; static size_t getBufferSize(size_t string_length, size_t string_count) { - return Base64Decode::getBufferSize(string_length, string_count); + return Base64Decode::getBufferSize(string_length, string_count); } - static size_t perform(const std::span src, UInt8 * dst) + static size_t perform(std::string_view src, UInt8 * dst) { + int rc; size_t outlen = 0; - int rc = base64_decode(reinterpret_cast(src.data()), src.size(), reinterpret_cast(dst), &outlen, 0); + if constexpr (variant == Base64Variant::Url) + { + std::string src_padded = preprocessBase64Url(src); + rc = base64_decode(src_padded.data(), src_padded.size(), reinterpret_cast(dst), &outlen, 0); + } + else + { + rc = base64_decode(src.data(), src.size(), reinterpret_cast(dst), &outlen, 0); + } if (rc != 1) outlen = 0; @@ -139,7 +238,7 @@ private: auto * dst = dst_chars.data(); auto * dst_pos = dst; - const auto * src = src_chars.data(); + const auto * src = reinterpret_cast(src_chars.data()); size_t src_offset_prev = 0; for (size_t row = 0; row < src_row_count; ++row) @@ -147,10 +246,6 @@ private: const size_t src_length = src_offsets[row] - src_offset_prev - 1; const size_t outlen = Func::perform({src, src_length}, dst_pos); - /// Base64 library is using AVX-512 with some shuffle operations. - /// Memory sanitizer don't understand if there was uninitialized memory in SIMD register but it was not used in the result of shuffle. - __msan_unpoison(dst_pos, outlen); - src += src_length + 1; dst_pos += outlen; *dst_pos = '\0'; @@ -179,16 +274,12 @@ private: auto * dst = dst_chars.data(); auto * dst_pos = dst; - const auto * src = src_chars.data(); + const auto * src = reinterpret_cast(src_chars.data()); for (size_t row = 0; row < src_row_count; ++row) { const auto outlen = Func::perform({src, src_n}, dst_pos); - /// Base64 library is using AVX-512 with some shuffle operations. - /// Memory sanitizer don't understand if there was uninitialized memory in SIMD register but it was not used in the result of shuffle. - __msan_unpoison(dst_pos, outlen); - src += src_n; dst_pos += outlen; *dst_pos = '\0'; diff --git a/src/Functions/base64Decode.cpp b/src/Functions/base64Decode.cpp index 5f7a3406c62..50278c4b0b2 100644 --- a/src/Functions/base64Decode.cpp +++ b/src/Functions/base64Decode.cpp @@ -7,7 +7,14 @@ namespace DB { REGISTER_FUNCTION(Base64Decode) { - factory.registerFunction>(); + FunctionDocumentation::Description description = R"(Accepts a String and decodes it from base64, according to RFC 4648 (https://datatracker.ietf.org/doc/html/rfc4648#section-4). Throws an exception in case of an error. Alias: FROM_BASE64.)"; + FunctionDocumentation::Syntax syntax = "base64Decode(encoded)"; + FunctionDocumentation::Arguments arguments = {{"encoded", "String column or constant. If the string is not a valid Base64-encoded value, an exception is thrown."}}; + FunctionDocumentation::ReturnedValue returned_value = "A string containing the decoded value of the argument."; + FunctionDocumentation::Examples examples = {{"Example", "SELECT base64Decode('Y2xpY2tob3VzZQ==')", "clickhouse"}}; + FunctionDocumentation::Categories categories = {"String encoding"}; + + factory.registerFunction>>({description, syntax, arguments, returned_value, examples, categories}); /// MySQL compatibility alias. factory.registerAlias("FROM_BASE64", "base64Decode", FunctionFactory::CaseInsensitive); diff --git a/src/Functions/base64Encode.cpp b/src/Functions/base64Encode.cpp index 69268f5a25d..d6e63c38a4c 100644 --- a/src/Functions/base64Encode.cpp +++ b/src/Functions/base64Encode.cpp @@ -7,7 +7,14 @@ namespace DB { REGISTER_FUNCTION(Base64Encode) { - factory.registerFunction>(); + FunctionDocumentation::Description description = R"(Encodes a String as base64, according to RFC 4648 (https://datatracker.ietf.org/doc/html/rfc4648#section-4). Alias: TO_BASE64.)"; + FunctionDocumentation::Syntax syntax = "base64Encode(plaintext)"; + FunctionDocumentation::Arguments arguments = {{"plaintext", "String column or constant."}}; + FunctionDocumentation::ReturnedValue returned_value = "A string containing the encoded value of the argument."; + FunctionDocumentation::Examples examples = {{"Example", "SELECT base64Encode('clickhouse')", "Y2xpY2tob3VzZQ=="}}; + FunctionDocumentation::Categories categories = {"String encoding"}; + + factory.registerFunction>>({description, syntax, arguments, returned_value, examples, categories}); /// MySQL compatibility alias. factory.registerAlias("TO_BASE64", "base64Encode", FunctionFactory::CaseInsensitive); diff --git a/src/Functions/base64UrlDecode.cpp b/src/Functions/base64UrlDecode.cpp new file mode 100644 index 00000000000..59975d8f9d1 --- /dev/null +++ b/src/Functions/base64UrlDecode.cpp @@ -0,0 +1,21 @@ +#include + +#if USE_BASE64 +#include + +namespace DB +{ +REGISTER_FUNCTION(Base64UrlDecode) +{ + FunctionDocumentation::Description description = R"(Accepts a base64-encoded URL and decodes it from base64 with URL-specific modifications, according to RFC 4648 (https://datatracker.ietf.org/doc/html/rfc4648#section-5).)"; + FunctionDocumentation::Syntax syntax = "base64UrlDecode(encodedUrl)"; + FunctionDocumentation::Arguments arguments = {{"encodedUrl", "String column or constant. If the string is not a valid Base64-encoded value, an exception is thrown."}}; + FunctionDocumentation::ReturnedValue returned_value = "A string containing the decoded value of the argument."; + FunctionDocumentation::Examples examples = {{"Example", "SELECT base64UrlDecode('aHR0cDovL2NsaWNraG91c2UuY29t')", "https://clickhouse.com"}}; + FunctionDocumentation::Categories categories = {"String encoding"}; + + factory.registerFunction>>({description, syntax, arguments, returned_value, examples, categories}); +} +} + +#endif diff --git a/src/Functions/base64UrlEncode.cpp b/src/Functions/base64UrlEncode.cpp new file mode 100644 index 00000000000..05d50170c14 --- /dev/null +++ b/src/Functions/base64UrlEncode.cpp @@ -0,0 +1,21 @@ +#include + +#if USE_BASE64 +#include + +namespace DB +{ +REGISTER_FUNCTION(Base64UrlEncode) +{ + FunctionDocumentation::Description description = R"(Encodes an URL (String or FixedString) as base64 with URL-specific modifications, according to RFC 4648 (https://datatracker.ietf.org/doc/html/rfc4648#section-5).)"; + FunctionDocumentation::Syntax syntax = "base64UrlEncode(url)"; + FunctionDocumentation::Arguments arguments = {{"url", "String column or constant."}}; + FunctionDocumentation::ReturnedValue returned_value = "A string containing the encoded value of the argument."; + FunctionDocumentation::Examples examples = {{"Example", "SELECT base64UrlEncode('https://clickhouse.com')", "aHR0cHM6Ly9jbGlja2hvdXNlLmNvbQ"}}; + FunctionDocumentation::Categories categories = {"String encoding"}; + + factory.registerFunction>>({description, syntax, arguments, returned_value, examples, categories}); +} +} + +#endif diff --git a/src/Functions/tryBase64Decode.cpp b/src/Functions/tryBase64Decode.cpp index bd452b8357b..08eabe93200 100644 --- a/src/Functions/tryBase64Decode.cpp +++ b/src/Functions/tryBase64Decode.cpp @@ -7,7 +7,14 @@ namespace DB { REGISTER_FUNCTION(TryBase64Decode) { - factory.registerFunction>(); + FunctionDocumentation::Description description = R"(Decodes a String or FixedString from base64, like base64Decode but returns an empty string in case of an error.)"; + FunctionDocumentation::Syntax syntax = "tryBase64Decode(encoded)"; + FunctionDocumentation::Arguments arguments = {{"encoded", "String column or constant. If the string is not a valid Base64-encoded value, returns an empty string."}}; + FunctionDocumentation::ReturnedValue returned_value = "A string containing the decoded value of the argument."; + FunctionDocumentation::Examples examples = {{"valid", "SELECT tryBase64Decode('Y2xpY2tob3VzZQ==')", "clickhouse"}, {"invalid", "SELECT tryBase64Decode('invalid')", ""}}; + FunctionDocumentation::Categories categories = {"String encoding"}; + + factory.registerFunction>>({description, syntax, arguments, returned_value, examples, categories}); } } diff --git a/src/Functions/tryBase64UrlDecode.cpp b/src/Functions/tryBase64UrlDecode.cpp new file mode 100644 index 00000000000..b9aaf4f9273 --- /dev/null +++ b/src/Functions/tryBase64UrlDecode.cpp @@ -0,0 +1,21 @@ +#include + +#if USE_BASE64 +#include + +namespace DB +{ +REGISTER_FUNCTION(TryBase64UrlDecode) +{ + FunctionDocumentation::Description description = R"(Decodes an URL from base64, like base64UrlDecode but returns an empty string in case of an error.)"; + FunctionDocumentation::Syntax syntax = "tryBase64UrlDecode(encodedUrl)"; + FunctionDocumentation::Arguments arguments = {{"encodedUrl", "String column or constant. If the string is not a valid Base64-encoded value with URL-specific modifications, returns an empty string."}}; + FunctionDocumentation::ReturnedValue returned_value = "A string containing the decoded value of the argument."; + FunctionDocumentation::Examples examples = {{"valid", "SELECT tryBase64UrlDecode('aHR0cHM6Ly9jbGlja2hvdXNlLmNvbQ')", "https://clickhouse.com"}, {"invalid", "SELECT tryBase64UrlDecode('aHR0cHM6Ly9jbGlja')", ""}}; + FunctionDocumentation::Categories categories = {"String encoding"}; + + factory.registerFunction>>({description, syntax, arguments, returned_value, examples, categories}); +} +} + +#endif diff --git a/src/IO/ReadBufferFromS3.cpp b/src/IO/ReadBufferFromS3.cpp index 8823af55936..9e001232e65 100644 --- a/src/IO/ReadBufferFromS3.cpp +++ b/src/IO/ReadBufferFromS3.cpp @@ -51,7 +51,7 @@ ReadBufferFromS3::ReadBufferFromS3( const String & bucket_, const String & key_, const String & version_id_, - const S3Settings::RequestSettings & request_settings_, + const S3::RequestSettings & request_settings_, const ReadSettings & settings_, bool use_external_buffer_, size_t offset_, @@ -318,7 +318,7 @@ size_t ReadBufferFromS3::getFileSize() if (file_size) return *file_size; - auto object_size = S3::getObjectSize(*client_ptr, bucket, key, version_id, request_settings); + auto object_size = S3::getObjectSize(*client_ptr, bucket, key, version_id); file_size = object_size; return *file_size; diff --git a/src/IO/ReadBufferFromS3.h b/src/IO/ReadBufferFromS3.h index 003c88df7d2..c6625c2d632 100644 --- a/src/IO/ReadBufferFromS3.h +++ b/src/IO/ReadBufferFromS3.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include "config.h" #if USE_AWS_S3 @@ -28,7 +28,7 @@ private: String bucket; String key; String version_id; - const S3Settings::RequestSettings request_settings; + const S3::RequestSettings request_settings; /// These variables are atomic because they can be used for `logging only` /// (where it is not important to get consistent result) @@ -47,7 +47,7 @@ public: const String & bucket_, const String & key_, const String & version_id_, - const S3Settings::RequestSettings & request_settings_, + const S3::RequestSettings & request_settings_, const ReadSettings & settings_, bool use_external_buffer = false, size_t offset_ = 0, diff --git a/src/IO/S3/Credentials.h b/src/IO/S3/Credentials.h index 8d586223035..b8698d9b302 100644 --- a/src/IO/S3/Credentials.h +++ b/src/IO/S3/Credentials.h @@ -13,18 +13,12 @@ # include # include +# include namespace DB::S3 { -inline static constexpr uint64_t DEFAULT_EXPIRATION_WINDOW_SECONDS = 120; -inline static constexpr uint64_t DEFAULT_CONNECT_TIMEOUT_MS = 1000; -inline static constexpr uint64_t DEFAULT_REQUEST_TIMEOUT_MS = 30000; -inline static constexpr uint64_t DEFAULT_MAX_CONNECTIONS = 100; -inline static constexpr uint64_t DEFAULT_KEEP_ALIVE_TIMEOUT = 5; -inline static constexpr uint64_t DEFAULT_KEEP_ALIVE_MAX_REQUESTS = 100; - /// In GCP metadata service can be accessed via DNS regardless of IPv4 or IPv6. static inline constexpr char GCP_METADATA_SERVICE_ENDPOINT[] = "http://metadata.google.internal"; diff --git a/src/IO/S3/copyS3File.cpp b/src/IO/S3/copyS3File.cpp index d3968d883e8..bb654c3f5c9 100644 --- a/src/IO/S3/copyS3File.cpp +++ b/src/IO/S3/copyS3File.cpp @@ -56,7 +56,7 @@ namespace const std::shared_ptr & client_ptr_, const String & dest_bucket_, const String & dest_key_, - const S3Settings::RequestSettings & request_settings_, + const S3::RequestSettings & request_settings_, const std::optional> & object_metadata_, ThreadPoolCallbackRunnerUnsafe schedule_, bool for_disk_s3_, @@ -66,7 +66,6 @@ namespace , dest_bucket(dest_bucket_) , dest_key(dest_key_) , request_settings(request_settings_) - , upload_settings(request_settings.getUploadSettings()) , object_metadata(object_metadata_) , schedule(schedule_) , for_disk_s3(for_disk_s3_) @@ -81,8 +80,7 @@ namespace std::shared_ptr client_ptr; const String & dest_bucket; const String & dest_key; - const S3Settings::RequestSettings & request_settings; - const S3Settings::RequestSettings::PartUploadSettings & upload_settings; + const S3::RequestSettings & request_settings; const std::optional> & object_metadata; ThreadPoolCallbackRunnerUnsafe schedule; bool for_disk_s3; @@ -127,8 +125,8 @@ namespace if (object_metadata.has_value()) request.SetMetadata(object_metadata.value()); - const auto & storage_class_name = upload_settings.storage_class_name; - if (!storage_class_name.empty()) + const auto & storage_class_name = request_settings.storage_class_name; + if (!storage_class_name.value.empty()) request.SetStorageClass(Aws::S3::Model::StorageClassMapper::GetStorageClassForName(storage_class_name)); client_ptr->setKMSHeaders(request); @@ -187,7 +185,7 @@ namespace request.SetMultipartUpload(multipart_upload); - size_t max_retries = std::max(request_settings.max_unexpected_write_error_retries, 1UL); + size_t max_retries = std::max(request_settings.max_unexpected_write_error_retries.value, 1UL); for (size_t retries = 1;; ++retries) { ProfileEvents::increment(ProfileEvents::S3CompleteMultipartUpload); @@ -241,7 +239,7 @@ namespace void checkObjectAfterUpload() { LOG_TRACE(log, "Checking object {} exists after upload", dest_key); - S3::checkObjectExists(*client_ptr, dest_bucket, dest_key, {}, request_settings, "Immediately after upload"); + S3::checkObjectExists(*client_ptr, dest_bucket, dest_key, {}, "Immediately after upload"); LOG_TRACE(log, "Object {} exists after upload", dest_key); } @@ -292,9 +290,9 @@ namespace if (!total_size) throw Exception(ErrorCodes::LOGICAL_ERROR, "Chosen multipart upload for an empty file. This must not happen"); - auto max_part_number = upload_settings.max_part_number; - auto min_upload_part_size = upload_settings.min_upload_part_size; - auto max_upload_part_size = upload_settings.max_upload_part_size; + auto max_part_number = request_settings.max_part_number; + auto min_upload_part_size = request_settings.min_upload_part_size; + auto max_upload_part_size = request_settings.max_upload_part_size; if (!max_part_number) throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "max_part_number must not be 0"); @@ -467,7 +465,7 @@ namespace const std::shared_ptr & client_ptr_, const String & dest_bucket_, const String & dest_key_, - const S3Settings::RequestSettings & request_settings_, + const S3::RequestSettings & request_settings_, const std::optional> & object_metadata_, ThreadPoolCallbackRunnerUnsafe schedule_, bool for_disk_s3_, @@ -481,7 +479,7 @@ namespace void performCopy() { - if (size <= upload_settings.max_single_part_upload_size) + if (size <= request_settings.max_single_part_upload_size) performSinglepartUpload(); else performMultipartUpload(); @@ -514,8 +512,8 @@ namespace if (object_metadata.has_value()) request.SetMetadata(object_metadata.value()); - const auto & storage_class_name = upload_settings.storage_class_name; - if (!storage_class_name.empty()) + const auto & storage_class_name = request_settings.storage_class_name; + if (!storage_class_name.value.empty()) request.SetStorageClass(Aws::S3::Model::StorageClassMapper::GetStorageClassForName(storage_class_name)); /// If we don't do it, AWS SDK can mistakenly set it to application/xml, see https://github.com/aws/aws-sdk-cpp/issues/1840 @@ -526,7 +524,7 @@ namespace void processPutRequest(S3::PutObjectRequest & request) { - size_t max_retries = std::max(request_settings.max_unexpected_write_error_retries, 1UL); + size_t max_retries = std::max(request_settings.max_unexpected_write_error_retries.value, 1UL); for (size_t retries = 1;; ++retries) { ProfileEvents::increment(ProfileEvents::S3PutObject); @@ -649,7 +647,7 @@ namespace size_t src_size_, const String & dest_bucket_, const String & dest_key_, - const S3Settings::RequestSettings & request_settings_, + const S3::RequestSettings & request_settings_, const ReadSettings & read_settings_, const std::optional> & object_metadata_, ThreadPoolCallbackRunnerUnsafe schedule_, @@ -679,7 +677,7 @@ namespace void performCopy() { LOG_TEST(log, "Copy object {} to {} using native copy", src_key, dest_key); - if (!supports_multipart_copy || size <= upload_settings.max_single_operation_copy_size) + if (!supports_multipart_copy || size <= request_settings.max_single_operation_copy_size) performSingleOperationCopy(); else performMultipartUploadCopy(); @@ -716,8 +714,8 @@ namespace request.SetMetadataDirective(Aws::S3::Model::MetadataDirective::REPLACE); } - const auto & storage_class_name = upload_settings.storage_class_name; - if (!storage_class_name.empty()) + const auto & storage_class_name = request_settings.storage_class_name; + if (!storage_class_name.value.empty()) request.SetStorageClass(Aws::S3::Model::StorageClassMapper::GetStorageClassForName(storage_class_name)); /// If we don't do it, AWS SDK can mistakenly set it to application/xml, see https://github.com/aws/aws-sdk-cpp/issues/1840 @@ -728,7 +726,7 @@ namespace void processCopyRequest(S3::CopyObjectRequest & request) { - size_t max_retries = std::max(request_settings.max_unexpected_write_error_retries, 1UL); + size_t max_retries = std::max(request_settings.max_unexpected_write_error_retries.value, 1UL); for (size_t retries = 1;; ++retries) { ProfileEvents::increment(ProfileEvents::S3CopyObject); @@ -852,7 +850,7 @@ void copyDataToS3File( const std::shared_ptr & dest_s3_client, const String & dest_bucket, const String & dest_key, - const S3Settings::RequestSettings & settings, + const S3::RequestSettings & settings, BlobStorageLogWriterPtr blob_storage_log, const std::optional> & object_metadata, ThreadPoolCallbackRunnerUnsafe schedule, @@ -883,7 +881,7 @@ void copyS3File( std::shared_ptr dest_s3_client, const String & dest_bucket, const String & dest_key, - const S3Settings::RequestSettings & settings, + const S3::RequestSettings & settings, const ReadSettings & read_settings, BlobStorageLogWriterPtr blob_storage_log, const std::optional> & object_metadata, diff --git a/src/IO/S3/copyS3File.h b/src/IO/S3/copyS3File.h index 85b3870ddbf..c33f55cb21b 100644 --- a/src/IO/S3/copyS3File.h +++ b/src/IO/S3/copyS3File.h @@ -4,7 +4,7 @@ #if USE_AWS_S3 -#include +#include #include #include #include @@ -39,7 +39,7 @@ void copyS3File( std::shared_ptr dest_s3_client, const String & dest_bucket, const String & dest_key, - const S3Settings::RequestSettings & settings, + const S3::RequestSettings & settings, const ReadSettings & read_settings, BlobStorageLogWriterPtr blob_storage_log, const std::optional> & object_metadata = std::nullopt, @@ -58,7 +58,7 @@ void copyDataToS3File( const std::shared_ptr & dest_s3_client, const String & dest_bucket, const String & dest_key, - const S3Settings::RequestSettings & settings, + const S3::RequestSettings & settings, BlobStorageLogWriterPtr blob_storage_log, const std::optional> & object_metadata = std::nullopt, ThreadPoolCallbackRunnerUnsafe schedule_ = {}, diff --git a/src/IO/S3/getObjectInfo.cpp b/src/IO/S3/getObjectInfo.cpp index 78efda4ae57..9271ad820e4 100644 --- a/src/IO/S3/getObjectInfo.cpp +++ b/src/IO/S3/getObjectInfo.cpp @@ -44,7 +44,7 @@ namespace /// Performs a request to get the size and last modification time of an object. std::pair, Aws::S3::S3Error> tryGetObjectInfo( const S3::Client & client, const String & bucket, const String & key, const String & version_id, - const S3Settings::RequestSettings & /*request_settings*/, bool with_metadata) + bool with_metadata) { auto outcome = headObject(client, bucket, key, version_id); if (!outcome.IsSuccess()) @@ -73,11 +73,10 @@ ObjectInfo getObjectInfo( const String & bucket, const String & key, const String & version_id, - const S3Settings::RequestSettings & request_settings, bool with_metadata, bool throw_on_error) { - auto [object_info, error] = tryGetObjectInfo(client, bucket, key, version_id, request_settings, with_metadata); + auto [object_info, error] = tryGetObjectInfo(client, bucket, key, version_id, with_metadata); if (object_info) { return *object_info; @@ -96,20 +95,18 @@ size_t getObjectSize( const String & bucket, const String & key, const String & version_id, - const S3Settings::RequestSettings & request_settings, bool throw_on_error) { - return getObjectInfo(client, bucket, key, version_id, request_settings, {}, throw_on_error).size; + return getObjectInfo(client, bucket, key, version_id, {}, throw_on_error).size; } bool objectExists( const S3::Client & client, const String & bucket, const String & key, - const String & version_id, - const S3Settings::RequestSettings & request_settings) + const String & version_id) { - auto [object_info, error] = tryGetObjectInfo(client, bucket, key, version_id, request_settings, {}); + auto [object_info, error] = tryGetObjectInfo(client, bucket, key, version_id, {}); if (object_info) return true; @@ -126,10 +123,9 @@ void checkObjectExists( const String & bucket, const String & key, const String & version_id, - const S3Settings::RequestSettings & request_settings, std::string_view description) { - auto [object_info, error] = tryGetObjectInfo(client, bucket, key, version_id, request_settings, {}); + auto [object_info, error] = tryGetObjectInfo(client, bucket, key, version_id, {}); if (object_info) return; throw S3Exception(error.GetErrorType(), "{}Object {} in bucket {} suddenly disappeared: {}", diff --git a/src/IO/S3/getObjectInfo.h b/src/IO/S3/getObjectInfo.h index ac8072a4338..32f34f74069 100644 --- a/src/IO/S3/getObjectInfo.h +++ b/src/IO/S3/getObjectInfo.h @@ -3,7 +3,7 @@ #include "config.h" #if USE_AWS_S3 -#include +#include #include #include @@ -24,7 +24,6 @@ ObjectInfo getObjectInfo( const String & bucket, const String & key, const String & version_id = {}, - const S3Settings::RequestSettings & request_settings = {}, bool with_metadata = false, bool throw_on_error = true); @@ -33,15 +32,13 @@ size_t getObjectSize( const String & bucket, const String & key, const String & version_id = {}, - const S3Settings::RequestSettings & request_settings = {}, bool throw_on_error = true); bool objectExists( const S3::Client & client, const String & bucket, const String & key, - const String & version_id = {}, - const S3Settings::RequestSettings & request_settings = {}); + const String & version_id = {}); /// Throws an exception if a specified object doesn't exist. `description` is used as a part of the error message. void checkObjectExists( @@ -49,7 +46,6 @@ void checkObjectExists( const String & bucket, const String & key, const String & version_id = {}, - const S3Settings::RequestSettings & request_settings = {}, std::string_view description = {}); bool isNotFoundError(Aws::S3::S3Errors error); diff --git a/src/IO/S3/tests/gtest_aws_s3_client.cpp b/src/IO/S3/tests/gtest_aws_s3_client.cpp index 0a28c578f69..5ee9648a44e 100644 --- a/src/IO/S3/tests/gtest_aws_s3_client.cpp +++ b/src/IO/S3/tests/gtest_aws_s3_client.cpp @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include "TestPocoHTTPServer.h" @@ -69,7 +69,7 @@ void doReadRequest(std::shared_ptr client, const DB::S3::U UInt64 max_single_read_retries = 1; DB::ReadSettings read_settings; - DB::S3Settings::RequestSettings request_settings; + DB::S3::RequestSettings request_settings; request_settings.max_single_read_retries = max_single_read_retries; DB::ReadBufferFromS3 read_buffer( client, @@ -88,7 +88,7 @@ void doWriteRequest(std::shared_ptr client, const DB::S3:: { UInt64 max_unexpected_write_error_retries = 1; - DB::S3Settings::RequestSettings request_settings; + DB::S3::RequestSettings request_settings; request_settings.max_unexpected_write_error_retries = max_unexpected_write_error_retries; DB::WriteBufferFromS3 write_buffer( client, diff --git a/src/IO/S3Common.cpp b/src/IO/S3Common.cpp index 78c51fcb29c..490bf8c2d0c 100644 --- a/src/IO/S3Common.cpp +++ b/src/IO/S3Common.cpp @@ -2,17 +2,19 @@ #include #include +#include +#include +#include +#include #include #include "config.h" #if USE_AWS_S3 -# include -# include -# include -# include -# include +#include +#include +#include namespace ProfileEvents @@ -58,6 +60,8 @@ namespace DB namespace ErrorCodes { extern const int INVALID_CONFIG_PARAMETER; + extern const int BAD_ARGUMENTS; + extern const int INVALID_SETTING_VALUE; } namespace S3 @@ -98,104 +102,320 @@ ServerSideEncryptionKMSConfig getSSEKMSConfig(const std::string & config_elem, c return sse_kms_config; } -AuthSettings AuthSettings::loadFromConfig(const std::string & config_elem, const Poco::Util::AbstractConfiguration & config) +template +static bool setValueFromConfig( + const Poco::Util::AbstractConfiguration & config, + const std::string & path, + typename Settings::SettingFieldRef & field) { - auto access_key_id = config.getString(config_elem + ".access_key_id", ""); - auto secret_access_key = config.getString(config_elem + ".secret_access_key", ""); - auto session_token = config.getString(config_elem + ".session_token", ""); + if (!config.has(path)) + return false; - auto region = config.getString(config_elem + ".region", ""); - auto server_side_encryption_customer_key_base64 = config.getString(config_elem + ".server_side_encryption_customer_key_base64", ""); + auto which = field.getValue().getType(); + if (isInt64OrUInt64FieldType(which)) + field.setValue(config.getUInt64(path)); + else if (which == Field::Types::String) + field.setValue(config.getString(path)); + else if (which == Field::Types::Bool) + field.setValue(config.getBool(path)); + else + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unexpected type: {}", field.getTypeName()); - std::optional use_environment_credentials; - if (config.has(config_elem + ".use_environment_credentials")) - use_environment_credentials = config.getBool(config_elem + ".use_environment_credentials"); + return true; +} - std::optional use_insecure_imds_request; - if (config.has(config_elem + ".use_insecure_imds_request")) - use_insecure_imds_request = config.getBool(config_elem + ".use_insecure_imds_request"); +AuthSettings::AuthSettings( + const Poco::Util::AbstractConfiguration & config, + const DB::Settings & settings, + const std::string & config_prefix) +{ + for (auto & field : allMutable()) + { + auto path = fmt::format("{}.{}", config_prefix, field.getName()); - std::optional expiration_window_seconds; - if (config.has(config_elem + ".expiration_window_seconds")) - expiration_window_seconds = config.getUInt64(config_elem + ".expiration_window_seconds"); + bool updated = setValueFromConfig(config, path, field); + if (!updated) + { + auto setting_name = "s3_" + field.getName(); + if (settings.has(setting_name) && settings.isChanged(setting_name)) + field.setValue(settings.get(setting_name)); + } + } - std::optional no_sign_request; - if (config.has(config_elem + ".no_sign_request")) - no_sign_request = config.getBool(config_elem + ".no_sign_request"); + headers = getHTTPHeaders(config_prefix, config); + server_side_encryption_kms_config = getSSEKMSConfig(config_prefix, config); - HTTPHeaderEntries headers = getHTTPHeaders(config_elem, config); - ServerSideEncryptionKMSConfig sse_kms_config = getSSEKMSConfig(config_elem, config); - - std::unordered_set users; Poco::Util::AbstractConfiguration::Keys keys; - config.keys(config_elem, keys); + config.keys(config_prefix, keys); for (const auto & key : keys) { if (startsWith(key, "user")) - users.insert(config.getString(config_elem + "." + key)); + users.insert(config.getString(config_prefix + "." + key)); } - - return AuthSettings - { - std::move(access_key_id), std::move(secret_access_key), std::move(session_token), - std::move(region), - std::move(server_side_encryption_customer_key_base64), - std::move(sse_kms_config), - std::move(headers), - use_environment_credentials, - use_insecure_imds_request, - expiration_window_seconds, - no_sign_request, - std::move(users) - }; } -bool AuthSettings::canBeUsedByUser(const String & user) const +AuthSettings::AuthSettings(const DB::Settings & settings) { - return users.empty() || users.contains(user); + updateFromSettings(settings, /* if_changed */false); +} + +void AuthSettings::updateFromSettings(const DB::Settings & settings, bool if_changed) +{ + for (auto & field : allMutable()) + { + const auto setting_name = "s3_" + field.getName(); + if (settings.has(setting_name) && (!if_changed || settings.isChanged(setting_name))) + { + field.setValue(settings.get(setting_name)); + } + } } bool AuthSettings::hasUpdates(const AuthSettings & other) const { AuthSettings copy = *this; - copy.updateFrom(other); + copy.updateIfChanged(other); return *this != copy; } -void AuthSettings::updateFrom(const AuthSettings & from) +void AuthSettings::updateIfChanged(const AuthSettings & settings) { - /// Update with check for emptyness only parameters which - /// can be passed not only from config, but via ast. + for (auto & setting : settings.all()) + { + if (setting.isValueChanged()) + set(setting.getName(), setting.getValue()); + } - if (!from.access_key_id.empty()) - access_key_id = from.access_key_id; - if (!from.secret_access_key.empty()) - secret_access_key = from.secret_access_key; - if (!from.session_token.empty()) - session_token = from.session_token; + if (!settings.headers.empty()) + headers = settings.headers; - if (!from.headers.empty()) - headers = from.headers; - if (!from.region.empty()) - region = from.region; + if (!settings.users.empty()) + users.insert(settings.users.begin(), settings.users.end()); - server_side_encryption_customer_key_base64 = from.server_side_encryption_customer_key_base64; - server_side_encryption_kms_config = from.server_side_encryption_kms_config; - - if (from.use_environment_credentials.has_value()) - use_environment_credentials = from.use_environment_credentials; - - if (from.use_insecure_imds_request.has_value()) - use_insecure_imds_request = from.use_insecure_imds_request; - - if (from.expiration_window_seconds.has_value()) - expiration_window_seconds = from.expiration_window_seconds; - - if (from.no_sign_request.has_value()) - no_sign_request = from.no_sign_request; - - users.insert(from.users.begin(), from.users.end()); + if (settings.server_side_encryption_kms_config.key_id.has_value() + || settings.server_side_encryption_kms_config.encryption_context.has_value() + || settings.server_side_encryption_kms_config.key_id.has_value()) + server_side_encryption_kms_config = settings.server_side_encryption_kms_config; } +RequestSettings::RequestSettings( + const Poco::Util::AbstractConfiguration & config, + const DB::Settings & settings, + const std::string & config_prefix, + const std::string & setting_name_prefix, + bool validate_settings) +{ + for (auto & field : allMutable()) + { + auto path = fmt::format("{}.{}{}", config_prefix, setting_name_prefix, field.getName()); + + bool updated = setValueFromConfig(config, path, field); + if (!updated) + { + auto setting_name = "s3_" + field.getName(); + if (settings.has(setting_name) && settings.isChanged(setting_name)) + field.setValue(settings.get(setting_name)); + } + } + finishInit(settings, validate_settings); +} + +RequestSettings::RequestSettings( + const NamedCollection & collection, + const DB::Settings & settings, + bool validate_settings) +{ + auto values = allMutable(); + for (auto & field : values) + { + const auto path = field.getName(); + if (collection.has(path)) + { + auto which = field.getValue().getType(); + if (isInt64OrUInt64FieldType(which)) + field.setValue(collection.get(path)); + else if (which == Field::Types::String) + field.setValue(collection.get(path)); + else if (which == Field::Types::Bool) + field.setValue(collection.get(path)); + else + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unexpected type: {}", field.getTypeName()); + } + } + finishInit(settings, validate_settings); +} + +RequestSettings::RequestSettings(const DB::Settings & settings, bool validate_settings) +{ + updateFromSettings(settings, /* if_changed */false, validate_settings); + finishInit(settings, validate_settings); +} + +void RequestSettings::updateFromSettings( + const DB::Settings & settings, bool if_changed, bool validate_settings) +{ + for (auto & field : allMutable()) + { + const auto setting_name = "s3_" + field.getName(); + if (settings.has(setting_name) && (!if_changed || settings.isChanged(setting_name))) + { + set(field.getName(), settings.get(setting_name)); + } + } + + normalizeSettings(); + if (validate_settings) + validateUploadSettings(); +} + +void RequestSettings::updateIfChanged(const RequestSettings & settings) +{ + for (auto & setting : settings.all()) + { + if (setting.isValueChanged()) + set(setting.getName(), setting.getValue()); + } +} + +void RequestSettings::normalizeSettings() +{ + if (!storage_class_name.value.empty() && storage_class_name.changed) + storage_class_name = Poco::toUpperInPlace(storage_class_name.value); +} + +void RequestSettings::finishInit(const DB::Settings & settings, bool validate_settings) +{ + normalizeSettings(); + if (validate_settings) + validateUploadSettings(); + + /// NOTE: it would be better to reuse old throttlers + /// to avoid losing token bucket state on every config reload, + /// which could lead to exceeding limit for short time. + /// But it is good enough unless very high `burst` values are used. + if (UInt64 max_get_rps = isChanged("max_get_rps") ? get("max_get_rps").get() : settings.s3_max_get_rps) + { + size_t default_max_get_burst = settings.s3_max_get_burst + ? settings.s3_max_get_burst + : (Throttler::default_burst_seconds * max_get_rps); + + size_t max_get_burst = isChanged("max_get_burts") ? get("max_get_burst").get() : default_max_get_burst; + get_request_throttler = std::make_shared(max_get_rps, max_get_burst); + } + if (UInt64 max_put_rps = isChanged("max_put_rps") ? get("max_put_rps").get() : settings.s3_max_put_rps) + { + size_t default_max_put_burst = settings.s3_max_put_burst + ? settings.s3_max_put_burst + : (Throttler::default_burst_seconds * max_put_rps); + size_t max_put_burst = isChanged("max_put_burts") ? get("max_put_burst").get() : default_max_put_burst; + put_request_throttler = std::make_shared(max_put_rps, max_put_burst); + } +} + +void RequestSettings::validateUploadSettings() +{ + static constexpr size_t min_upload_part_size_limit = 5 * 1024 * 1024; + if (strict_upload_part_size && strict_upload_part_size < min_upload_part_size_limit) + throw Exception( + ErrorCodes::INVALID_SETTING_VALUE, + "Setting strict_upload_part_size has invalid value {} which is less than the s3 API limit {}", + ReadableSize(strict_upload_part_size), ReadableSize(min_upload_part_size_limit)); + + if (min_upload_part_size < min_upload_part_size_limit) + throw Exception( + ErrorCodes::INVALID_SETTING_VALUE, + "Setting min_upload_part_size has invalid value {} which is less than the s3 API limit {}", + ReadableSize(min_upload_part_size), ReadableSize(min_upload_part_size_limit)); + + static constexpr size_t max_upload_part_size_limit = 5ull * 1024 * 1024 * 1024; + if (max_upload_part_size > max_upload_part_size_limit) + throw Exception( + ErrorCodes::INVALID_SETTING_VALUE, + "Setting max_upload_part_size has invalid value {} which is greater than the s3 API limit {}", + ReadableSize(max_upload_part_size), ReadableSize(max_upload_part_size_limit)); + + if (max_single_part_upload_size > max_upload_part_size_limit) + throw Exception( + ErrorCodes::INVALID_SETTING_VALUE, + "Setting max_single_part_upload_size has invalid value {} which is grater than the s3 API limit {}", + ReadableSize(max_single_part_upload_size), ReadableSize(max_upload_part_size_limit)); + + if (max_single_operation_copy_size > max_upload_part_size_limit) + throw Exception( + ErrorCodes::INVALID_SETTING_VALUE, + "Setting max_single_operation_copy_size has invalid value {} which is grater than the s3 API limit {}", + ReadableSize(max_single_operation_copy_size), ReadableSize(max_upload_part_size_limit)); + + if (max_upload_part_size < min_upload_part_size) + throw Exception( + ErrorCodes::INVALID_SETTING_VALUE, + "Setting max_upload_part_size ({}) can't be less than setting min_upload_part_size {}", + ReadableSize(max_upload_part_size), ReadableSize(min_upload_part_size)); + + if (!upload_part_size_multiply_factor) + throw Exception( + ErrorCodes::INVALID_SETTING_VALUE, + "Setting upload_part_size_multiply_factor cannot be zero"); + + if (!upload_part_size_multiply_parts_count_threshold) + throw Exception( + ErrorCodes::INVALID_SETTING_VALUE, + "Setting upload_part_size_multiply_parts_count_threshold cannot be zero"); + + if (!max_part_number) + throw Exception( + ErrorCodes::INVALID_SETTING_VALUE, + "Setting max_part_number cannot be zero"); + + static constexpr size_t max_part_number_limit = 10000; + if (max_part_number > max_part_number_limit) + throw Exception( + ErrorCodes::INVALID_SETTING_VALUE, + "Setting max_part_number has invalid value {} which is grater than the s3 API limit {}", + ReadableSize(max_part_number), ReadableSize(max_part_number_limit)); + + size_t maybe_overflow; + if (common::mulOverflow(max_upload_part_size.value, upload_part_size_multiply_factor.value, maybe_overflow)) + throw Exception( + ErrorCodes::INVALID_SETTING_VALUE, + "Setting upload_part_size_multiply_factor is too big ({}). " + "Multiplication to max_upload_part_size ({}) will cause integer overflow", + ReadableSize(max_part_number), ReadableSize(max_part_number_limit)); + + std::unordered_set storage_class_names {"STANDARD", "INTELLIGENT_TIERING"}; + if (!storage_class_name.value.empty() && !storage_class_names.contains(storage_class_name)) + throw Exception( + ErrorCodes::INVALID_SETTING_VALUE, + "Setting storage_class has invalid value {} which only supports STANDARD and INTELLIGENT_TIERING", + storage_class_name.value); + + /// TODO: it's possible to set too small limits. + /// We can check that max possible object size is not too small. +} + +bool operator==(const AuthSettings & left, const AuthSettings & right) +{ + if (left.headers != right.headers) + return false; + + if (left.users != right.users) + return false; + + if (left.server_side_encryption_kms_config != right.server_side_encryption_kms_config) + return false; + + auto l = left.begin(); + for (const auto & r : right) + { + if ((l == left.end()) || (*l != r)) + return false; + ++l; + } + return l == left.end(); } } + +IMPLEMENT_SETTINGS_TRAITS(S3::AuthSettingsTraits, CLIENT_SETTINGS_LIST) +IMPLEMENT_SETTINGS_TRAITS(S3::RequestSettingsTraits, REQUEST_SETTINGS_LIST) + +} diff --git a/src/IO/S3Common.h b/src/IO/S3Common.h index b3e01bd6132..2dca08871d3 100644 --- a/src/IO/S3Common.h +++ b/src/IO/S3Common.h @@ -3,22 +3,22 @@ #include #include #include - -#include -#include +#include +#include +#include +#include +#include +#include +#include +#include #include #include "config.h" #if USE_AWS_S3 -#include -#include -#include - #include #include - #include #include @@ -30,8 +30,6 @@ namespace ErrorCodes extern const int S3_ERROR; } -class RemoteHostFilter; - class S3Exception : public Exception { public: @@ -68,40 +66,140 @@ namespace Poco::Util class AbstractConfiguration; }; -namespace DB::S3 +namespace DB { +class NamedCollection; +struct ProxyConfigurationResolver; + +namespace S3 +{ +/// We use s3 settings for DiskS3, StorageS3 (StorageS3Cluster, S3Queue, etc), BackupIO_S3, etc. +/// 1. For DiskS3 we usually have configuration in disk section in configuration file. +/// REQUEST_SETTINGS, PART_UPLOAD_SETTINGS start with "s3_" prefix there, while AUTH_SETTINGS and CLIENT_SETTINGS do not +/// (does not make sense, but it happened this way). +/// If some setting is absent from disk configuration, we look up for it in the "s3." server config section, +/// where s3 settings no longer have "s3_" prefix like in disk configuration section. +/// If the settings is absent there as well, we look up for it in Users config (where query/session settings are also updated). +/// 2. For StorageS3 and similar - we look up to "s3." config section (again - settings there do not have "s3_" prefix). +/// If some setting is absent from there, we lool up for it in Users config. + +#define AUTH_SETTINGS(M, ALIAS) \ + M(String, access_key_id, "", "", 0) \ + M(String, secret_access_key, "", "", 0) \ + M(String, session_token, "", "", 0) \ + M(String, region, "", "", 0) \ + M(String, server_side_encryption_customer_key_base64, "", "", 0) \ + +#define CLIENT_SETTINGS(M, ALIAS) \ + M(UInt64, connect_timeout_ms, DEFAULT_CONNECT_TIMEOUT_MS, "", 0) \ + M(UInt64, request_timeout_ms, DEFAULT_REQUEST_TIMEOUT_MS, "", 0) \ + M(UInt64, max_connections, DEFAULT_MAX_CONNECTIONS, "", 0) \ + M(UInt64, http_keep_alive_timeout, DEFAULT_KEEP_ALIVE_TIMEOUT, "", 0) \ + M(UInt64, http_keep_alive_max_requests, DEFAULT_KEEP_ALIVE_MAX_REQUESTS, "", 0) \ + M(UInt64, expiration_window_seconds, DEFAULT_EXPIRATION_WINDOW_SECONDS, "", 0) \ + M(Bool, use_environment_credentials, DEFAULT_USE_ENVIRONMENT_CREDENTIALS, "", 0) \ + M(Bool, no_sign_request, DEFAULT_NO_SIGN_REQUEST, "", 0) \ + M(Bool, use_insecure_imds_request, false, "", 0) \ + M(Bool, use_adaptive_timeouts, DEFAULT_USE_ADAPTIVE_TIMEOUTS, "", 0) \ + M(Bool, is_virtual_hosted_style, false, "", 0) \ + M(Bool, disable_checksum, DEFAULT_DISABLE_CHECKSUM, "", 0) \ + M(Bool, gcs_issue_compose_request, false, "", 0) \ + +#define REQUEST_SETTINGS(M, ALIAS) \ + M(UInt64, max_single_read_retries, 4, "", 0) \ + M(UInt64, request_timeout_ms, DEFAULT_REQUEST_TIMEOUT_MS, "", 0) \ + M(UInt64, list_object_keys_size, DEFAULT_LIST_OBJECT_KEYS_SIZE, "", 0) \ + M(Bool, allow_native_copy, DEFAULT_ALLOW_NATIVE_COPY, "", 0) \ + M(Bool, check_objects_after_upload, DEFAULT_CHECK_OBJECTS_AFTER_UPLOAD, "", 0) \ + M(Bool, throw_on_zero_files_match, false, "", 0) \ + M(UInt64, max_single_operation_copy_size, DEFAULT_MAX_SINGLE_OPERATION_COPY_SIZE, "", 0) \ + M(String, storage_class_name, "", "", 0) \ + +#define PART_UPLOAD_SETTINGS(M, ALIAS) \ + M(UInt64, strict_upload_part_size, 0, "", 0) \ + M(UInt64, min_upload_part_size, DEFAULT_MIN_UPLOAD_PART_SIZE, "", 0) \ + M(UInt64, max_upload_part_size, DEFAULT_MAX_UPLOAD_PART_SIZE, "", 0) \ + M(UInt64, upload_part_size_multiply_factor, DEFAULT_UPLOAD_PART_SIZE_MULTIPLY_FACTOR, "", 0) \ + M(UInt64, upload_part_size_multiply_parts_count_threshold, DEFAULT_UPLOAD_PART_SIZE_MULTIPLY_PARTS_COUNT_THRESHOLD, "", 0) \ + M(UInt64, max_inflight_parts_for_one_file, DEFAULT_MAX_INFLIGHT_PARTS_FOR_ONE_FILE, "", 0) \ + M(UInt64, max_part_number, DEFAULT_MAX_PART_NUMBER, "", 0) \ + M(UInt64, max_single_part_upload_size, DEFAULT_MAX_SINGLE_PART_UPLOAD_SIZE, "", 0) \ + M(UInt64, max_unexpected_write_error_retries, 4, "", 0) \ + +#define CLIENT_SETTINGS_LIST(M, ALIAS) \ + CLIENT_SETTINGS(M, ALIAS) \ + AUTH_SETTINGS(M, ALIAS) + +#define REQUEST_SETTINGS_LIST(M, ALIAS) \ + REQUEST_SETTINGS(M, ALIAS) \ + PART_UPLOAD_SETTINGS(M, ALIAS) + +DECLARE_SETTINGS_TRAITS(AuthSettingsTraits, CLIENT_SETTINGS_LIST) +DECLARE_SETTINGS_TRAITS(RequestSettingsTraits, REQUEST_SETTINGS_LIST) + +struct AuthSettings : public BaseSettings +{ + AuthSettings() = default; + + AuthSettings( + const Poco::Util::AbstractConfiguration & config, + const DB::Settings & settings, + const std::string & config_prefix); + + explicit AuthSettings(const DB::Settings & settings); + + explicit AuthSettings(const DB::NamedCollection & collection); + + void updateFromSettings(const DB::Settings & settings, bool if_changed); + bool hasUpdates(const AuthSettings & other) const; + void updateIfChanged(const AuthSettings & settings); + bool canBeUsedByUser(const String & user) const { return users.empty() || users.contains(user); } + + HTTPHeaderEntries headers; + std::unordered_set users; + ServerSideEncryptionKMSConfig server_side_encryption_kms_config; + /// Note: if you add any field, do not forget to update operator ==. +}; + +bool operator==(const AuthSettings & left, const AuthSettings & right); + +struct RequestSettings : public BaseSettings +{ + RequestSettings() = default; + + /// Create request settings from Config. + RequestSettings( + const Poco::Util::AbstractConfiguration & config, + const DB::Settings & settings, + const std::string & config_prefix, + const std::string & setting_name_prefix = "", + bool validate_settings = true); + + /// Create request settings from DB::Settings. + explicit RequestSettings(const DB::Settings & settings, bool validate_settings = true); + + /// Create request settings from NamedCollection. + RequestSettings( + const NamedCollection & collection, + const DB::Settings & settings, + bool validate_settings = true); + + void updateFromSettings(const DB::Settings & settings, bool if_changed, bool validate_settings = true); + void updateIfChanged(const RequestSettings & settings); + void validateUploadSettings(); + + ThrottlerPtr get_request_throttler; + ThrottlerPtr put_request_throttler; + std::shared_ptr proxy_resolver; + +private: + void finishInit(const DB::Settings & settings, bool validate_settings); + void normalizeSettings(); +}; HTTPHeaderEntries getHTTPHeaders(const std::string & config_elem, const Poco::Util::AbstractConfiguration & config); ServerSideEncryptionKMSConfig getSSEKMSConfig(const std::string & config_elem, const Poco::Util::AbstractConfiguration & config); -struct AuthSettings -{ - static AuthSettings loadFromConfig(const std::string & config_elem, const Poco::Util::AbstractConfiguration & config); - - std::string access_key_id; - std::string secret_access_key; - std::string session_token; - std::string region; - std::string server_side_encryption_customer_key_base64; - ServerSideEncryptionKMSConfig server_side_encryption_kms_config; - - HTTPHeaderEntries headers; - - std::optional use_environment_credentials; - std::optional use_insecure_imds_request; - std::optional expiration_window_seconds; - std::optional no_sign_request; - - std::unordered_set users; - - bool hasUpdates(const AuthSettings & other) const; - void updateFrom(const AuthSettings & from); - - bool canBeUsedByUser(const String & user) const; - -private: - bool operator==(const AuthSettings & other) const = default; -}; - +} } diff --git a/src/IO/S3Defines.h b/src/IO/S3Defines.h new file mode 100644 index 00000000000..332ebcfea92 --- /dev/null +++ b/src/IO/S3Defines.h @@ -0,0 +1,41 @@ +#pragma once +#include + +namespace DB::S3 +{ + +/// Client settings. +inline static constexpr uint64_t DEFAULT_EXPIRATION_WINDOW_SECONDS = 120; +inline static constexpr uint64_t DEFAULT_CONNECT_TIMEOUT_MS = 1000; +inline static constexpr uint64_t DEFAULT_REQUEST_TIMEOUT_MS = 30000; +inline static constexpr uint64_t DEFAULT_MAX_CONNECTIONS = 1024; +inline static constexpr uint64_t DEFAULT_KEEP_ALIVE_TIMEOUT = 5; +inline static constexpr uint64_t DEFAULT_KEEP_ALIVE_MAX_REQUESTS = 100; + +inline static constexpr bool DEFAULT_USE_ENVIRONMENT_CREDENTIALS = true; +inline static constexpr bool DEFAULT_NO_SIGN_REQUEST = false; +inline static constexpr bool DEFAULT_DISABLE_CHECKSUM = false; +inline static constexpr bool DEFAULT_USE_ADAPTIVE_TIMEOUTS = true; + +/// Upload settings. +inline static constexpr uint64_t DEFAULT_MIN_UPLOAD_PART_SIZE = 16 * 1024 * 1024; +inline static constexpr uint64_t DEFAULT_MAX_UPLOAD_PART_SIZE = 5ull * 1024 * 1024 * 1024; +inline static constexpr uint64_t DEFAULT_MAX_SINGLE_PART_UPLOAD_SIZE = 32 * 1024 * 1024; +inline static constexpr uint64_t DEFAULT_STRICT_UPLOAD_PART_SIZE = 0; +inline static constexpr uint64_t DEFAULT_UPLOAD_PART_SIZE_MULTIPLY_FACTOR = 2; +inline static constexpr uint64_t DEFAULT_UPLOAD_PART_SIZE_MULTIPLY_PARTS_COUNT_THRESHOLD = 500; +inline static constexpr uint64_t DEFAULT_MAX_PART_NUMBER = 10000; + +/// Other settings. +inline static constexpr uint64_t DEFAULT_MAX_SINGLE_OPERATION_COPY_SIZE = 32 * 1024 * 1024; +inline static constexpr uint64_t DEFAULT_MAX_INFLIGHT_PARTS_FOR_ONE_FILE = 20; +inline static constexpr uint64_t DEFAULT_LIST_OBJECT_KEYS_SIZE = 1000; +inline static constexpr uint64_t DEFAULT_MAX_SINGLE_READ_TRIES = 4; +inline static constexpr uint64_t DEFAULT_MAX_UNEXPECTED_WRITE_ERROR_RETRIES = 4; +inline static constexpr uint64_t DEFAULT_MAX_REDIRECTS = 10; +inline static constexpr uint64_t DEFAULT_RETRY_ATTEMPTS = 100; + +inline static constexpr bool DEFAULT_ALLOW_NATIVE_COPY = true; +inline static constexpr bool DEFAULT_CHECK_OBJECTS_AFTER_UPLOAD = false; + +} diff --git a/src/IO/S3Settings.cpp b/src/IO/S3Settings.cpp new file mode 100644 index 00000000000..a5a50c873cb --- /dev/null +++ b/src/IO/S3Settings.cpp @@ -0,0 +1,80 @@ +#include + +#include +#include +#include + + +namespace DB +{ + +void S3Settings::loadFromConfig( + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix, + const DB::Settings & settings) +{ + auth_settings = S3::AuthSettings(config, settings, config_prefix); + request_settings = S3::RequestSettings(config, settings, config_prefix); +} + +void S3Settings::updateIfChanged(const S3Settings & settings) +{ + auth_settings.updateIfChanged(settings.auth_settings); + request_settings.updateIfChanged(settings.request_settings); +} + +void S3SettingsByEndpoint::loadFromConfig( + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix, + const DB::Settings & settings) +{ + std::lock_guard lock(mutex); + s3_settings.clear(); + if (!config.has(config_prefix)) + return; + + Poco::Util::AbstractConfiguration::Keys config_keys; + config.keys(config_prefix, config_keys); + auto default_auth_settings = S3::AuthSettings(config, settings, config_prefix); + auto default_request_settings = S3::RequestSettings(config, settings, config_prefix); + + for (const String & key : config_keys) + { + const auto key_path = config_prefix + "." + key; + const auto endpoint_path = key_path + ".endpoint"; + if (config.has(endpoint_path)) + { + auto auth_settings{default_auth_settings}; + auth_settings.updateIfChanged(S3::AuthSettings(config, settings, key_path)); + + auto request_settings{default_request_settings}; + request_settings.updateIfChanged(S3::RequestSettings(config, settings, key_path, "", settings.s3_validate_request_settings)); + + s3_settings.emplace( + config.getString(endpoint_path), + S3Settings{std::move(auth_settings), std::move(request_settings)}); + } + } +} + +std::optional S3SettingsByEndpoint::getSettings( + const String & endpoint, + const String & user, + bool ignore_user) const +{ + std::lock_guard lock(mutex); + auto next_prefix_setting = s3_settings.upper_bound(endpoint); + + /// Linear time algorithm may be replaced with logarithmic with prefix tree map. + for (auto possible_prefix_setting = next_prefix_setting; possible_prefix_setting != s3_settings.begin();) + { + std::advance(possible_prefix_setting, -1); + const auto & [endpoint_prefix, settings] = *possible_prefix_setting; + if (endpoint.starts_with(endpoint_prefix) && (ignore_user || settings.auth_settings.canBeUsedByUser(user))) + return possible_prefix_setting->second; + } + + return {}; +} + +} diff --git a/src/IO/S3Settings.h b/src/IO/S3Settings.h new file mode 100644 index 00000000000..9eed0a5652f --- /dev/null +++ b/src/IO/S3Settings.h @@ -0,0 +1,52 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace Poco::Util { class AbstractConfiguration; } + +namespace DB +{ + +struct Settings; + +struct S3Settings +{ + S3::AuthSettings auth_settings; + S3::RequestSettings request_settings; + + void loadFromConfig( + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix, + const DB::Settings & settings); + + void updateIfChanged(const S3Settings & settings); +}; + +class S3SettingsByEndpoint +{ +public: + void loadFromConfig( + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix, + const DB::Settings & settings); + + std::optional getSettings( + const std::string & endpoint, + const std::string & user, + bool ignore_user = false) const; + +private: + mutable std::mutex mutex; + std::map s3_settings; +}; + + +} diff --git a/src/IO/WriteBufferFromS3.cpp b/src/IO/WriteBufferFromS3.cpp index b796c029051..cd9949862ca 100644 --- a/src/IO/WriteBufferFromS3.cpp +++ b/src/IO/WriteBufferFromS3.cpp @@ -72,7 +72,7 @@ struct WriteBufferFromS3::PartData } }; -BufferAllocationPolicyPtr createBufferAllocationPolicy(const S3Settings::RequestSettings::PartUploadSettings & settings) +BufferAllocationPolicyPtr createBufferAllocationPolicy(const S3::RequestSettings & settings) { BufferAllocationPolicy::Settings allocation_settings; allocation_settings.strict_size = settings.strict_upload_part_size; @@ -91,7 +91,7 @@ WriteBufferFromS3::WriteBufferFromS3( const String & bucket_, const String & key_, size_t buf_size_, - const S3Settings::RequestSettings & request_settings_, + const S3::RequestSettings & request_settings_, BlobStorageLogWriterPtr blob_log_, std::optional> object_metadata_, ThreadPoolCallbackRunnerUnsafe schedule_, @@ -100,15 +100,14 @@ WriteBufferFromS3::WriteBufferFromS3( , bucket(bucket_) , key(key_) , request_settings(request_settings_) - , upload_settings(request_settings.getUploadSettings()) , write_settings(write_settings_) , client_ptr(std::move(client_ptr_)) , object_metadata(std::move(object_metadata_)) - , buffer_allocation_policy(createBufferAllocationPolicy(upload_settings)) + , buffer_allocation_policy(createBufferAllocationPolicy(request_settings)) , task_tracker( std::make_unique( std::move(schedule_), - upload_settings.max_inflight_parts_for_one_file, + request_settings.max_inflight_parts_for_one_file, limitedLog)) , blob_log(std::move(blob_log_)) { @@ -165,7 +164,7 @@ void WriteBufferFromS3::preFinalize() if (multipart_upload_id.empty() && detached_part_data.size() <= 1) { - if (detached_part_data.empty() || detached_part_data.front().data_size <= upload_settings.max_single_part_upload_size) + if (detached_part_data.empty() || detached_part_data.front().data_size <= request_settings.max_single_part_upload_size) do_single_part_upload = true; } @@ -214,9 +213,9 @@ void WriteBufferFromS3::finalizeImpl() if (request_settings.check_objects_after_upload) { - S3::checkObjectExists(*client_ptr, bucket, key, {}, request_settings, "Immediately after upload"); + S3::checkObjectExists(*client_ptr, bucket, key, {}, "Immediately after upload"); - size_t actual_size = S3::getObjectSize(*client_ptr, bucket, key, {}, request_settings); + size_t actual_size = S3::getObjectSize(*client_ptr, bucket, key, {}); if (actual_size != total_size) throw Exception( ErrorCodes::S3_ERROR, @@ -505,18 +504,18 @@ void WriteBufferFromS3::writePart(WriteBufferFromS3::PartData && data) "Unable to write a part without multipart_upload_id, details: WriteBufferFromS3 created for bucket {}, key {}", bucket, key); - if (part_number > upload_settings.max_part_number) + if (part_number > request_settings.max_part_number) { throw Exception( ErrorCodes::INVALID_CONFIG_PARAMETER, "Part number exceeded {} while writing {} bytes to S3. Check min_upload_part_size = {}, max_upload_part_size = {}, " "upload_part_size_multiply_factor = {}, upload_part_size_multiply_parts_count_threshold = {}, max_single_part_upload_size = {}", - upload_settings.max_part_number, count(), upload_settings.min_upload_part_size, upload_settings.max_upload_part_size, - upload_settings.upload_part_size_multiply_factor, upload_settings.upload_part_size_multiply_parts_count_threshold, - upload_settings.max_single_part_upload_size); + request_settings.max_part_number, count(), request_settings.min_upload_part_size, request_settings.max_upload_part_size, + request_settings.upload_part_size_multiply_factor, request_settings.upload_part_size_multiply_parts_count_threshold, + request_settings.max_single_part_upload_size); } - if (data.data_size > upload_settings.max_upload_part_size) + if (data.data_size > request_settings.max_upload_part_size) { throw Exception( ErrorCodes::LOGICAL_ERROR, @@ -524,7 +523,7 @@ void WriteBufferFromS3::writePart(WriteBufferFromS3::PartData && data) getShortLogDetails(), part_number, data.data_size, - upload_settings.max_upload_part_size + request_settings.max_upload_part_size ); } @@ -611,7 +610,7 @@ void WriteBufferFromS3::completeMultipartUpload() req.SetMultipartUpload(multipart_upload); - size_t max_retry = std::max(request_settings.max_unexpected_write_error_retries, 1UL); + size_t max_retry = std::max(request_settings.max_unexpected_write_error_retries.value, 1UL); for (size_t i = 0; i < max_retry; ++i) { ProfileEvents::increment(ProfileEvents::S3CompleteMultipartUpload); @@ -669,8 +668,8 @@ S3::PutObjectRequest WriteBufferFromS3::getPutRequest(PartData & data) req.SetBody(data.createAwsBuffer()); if (object_metadata.has_value()) req.SetMetadata(object_metadata.value()); - if (!upload_settings.storage_class_name.empty()) - req.SetStorageClass(Aws::S3::Model::StorageClassMapper::GetStorageClassForName(upload_settings.storage_class_name)); + if (!request_settings.storage_class_name.value.empty()) + req.SetStorageClass(Aws::S3::Model::StorageClassMapper::GetStorageClassForName(request_settings.storage_class_name)); /// If we don't do it, AWS SDK can mistakenly set it to application/xml, see https://github.com/aws/aws-sdk-cpp/issues/1840 req.SetContentType("binary/octet-stream"); @@ -694,7 +693,7 @@ void WriteBufferFromS3::makeSinglepartUpload(WriteBufferFromS3::PartData && data auto & request = std::get<0>(*worker_data); size_t content_length = request.GetContentLength(); - size_t max_retry = std::max(request_settings.max_unexpected_write_error_retries, 1UL); + size_t max_retry = std::max(request_settings.max_unexpected_write_error_retries.value, 1UL); for (size_t i = 0; i < max_retry; ++i) { ProfileEvents::increment(ProfileEvents::S3PutObject); diff --git a/src/IO/WriteBufferFromS3.h b/src/IO/WriteBufferFromS3.h index fbfec3588fa..973ca4c7526 100644 --- a/src/IO/WriteBufferFromS3.h +++ b/src/IO/WriteBufferFromS3.h @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include #include @@ -38,7 +38,7 @@ public: const String & bucket_, const String & key_, size_t buf_size_, - const S3Settings::RequestSettings & request_settings_, + const S3::RequestSettings & request_settings_, BlobStorageLogWriterPtr blob_log_, std::optional> object_metadata_ = std::nullopt, ThreadPoolCallbackRunnerUnsafe schedule_ = {}, @@ -78,8 +78,7 @@ private: const String bucket; const String key; - const S3Settings::RequestSettings request_settings; - const S3Settings::RequestSettings::PartUploadSettings & upload_settings; + const S3::RequestSettings request_settings; const WriteSettings write_settings; const std::shared_ptr client_ptr; const std::optional> object_metadata; diff --git a/src/IO/readFloatText.h b/src/IO/readFloatText.h index 3a21d7201a9..215bb1a3270 100644 --- a/src/IO/readFloatText.h +++ b/src/IO/readFloatText.h @@ -320,11 +320,13 @@ static inline void readUIntTextUpToNSignificantDigits(T & x, ReadBuffer & buf) template -ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in) +ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in, bool & has_fractional) { static_assert(std::is_same_v || std::is_same_v, "Argument for readFloatTextImpl must be float or double"); static_assert('a' > '.' && 'A' > '.' && '\n' < '.' && '\t' < '.' && '\'' < '.' && '"' < '.', "Layout of char is not like ASCII"); + has_fractional = false; + static constexpr bool throw_exception = std::is_same_v; bool negative = false; @@ -377,6 +379,7 @@ ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in) if (checkChar('.', in)) { + has_fractional = true; auto after_point_count = in.count(); while (!in.eof() && *in.position() == '0') @@ -394,6 +397,7 @@ ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in) { if (checkChar('e', in) || checkChar('E', in)) { + has_fractional = true; if (in.eof()) { if constexpr (throw_exception) @@ -420,10 +424,14 @@ ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in) } if (after_point) + { x += static_cast(shift10(after_point, after_point_exponent)); + } if (exponent) + { x = static_cast(shift10(x, exponent)); + } if (negative) x = -x; @@ -590,8 +598,16 @@ ReturnType readFloatTextSimpleImpl(T & x, ReadBuffer & buf) template void readFloatTextPrecise(T & x, ReadBuffer & in) { readFloatTextPreciseImpl(x, in); } template bool tryReadFloatTextPrecise(T & x, ReadBuffer & in) { return readFloatTextPreciseImpl(x, in); } -template void readFloatTextFast(T & x, ReadBuffer & in) { readFloatTextFastImpl(x, in); } -template bool tryReadFloatTextFast(T & x, ReadBuffer & in) { return readFloatTextFastImpl(x, in); } +template void readFloatTextFast(T & x, ReadBuffer & in) +{ + bool has_fractional; + readFloatTextFastImpl(x, in, has_fractional); +} +template bool tryReadFloatTextFast(T & x, ReadBuffer & in) +{ + bool has_fractional; + return readFloatTextFastImpl(x, in, has_fractional); +} template void readFloatTextSimple(T & x, ReadBuffer & in) { readFloatTextSimpleImpl(x, in); } template bool tryReadFloatTextSimple(T & x, ReadBuffer & in) { return readFloatTextSimpleImpl(x, in); } @@ -603,6 +619,21 @@ template void readFloatText(T & x, ReadBuffer & in) { readFloatText template bool tryReadFloatText(T & x, ReadBuffer & in) { return tryReadFloatTextFast(x, in); } /// Don't read exponent part of the number. -template bool tryReadFloatTextNoExponent(T & x, ReadBuffer & in) { return readFloatTextFastImpl(x, in); } +template bool tryReadFloatTextNoExponent(T & x, ReadBuffer & in) +{ + bool has_fractional; + return readFloatTextFastImpl(x, in, has_fractional); +} + +/// With a @has_fractional flag +/// Used for input_format_try_infer_integers +template bool tryReadFloatTextExt(T & x, ReadBuffer & in, bool & has_fractional) +{ + return readFloatTextFastImpl(x, in, has_fractional); +} +template bool tryReadFloatTextExtNoExponent(T & x, ReadBuffer & in, bool & has_fractional) +{ + return readFloatTextFastImpl(x, in, has_fractional); +} } diff --git a/src/IO/tests/gtest_writebuffer_s3.cpp b/src/IO/tests/gtest_writebuffer_s3.cpp index 4a4d7cc0fc2..3c1af6538ad 100644 --- a/src/IO/tests/gtest_writebuffer_s3.cpp +++ b/src/IO/tests/gtest_writebuffer_s3.cpp @@ -546,8 +546,8 @@ public: std::unique_ptr getWriteBuffer(String file_name = "file") { - S3Settings::RequestSettings request_settings; - request_settings.updateFromSettingsIfChanged(settings); + S3::RequestSettings request_settings; + request_settings.updateFromSettings(settings, /* if_changed */true, /* validate_settings */false); client->resetCounters(); diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 67ea069d46d..451671b062c 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include #include #include @@ -371,7 +371,7 @@ struct ContextSharedPart : boost::noncopyable ActionLocksManagerPtr action_locks_manager; /// Set of storages' action lockers OnceFlag system_logs_initialized; std::unique_ptr system_logs TSA_GUARDED_BY(mutex); /// Used to log queries and operations on parts - std::optional storage_s3_settings TSA_GUARDED_BY(mutex); /// Settings of S3 storage + std::optional storage_s3_settings TSA_GUARDED_BY(mutex); /// Settings of S3 storage std::vector warnings TSA_GUARDED_BY(mutex); /// Store warning messages about server configuration. /// Background executors for *MergeTree tables @@ -4296,7 +4296,7 @@ void Context::updateStorageConfiguration(const Poco::Util::AbstractConfiguration { std::lock_guard lock(shared->mutex); if (shared->storage_s3_settings) - shared->storage_s3_settings->loadFromConfig("s3", config, getSettingsRef()); + shared->storage_s3_settings->loadFromConfig(config, /* config_prefix */"s3", getSettingsRef()); } } @@ -4348,14 +4348,14 @@ const DistributedSettings & Context::getDistributedSettings() const return *shared->distributed_settings; } -const StorageS3Settings & Context::getStorageS3Settings() const +const S3SettingsByEndpoint & Context::getStorageS3Settings() const { std::lock_guard lock(shared->mutex); if (!shared->storage_s3_settings) { const auto & config = shared->getConfigRefWithLock(lock); - shared->storage_s3_settings.emplace().loadFromConfig("s3", config, getSettingsRef()); + shared->storage_s3_settings.emplace().loadFromConfig(config, "s3", getSettingsRef()); } return *shared->storage_s3_settings; diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index 68f37377926..7c7b2e4ea64 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -117,7 +117,7 @@ struct DistributedSettings; struct InitialAllRangesAnnouncement; struct ParallelReadRequest; struct ParallelReadResponse; -class StorageS3Settings; +class S3SettingsByEndpoint; class IDatabase; class DDLWorker; class ITableFunction; @@ -1115,7 +1115,7 @@ public: const MergeTreeSettings & getMergeTreeSettings() const; const MergeTreeSettings & getReplicatedMergeTreeSettings() const; const DistributedSettings & getDistributedSettings() const; - const StorageS3Settings & getStorageS3Settings() const; + const S3SettingsByEndpoint & getStorageS3Settings() const; /// Prevents DROP TABLE if its size is greater than max_size (50GB by default, max_size=0 turn off this check) void setMaxTableSizeToDrop(size_t max_size); diff --git a/src/Interpreters/HashJoin.cpp b/src/Interpreters/HashJoin.cpp index 75da8bbc3e7..1c2a054b2a5 100644 --- a/src/Interpreters/HashJoin.cpp +++ b/src/Interpreters/HashJoin.cpp @@ -869,6 +869,7 @@ bool HashJoin::addBlockToJoin(const Block & source_block_, bool check_limits) || (min_rows_to_compress && getTotalRowCount() >= min_rows_to_compress))) { block_to_save = block_to_save.compress(); + have_compressed = true; } data->blocks_allocated_size += block_to_save.allocatedBytes(); @@ -2317,14 +2318,19 @@ void HashJoin::joinBlockImplCross(Block & block, ExtraBlockPtr & not_processed) } }; - for (const Block & compressed_block_right : data->blocks) + for (const Block & block_right : data->blocks) { ++block_number; if (block_number < start_right_block) continue; - auto block_right = compressed_block_right.decompress(); - process_right_block(block_right); + /// The following statement cannot be substituted with `process_right_block(!have_compressed ? block_right : block_right.decompress())` + /// because it will lead to copying of `block_right` even if its branch is taken (because common type of `block_right` and `block_right.decompress()` is `Block`). + if (!have_compressed) + process_right_block(block_right); + else + process_right_block(block_right.decompress()); + if (rows_added > max_joined_block_rows) { break; diff --git a/src/Interpreters/HashJoin.h b/src/Interpreters/HashJoin.h index a0996556f9a..56a1768a7ff 100644 --- a/src/Interpreters/HashJoin.h +++ b/src/Interpreters/HashJoin.h @@ -434,7 +434,10 @@ private: /// Changes in hash table broke correspondence, /// so we must guarantee constantness of hash table during HashJoin lifetime (using method setLock) mutable JoinStuff::JoinUsedFlags used_flags; + RightTableDataPtr data; + bool have_compressed = false; + std::vector key_sizes; /// Needed to do external cross join diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index b72399df2c1..8e072779b53 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -2374,49 +2374,6 @@ UInt64 InterpreterSelectQuery::maxBlockSizeByLimit() const return 0; } -/** Storages can rely that filters that for storage will be available for analysis before - * plan is fully constructed and optimized. - * - * StorageMerge common header calculation and prewhere push-down relies on this. - * - * This is similar to Planner::collectFiltersForAnalysis - */ -void collectFiltersForAnalysis( - const ASTPtr & query_ptr, - const ContextPtr & query_context, - const StorageSnapshotPtr & storage_snapshot, - const SelectQueryOptions & options, - SelectQueryInfo & query_info) -{ - auto get_column_options = GetColumnsOptions(GetColumnsOptions::All).withExtendedObjects().withVirtuals(); - - auto dummy = std::make_shared( - storage_snapshot->storage.getStorageID(), ColumnsDescription(storage_snapshot->getColumns(get_column_options)), storage_snapshot); - - QueryPlan query_plan; - InterpreterSelectQuery(query_ptr, query_context, dummy, dummy->getInMemoryMetadataPtr(), options).buildQueryPlan(query_plan); - - auto optimization_settings = QueryPlanOptimizationSettings::fromContext(query_context); - query_plan.optimize(optimization_settings); - - std::vector nodes_to_process; - nodes_to_process.push_back(query_plan.getRootNode()); - - while (!nodes_to_process.empty()) - { - const auto * node_to_process = nodes_to_process.back(); - nodes_to_process.pop_back(); - nodes_to_process.insert(nodes_to_process.end(), node_to_process->children.begin(), node_to_process->children.end()); - - auto * read_from_dummy = typeid_cast(node_to_process->step.get()); - if (!read_from_dummy) - continue; - - query_info.filter_actions_dag = read_from_dummy->getFilterActionsDAG(); - query_info.optimized_prewhere_info = read_from_dummy->getPrewhereInfo(); - } -} - void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum processing_stage, QueryPlan & query_plan) { auto & query = getSelectQuery(); @@ -2546,10 +2503,6 @@ void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum proc } else if (storage) { - if (shouldMoveToPrewhere() && settings.query_plan_optimize_prewhere && settings.query_plan_enable_optimizations - && typeid_cast(storage.get())) - collectFiltersForAnalysis(query_ptr, context, storage_snapshot, options, query_info); - /// Table. if (max_streams == 0) max_streams = 1; @@ -2601,10 +2554,6 @@ void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum proc query_info.storage_limits = std::make_shared(storage_limits); query_info.settings_limit_offset_done = options.settings_limit_offset_done; - /// Possible filters: row-security, additional filter, replica filter (before array join), where (after array join) - query_info.has_filters_and_no_array_join_before_filter = row_policy_filter || additional_filter_info - || parallel_replicas_custom_filter_info - || (analysis_result.hasWhere() && !analysis_result.before_where->hasArrayJoin() && !analysis_result.array_join); storage->read(query_plan, required_columns, storage_snapshot, query_info, context, processing_stage, max_block_size, max_streams); if (context->hasQueryContext() && !options.is_internal) diff --git a/src/Interpreters/tests/gtest_filecache.cpp b/src/Interpreters/tests/gtest_filecache.cpp index 41191ba1605..36acc319f4e 100644 --- a/src/Interpreters/tests/gtest_filecache.cpp +++ b/src/Interpreters/tests/gtest_filecache.cpp @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -333,6 +334,7 @@ public: TEST_F(FileCacheTest, LRUPolicy) { + ServerUUID::setRandomForUnitTests(); DB::ThreadStatus thread_status; /// To work with cache need query_id and query context. @@ -807,6 +809,7 @@ TEST_F(FileCacheTest, LRUPolicy) TEST_F(FileCacheTest, writeBuffer) { + ServerUUID::setRandomForUnitTests(); FileCacheSettings settings; settings.max_size = 100; settings.max_elements = 5; @@ -938,6 +941,7 @@ static size_t readAllTemporaryData(TemporaryFileStream & stream) TEST_F(FileCacheTest, temporaryData) { + ServerUUID::setRandomForUnitTests(); DB::FileCacheSettings settings; settings.max_size = 10_KiB; settings.max_file_segment_size = 1_KiB; @@ -1044,6 +1048,7 @@ TEST_F(FileCacheTest, temporaryData) TEST_F(FileCacheTest, CachedReadBuffer) { + ServerUUID::setRandomForUnitTests(); DB::ThreadStatus thread_status; /// To work with cache need query_id and query context. @@ -1120,6 +1125,7 @@ TEST_F(FileCacheTest, CachedReadBuffer) TEST_F(FileCacheTest, TemporaryDataReadBufferSize) { + ServerUUID::setRandomForUnitTests(); /// Temporary data stored in cache { DB::FileCacheSettings settings; @@ -1167,6 +1173,7 @@ TEST_F(FileCacheTest, TemporaryDataReadBufferSize) TEST_F(FileCacheTest, SLRUPolicy) { + ServerUUID::setRandomForUnitTests(); DB::ThreadStatus thread_status; std::string query_id = "query_id"; /// To work with cache need query_id and query context. diff --git a/src/Planner/Planner.cpp b/src/Planner/Planner.cpp index b40e23a9553..15b92ed12da 100644 --- a/src/Planner/Planner.cpp +++ b/src/Planner/Planner.cpp @@ -166,7 +166,7 @@ FiltersForTableExpressionMap collectFiltersForAnalysis(const QueryTreeNodePtr & continue; const auto & storage = table_node ? table_node->getStorage() : table_function_node->getStorage(); - if (typeid_cast(storage.get()) || typeid_cast(storage.get()) + if (typeid_cast(storage.get()) || (parallel_replicas_estimation_enabled && std::dynamic_pointer_cast(storage))) { collect_filters = true; diff --git a/src/Planner/PlannerJoinTree.cpp b/src/Planner/PlannerJoinTree.cpp index 7a8ad50b9ab..8275d1604c7 100644 --- a/src/Planner/PlannerJoinTree.cpp +++ b/src/Planner/PlannerJoinTree.cpp @@ -647,7 +647,6 @@ JoinTreeQueryPlan buildQueryPlanForTableExpression(QueryTreeNodePtr table_expres auto table_expression_query_info = select_query_info; table_expression_query_info.table_expression = table_expression; table_expression_query_info.filter_actions_dag = table_expression_data.getFilterActions(); - table_expression_query_info.optimized_prewhere_info = table_expression_data.getPrewhereInfo(); table_expression_query_info.analyzer_can_use_parallel_replicas_on_follower = table_node == planner_context->getGlobalPlannerContext()->parallel_replicas_table; size_t max_streams = settings.max_threads; diff --git a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp index 8ca240b3e8b..263598bdca7 100644 --- a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp +++ b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -608,6 +609,14 @@ size_t tryPushDownFilter(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes return 3; } + if (auto * read_from_merge = typeid_cast(child.get())) + { + FilterDAGInfo info{filter->getExpression(), filter->getFilterColumnName(), filter->removesFilterColumn()}; + read_from_merge->addFilter(std::move(info)); + std::swap(*parent_node, *child_node); + return 1; + } + return 0; } diff --git a/src/Processors/QueryPlan/Optimizations/optimizePrewhere.cpp b/src/Processors/QueryPlan/Optimizations/optimizePrewhere.cpp index 72a2027763c..1badd315200 100644 --- a/src/Processors/QueryPlan/Optimizations/optimizePrewhere.cpp +++ b/src/Processors/QueryPlan/Optimizations/optimizePrewhere.cpp @@ -4,10 +4,10 @@ #include #include #include +#include #include #include #include - namespace DB { @@ -30,7 +30,7 @@ static void removeFromOutput(ActionsDAG & dag, const std::string name) void optimizePrewhere(Stack & stack, QueryPlan::Nodes &) { - if (stack.size() < 3) + if (stack.size() < 2) return; auto & frame = stack.back(); @@ -45,6 +45,9 @@ void optimizePrewhere(Stack & stack, QueryPlan::Nodes &) if (!source_step_with_filter) return; + if (typeid_cast(frame.node->step.get())) + return; + const auto & storage_snapshot = source_step_with_filter->getStorageSnapshot(); const auto & storage = storage_snapshot->storage; if (!storage.canMoveConditionsToPrewhere()) diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp index 5cf6e23854b..3157773d674 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp +++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp @@ -1117,8 +1117,7 @@ static void addMergingFinal( MergeTreeData::MergingParams merging_params, Names partition_key_columns, size_t max_block_size_rows, - bool enable_vertical_final, - bool can_merge_final_indices_to_next_step_filter) + bool enable_vertical_final) { const auto & header = pipe.getHeader(); size_t num_outputs = pipe.numOutputPorts(); @@ -1160,7 +1159,7 @@ static void addMergingFinal( }; pipe.addTransform(get_merging_processor()); - if (enable_vertical_final && !can_merge_final_indices_to_next_step_filter) + if (enable_vertical_final) pipe.addSimpleTransform([](const Block & header_) { return std::make_shared(header_); }); } @@ -1348,8 +1347,7 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsFinal( data.merging_params, partition_key_columns, block_size.max_block_size_rows, - enable_vertical_final, - query_info.has_filters_and_no_array_join_before_filter); + enable_vertical_final); merging_pipes.emplace_back(Pipe::unitePipes(std::move(pipes))); } diff --git a/src/Processors/QueryPlan/SourceStepWithFilter.h b/src/Processors/QueryPlan/SourceStepWithFilter.h index 0971b99d828..126d4824fff 100644 --- a/src/Processors/QueryPlan/SourceStepWithFilter.h +++ b/src/Processors/QueryPlan/SourceStepWithFilter.h @@ -49,11 +49,6 @@ public: filter_dags.push_back(std::move(filter_dag)); } - void addFilterFromParentStep(const ActionsDAG::Node * filter_node) - { - filter_nodes.nodes.push_back(filter_node); - } - /// Apply filters that can optimize reading from storage. void applyFilters() { diff --git a/src/Processors/Transforms/FilterTransform.cpp b/src/Processors/Transforms/FilterTransform.cpp index e8e7f99ce53..cd87019a8e0 100644 --- a/src/Processors/Transforms/FilterTransform.cpp +++ b/src/Processors/Transforms/FilterTransform.cpp @@ -14,7 +14,6 @@ namespace DB namespace ErrorCodes { extern const int ILLEGAL_TYPE_OF_COLUMN_FOR_FILTER; - extern const int LOGICAL_ERROR; } static void replaceFilterToConstant(Block & block, const String & filter_column_name) @@ -37,147 +36,6 @@ static void replaceFilterToConstant(Block & block, const String & filter_column_ } } -static std::shared_ptr getSelectByFinalIndices(Chunk & chunk) -{ - if (auto select_final_indices_info = std::dynamic_pointer_cast(chunk.getChunkInfo())) - { - const auto & index_column = select_final_indices_info->select_final_indices; - chunk.setChunkInfo(nullptr); - if (index_column && index_column->size() != chunk.getNumRows()) - return select_final_indices_info; - } - return nullptr; -} - -static void -executeSelectByIndices(Columns & columns, std::shared_ptr & select_final_indices_info, size_t & num_rows) -{ - if (select_final_indices_info) - { - const auto & index_column = select_final_indices_info->select_final_indices; - - for (auto & column : columns) - column = column->index(*index_column, 0); - - num_rows = index_column->size(); - } -} - -static std::unique_ptr combineFilterAndIndices( - std::unique_ptr description, - std::shared_ptr & select_final_indices_info, - size_t num_rows) -{ - if (select_final_indices_info) - { - const auto * index_column = select_final_indices_info->select_final_indices; - - if (description->hasOne()) - { - const auto & selected_by_indices = index_column->getData(); - const auto * selected_by_filter = description->data->data(); - /// We will recompute new has_one - description->has_one = 0; - /// At this point we know that the filter is not constant, just create a new filter - auto mutable_holder = ColumnUInt8::create(num_rows, 0); - auto & data = mutable_holder->getData(); - for (auto idx : selected_by_indices) - { - if (idx >= num_rows) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Index {} out of range {}", idx, num_rows); - data[idx] = 1; - } - - /// AND two filters - auto * begin = data.data(); - const auto * end = begin + num_rows; -#if defined(__AVX2__) - while (end - begin >= 32) - { - _mm256_storeu_si256( - reinterpret_cast<__m256i *>(begin), - _mm256_and_si256( - _mm256_loadu_si256(reinterpret_cast(begin)), - _mm256_loadu_si256(reinterpret_cast(selected_by_filter)))); - description->has_one |= !memoryIsZero(begin, 0, 32); - begin += 32; - selected_by_filter += 32; - } -#elif defined(__SSE2__) - while (end - begin >= 16) - { - _mm_storeu_si128( - reinterpret_cast<__m128i *>(begin), - _mm_and_si128( - _mm_loadu_si128(reinterpret_cast(begin)), - _mm_loadu_si128(reinterpret_cast(selected_by_filter)))); - description->has_one |= !memoryIsZero(begin, 0, 16); - begin += 16; - selected_by_filter += 16; - } -#endif - - while (end - begin >= 8) - { - *reinterpret_cast(begin) &= *reinterpret_cast(selected_by_filter); - description->has_one |= *reinterpret_cast(begin); - begin += 8; - selected_by_filter += 8; - } - - while (end - begin > 0) - { - *begin &= *selected_by_filter; - description->has_one |= *begin; - begin++; - selected_by_filter++; - } - - description->data_holder = std::move(mutable_holder); - description->data = &data; - } - } - return std::move(description); -} - -static std::unique_ptr combineFilterAndIndices( - std::unique_ptr description, - std::shared_ptr & select_final_indices_info, - size_t num_rows) -{ - /// Iterator interface to decorate data from output of std::set_intersection - struct Iterator - { - UInt8 * data; - Int64 & pop_cnt; - explicit Iterator(UInt8 * data_, Int64 & pop_cnt_) : data(data_), pop_cnt(pop_cnt_) {} - Iterator & operator = (UInt64 index) { data[index] = 1; ++pop_cnt; return *this; } - Iterator & operator ++ () { return *this; } - Iterator & operator * () { return *this; } - }; - - if (select_final_indices_info) - { - const auto * index_column = select_final_indices_info->select_final_indices; - - if (description->hasOne()) - { - std::unique_ptr res; - res->has_one = 0; - const auto & selected_by_indices = index_column->getData(); - const auto & selected_by_filter = description->filter_indices->getData(); - auto mutable_holder = ColumnUInt8::create(num_rows, 0); - auto & data = mutable_holder->getData(); - Iterator decorator(data.data(), res->has_one); - std::set_intersection(selected_by_indices.begin(), selected_by_indices.end(), selected_by_filter.begin(), selected_by_filter.end(), decorator); - res->data_holder = std::move(mutable_holder); - res->data = &data; - return res; - } - } - return std::move(description); -} - Block FilterTransform::transformHeader( const Block & header, const ActionsDAG * expression, const String & filter_column_name, bool remove_filter_column) { @@ -267,7 +125,6 @@ void FilterTransform::doTransform(Chunk & chunk) size_t num_rows_before_filtration = chunk.getNumRows(); auto columns = chunk.detachColumns(); DataTypes types; - auto select_final_indices_info = getSelectByFinalIndices(chunk); { Block block = getInputPort().getHeader().cloneWithColumns(columns); @@ -282,7 +139,6 @@ void FilterTransform::doTransform(Chunk & chunk) if (constant_filter_description.always_true || on_totals) { - executeSelectByIndices(columns, select_final_indices_info, num_rows_before_filtration); chunk.setColumns(std::move(columns), num_rows_before_filtration); removeFilterIfNeed(chunk); return; @@ -303,7 +159,6 @@ void FilterTransform::doTransform(Chunk & chunk) if (constant_filter_description.always_true) { - executeSelectByIndices(columns, select_final_indices_info, num_rows_before_filtration); chunk.setColumns(std::move(columns), num_rows_before_filtration); removeFilterIfNeed(chunk); return; @@ -311,15 +166,9 @@ void FilterTransform::doTransform(Chunk & chunk) std::unique_ptr filter_description; if (filter_column->isSparse()) - filter_description = combineFilterAndIndices( - std::make_unique(*filter_column), select_final_indices_info, num_rows_before_filtration); + filter_description = std::make_unique(*filter_column); else - filter_description = combineFilterAndIndices( - std::make_unique(*filter_column), select_final_indices_info, num_rows_before_filtration); - - - if (!filter_description->has_one) - return; + filter_description = std::make_unique(*filter_column); /** Let's find out how many rows will be in result. * To do this, we filter out the first non-constant column diff --git a/src/Storages/ExternalDataSourceConfiguration.h b/src/Storages/ExternalDataSourceConfiguration.h index d4e737a7de1..c703c9ce999 100644 --- a/src/Storages/ExternalDataSourceConfiguration.h +++ b/src/Storages/ExternalDataSourceConfiguration.h @@ -2,7 +2,7 @@ #include #include -#include +#include #include diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 14a310364dc..89f39c65517 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -1981,6 +1981,7 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks, std::optional runner(getUnexpectedPartsLoadingThreadPool().get(), "UnexpectedParts"); for (auto & load_state : unexpected_data_parts) @@ -2027,6 +2031,13 @@ void MergeTreeData::loadUnexpectedDataParts() unexpected_data_parts_cv.notify_all(); } } +catch (...) +{ + LOG_ERROR(log, "Loading of unexpected parts failed. " + "Will terminate to avoid undefined behaviour due to inconsistent set of parts. " + "Exception: {}", getCurrentExceptionMessage(true)); + std::terminate(); +} void MergeTreeData::loadOutdatedDataParts(bool is_async) try diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp index 9a368bd44f5..e30d63c343a 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp @@ -2004,8 +2004,7 @@ MutationCommands ReplicatedMergeTreeQueue::getMutationCommands( MutationCommands commands; for (auto it = begin; it != end; ++it) { - /// FIXME uncomment this assertion after relesing 23.5 (currently it fails in Upgrade check) - /// chassert(mutation_pointer < it->second->entry->znode_name); + chassert(mutation_pointer < it->second->entry->znode_name); mutation_ids.push_back(it->second->entry->znode_name); const auto & commands_from_entry = it->second->entry->commands; commands.insert(commands.end(), commands_from_entry.begin(), commands_from_entry.end()); diff --git a/src/Storages/ObjectStorage/Azure/Configuration.cpp b/src/Storages/ObjectStorage/Azure/Configuration.cpp index ada3e2e9323..163f08be420 100644 --- a/src/Storages/ObjectStorage/Azure/Configuration.cpp +++ b/src/Storages/ObjectStorage/Azure/Configuration.cpp @@ -249,7 +249,7 @@ AzureClientPtr StorageAzureConfiguration::createClient(bool is_read_only, bool a return result; } -void StorageAzureConfiguration::fromNamedCollection(const NamedCollection & collection) + void StorageAzureConfiguration::fromNamedCollection(const NamedCollection & collection, ContextPtr) { validateNamedCollection(collection, required_configuration_keys, optional_configuration_keys); diff --git a/src/Storages/ObjectStorage/Azure/Configuration.h b/src/Storages/ObjectStorage/Azure/Configuration.h index 35b19079ca9..bbaa82c51ba 100644 --- a/src/Storages/ObjectStorage/Azure/Configuration.h +++ b/src/Storages/ObjectStorage/Azure/Configuration.h @@ -51,7 +51,7 @@ public: ContextPtr context) override; protected: - void fromNamedCollection(const NamedCollection & collection) override; + void fromNamedCollection(const NamedCollection & collection, ContextPtr context) override; void fromAST(ASTs & args, ContextPtr context, bool with_structure) override; using AzureClient = Azure::Storage::Blobs::BlobContainerClient; diff --git a/src/Storages/ObjectStorage/HDFS/Configuration.cpp b/src/Storages/ObjectStorage/HDFS/Configuration.cpp index a8a9ab5b557..155f51adf61 100644 --- a/src/Storages/ObjectStorage/HDFS/Configuration.cpp +++ b/src/Storages/ObjectStorage/HDFS/Configuration.cpp @@ -119,7 +119,7 @@ void StorageHDFSConfiguration::fromAST(ASTs & args, ContextPtr context, bool wit setURL(url_str); } -void StorageHDFSConfiguration::fromNamedCollection(const NamedCollection & collection) +void StorageHDFSConfiguration::fromNamedCollection(const NamedCollection & collection, ContextPtr) { std::string url_str; diff --git a/src/Storages/ObjectStorage/HDFS/Configuration.h b/src/Storages/ObjectStorage/HDFS/Configuration.h index 01a8b9c5e3b..04884542908 100644 --- a/src/Storages/ObjectStorage/HDFS/Configuration.h +++ b/src/Storages/ObjectStorage/HDFS/Configuration.h @@ -46,7 +46,7 @@ public: ContextPtr context) override; private: - void fromNamedCollection(const NamedCollection &) override; + void fromNamedCollection(const NamedCollection &, ContextPtr context) override; void fromAST(ASTs & args, ContextPtr, bool /* with_structure */) override; void setURL(const std::string & url_); diff --git a/src/Storages/ObjectStorage/S3/Configuration.cpp b/src/Storages/ObjectStorage/S3/Configuration.cpp index 4b217b94730..b33d55105e9 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.cpp +++ b/src/Storages/ObjectStorage/S3/Configuration.cpp @@ -106,15 +106,18 @@ ObjectStoragePtr StorageS3Configuration::createObjectStorage(ContextPtr context, const auto & config = context->getConfigRef(); const auto & settings = context->getSettingsRef(); - const std::string config_prefix = "s3."; - auto s3_settings = getSettings(config, config_prefix, context, settings.s3_validate_request_settings); + auto s3_settings = getSettings( + config, "s3"/* config_prefix */, context, url.uri_str, settings.s3_validate_request_settings); - request_settings.updateFromSettingsIfChanged(settings); - auth_settings.updateFrom(s3_settings->auth_settings); + if (auto endpoint_settings = context->getStorageS3Settings().getSettings(url.uri.toString(), context->getUserName())) + { + s3_settings->auth_settings.updateIfChanged(endpoint_settings->auth_settings); + s3_settings->request_settings.updateIfChanged(endpoint_settings->request_settings); + } - s3_settings->auth_settings = auth_settings; - s3_settings->request_settings = request_settings; + s3_settings->auth_settings.updateIfChanged(auth_settings); + s3_settings->request_settings.updateIfChanged(request_settings); if (!headers_from_ast.empty()) { @@ -123,10 +126,7 @@ ObjectStoragePtr StorageS3Configuration::createObjectStorage(ContextPtr context, headers_from_ast.begin(), headers_from_ast.end()); } - if (auto endpoint_settings = context->getStorageS3Settings().getSettings(url.uri.toString(), context->getUserName())) - s3_settings->auth_settings.updateFrom(endpoint_settings->auth_settings); - - auto client = getClient(config, config_prefix, context, *s3_settings, false, &url); + auto client = getClient(url, *s3_settings, context, /* for_disk_s3 */false); auto key_generator = createObjectStorageKeysGeneratorAsIsWithPrefix(url.key); auto s3_capabilities = S3Capabilities { @@ -139,8 +139,9 @@ ObjectStoragePtr StorageS3Configuration::createObjectStorage(ContextPtr context, key_generator, "StorageS3", false); } -void StorageS3Configuration::fromNamedCollection(const NamedCollection & collection) +void StorageS3Configuration::fromNamedCollection(const NamedCollection & collection, ContextPtr context) { + const auto settings = context->getSettingsRef(); validateNamedCollection(collection, required_configuration_keys, optional_configuration_keys); auto filename = collection.getOrDefault("filename", ""); @@ -159,9 +160,9 @@ void StorageS3Configuration::fromNamedCollection(const NamedCollection & collect compression_method = collection.getOrDefault("compression_method", collection.getOrDefault("compression", "auto")); structure = collection.getOrDefault("structure", "auto"); - request_settings = S3Settings::RequestSettings(collection); + request_settings = S3::RequestSettings(collection, settings, /* validate_settings */true); - static_configuration = !auth_settings.access_key_id.empty() || auth_settings.no_sign_request.has_value(); + static_configuration = !auth_settings.access_key_id.value.empty() || auth_settings.no_sign_request.changed; keys = {url.key}; } @@ -357,7 +358,7 @@ void StorageS3Configuration::fromAST(ASTs & args, ContextPtr context, bool with_ if (no_sign_request) auth_settings.no_sign_request = no_sign_request; - static_configuration = !auth_settings.access_key_id.empty() || auth_settings.no_sign_request.has_value(); + static_configuration = !auth_settings.access_key_id.value.empty() || auth_settings.no_sign_request.changed; auth_settings.no_sign_request = no_sign_request; keys = {url.key}; diff --git a/src/Storages/ObjectStorage/S3/Configuration.h b/src/Storages/ObjectStorage/S3/Configuration.h index 906d10a1a9a..39a646c7df2 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.h +++ b/src/Storages/ObjectStorage/S3/Configuration.h @@ -3,7 +3,7 @@ #include "config.h" #if USE_AWS_S3 -#include +#include #include namespace DB @@ -51,14 +51,14 @@ public: ContextPtr context) override; private: - void fromNamedCollection(const NamedCollection & collection) override; + void fromNamedCollection(const NamedCollection & collection, ContextPtr context) override; void fromAST(ASTs & args, ContextPtr context, bool with_structure) override; S3::URI url; std::vector keys; S3::AuthSettings auth_settings; - S3Settings::RequestSettings request_settings; + S3::RequestSettings request_settings; HTTPHeaderEntries headers_from_ast; /// Headers from ast is a part of static configuration. /// If s3 configuration was passed from ast, then it is static. /// If from config - it can be changed with config reload. diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index 2c8e60b49d0..90a97a9ea62 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -424,7 +424,7 @@ void StorageObjectStorage::Configuration::initialize( bool with_table_structure) { if (auto named_collection = tryGetNamedCollectionWithOverrides(engine_args, local_context)) - configuration.fromNamedCollection(*named_collection); + configuration.fromNamedCollection(*named_collection, local_context); else configuration.fromAST(engine_args, local_context, with_table_structure); diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.h b/src/Storages/ObjectStorage/StorageObjectStorage.h index f45d8c1f01a..cf8ec113653 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.h +++ b/src/Storages/ObjectStorage/StorageObjectStorage.h @@ -193,7 +193,7 @@ public: String structure = "auto"; protected: - virtual void fromNamedCollection(const NamedCollection & collection) = 0; + virtual void fromNamedCollection(const NamedCollection & collection, ContextPtr context) = 0; virtual void fromAST(ASTs & args, ContextPtr context, bool with_structure) = 0; void assertInitialized() const; diff --git a/src/Storages/S3Queue/S3QueueMetadata.cpp b/src/Storages/S3Queue/S3QueueMetadata.cpp index 9c77bb2d24c..e828e9f0716 100644 --- a/src/Storages/S3Queue/S3QueueMetadata.cpp +++ b/src/Storages/S3Queue/S3QueueMetadata.cpp @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Storages/SelectQueryInfo.h b/src/Storages/SelectQueryInfo.h index 11e2a2fc5e7..52b6674c93d 100644 --- a/src/Storages/SelectQueryInfo.h +++ b/src/Storages/SelectQueryInfo.h @@ -208,19 +208,9 @@ struct SelectQueryInfo bool need_aggregate = false; PrewhereInfoPtr prewhere_info; - /// Generated by pre-run optimization with StorageDummy. - /// Currently it's used to support StorageMerge PREWHERE optimization. - PrewhereInfoPtr optimized_prewhere_info; - /// If query has aggregate functions bool has_aggregates = false; - /// If query has any filter and no arrayJoin before filter. Used by skipping FINAL - /// Skipping FINAL algorithm will output the original chunk and a column indices of - /// selected rows. If query has filter and doesn't have array join before any filter, - /// we can merge the indices with the first filter in FilterTransform later. - bool has_filters_and_no_array_join_before_filter = false; - ClusterPtr getCluster() const { return !optimized_cluster ? cluster : optimized_cluster; } bool settings_limit_offset_done = false; diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index 4c678a1228b..ed3f43367dd 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -34,9 +34,10 @@ #include #include #include -#include #include #include +#include +#include #include #include #include @@ -402,10 +403,14 @@ ReadFromMerge::ReadFromMerge( { } -void ReadFromMerge::updatePrewhereInfo(const PrewhereInfoPtr & prewhere_info_value) +void ReadFromMerge::addFilter(FilterDAGInfo filter) { - SourceStepWithFilter::updatePrewhereInfo(prewhere_info_value); - common_header = applyPrewhereActions(common_header, prewhere_info); + output_stream->header = FilterTransform::transformHeader( + output_stream->header, + filter.actions.get(), + filter.column_name, + filter.do_remove_column); + pushed_down_filters.push_back(std::move(filter)); } void ReadFromMerge::initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) @@ -435,21 +440,7 @@ void ReadFromMerge::initializePipeline(QueryPipelineBuilder & pipeline, const Bu Names column_names_as_aliases; Aliases aliases; - Names real_column_names = column_names; - if (child_plan.row_policy_data_opt) - child_plan.row_policy_data_opt->extendNames(real_column_names); - - auto modified_query_info = getModifiedQueryInfo(modified_context, table, nested_storage_snaphsot, real_column_names, column_names_as_aliases, aliases); - - auto source_pipeline = createSources( - child_plan.plan, - nested_storage_snaphsot, - modified_query_info, - common_processed_stage, - common_header, - child_plan.table_aliases, - child_plan.row_policy_data_opt, - table); + auto source_pipeline = buildPipeline(child_plan, common_processed_stage); if (source_pipeline && source_pipeline->initialized()) { @@ -567,10 +558,8 @@ std::vector ReadFromMerge::createChildrenPlans(SelectQ if (sampling_requested && !storage->supportsSampling()) throw Exception(ErrorCodes::SAMPLING_NOT_SUPPORTED, "Illegal SAMPLE: table {} doesn't support sampling", storage->getStorageID().getNameForLogs()); - res.emplace_back(); - - auto & aliases = res.back().table_aliases; - auto & row_policy_data_opt = res.back().row_policy_data_opt; + Aliases aliases; + RowPolicyDataOpt row_policy_data_opt; auto storage_metadata_snapshot = storage->getInMemoryMetadataPtr(); auto nested_storage_snaphsot = storage->getStorageSnapshot(storage_metadata_snapshot, modified_context); @@ -649,7 +638,7 @@ std::vector ReadFromMerge::createChildrenPlans(SelectQ } - res.back().plan = createPlanForTable( + auto child = createPlanForTable( nested_storage_snaphsot, modified_query_info, common_processed_stage, @@ -659,9 +648,32 @@ std::vector ReadFromMerge::createChildrenPlans(SelectQ row_policy_data_opt, modified_context, current_streams); - res.back().plan.addInterpreterContext(modified_context); - } + child.plan.addInterpreterContext(modified_context); + if (child.plan.isInitialized()) + { + addVirtualColumns(child, modified_query_info, common_processed_stage, table); + + /// Subordinary tables could have different but convertible types, like numeric types of different width. + /// We must return streams with structure equals to structure of Merge table. + convertAndFilterSourceStream(common_header, modified_query_info, nested_storage_snaphsot, aliases, row_policy_data_opt, context, child); + + for (const auto & filter_info : pushed_down_filters) + { + auto filter_step = std::make_unique( + child.plan.getCurrentDataStream(), + filter_info.actions->clone(), + filter_info.column_name, + filter_info.do_remove_column); + + child.plan.addStep(std::move(filter_step)); + } + + child.plan.optimize(QueryPlanOptimizationSettings::fromContext(modified_context)); + } + + res.emplace_back(std::move(child)); + } return res; } @@ -876,8 +888,6 @@ SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const ContextMutablePtr & mo const StorageID current_storage_id = storage->getStorageID(); SelectQueryInfo modified_query_info = query_info; - if (modified_query_info.optimized_prewhere_info && !modified_query_info.prewhere_info) - modified_query_info.prewhere_info = modified_query_info.optimized_prewhere_info; if (modified_query_info.planner_context) modified_query_info.planner_context = std::make_shared(modified_context, modified_query_info.planner_context); @@ -1019,31 +1029,101 @@ bool recursivelyApplyToReadingSteps(QueryPlan::Node * node, const std::function< return ok; } -QueryPipelineBuilderPtr ReadFromMerge::createSources( - QueryPlan & plan, - const StorageSnapshotPtr & storage_snapshot_, +void ReadFromMerge::addVirtualColumns( + ChildPlan & child, SelectQueryInfo & modified_query_info, QueryProcessingStage::Enum processed_stage, - const Block & header, - const Aliases & aliases, - const RowPolicyDataOpt & row_policy_data_opt, - const StorageWithLockAndName & storage_with_lock, - bool concat_streams) const + const StorageWithLockAndName & storage_with_lock) const { - if (!plan.isInitialized()) - return std::make_unique(); - - QueryPipelineBuilderPtr builder; - - const auto & [database_name, storage, _, table_name] = storage_with_lock; + const auto & [database_name, _, storage, table_name] = storage_with_lock; bool allow_experimental_analyzer = context->getSettingsRef().allow_experimental_analyzer; - auto storage_stage - = storage->getQueryProcessingStage(context, processed_stage, storage_snapshot_, modified_query_info); - builder = plan.buildQueryPipeline( - QueryPlanOptimizationSettings::fromContext(context), BuildQueryPipelineSettings::fromContext(context)); + /// Add virtual columns if we don't already have them. - if (processed_stage > storage_stage || (allow_experimental_analyzer && processed_stage != QueryProcessingStage::FetchColumns)) + Block plan_header = child.plan.getCurrentDataStream().header; + + if (allow_experimental_analyzer) + { + String table_alias = modified_query_info.query_tree->as()->getJoinTree()->as()->getAlias(); + + String database_column = table_alias.empty() || processed_stage == QueryProcessingStage::FetchColumns ? "_database" : table_alias + "._database"; + String table_column = table_alias.empty() || processed_stage == QueryProcessingStage::FetchColumns ? "_table" : table_alias + "._table"; + + if (has_database_virtual_column && common_header.has(database_column) + && child.stage == QueryProcessingStage::FetchColumns && !plan_header.has(database_column)) + { + ColumnWithTypeAndName column; + column.name = database_column; + column.type = std::make_shared(std::make_shared()); + column.column = column.type->createColumnConst(0, Field(database_name)); + + auto adding_column_dag = ActionsDAG::makeAddingColumnActions(std::move(column)); + auto expression_step = std::make_unique(child.plan.getCurrentDataStream(), adding_column_dag); + child.plan.addStep(std::move(expression_step)); + plan_header = child.plan.getCurrentDataStream().header; + } + + if (has_table_virtual_column && common_header.has(table_column) + && child.stage == QueryProcessingStage::FetchColumns && !plan_header.has(table_column)) + { + ColumnWithTypeAndName column; + column.name = table_column; + column.type = std::make_shared(std::make_shared()); + column.column = column.type->createColumnConst(0, Field(table_name)); + + auto adding_column_dag = ActionsDAG::makeAddingColumnActions(std::move(column)); + auto expression_step = std::make_unique(child.plan.getCurrentDataStream(), adding_column_dag); + child.plan.addStep(std::move(expression_step)); + plan_header = child.plan.getCurrentDataStream().header; + } + } + else + { + if (has_database_virtual_column && common_header.has("_database") && !plan_header.has("_database")) + { + ColumnWithTypeAndName column; + column.name = "_database"; + column.type = std::make_shared(std::make_shared()); + column.column = column.type->createColumnConst(0, Field(database_name)); + + auto adding_column_dag = ActionsDAG::makeAddingColumnActions(std::move(column)); + auto expression_step = std::make_unique(child.plan.getCurrentDataStream(), adding_column_dag); + child.plan.addStep(std::move(expression_step)); + plan_header = child.plan.getCurrentDataStream().header; + } + + if (has_table_virtual_column && common_header.has("_table") && !plan_header.has("_table")) + { + ColumnWithTypeAndName column; + column.name = "_table"; + column.type = std::make_shared(std::make_shared()); + column.column = column.type->createColumnConst(0, Field(table_name)); + + auto adding_column_dag = ActionsDAG::makeAddingColumnActions(std::move(column)); + auto expression_step = std::make_unique(child.plan.getCurrentDataStream(), adding_column_dag); + child.plan.addStep(std::move(expression_step)); + plan_header = child.plan.getCurrentDataStream().header; + } + } +} + +QueryPipelineBuilderPtr ReadFromMerge::buildPipeline( + ChildPlan & child, + QueryProcessingStage::Enum processed_stage) const +{ + if (!child.plan.isInitialized()) + return nullptr; + + auto optimisation_settings = QueryPlanOptimizationSettings::fromContext(context); + /// All optimisations will be done at plans creation + optimisation_settings.optimize_plan = false; + auto builder = child.plan.buildQueryPipeline(optimisation_settings, BuildQueryPipelineSettings::fromContext(context)); + + if (!builder->initialized()) + return builder; + + bool allow_experimental_analyzer = context->getSettingsRef().allow_experimental_analyzer; + if (processed_stage > child.stage || (allow_experimental_analyzer && processed_stage != QueryProcessingStage::FetchColumns)) { /** Materialization is needed, since from distributed storage the constants come materialized. * If you do not do this, different types (Const and non-Const) columns will be produced in different threads, @@ -1052,99 +1132,10 @@ QueryPipelineBuilderPtr ReadFromMerge::createSources( builder->addSimpleTransform([](const Block & stream_header) { return std::make_shared(stream_header); }); } - if (builder->initialized()) - { - if (concat_streams && builder->getNumStreams() > 1) - { - // It's possible to have many tables read from merge, resize(1) might open too many files at the same time. - // Using concat instead. - builder->addTransform(std::make_shared(builder->getHeader(), builder->getNumStreams())); - } - - /// Add virtual columns if we don't already have them. - - Block pipe_header = builder->getHeader(); - - if (allow_experimental_analyzer) - { - String table_alias = modified_query_info.query_tree->as()->getJoinTree()->as()->getAlias(); - - String database_column = table_alias.empty() || processed_stage == QueryProcessingStage::FetchColumns ? "_database" : table_alias + "._database"; - String table_column = table_alias.empty() || processed_stage == QueryProcessingStage::FetchColumns ? "_table" : table_alias + "._table"; - - if (has_database_virtual_column && common_header.has(database_column) - && storage_stage == QueryProcessingStage::FetchColumns && !pipe_header.has(database_column)) - { - ColumnWithTypeAndName column; - column.name = database_column; - column.type = std::make_shared(std::make_shared()); - column.column = column.type->createColumnConst(0, Field(database_name)); - - auto adding_column_dag = ActionsDAG::makeAddingColumnActions(std::move(column)); - auto adding_column_actions = std::make_shared( - std::move(adding_column_dag), ExpressionActionsSettings::fromContext(context, CompileExpressions::yes)); - - builder->addSimpleTransform([&](const Block & stream_header) - { return std::make_shared(stream_header, adding_column_actions); }); - } - - if (has_table_virtual_column && common_header.has(table_column) - && storage_stage == QueryProcessingStage::FetchColumns && !pipe_header.has(table_column)) - { - ColumnWithTypeAndName column; - column.name = table_column; - column.type = std::make_shared(std::make_shared()); - column.column = column.type->createColumnConst(0, Field(table_name)); - - auto adding_column_dag = ActionsDAG::makeAddingColumnActions(std::move(column)); - auto adding_column_actions = std::make_shared( - std::move(adding_column_dag), ExpressionActionsSettings::fromContext(context, CompileExpressions::yes)); - - builder->addSimpleTransform([&](const Block & stream_header) - { return std::make_shared(stream_header, adding_column_actions); }); - } - } - else - { - if (has_database_virtual_column && common_header.has("_database") && !pipe_header.has("_database")) - { - ColumnWithTypeAndName column; - column.name = "_database"; - column.type = std::make_shared(std::make_shared()); - column.column = column.type->createColumnConst(0, Field(database_name)); - - auto adding_column_dag = ActionsDAG::makeAddingColumnActions(std::move(column)); - auto adding_column_actions = std::make_shared( - std::move(adding_column_dag), ExpressionActionsSettings::fromContext(context, CompileExpressions::yes)); - builder->addSimpleTransform([&](const Block & stream_header) - { return std::make_shared(stream_header, adding_column_actions); }); - } - - if (has_table_virtual_column && common_header.has("_table") && !pipe_header.has("_table")) - { - ColumnWithTypeAndName column; - column.name = "_table"; - column.type = std::make_shared(std::make_shared()); - column.column = column.type->createColumnConst(0, Field(table_name)); - - auto adding_column_dag = ActionsDAG::makeAddingColumnActions(std::move(column)); - auto adding_column_actions = std::make_shared( - std::move(adding_column_dag), ExpressionActionsSettings::fromContext(context, CompileExpressions::yes)); - builder->addSimpleTransform([&](const Block & stream_header) - { return std::make_shared(stream_header, adding_column_actions); }); - } - } - - /// Subordinary tables could have different but convertible types, like numeric types of different width. - /// We must return streams with structure equals to structure of Merge table. - convertAndFilterSourceStream( - header, modified_query_info, storage_snapshot_, aliases, row_policy_data_opt, context, *builder, storage_stage); - } - return builder; } -QueryPlan ReadFromMerge::createPlanForTable( +ReadFromMerge::ChildPlan ReadFromMerge::createPlanForTable( const StorageSnapshotPtr & storage_snapshot_, SelectQueryInfo & modified_query_info, QueryProcessingStage::Enum processed_stage, @@ -1181,35 +1172,14 @@ QueryPlan ReadFromMerge::createPlanForTable( if (real_column_names.empty()) real_column_names.push_back(ExpressionActions::getSmallestColumn(storage_snapshot_->metadata->getColumns().getAllPhysical()).name); - StorageView * view = dynamic_cast(storage.get()); - if (!view || allow_experimental_analyzer) - { - storage->read(plan, - real_column_names, - storage_snapshot_, - modified_query_info, - modified_context, - processed_stage, - max_block_size, - UInt32(streams_num)); - } - else - { - /// For view storage, we need to rewrite the `modified_query_info.view_query` to optimize read. - /// The most intuitive way is to use InterpreterSelectQuery. - - /// Intercept the settings - modified_context->setSetting("max_threads", streams_num); - modified_context->setSetting("max_streams_to_max_threads_ratio", 1); - modified_context->setSetting("max_block_size", max_block_size); - - InterpreterSelectQuery interpreter(modified_query_info.query, - modified_context, - storage, - view->getInMemoryMetadataPtr(), - SelectQueryOptions(processed_stage)); - interpreter.buildQueryPlan(plan); - } + storage->read(plan, + real_column_names, + storage_snapshot_, + modified_query_info, + modified_context, + processed_stage, + max_block_size, + UInt32(streams_num)); if (!plan.isInitialized()) return {}; @@ -1248,7 +1218,7 @@ QueryPlan ReadFromMerge::createPlanForTable( } } - return plan; + return ChildPlan{std::move(plan), storage_stage}; } ReadFromMerge::RowPolicyData::RowPolicyData(RowPolicyFilterPtr row_policy_filter_ptr, @@ -1306,12 +1276,10 @@ void ReadFromMerge::RowPolicyData::addStorageFilter(SourceStepWithFilter * step) step->addFilter(actions_dag, filter_column_name); } -void ReadFromMerge::RowPolicyData::addFilterTransform(QueryPipelineBuilder & builder) const +void ReadFromMerge::RowPolicyData::addFilterTransform(QueryPlan & plan) const { - builder.addSimpleTransform([&](const Block & stream_header) - { - return std::make_shared(stream_header, filter_actions, filter_column_name, true /* remove filter column */); - }); + auto filter_step = std::make_unique(plan.getCurrentDataStream(), actions_dag, filter_column_name, true /* remove filter column */); + plan.addStep(std::move(filter_step)); } StorageMerge::StorageListWithLocks ReadFromMerge::getSelectedTables( @@ -1490,13 +1458,12 @@ void ReadFromMerge::convertAndFilterSourceStream( const Aliases & aliases, const RowPolicyDataOpt & row_policy_data_opt, ContextPtr local_context, - QueryPipelineBuilder & builder, - QueryProcessingStage::Enum processed_stage) + ChildPlan & child) { - Block before_block_header = builder.getHeader(); + Block before_block_header = child.plan.getCurrentDataStream().header; auto storage_sample_block = snapshot->metadata->getSampleBlock(); - auto pipe_columns = builder.getHeader().getNamesAndTypesList(); + auto pipe_columns = before_block_header.getNamesAndTypesList(); if (local_context->getSettingsRef().allow_experimental_analyzer) { @@ -1519,13 +1486,8 @@ void ReadFromMerge::convertAndFilterSourceStream( throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected to have 1 output but got {}", nodes.size()); actions_dag->addOrReplaceInOutputs(actions_dag->addAlias(*nodes.front(), alias.name)); - - auto actions = std::make_shared(actions_dag, ExpressionActionsSettings::fromContext(local_context, CompileExpressions::yes)); - - builder.addSimpleTransform([&](const Block & stream_header) - { - return std::make_shared(stream_header, actions); - }); + auto expression_step = std::make_unique(child.plan.getCurrentDataStream(), actions_dag); + child.plan.addStep(std::move(expression_step)); } } else @@ -1539,37 +1501,26 @@ void ReadFromMerge::convertAndFilterSourceStream( auto dag = std::make_shared(pipe_columns); auto actions_dag = expression_analyzer.getActionsDAG(true, false); - auto actions = std::make_shared(actions_dag, ExpressionActionsSettings::fromContext(local_context, CompileExpressions::yes)); - - builder.addSimpleTransform([&](const Block & stream_header) - { - return std::make_shared(stream_header, actions); - }); + auto expression_step = std::make_unique(child.plan.getCurrentDataStream(), actions_dag); + child.plan.addStep(std::move(expression_step)); } } ActionsDAG::MatchColumnsMode convert_actions_match_columns_mode = ActionsDAG::MatchColumnsMode::Name; if (local_context->getSettingsRef().allow_experimental_analyzer - && (processed_stage != QueryProcessingStage::FetchColumns || dynamic_cast(&snapshot->storage) != nullptr)) + && (child.stage != QueryProcessingStage::FetchColumns || dynamic_cast(&snapshot->storage) != nullptr)) convert_actions_match_columns_mode = ActionsDAG::MatchColumnsMode::Position; if (row_policy_data_opt) - { - row_policy_data_opt->addFilterTransform(builder); - } + row_policy_data_opt->addFilterTransform(child.plan); - auto convert_actions_dag = ActionsDAG::makeConvertingActions(builder.getHeader().getColumnsWithTypeAndName(), + auto convert_actions_dag = ActionsDAG::makeConvertingActions(child.plan.getCurrentDataStream().header.getColumnsWithTypeAndName(), header.getColumnsWithTypeAndName(), convert_actions_match_columns_mode); - auto actions = std::make_shared( - std::move(convert_actions_dag), - ExpressionActionsSettings::fromContext(local_context, CompileExpressions::yes)); - builder.addSimpleTransform([&](const Block & stream_header) - { - return std::make_shared(stream_header, actions); - }); + auto expression_step = std::make_unique(child.plan.getCurrentDataStream(), convert_actions_dag); + child.plan.addStep(std::move(expression_step)); } const ReadFromMerge::StorageListWithLocks & ReadFromMerge::getSelectedTables() @@ -1606,29 +1557,14 @@ bool ReadFromMerge::requestReadingInOrder(InputOrderInfoPtr order_info_) return true; } -void ReadFromMerge::applyFilters(const QueryPlan & plan, const ActionDAGNodes & added_filter_nodes) const -{ - auto apply_filters = [&added_filter_nodes](ReadFromMergeTree & read_from_merge_tree) - { - for (const auto & node : added_filter_nodes.nodes) - read_from_merge_tree.addFilterFromParentStep(node); - - read_from_merge_tree.SourceStepWithFilter::applyFilters(); - return true; - }; - - recursivelyApplyToReadingSteps(plan.getRootNode(), apply_filters); -} - void ReadFromMerge::applyFilters(ActionDAGNodes added_filter_nodes) { + for (const auto & filter_info : pushed_down_filters) + added_filter_nodes.nodes.push_back(&filter_info.actions->findInOutputs(filter_info.column_name)); + SourceStepWithFilter::applyFilters(added_filter_nodes); filterTablesAndCreateChildrenPlans(); - - for (const auto & child_plan : *child_plans) - if (child_plan.plan.isInitialized()) - applyFilters(child_plan.plan, added_filter_nodes); } QueryPlanRawPtrs ReadFromMerge::getChildPlans() diff --git a/src/Storages/StorageMerge.h b/src/Storages/StorageMerge.h index 735c8711a63..94b34256d02 100644 --- a/src/Storages/StorageMerge.h +++ b/src/Storages/StorageMerge.h @@ -165,7 +165,7 @@ public: QueryPlanRawPtrs getChildPlans() override; - void updatePrewhereInfo(const PrewhereInfoPtr & prewhere_info_value) override; + void addFilter(FilterDAGInfo filter); private: const size_t required_max_block_size; @@ -221,7 +221,7 @@ private: /// Create explicit filter transform to exclude /// rows that are not conform to row level policy - void addFilterTransform(QueryPipelineBuilder &) const; + void addFilterTransform(QueryPlan &) const; private: std::string filter_column_name; // complex filter, may contain logic operations @@ -235,21 +235,21 @@ private: struct ChildPlan { QueryPlan plan; - Aliases table_aliases; - RowPolicyDataOpt row_policy_data_opt; + QueryProcessingStage::Enum stage; }; /// Store read plan for each child table. /// It's needed to guarantee lifetime for child steps to be the same as for this step (mainly for EXPLAIN PIPELINE). std::optional> child_plans; + /// Store filters pushed down from query plan optimization. Filters are added on top of child plans. + std::vector pushed_down_filters; + std::vector createChildrenPlans(SelectQueryInfo & query_info_) const; void filterTablesAndCreateChildrenPlans(); - void applyFilters(const QueryPlan & plan, const ActionDAGNodes & added_filter_nodes) const; - - QueryPlan createPlanForTable( + ChildPlan createPlanForTable( const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & query_info, QueryProcessingStage::Enum processed_stage, @@ -260,16 +260,15 @@ private: ContextMutablePtr modified_context, size_t streams_num) const; - QueryPipelineBuilderPtr createSources( - QueryPlan & plan, - const StorageSnapshotPtr & storage_snapshot, + void addVirtualColumns( + ChildPlan & child, SelectQueryInfo & modified_query_info, QueryProcessingStage::Enum processed_stage, - const Block & header, - const Aliases & aliases, - const RowPolicyDataOpt & row_policy_data_opt, - const StorageWithLockAndName & storage_with_lock, - bool concat_streams = false) const; + const StorageWithLockAndName & storage_with_lock) const; + + QueryPipelineBuilderPtr buildPipeline( + ChildPlan & child, + QueryProcessingStage::Enum processed_stage) const; static void convertAndFilterSourceStream( const Block & header, @@ -278,15 +277,12 @@ private: const Aliases & aliases, const RowPolicyDataOpt & row_policy_data_opt, ContextPtr context, - QueryPipelineBuilder & builder, - QueryProcessingStage::Enum processed_stage); + ChildPlan & child); StorageMerge::StorageListWithLocks getSelectedTables( ContextPtr query_context, bool filter_by_database_virtual_column, bool filter_by_table_virtual_column) const; - - // static VirtualColumnsDescription createVirtuals(StoragePtr first_table); }; } diff --git a/src/Storages/StorageS3Settings.cpp b/src/Storages/StorageS3Settings.cpp deleted file mode 100644 index b767805f637..00000000000 --- a/src/Storages/StorageS3Settings.cpp +++ /dev/null @@ -1,315 +0,0 @@ -#include - -#include - -#include -#include -#include -#include -#include -#include - - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int INVALID_SETTING_VALUE; -} - -S3Settings::RequestSettings::PartUploadSettings::PartUploadSettings(const Settings & settings, bool validate_settings) -{ - updateFromSettings(settings, false); - if (validate_settings) - validate(); -} - -S3Settings::RequestSettings::PartUploadSettings::PartUploadSettings( - const Poco::Util::AbstractConfiguration & config, - const String & config_prefix, - const Settings & settings, - String setting_name_prefix, - bool validate_settings) - : PartUploadSettings(settings, validate_settings) -{ - String key = config_prefix + "." + setting_name_prefix; - strict_upload_part_size = config.getUInt64(key + "strict_upload_part_size", strict_upload_part_size); - min_upload_part_size = config.getUInt64(key + "min_upload_part_size", min_upload_part_size); - max_upload_part_size = config.getUInt64(key + "max_upload_part_size", max_upload_part_size); - upload_part_size_multiply_factor = config.getUInt64(key + "upload_part_size_multiply_factor", upload_part_size_multiply_factor); - upload_part_size_multiply_parts_count_threshold = config.getUInt64(key + "upload_part_size_multiply_parts_count_threshold", upload_part_size_multiply_parts_count_threshold); - max_inflight_parts_for_one_file = config.getUInt64(key + "max_inflight_parts_for_one_file", max_inflight_parts_for_one_file); - max_part_number = config.getUInt64(key + "max_part_number", max_part_number); - max_single_part_upload_size = config.getUInt64(key + "max_single_part_upload_size", max_single_part_upload_size); - max_single_operation_copy_size = config.getUInt64(key + "max_single_operation_copy_size", max_single_operation_copy_size); - - /// This configuration is only applicable to s3. Other types of object storage are not applicable or have different meanings. - storage_class_name = config.getString(config_prefix + ".s3_storage_class", storage_class_name); - storage_class_name = Poco::toUpperInPlace(storage_class_name); - - if (validate_settings) - validate(); -} - -S3Settings::RequestSettings::PartUploadSettings::PartUploadSettings(const NamedCollection & collection) -{ - strict_upload_part_size = collection.getOrDefault("strict_upload_part_size", strict_upload_part_size); - min_upload_part_size = collection.getOrDefault("min_upload_part_size", min_upload_part_size); - max_single_part_upload_size = collection.getOrDefault("max_single_part_upload_size", max_single_part_upload_size); - upload_part_size_multiply_factor = collection.getOrDefault("upload_part_size_multiply_factor", upload_part_size_multiply_factor); - upload_part_size_multiply_parts_count_threshold = collection.getOrDefault("upload_part_size_multiply_parts_count_threshold", upload_part_size_multiply_parts_count_threshold); - max_inflight_parts_for_one_file = collection.getOrDefault("max_inflight_parts_for_one_file", max_inflight_parts_for_one_file); - - /// This configuration is only applicable to s3. Other types of object storage are not applicable or have different meanings. - storage_class_name = collection.getOrDefault("s3_storage_class", storage_class_name); - storage_class_name = Poco::toUpperInPlace(storage_class_name); - - validate(); -} - -void S3Settings::RequestSettings::PartUploadSettings::updateFromSettings(const Settings & settings, bool if_changed) -{ - if (!if_changed || settings.s3_strict_upload_part_size.changed) - strict_upload_part_size = settings.s3_strict_upload_part_size; - - if (!if_changed || settings.s3_min_upload_part_size.changed) - min_upload_part_size = settings.s3_min_upload_part_size; - - if (!if_changed || settings.s3_max_upload_part_size.changed) - max_upload_part_size = settings.s3_max_upload_part_size; - - if (!if_changed || settings.s3_upload_part_size_multiply_factor.changed) - upload_part_size_multiply_factor = settings.s3_upload_part_size_multiply_factor; - - if (!if_changed || settings.s3_upload_part_size_multiply_parts_count_threshold.changed) - upload_part_size_multiply_parts_count_threshold = settings.s3_upload_part_size_multiply_parts_count_threshold; - - if (!if_changed || settings.s3_max_inflight_parts_for_one_file.changed) - max_inflight_parts_for_one_file = settings.s3_max_inflight_parts_for_one_file; - - if (!if_changed || settings.s3_max_single_part_upload_size.changed) - max_single_part_upload_size = settings.s3_max_single_part_upload_size; -} - -void S3Settings::RequestSettings::PartUploadSettings::validate() -{ - static constexpr size_t min_upload_part_size_limit = 5 * 1024 * 1024; - if (strict_upload_part_size && strict_upload_part_size < min_upload_part_size_limit) - throw Exception( - ErrorCodes::INVALID_SETTING_VALUE, - "Setting strict_upload_part_size has invalid value {} which is less than the s3 API limit {}", - ReadableSize(strict_upload_part_size), ReadableSize(min_upload_part_size_limit)); - - if (min_upload_part_size < min_upload_part_size_limit) - throw Exception( - ErrorCodes::INVALID_SETTING_VALUE, - "Setting min_upload_part_size has invalid value {} which is less than the s3 API limit {}", - ReadableSize(min_upload_part_size), ReadableSize(min_upload_part_size_limit)); - - static constexpr size_t max_upload_part_size_limit = 5ull * 1024 * 1024 * 1024; - if (max_upload_part_size > max_upload_part_size_limit) - throw Exception( - ErrorCodes::INVALID_SETTING_VALUE, - "Setting max_upload_part_size has invalid value {} which is greater than the s3 API limit {}", - ReadableSize(max_upload_part_size), ReadableSize(max_upload_part_size_limit)); - - if (max_single_part_upload_size > max_upload_part_size_limit) - throw Exception( - ErrorCodes::INVALID_SETTING_VALUE, - "Setting max_single_part_upload_size has invalid value {} which is grater than the s3 API limit {}", - ReadableSize(max_single_part_upload_size), ReadableSize(max_upload_part_size_limit)); - - if (max_single_operation_copy_size > max_upload_part_size_limit) - throw Exception( - ErrorCodes::INVALID_SETTING_VALUE, - "Setting max_single_operation_copy_size has invalid value {} which is grater than the s3 API limit {}", - ReadableSize(max_single_operation_copy_size), ReadableSize(max_upload_part_size_limit)); - - if (max_upload_part_size < min_upload_part_size) - throw Exception( - ErrorCodes::INVALID_SETTING_VALUE, - "Setting max_upload_part_size ({}) can't be less than setting min_upload_part_size {}", - ReadableSize(max_upload_part_size), ReadableSize(min_upload_part_size)); - - if (!upload_part_size_multiply_factor) - throw Exception( - ErrorCodes::INVALID_SETTING_VALUE, - "Setting upload_part_size_multiply_factor cannot be zero"); - - if (!upload_part_size_multiply_parts_count_threshold) - throw Exception( - ErrorCodes::INVALID_SETTING_VALUE, - "Setting upload_part_size_multiply_parts_count_threshold cannot be zero"); - - if (!max_part_number) - throw Exception( - ErrorCodes::INVALID_SETTING_VALUE, - "Setting max_part_number cannot be zero"); - - static constexpr size_t max_part_number_limit = 10000; - if (max_part_number > max_part_number_limit) - throw Exception( - ErrorCodes::INVALID_SETTING_VALUE, - "Setting max_part_number has invalid value {} which is grater than the s3 API limit {}", - ReadableSize(max_part_number), ReadableSize(max_part_number_limit)); - - size_t maybe_overflow; - if (common::mulOverflow(max_upload_part_size, upload_part_size_multiply_factor, maybe_overflow)) - throw Exception( - ErrorCodes::INVALID_SETTING_VALUE, - "Setting upload_part_size_multiply_factor is too big ({}). " - "Multiplication to max_upload_part_size ({}) will cause integer overflow", - ReadableSize(max_part_number), ReadableSize(max_part_number_limit)); - - std::unordered_set storage_class_names {"STANDARD", "INTELLIGENT_TIERING"}; - if (!storage_class_name.empty() && !storage_class_names.contains(storage_class_name)) - throw Exception( - ErrorCodes::INVALID_SETTING_VALUE, - "Setting storage_class has invalid value {} which only supports STANDARD and INTELLIGENT_TIERING", - storage_class_name); - - /// TODO: it's possible to set too small limits. We can check that max possible object size is not too small. -} - - -S3Settings::RequestSettings::RequestSettings(const Settings & settings, bool validate_settings) - : upload_settings(settings, validate_settings) -{ - updateFromSettingsImpl(settings, false); -} - -S3Settings::RequestSettings::RequestSettings(const NamedCollection & collection) - : upload_settings(collection) -{ - max_single_read_retries = collection.getOrDefault("max_single_read_retries", max_single_read_retries); - max_connections = collection.getOrDefault("max_connections", max_connections); - list_object_keys_size = collection.getOrDefault("list_object_keys_size", list_object_keys_size); - allow_native_copy = collection.getOrDefault("allow_native_copy", allow_native_copy); - throw_on_zero_files_match = collection.getOrDefault("throw_on_zero_files_match", throw_on_zero_files_match); -} - -S3Settings::RequestSettings::RequestSettings( - const Poco::Util::AbstractConfiguration & config, - const String & config_prefix, - const Settings & settings, - String setting_name_prefix, - bool validate_settings) - : upload_settings(config, config_prefix, settings, setting_name_prefix, validate_settings) -{ - String key = config_prefix + "." + setting_name_prefix; - max_single_read_retries = config.getUInt64(key + "max_single_read_retries", settings.s3_max_single_read_retries); - max_connections = config.getUInt64(key + "max_connections", settings.s3_max_connections); - check_objects_after_upload = config.getBool(key + "check_objects_after_upload", settings.s3_check_objects_after_upload); - list_object_keys_size = config.getUInt64(key + "list_object_keys_size", settings.s3_list_object_keys_size); - allow_native_copy = config.getBool(key + "allow_native_copy", allow_native_copy); - throw_on_zero_files_match = config.getBool(key + "throw_on_zero_files_match", settings.s3_throw_on_zero_files_match); - retry_attempts = config.getUInt64(key + "retry_attempts", settings.s3_retry_attempts); - request_timeout_ms = config.getUInt64(key + "request_timeout_ms", settings.s3_request_timeout_ms); - - /// NOTE: it would be better to reuse old throttlers to avoid losing token bucket state on every config reload, - /// which could lead to exceeding limit for short time. But it is good enough unless very high `burst` values are used. - if (UInt64 max_get_rps = config.getUInt64(key + "max_get_rps", settings.s3_max_get_rps)) - { - size_t default_max_get_burst = settings.s3_max_get_burst - ? settings.s3_max_get_burst - : (Throttler::default_burst_seconds * max_get_rps); - - size_t max_get_burst = config.getUInt64(key + "max_get_burst", default_max_get_burst); - - get_request_throttler = std::make_shared(max_get_rps, max_get_burst); - } - if (UInt64 max_put_rps = config.getUInt64(key + "max_put_rps", settings.s3_max_put_rps)) - { - size_t default_max_put_burst = settings.s3_max_put_burst - ? settings.s3_max_put_burst - : (Throttler::default_burst_seconds * max_put_rps); - - size_t max_put_burst = config.getUInt64(key + "max_put_burst", default_max_put_burst); - - put_request_throttler = std::make_shared(max_put_rps, max_put_burst); - } -} - -void S3Settings::RequestSettings::updateFromSettingsImpl(const Settings & settings, bool if_changed) -{ - if (!if_changed || settings.s3_max_single_read_retries.changed) - max_single_read_retries = settings.s3_max_single_read_retries; - - if (!if_changed || settings.s3_max_connections.changed) - max_connections = settings.s3_max_connections; - - if (!if_changed || settings.s3_check_objects_after_upload.changed) - check_objects_after_upload = settings.s3_check_objects_after_upload; - - if (!if_changed || settings.s3_max_unexpected_write_error_retries.changed) - max_unexpected_write_error_retries = settings.s3_max_unexpected_write_error_retries; - - if (!if_changed || settings.s3_list_object_keys_size.changed) - list_object_keys_size = settings.s3_list_object_keys_size; - - if ((!if_changed || settings.s3_max_get_rps.changed || settings.s3_max_get_burst.changed) && settings.s3_max_get_rps) - get_request_throttler = std::make_shared( - settings.s3_max_get_rps, settings.s3_max_get_burst ? settings.s3_max_get_burst : Throttler::default_burst_seconds * settings.s3_max_get_rps); - - if ((!if_changed || settings.s3_max_put_rps.changed || settings.s3_max_put_burst.changed) && settings.s3_max_put_rps) - put_request_throttler = std::make_shared( - settings.s3_max_put_rps, settings.s3_max_put_burst ? settings.s3_max_put_burst : Throttler::default_burst_seconds * settings.s3_max_put_rps); - - if (!if_changed || settings.s3_throw_on_zero_files_match.changed) - throw_on_zero_files_match = settings.s3_throw_on_zero_files_match; - - if (!if_changed || settings.s3_retry_attempts.changed) - retry_attempts = settings.s3_retry_attempts; - - if (!if_changed || settings.s3_request_timeout_ms.changed) - request_timeout_ms = settings.s3_request_timeout_ms; -} - -void S3Settings::RequestSettings::updateFromSettingsIfChanged(const Settings & settings) -{ - updateFromSettingsImpl(settings, true); - upload_settings.updateFromSettings(settings, true); -} - -void StorageS3Settings::loadFromConfig(const String & config_elem, const Poco::Util::AbstractConfiguration & config, const Settings & settings) -{ - std::lock_guard lock(mutex); - s3_settings.clear(); - if (!config.has(config_elem)) - return; - - Poco::Util::AbstractConfiguration::Keys config_keys; - config.keys(config_elem, config_keys); - - for (const String & key : config_keys) - { - if (config.has(config_elem + "." + key + ".endpoint")) - { - auto endpoint = config.getString(config_elem + "." + key + ".endpoint"); - auto auth_settings = S3::AuthSettings::loadFromConfig(config_elem + "." + key, config); - S3Settings::RequestSettings request_settings(config, config_elem + "." + key, settings); - - s3_settings.emplace(endpoint, S3Settings{std::move(auth_settings), std::move(request_settings)}); - } - } -} - -std::optional StorageS3Settings::getSettings(const String & endpoint, const String & user, bool ignore_user) const -{ - std::lock_guard lock(mutex); - auto next_prefix_setting = s3_settings.upper_bound(endpoint); - - /// Linear time algorithm may be replaced with logarithmic with prefix tree map. - for (auto possible_prefix_setting = next_prefix_setting; possible_prefix_setting != s3_settings.begin();) - { - std::advance(possible_prefix_setting, -1); - const auto & [endpoint_prefix, settings] = *possible_prefix_setting; - if (endpoint.starts_with(endpoint_prefix) && (ignore_user || settings.auth_settings.canBeUsedByUser(user))) - return possible_prefix_setting->second; - } - - return {}; -} - -} diff --git a/src/Storages/StorageS3Settings.h b/src/Storages/StorageS3Settings.h deleted file mode 100644 index c3bc8aa6ed6..00000000000 --- a/src/Storages/StorageS3Settings.h +++ /dev/null @@ -1,122 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -namespace Poco::Util -{ -class AbstractConfiguration; -} - -namespace DB -{ - -struct Settings; -class NamedCollection; - -struct S3Settings -{ - struct RequestSettings - { - struct PartUploadSettings - { - size_t strict_upload_part_size = 0; - size_t min_upload_part_size = 16 * 1024 * 1024; - size_t max_upload_part_size = 5ULL * 1024 * 1024 * 1024; - size_t upload_part_size_multiply_factor = 2; - size_t upload_part_size_multiply_parts_count_threshold = 500; - size_t max_inflight_parts_for_one_file = 20; - size_t max_part_number = 10000; - size_t max_single_part_upload_size = 32 * 1024 * 1024; - size_t max_single_operation_copy_size = 5ULL * 1024 * 1024 * 1024; - String storage_class_name; - - void updateFromSettings(const Settings & settings, bool if_changed); - void validate(); - - private: - PartUploadSettings() = default; - explicit PartUploadSettings(const Settings & settings, bool validate_settings = true); - explicit PartUploadSettings(const NamedCollection & collection); - PartUploadSettings( - const Poco::Util::AbstractConfiguration & config, - const String & config_prefix, - const Settings & settings, - String setting_name_prefix = {}, - bool validate_settings = true); - - friend struct RequestSettings; - }; - - private: - PartUploadSettings upload_settings = {}; - - public: - size_t max_single_read_retries = 4; - size_t max_connections = 1024; - bool check_objects_after_upload = false; - size_t max_unexpected_write_error_retries = 4; - size_t list_object_keys_size = 1000; - ThrottlerPtr get_request_throttler; - ThrottlerPtr put_request_throttler; - size_t retry_attempts = 10; - size_t request_timeout_ms = 30000; - bool allow_native_copy = true; - - bool throw_on_zero_files_match = false; - - const PartUploadSettings & getUploadSettings() const { return upload_settings; } - PartUploadSettings & getUploadSettings() { return upload_settings; } - - void setStorageClassName(const String & storage_class_name) { upload_settings.storage_class_name = storage_class_name; } - - RequestSettings() = default; - explicit RequestSettings(const Settings & settings, bool validate_settings = true); - explicit RequestSettings(const NamedCollection & collection); - - /// What's the setting_name_prefix, and why do we need it? - /// There are (at least) two config sections where s3 settings can be specified: - /// * settings for s3 disk (clickhouse/storage_configuration/disks) - /// * settings for s3 storage (clickhouse/s3), which are also used for backups - /// Even though settings are the same, in case of s3 disk they are prefixed with "s3_" - /// ("s3_max_single_part_upload_size"), but in case of s3 storage they are not - /// ( "max_single_part_upload_size"). Why this happened is a complete mystery to me. - RequestSettings( - const Poco::Util::AbstractConfiguration & config, - const String & config_prefix, - const Settings & settings, - String setting_name_prefix = {}, - bool validate_settings = true); - - void updateFromSettingsIfChanged(const Settings & settings); - - private: - void updateFromSettingsImpl(const Settings & settings, bool if_changed); - }; - - S3::AuthSettings auth_settings; - RequestSettings request_settings; -}; - -/// Settings for the StorageS3. -class StorageS3Settings -{ -public: - void loadFromConfig(const String & config_elem, const Poco::Util::AbstractConfiguration & config, const Settings & settings); - - std::optional getSettings(const String & endpoint, const String & user, bool ignore_user = false) const; - -private: - mutable std::mutex mutex; - std::map s3_settings; -}; - -} diff --git a/tests/ci/changelog.py b/tests/ci/changelog.py new file mode 100755 index 00000000000..fcb61d3f605 --- /dev/null +++ b/tests/ci/changelog.py @@ -0,0 +1,455 @@ +#!/usr/bin/env python3 +# In our CI this script runs in style-test containers + +import argparse +import logging +import re +from datetime import date, timedelta +from pathlib import Path +from subprocess import DEVNULL +from typing import Any, Dict, List, Optional, TextIO + +import tqdm # type: ignore +from github.GithubException import RateLimitExceededException, UnknownObjectException +from github.NamedUser import NamedUser +from thefuzz.fuzz import ratio # type: ignore + +from cache_utils import GitHubCache +from env_helper import TEMP_PATH +from git_helper import git_runner, is_shallow +from github_helper import GitHub, PullRequest, PullRequests, Repository +from s3_helper import S3Helper +from version_helper import ( + FILE_WITH_VERSION_PATH, + get_abs_path, + get_version_from_repo, + get_version_from_tag, +) + +# This array gives the preferred category order, and is also used to +# normalize category names. +# Categories are used in .github/PULL_REQUEST_TEMPLATE.md, keep comments there +# updated accordingly +categories_preferred_order = ( + "Backward Incompatible Change", + "New Feature", + "Performance Improvement", + "Improvement", + "Critical Bug Fix", + "Bug Fix", + "Build/Testing/Packaging Improvement", + "Other", +) + +FROM_REF = "" +TO_REF = "" +SHA_IN_CHANGELOG = [] # type: List[str] +gh = GitHub(create_cache_dir=False) +runner = git_runner + + +class Description: + def __init__( + self, number: int, user: NamedUser, html_url: str, entry: str, category: str + ): + self.number = number + self.html_url = html_url + self.user = gh.get_user_cached(user._rawData["login"]) # type: ignore + self.entry = entry + self.category = category + + @property + def formatted_entry(self) -> str: + # Substitute issue links. + # 1) issue number w/o markdown link + entry = re.sub( + r"([^[])#([0-9]{4,})", + r"\1[#\2](https://github.com/ClickHouse/ClickHouse/issues/\2)", + self.entry, + ) + # 2) issue URL w/o markdown link + # including #issuecomment-1 or #event-12 + entry = re.sub( + r"([^(])(https://github.com/ClickHouse/ClickHouse/issues/([0-9]{4,})[-#a-z0-9]*)", + r"\1[#\3](\2)", + entry, + ) + # It's possible that we face a secondary rate limit. + # In this case we should sleep until we get it + while True: + try: + user_name = self.user.name if self.user.name else self.user.login + break + except UnknownObjectException: + user_name = self.user.login + break + except RateLimitExceededException: + gh.sleep_on_rate_limit() + return ( + f"* {entry} [#{self.number}]({self.html_url}) " + f"([{user_name}]({self.user.html_url}))." + ) + + # Sort PR descriptions by numbers + def __eq__(self, other: Any) -> bool: + if not isinstance(self, type(other)): + raise NotImplementedError + return bool(self.number == other.number) + + def __lt__(self, other: "Description") -> bool: + return self.number < other.number + + +def get_descriptions(prs: PullRequests) -> Dict[str, List[Description]]: + descriptions = {} # type: Dict[str, List[Description]] + repos = {} # type: Dict[str, Repository] + for pr in prs: + # See https://github.com/PyGithub/PyGithub/issues/2202, + # obj._rawData doesn't spend additional API requests + # We'll save some requests + # pylint: disable=protected-access + repo_name = pr._rawData["base"]["repo"]["full_name"] + # pylint: enable=protected-access + if repo_name not in repos: + repos[repo_name] = pr.base.repo + in_changelog = False + merge_commit = pr.merge_commit_sha + if merge_commit is None: + logging.warning("PR %s does not have merge-commit, skipping", pr.number) + continue + + in_changelog = merge_commit in SHA_IN_CHANGELOG + if in_changelog: + desc = generate_description(pr, repos[repo_name]) + if desc: + if desc.category not in descriptions: + descriptions[desc.category] = [] + descriptions[desc.category].append(desc) + + for descs in descriptions.values(): + descs.sort() + + return descriptions + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description="Generate a changelog in Markdown format between given tags. " + "It fetches all tags and unshallow the git repository automatically", + ) + parser.add_argument( + "-v", + "--verbose", + action="count", + default=0, + help="set the script verbosity, could be used multiple", + ) + parser.add_argument( + "--debug-helpers", + action="store_true", + help="add debug logging for git_helper and github_helper", + ) + parser.add_argument( + "--output", + type=argparse.FileType("w"), + default="-", + help="output file for changelog", + ) + parser.add_argument( + "--repo", + default="ClickHouse/ClickHouse", + help="a repository to query for pull-requests from GitHub", + ) + parser.add_argument( + "--jobs", + type=int, + default=10, + help="number of jobs to get pull-requests info from GitHub API", + ) + parser.add_argument( + "--gh-user-or-token", + help="user name or GH token to authenticate", + ) + parser.add_argument( + "--gh-password", + help="a password that should be used when user is given", + ) + parser.add_argument( + "--with-testing-tags", + action="store_true", + help="by default '*-testing' tags are ignored, this argument enables them too", + ) + parser.add_argument( + "--from", + dest="from_ref", + help="git ref for a starting point of changelog, by default is calculated " + "automatically to match a previous tag in history", + ) + parser.add_argument( + "to_ref", + metavar="TO_REF", + help="git ref for the changelog end", + ) + args = parser.parse_args() + return args + + +# This function mirrors the PR description checks in ClickhousePullRequestTrigger. +# Returns None if the PR should not be mentioned in changelog. +def generate_description(item: PullRequest, repo: Repository) -> Optional[Description]: + backport_number = item.number + if item.head.ref.startswith("backport/"): + branch_parts = item.head.ref.split("/") + if len(branch_parts) == 3: + try: + item = gh.get_pull_cached(repo, int(branch_parts[-1])) + except Exception as e: + logging.warning("unable to get backpoted PR, exception: %s", e) + else: + logging.warning( + "The branch %s doesn't match backport template, using PR %s as is", + item.head.ref, + item.number, + ) + description = item.body + # Don't skip empty lines because they delimit parts of description + lines = [x.strip() for x in (description.split("\n") if description else [])] + lines = [re.sub(r"\s+", " ", ln) for ln in lines] + + category = "" + entry = "" + + if lines: + i = 0 + while i < len(lines): + if re.match(r"(?i)^[#>*_ ]*change\s*log\s*category", lines[i]): + i += 1 + if i >= len(lines): + break + # Can have one empty line between header and the category itself. + # Filter it out. + if not lines[i]: + i += 1 + if i >= len(lines): + break + category = re.sub(r"^[-*\s]*", "", lines[i]) + i += 1 + elif re.match( + r"(?i)^[#>*_ ]*(short\s*description|change\s*log\s*entry)", lines[i] + ): + i += 1 + # Can have one empty line between header and the entry itself. + # Filter it out. + if i < len(lines) and not lines[i]: + i += 1 + # All following lines until empty one are the changelog entry. + entry_lines = [] + while i < len(lines) and lines[i]: + entry_lines.append(lines[i]) + i += 1 + entry = " ".join(entry_lines) + else: + i += 1 + + # Remove excessive bullets from the entry. + if re.match(r"^[\-\*] ", entry): + entry = entry[2:] + + # Better style. + if re.match(r"^[a-z]", entry): + entry = entry.capitalize() + + if not category: + # Shouldn't happen, because description check in CI should catch such PRs. + # Fall through, so that it shows up in output and the user can fix it. + category = "NO CL CATEGORY" + + # Filter out documentations changelog before not-for-changelog + if re.match( + r"(?i)doc", + category, + ): + return None + + # Filter out the PR categories that are not for changelog. + if re.search( + r"(?i)((non|in|not|un)[-\s]*significant)|" + r"(not[ ]*for[ ]*changelog)|" + r"(changelog[ ]*entry[ ]*is[ ]*not[ ]*required)", + category, + ): + category = "NOT FOR CHANGELOG / INSIGNIFICANT" + entry = item.title + + # Normalize bug fixes + if re.match( + r"(?i)bug\Wfix", + category, + ): + category = "Bug Fix (user-visible misbehavior in an official stable release)" + + if backport_number != item.number: + entry = f"Backported in #{backport_number}: {entry}" + + if not entry: + # Shouldn't happen, because description check in CI should catch such PRs. + category = "NO CL ENTRY" + entry = "NO CL ENTRY: '" + item.title + "'" + + entry = entry.strip() + if entry[-1] != ".": + entry += "." + + for c in categories_preferred_order: + if ratio(category.lower(), c.lower()) >= 90: + category = c + break + + return Description(item.number, item.user, item.html_url, entry, category) + + +def write_changelog( + fd: TextIO, descriptions: Dict[str, List[Description]], year: int +) -> None: + to_commit = runner(f"git rev-parse {TO_REF}^{{}}")[:11] + from_commit = runner(f"git rev-parse {FROM_REF}^{{}}")[:11] + fd.write( + f"---\nsidebar_position: 1\nsidebar_label: {year}\n---\n\n" + f"# {year} Changelog\n\n" + f"### ClickHouse release {TO_REF} ({to_commit}) FIXME " + f"as compared to {FROM_REF} ({from_commit})\n\n" + ) + + seen_categories = [] # type: List[str] + for category in categories_preferred_order: + if category in descriptions: + seen_categories.append(category) + fd.write(f"#### {category}\n") + for desc in descriptions[category]: + fd.write(f"{desc.formatted_entry}\n") + + fd.write("\n") + + for category in sorted(descriptions): + if category not in seen_categories: + fd.write(f"#### {category}\n\n") + for desc in descriptions[category]: + fd.write(f"{desc.formatted_entry}\n") + + fd.write("\n") + + +def check_refs(from_ref: Optional[str], to_ref: str, with_testing_tags: bool) -> None: + global FROM_REF, TO_REF + TO_REF = to_ref + + # Check TO_REF + runner.run(f"git rev-parse {TO_REF}") + + # Check from_ref + if from_ref is not None: + runner.run(f"git rev-parse {FROM_REF}") + FROM_REF = from_ref + return + + # Get the cmake/autogenerated_versions.txt from FROM_REF to read the version + # If the previous tag is greater than version in the FROM_REF, + # then we need to add it to tags_to_exclude + temp_cmake = "tests/ci/tmp/autogenerated_versions.txt" + cmake_version = get_abs_path(temp_cmake) + cmake_version.write_text(runner(f"git show {TO_REF}:{FILE_WITH_VERSION_PATH}")) + to_ref_version = get_version_from_repo(cmake_version) + # Get all tags pointing to TO_REF + excluded_tags = runner.run(f"git tag --points-at '{TO_REF}^{{}}'").split("\n") + logging.info("All tags pointing to %s:\n%s", TO_REF, excluded_tags) + if not with_testing_tags: + excluded_tags.append("*-testing") + while not from_ref: + exclude = " ".join([f"--exclude='{tag}'" for tag in excluded_tags]) + from_ref_tag = runner(f"git describe --abbrev=0 --tags {exclude} '{TO_REF}'") + from_ref_version = get_version_from_tag(from_ref_tag) + if from_ref_version <= to_ref_version: + from_ref = from_ref_tag + break + excluded_tags.append(from_ref_tag) + + cmake_version.unlink() + FROM_REF = from_ref + + +def set_sha_in_changelog(): + global SHA_IN_CHANGELOG + SHA_IN_CHANGELOG = runner.run( + f"git log --format=format:%H {FROM_REF}..{TO_REF}" + ).split("\n") + + +def get_year(prs: PullRequests) -> int: + if not prs: + return date.today().year + return max(pr.created_at.year for pr in prs) + + +def main(): + log_levels = [logging.WARN, logging.INFO, logging.DEBUG] + args = parse_args() + logging.basicConfig( + format="%(asctime)s %(levelname)-8s [%(filename)s:%(lineno)d]:\n%(message)s", + level=log_levels[min(args.verbose, 2)], + ) + if args.debug_helpers: + logging.getLogger("github_helper").setLevel(logging.DEBUG) + logging.getLogger("git_helper").setLevel(logging.DEBUG) + + # Get the full repo + if is_shallow(): + logging.info("Unshallow repository") + runner.run("git fetch --unshallow", stderr=DEVNULL) + logging.info("Fetching all tags") + runner.run("git fetch --tags", stderr=DEVNULL) + + check_refs(args.from_ref, args.to_ref, args.with_testing_tags) + set_sha_in_changelog() + + logging.info("Using %s..%s as changelog interval", FROM_REF, TO_REF) + + # use merge-base commit as a starting point, if used ref in another branch + base_commit = runner.run(f"git merge-base '{FROM_REF}^{{}}' '{TO_REF}^{{}}'") + # Get starting and ending dates for gathering PRs + # Add one day after and before to mitigate TZ possible issues + # `tag^{}` format gives commit ref when we have annotated tags + # format %cs gives a committer date, works better for cherry-picked commits + from_date = runner.run(f"git log -1 --format=format:%cs '{base_commit}'") + to_date = runner.run(f"git log -1 --format=format:%cs '{TO_REF}^{{}}'") + merged = ( + date.fromisoformat(from_date) - timedelta(1), + date.fromisoformat(to_date) + timedelta(1), + ) + + # Get all PRs for the given time frame + global gh + gh = GitHub( + args.gh_user_or_token, + args.gh_password, + create_cache_dir=False, + per_page=100, + pool_size=args.jobs, + ) + temp_path = Path(TEMP_PATH) + gh_cache = GitHubCache(gh.cache_path, temp_path, S3Helper()) + gh_cache.download() + query = f"type:pr repo:{args.repo} is:merged" + prs = gh.get_pulls_from_search( + query=query, merged=merged, sort="created", progress_func=tqdm.tqdm + ) + + descriptions = get_descriptions(prs) + changelog_year = get_year(prs) + + write_changelog(args.output, descriptions, changelog_year) + gh_cache.upload() + + +if __name__ == "__main__": + main() diff --git a/tests/ci/cherry_pick.py b/tests/ci/cherry_pick.py index 629464d0422..459be12ada0 100644 --- a/tests/ci/cherry_pick.py +++ b/tests/ci/cherry_pick.py @@ -532,9 +532,9 @@ class Backport: for br in branches: br.process(self.dry_run) - for br in branches: - if br.backported: - self.mark_pr_backported(pr) + if all(br.backported for br in branches): + # And check it after the running + self.mark_pr_backported(pr) def mark_pr_backported(self, pr: PullRequest) -> None: if self.dry_run: diff --git a/tests/ci/finish_check.py b/tests/ci/finish_check.py index 12756599865..904b565ad86 100644 --- a/tests/ci/finish_check.py +++ b/tests/ci/finish_check.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 +import argparse import logging -import sys from github import Github @@ -20,84 +20,98 @@ from report import FAILURE, PENDING, SUCCESS, StatusType from synchronizer_utils import SYNC_BRANCH_PREFIX +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description="Script to merge the given PR. Additional checks for approved " + "status and green commit statuses could be done", + ) + parser.add_argument( + "--wf-status", + type=str, + default="", + help="overall workflow status [success|failure]", + ) + return parser.parse_args() + + def main(): logging.basicConfig(level=logging.INFO) + args = parse_args() - has_failure = False - - # FIXME: temporary hack to fail Mergeable Check in MQ if pipeline has any failed jobs - if len(sys.argv) > 1 and sys.argv[1] == "--pipeline-failure": - has_failure = True + has_workflow_failures = args.wf_status == FAILURE pr_info = PRInfo(need_orgs=True) gh = Github(get_best_robot_token(), per_page=100) commit = get_commit(gh, pr_info.sha) - statuses = None if pr_info.is_merge_queue: - # in MQ Mergeable check status must never be green if any failures in workflow - if has_failure: - set_mergeable_check(commit, "workflow failed", "failure") + # in MQ Mergeable check status must never be green if any failures in the workflow + if has_workflow_failures: + set_mergeable_check(commit, "workflow failed", FAILURE) else: # This must be the only place where green MCheck is set in the MQ (in the end of CI) to avoid early merge - set_mergeable_check(commit, "workflow passed", "success") - else: - statuses = get_commit_filtered_statuses(commit) - state = trigger_mergeable_check(commit, statuses, set_if_green=True) + set_mergeable_check(commit, "workflow passed", SUCCESS) + return - # Process upstream StatusNames.SYNC - if ( - pr_info.head_ref.startswith(f"{SYNC_BRANCH_PREFIX}/pr/") - and GITHUB_REPOSITORY != GITHUB_UPSTREAM_REPOSITORY - ): - upstream_pr_number = int(pr_info.head_ref.split("/pr/", maxsplit=1)[1]) - update_upstream_sync_status( - upstream_pr_number, - pr_info.number, - gh, - state, - can_set_green_mergeable_status=True, - ) + statuses = get_commit_filtered_statuses(commit) + state = trigger_mergeable_check(commit, statuses, set_if_green=True) - ci_running_statuses = [s for s in statuses if s.context == StatusNames.CI] - if not ci_running_statuses: - return - # Take the latest status - ci_status = ci_running_statuses[-1] + # Process upstream StatusNames.SYNC + if ( + pr_info.head_ref.startswith(f"{SYNC_BRANCH_PREFIX}/pr/") + and GITHUB_REPOSITORY != GITHUB_UPSTREAM_REPOSITORY + ): + upstream_pr_number = int(pr_info.head_ref.split("/pr/", maxsplit=1)[1]) + update_upstream_sync_status( + upstream_pr_number, + pr_info.number, + gh, + state, + can_set_green_mergeable_status=True, + ) - has_failure = False - has_pending = False - for status in statuses: - if status.context in (StatusNames.MERGEABLE, StatusNames.CI): - # do not account these statuses - continue - if status.state == PENDING: - if status.context == StatusNames.SYNC: - # do not account sync status if pending - it's a different WF - continue - has_pending = True - elif status.state == SUCCESS: - continue - else: - has_failure = True + ci_running_statuses = [s for s in statuses if s.context == StatusNames.CI] + if not ci_running_statuses: + return + # Take the latest status + ci_status = ci_running_statuses[-1] - ci_state = SUCCESS # type: StatusType - if has_failure: - ci_state = FAILURE - elif has_pending: - print("ERROR: CI must not have pending jobs by the time of finish check") - ci_state = FAILURE + has_failure = False + has_pending = False + error_cnt = 0 + for status in statuses: + if status.context in (StatusNames.MERGEABLE, StatusNames.CI, StatusNames.SYNC): + # do not account these statuses + continue + if status.state == PENDING: + has_pending = True + elif status.state != SUCCESS: + has_failure = True + error_cnt += 1 - if ci_status.state == PENDING: - post_commit_status( - commit, - ci_state, - ci_status.target_url, - "All checks finished", - StatusNames.CI, - pr_info, - dump_to_file=True, - ) + ci_state = SUCCESS # type: StatusType + description = "All checks finished" + if has_failure: + ci_state = FAILURE + description = f"All checks finished. {error_cnt} jobs failed" + elif has_workflow_failures: + ci_state = FAILURE + description = "All checks finished. Workflow has failures." + elif has_pending: + print("ERROR: CI must not have pending jobs by the time of finish check") + description = "ERROR: workflow has pending jobs" + ci_state = FAILURE + + post_commit_status( + commit, + ci_state, + ci_status.target_url, + description, + StatusNames.CI, + pr_info, + dump_to_file=True, + ) if __name__ == "__main__": diff --git a/tests/ci/github_helper.py b/tests/ci/github_helper.py index eb0f6c24527..431e6977091 100644 --- a/tests/ci/github_helper.py +++ b/tests/ci/github_helper.py @@ -6,7 +6,7 @@ from datetime import date, datetime, timedelta from os import path as p from pathlib import Path from time import sleep -from typing import List, Optional, Tuple, Union +from typing import Any, Callable, List, Optional, Tuple, Union import github import requests @@ -49,38 +49,43 @@ class GitHub(github.Github): """Wrapper around search method with throttling and splitting by date. We split only by the first""" - splittable = False + splittable_arg = "" + splittable_value = [] for arg, value in kwargs.items(): if arg in ["closed", "created", "merged", "updated"]: if hasattr(value, "__iter__") and not isinstance(value, str): - assert [True for v in value if isinstance(v, (date, datetime))] + assert all(True for v in value if isinstance(v, (date, datetime))) assert len(value) == 2 kwargs[arg] = f"{value[0].isoformat()}..{value[1].isoformat()}" - if not splittable: + if not splittable_arg: # We split only by the first met splittable argument - preserved_arg = arg - preserved_value = value middle_value = value[0] + (value[1] - value[0]) / 2 - splittable = middle_value not in value + if middle_value in value: + # When the middle value in itareble value, we can't use it + # to split by dates later + continue + splittable_arg = arg + splittable_value = value continue assert isinstance(value, (date, datetime, str)) inter_result = [] # type: Issues + exception = RateLimitExceededException(0) for i in range(self.retries): try: logger.debug("Search issues, args=%s, kwargs=%s", args, kwargs) result = super().search_issues(*args, **kwargs) - if result.totalCount == 1000 and splittable: + if result.totalCount == 1000 and splittable_arg: # The hard limit is 1000. If it's splittable, then we make # two subrequests requests with less time frames logger.debug( "The search result contain exactly 1000 results, " "splitting %s=%s by middle point %s", - preserved_arg, - kwargs[preserved_arg], + splittable_arg, + kwargs[splittable_arg], middle_value, ) - kwargs[preserved_arg] = [preserved_value[0], middle_value] + kwargs[splittable_arg] = [splittable_value[0], middle_value] inter_result.extend(self.search_issues(*args, **kwargs)) if isinstance(middle_value, date): # When middle_value is a date, 2022-01-01..2022-01-03 @@ -88,9 +93,10 @@ class GitHub(github.Github): # 2022-01-02..2022-01-03, so we have results for # 2022-01-02 twicely. We split it to # 2022-01-01..2022-01-02 and 2022-01-03..2022-01-03. - # 2022-01-01..2022-01-02 aren't split, see splittable + # 2022-01-01..2022-01-02 aren't split, see splittable_arg + # definition above for kwargs.items middle_value += timedelta(days=1) - kwargs[preserved_arg] = [middle_value, preserved_value[1]] + kwargs[splittable_arg] = [middle_value, splittable_value[1]] inter_result.extend(self.search_issues(*args, **kwargs)) return inter_result @@ -104,12 +110,15 @@ class GitHub(github.Github): raise exception # pylint: enable=signature-differs - def get_pulls_from_search(self, *args, **kwargs) -> PullRequests: # type: ignore + def get_pulls_from_search(self, *args: Any, **kwargs: Any) -> PullRequests: """The search api returns actually issues, so we need to fetch PullRequests""" + progress_func = kwargs.pop( + "progress_func", lambda x: x + ) # type: Callable[[Issues], Issues] issues = self.search_issues(*args, **kwargs) repos = {} prs = [] # type: PullRequests - for issue in issues: + for issue in progress_func(issues): # See https://github.com/PyGithub/PyGithub/issues/2202, # obj._rawData doesn't spend additional API requests # pylint: disable=protected-access diff --git a/tests/integration/helpers/s3_url_proxy_tests_util.py b/tests/integration/helpers/s3_url_proxy_tests_util.py index c67d00769c5..9a45855acb8 100644 --- a/tests/integration/helpers/s3_url_proxy_tests_util.py +++ b/tests/integration/helpers/s3_url_proxy_tests_util.py @@ -30,7 +30,7 @@ def check_proxy_logs( False ), f"{http_method} method not found in logs of {proxy_instance} for bucket {bucket}" - time.sleep(1) + time.sleep(1) def wait_resolver(cluster): @@ -124,3 +124,13 @@ def simple_storage_test(cluster, node, proxies, policy): # not checking for POST because it is in a different format check_proxy_logs(cluster, proxies, "http", policy, ["PUT", "GET"]) + + +def simple_test_assert_no_proxy(cluster, proxies, protocol, bucket): + minio_endpoint = build_s3_endpoint(protocol, bucket) + node = cluster.instances[bucket] + perform_simple_queries(node, minio_endpoint) + + # No HTTP method should be found in proxy logs if no proxy is active + empty_method_list = [] + check_proxy_logs(cluster, proxies, protocol, bucket, empty_method_list) diff --git a/tests/integration/helpers/test_tools.py b/tests/integration/helpers/test_tools.py index 2afbae340be..1c8c5c33a13 100644 --- a/tests/integration/helpers/test_tools.py +++ b/tests/integration/helpers/test_tools.py @@ -139,12 +139,18 @@ def assert_logs_contain_with_retry(instance, substring, retry_count=20, sleep_ti def exec_query_with_retry( - instance, query, retry_count=40, sleep_time=0.5, silent=False, settings={} + instance, + query, + retry_count=40, + sleep_time=0.5, + silent=False, + settings={}, + timeout=30, ): exception = None for cnt in range(retry_count): try: - res = instance.query(query, timeout=30, settings=settings) + res = instance.query(query, timeout=timeout, settings=settings) if not silent: logging.debug(f"Result of {query} on {cnt} try is {res}") break diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index ea569939c1c..f23384b5c04 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -404,6 +404,8 @@ def test_alter_detach_part(started_cluster, engine): main_node.query(f"INSERT INTO {database}.alter_detach VALUES (123)") if engine == "MergeTree": dummy_node.query(f"INSERT INTO {database}.alter_detach VALUES (456)") + else: + main_node.query(f"SYSTEM SYNC REPLICA {database}.alter_detach PULL") main_node.query(f"ALTER TABLE {database}.alter_detach DETACH PART '{part_name}'") detached_parts_query = f"SELECT name FROM system.detached_parts WHERE database='{database}' AND table='alter_detach'" assert main_node.query(detached_parts_query) == f"{part_name}\n" diff --git a/tests/integration/test_s3_table_function_with_http_proxy/configs/config.d/proxy_list_no_proxy.xml b/tests/integration/test_s3_table_function_with_http_proxy/configs/config.d/proxy_list_no_proxy.xml new file mode 100644 index 00000000000..a1601153151 --- /dev/null +++ b/tests/integration/test_s3_table_function_with_http_proxy/configs/config.d/proxy_list_no_proxy.xml @@ -0,0 +1,9 @@ + + + not_important_host,, minio1 , + + http://proxy1 + http://proxy2 + + + diff --git a/tests/integration/test_s3_table_function_with_http_proxy/configs/config.d/proxy_remote_no_proxy.xml b/tests/integration/test_s3_table_function_with_http_proxy/configs/config.d/proxy_remote_no_proxy.xml new file mode 100644 index 00000000000..6c16a65b154 --- /dev/null +++ b/tests/integration/test_s3_table_function_with_http_proxy/configs/config.d/proxy_remote_no_proxy.xml @@ -0,0 +1,18 @@ + + + not_important_host,, minio1 , + + + + http://resolver:8080/hostname + http + 80 + 10 + + + + diff --git a/tests/integration/test_s3_table_function_with_http_proxy/test.py b/tests/integration/test_s3_table_function_with_http_proxy/test.py index 76ad2109efc..2ec73ecbef6 100644 --- a/tests/integration/test_s3_table_function_with_http_proxy/test.py +++ b/tests/integration/test_s3_table_function_with_http_proxy/test.py @@ -19,6 +19,14 @@ def cluster(): with_minio=True, ) + cluster.add_instance( + "remote_proxy_node_no_proxy", + main_configs=[ + "configs/config.d/proxy_remote_no_proxy.xml", + ], + with_minio=True, + ) + cluster.add_instance( "proxy_list_node", main_configs=[ @@ -27,6 +35,14 @@ def cluster(): with_minio=True, ) + cluster.add_instance( + "proxy_list_node_no_proxy", + main_configs=[ + "configs/config.d/proxy_list_no_proxy.xml", + ], + with_minio=True, + ) + cluster.add_instance( "env_node", with_minio=True, @@ -36,6 +52,16 @@ def cluster(): instance_env_variables=True, ) + cluster.add_instance( + "env_node_no_proxy", + with_minio=True, + env_variables={ + "http_proxy": "http://proxy1", + "no_proxy": "not_important_host,, minio1 ,", + }, + instance_env_variables=True, + ) + logging.info("Starting cluster...") cluster.start() logging.info("Cluster started") @@ -48,6 +74,24 @@ def cluster(): cluster.shutdown() +def test_s3_with_http_proxy_list_no_proxy(cluster): + proxy_util.simple_test_assert_no_proxy( + cluster, ["proxy1", "proxy2"], "http", "proxy_list_node_no_proxy" + ) + + +def test_s3_with_http_remote_proxy_no_proxy(cluster): + proxy_util.simple_test_assert_no_proxy( + cluster, ["proxy1"], "http", "remote_proxy_node_no_proxy" + ) + + +def test_s3_with_http_env_no_proxy(cluster): + proxy_util.simple_test_assert_no_proxy( + cluster, ["proxy1"], "http", "env_node_no_proxy" + ) + + def test_s3_with_http_proxy_list(cluster): proxy_util.simple_test(cluster, ["proxy1", "proxy2"], "http", "proxy_list_node") diff --git a/tests/integration/test_s3_table_function_with_https_proxy/configs/config.d/proxy_list_no_proxy.xml b/tests/integration/test_s3_table_function_with_https_proxy/configs/config.d/proxy_list_no_proxy.xml new file mode 100644 index 00000000000..0a03986f839 --- /dev/null +++ b/tests/integration/test_s3_table_function_with_https_proxy/configs/config.d/proxy_list_no_proxy.xml @@ -0,0 +1,13 @@ + + + not_important_host,, minio1 , + + http://proxy1 + http://proxy2 + + + https://proxy1 + https://proxy2 + + + diff --git a/tests/integration/test_s3_table_function_with_https_proxy/configs/config.d/proxy_remote_no_proxy.xml b/tests/integration/test_s3_table_function_with_https_proxy/configs/config.d/proxy_remote_no_proxy.xml new file mode 100644 index 00000000000..943f2b36a34 --- /dev/null +++ b/tests/integration/test_s3_table_function_with_https_proxy/configs/config.d/proxy_remote_no_proxy.xml @@ -0,0 +1,18 @@ + + + not_important_host,, minio1 , + + + + http://resolver:8080/hostname + https + 443 + 10 + + + + diff --git a/tests/integration/test_s3_table_function_with_https_proxy/test.py b/tests/integration/test_s3_table_function_with_https_proxy/test.py index 8b40b232742..54452dda401 100644 --- a/tests/integration/test_s3_table_function_with_https_proxy/test.py +++ b/tests/integration/test_s3_table_function_with_https_proxy/test.py @@ -23,6 +23,15 @@ def cluster(): minio_certs_dir="minio_certs", ) + cluster.add_instance( + "remote_proxy_node_no_proxy", + main_configs=[ + "configs/config.d/proxy_remote_no_proxy.xml", + "configs/config.d/ssl.xml", + ], + with_minio=True, + ) + cluster.add_instance( "proxy_list_node", main_configs=[ @@ -32,6 +41,15 @@ def cluster(): with_minio=True, ) + cluster.add_instance( + "proxy_list_node_no_proxy", + main_configs=[ + "configs/config.d/proxy_list_no_proxy.xml", + "configs/config.d/ssl.xml", + ], + with_minio=True, + ) + cluster.add_instance( "env_node", main_configs=[ @@ -44,6 +62,19 @@ def cluster(): instance_env_variables=True, ) + cluster.add_instance( + "env_node_no_proxy", + main_configs=[ + "configs/config.d/ssl.xml", + ], + with_minio=True, + env_variables={ + "https_proxy": "https://proxy1", + "no_proxy": "not_important_host,, minio1 ,", + }, + instance_env_variables=True, + ) + logging.info("Starting cluster...") cluster.start() logging.info("Cluster started") @@ -56,6 +87,24 @@ def cluster(): cluster.shutdown() +def test_s3_with_https_proxy_list_no_proxy(cluster): + proxy_util.simple_test_assert_no_proxy( + cluster, ["proxy1", "proxy2"], "https", "proxy_list_node_no_proxy" + ) + + +def test_s3_with_https_env_no_proxy(cluster): + proxy_util.simple_test_assert_no_proxy( + cluster, ["proxy1"], "https", "env_node_no_proxy" + ) + + +def test_s3_with_https_remote_no_proxy(cluster): + proxy_util.simple_test_assert_no_proxy( + cluster, ["proxy1"], "https", "remote_proxy_node_no_proxy" + ) + + def test_s3_with_https_proxy_list(cluster): proxy_util.simple_test(cluster, ["proxy1", "proxy2"], "https", "proxy_list_node") diff --git a/tests/integration/test_storage_azure_blob_storage/test.py b/tests/integration/test_storage_azure_blob_storage/test.py index 9f5aef1489c..d986c1f9746 100644 --- a/tests/integration/test_storage_azure_blob_storage/test.py +++ b/tests/integration/test_storage_azure_blob_storage/test.py @@ -758,12 +758,12 @@ def test_read_subcolumns(cluster): ) res = node.query( - f"select a.b.d, _path, a.b, _file, dateDiff('minute', _time, now()), a.e from azureBlobStorage('{storage_account_url}', 'cont', 'test_subcolumns.tsv'," + f"select a.b.d, _path, a.b, _file, a.e from azureBlobStorage('{storage_account_url}', 'cont', 'test_subcolumns.tsv'," f" 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'auto', 'auto'," f" 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)')" ) - assert res == "2\tcont/test_subcolumns.tsv\t(1,2)\ttest_subcolumns.tsv\t0\t3\n" + assert res == "2\tcont/test_subcolumns.tsv\t(1,2)\ttest_subcolumns.tsv\t3\n" res = node.query( f"select a.b.d, _path, a.b, _file, a.e from azureBlobStorage('{storage_account_url}', 'cont', 'test_subcolumns.jsonl'," @@ -790,6 +790,25 @@ def test_read_subcolumns(cluster): assert res == "42\tcont/test_subcolumns.jsonl\t(42,42)\ttest_subcolumns.jsonl\t42\n" +def test_read_subcolumn_time(cluster): + node = cluster.instances["node"] + storage_account_url = cluster.env_variables["AZURITE_STORAGE_ACCOUNT_URL"] + azure_query( + node, + f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_subcolumn_time.tsv', " + f"'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'auto', 'auto'," + f" 'a UInt32') select (42)", + ) + + res = node.query( + f"select a, dateDiff('minute', _time, now()) < 59 from azureBlobStorage('{storage_account_url}', 'cont', 'test_subcolumn_time.tsv'," + f" 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'auto', 'auto'," + f" 'a UInt32')" + ) + + assert res == "42\t1\n" + + def test_read_from_not_existing_container(cluster): node = cluster.instances["node"] query = ( diff --git a/tests/integration/test_storage_hdfs/test.py b/tests/integration/test_storage_hdfs/test.py index cda2b8694c6..47d8f44c0b7 100644 --- a/tests/integration/test_storage_hdfs/test.py +++ b/tests/integration/test_storage_hdfs/test.py @@ -987,10 +987,10 @@ def test_read_subcolumns(started_cluster): assert res == "2\ttest_subcolumns.jsonl\t(1,2)\ttest_subcolumns.jsonl\t3\n" res = node.query( - f"select x.b.d, _path, x.b, _file, dateDiff('minute', _time, now()), x.e from hdfs('hdfs://hdfs1:9000/test_subcolumns.jsonl', auto, 'x Tuple(b Tuple(c UInt32, d UInt32), e UInt32)')" + f"select x.b.d, _path, x.b, _file, x.e from hdfs('hdfs://hdfs1:9000/test_subcolumns.jsonl', auto, 'x Tuple(b Tuple(c UInt32, d UInt32), e UInt32)')" ) - assert res == "0\ttest_subcolumns.jsonl\t(0,0)\ttest_subcolumns.jsonl\t0\t0\n" + assert res == "0\ttest_subcolumns.jsonl\t(0,0)\ttest_subcolumns.jsonl\t0\n" res = node.query( f"select x.b.d, _path, x.b, _file, x.e from hdfs('hdfs://hdfs1:9000/test_subcolumns.jsonl', auto, 'x Tuple(b Tuple(c UInt32, d UInt32), e UInt32) default ((42, 42), 42)')" @@ -999,6 +999,20 @@ def test_read_subcolumns(started_cluster): assert res == "42\ttest_subcolumns.jsonl\t(42,42)\ttest_subcolumns.jsonl\t42\n" +def test_read_subcolumn_time(started_cluster): + node = started_cluster.instances["node1"] + + node.query( + f"insert into function hdfs('hdfs://hdfs1:9000/test_subcolumn_time.tsv', auto, 'a UInt32') select (42)" + ) + + res = node.query( + f"select a, dateDiff('minute', _time, now()) < 59 from hdfs('hdfs://hdfs1:9000/test_subcolumn_time.tsv', auto, 'a UInt32')" + ) + + assert res == "42\t1\n" + + def test_union_schema_inference_mode(started_cluster): node = started_cluster.instances["node1"] diff --git a/tests/integration/test_storage_s3/test.py b/tests/integration/test_storage_s3/test.py index 61c6d95f123..b2ebd12ce00 100644 --- a/tests/integration/test_storage_s3/test.py +++ b/tests/integration/test_storage_s3/test.py @@ -1133,6 +1133,7 @@ def test_seekable_formats(started_cluster): exec_query_with_retry( instance, f"insert into table function {table_function} SELECT number, randomString(100) FROM numbers(1000000) settings s3_truncate_on_insert=1", + timeout=100, ) result = instance.query(f"SELECT count() FROM {table_function}") @@ -1142,6 +1143,7 @@ def test_seekable_formats(started_cluster): exec_query_with_retry( instance, f"insert into table function {table_function} SELECT number, randomString(100) FROM numbers(1500000) settings s3_truncate_on_insert=1", + timeout=100, ) result = instance.query( @@ -1169,6 +1171,7 @@ def test_seekable_formats_url(started_cluster): exec_query_with_retry( instance, f"insert into table function {table_function} SELECT number, randomString(100) FROM numbers(1500000) settings s3_truncate_on_insert=1", + timeout=100, ) result = instance.query(f"SELECT count() FROM {table_function}") @@ -1178,6 +1181,7 @@ def test_seekable_formats_url(started_cluster): exec_query_with_retry( instance, f"insert into table function {table_function} SELECT number, randomString(100) FROM numbers(1500000) settings s3_truncate_on_insert=1", + timeout=100, ) table_function = f"url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_parquet', 'Parquet', 'a Int32, b String')" @@ -2117,12 +2121,10 @@ def test_read_subcolumns(started_cluster): assert res == "0\troot/test_subcolumns.jsonl\t(0,0)\ttest_subcolumns.jsonl\t0\n" res = instance.query( - f"select x.b.d, _path, x.b, _file, dateDiff('minute', _time, now()), x.e from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumns.jsonl', auto, 'x Tuple(b Tuple(c UInt32, d UInt32), e UInt32) default ((42, 42), 42)')" + f"select x.b.d, _path, x.b, _file, x.e from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumns.jsonl', auto, 'x Tuple(b Tuple(c UInt32, d UInt32), e UInt32) default ((42, 42), 42)')" ) - assert ( - res == "42\troot/test_subcolumns.jsonl\t(42,42)\ttest_subcolumns.jsonl\t0\t42\n" - ) + assert res == "42\troot/test_subcolumns.jsonl\t(42,42)\ttest_subcolumns.jsonl\t42\n" res = instance.query( f"select a.b.d, _path, a.b, _file, a.e from url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumns.tsv', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)')" @@ -2150,7 +2152,20 @@ def test_read_subcolumns(started_cluster): res == "42\t/root/test_subcolumns.jsonl\t(42,42)\ttest_subcolumns.jsonl\t42\n" ) - logging.info("Some custom logging") + +def test_read_subcolumn_time(started_cluster): + bucket = started_cluster.minio_bucket + instance = started_cluster.instances["dummy"] + + instance.query( + f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumn_time.tsv', auto, 'a UInt32') select (42)" + ) + + res = instance.query( + f"select a, dateDiff('minute', _time, now()) < 59 from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumn_time.tsv', auto, 'a UInt32')" + ) + + assert res == "42\t1\n" def test_filtering_by_file_or_path(started_cluster): diff --git a/tests/queries/0_stateless/00300_csv.reference b/tests/queries/0_stateless/00300_csv.reference index e7966a9e8d9..42cd22078c4 100644 --- a/tests/queries/0_stateless/00300_csv.reference +++ b/tests/queries/0_stateless/00300_csv.reference @@ -1,11 +1,11 @@ -"Hello, ""World""",123,"[1,2,3]","(456,['abc','def'])","Newline +"Hello, ""World""",123,"[1,2,3]",456,"['abc','def']","Newline here" "x","y","z","a","b" -"Hello, ""World""",123,"[1,2,3]","(456,['abc','def'])","Newline +"Hello, ""World""",123,"[1,2,3]",456,"['abc','def']","Newline here" "x","y","z","a","b" "String","UInt8","Array(UInt8)","Tuple(UInt16, Array(String))","String" -"Hello, ""World""",123,"[1,2,3]","(456,['abc','def'])","Newline +"Hello, ""World""",123,"[1,2,3]",456,"['abc','def']","Newline here" 0,"0","[]","2000-01-01","2000-01-01 00:00:00" 1,"1","[0]","2000-01-02","2000-01-01 00:00:01" diff --git a/tests/queries/0_stateless/00309_formats.reference b/tests/queries/0_stateless/00309_formats.reference index 5c0ae4d2c3b..e637ee0363a 100644 Binary files a/tests/queries/0_stateless/00309_formats.reference and b/tests/queries/0_stateless/00309_formats.reference differ diff --git a/tests/queries/0_stateless/00732_base64_functions.sql b/tests/queries/0_stateless/00732_base64_functions.sql index 3c60bf939fe..b4be8db4ede 100644 --- a/tests/queries/0_stateless/00732_base64_functions.sql +++ b/tests/queries/0_stateless/00732_base64_functions.sql @@ -1,6 +1,5 @@ -- Tags: no-fasttest - -SET send_logs_level = 'fatal'; +-- no-fasttest because aklomp-base64 library is required SELECT base64Encode(); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } SELECT base64Decode(); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } diff --git a/tests/queries/0_stateless/01016_input_null_as_default.sh b/tests/queries/0_stateless/01016_input_null_as_default.sh index 8d6a9a07435..24d93b2703c 100755 --- a/tests/queries/0_stateless/01016_input_null_as_default.sh +++ b/tests/queries/0_stateless/01016_input_null_as_default.sh @@ -11,8 +11,8 @@ $CLICKHOUSE_CLIENT --query="CREATE TABLE default_by_other_column (a Float32 DEFA echo 'CSV' echo '\N, 1, \N, "2019-07-22", "[10, 20, 30]", \N -1, world, 3, "2019-07-23", \N, "('\''tuple'\'', 3.14)" -2, \N, 123, \N, "[]", "('\''test'\'', 2.71828)" +1, world, 3, "2019-07-23", \N, tuple, 3.14 +2, \N, 123, \N, "[]", test, 2.71828 3, \N, \N, \N, \N, \N' | $CLICKHOUSE_CLIENT --input_format_null_as_default=1 --query="INSERT INTO null_as_default FORMAT CSV"; $CLICKHOUSE_CLIENT --query="SELECT * FROM null_as_default ORDER BY i"; $CLICKHOUSE_CLIENT --query="TRUNCATE TABLE null_as_default"; diff --git a/tests/queries/0_stateless/02156_storage_merge_prewhere.reference b/tests/queries/0_stateless/02156_storage_merge_prewhere.reference index 86a36a9392c..8a18c609ede 100644 --- a/tests/queries/0_stateless/02156_storage_merge_prewhere.reference +++ b/tests/queries/0_stateless/02156_storage_merge_prewhere.reference @@ -1,6 +1,3 @@ - Prewhere info - Prewhere filter - Prewhere filter column: and(notEmpty(v), equals(k, 3)) (removed) Prewhere info Prewhere filter Prewhere filter column: and(notEmpty(v), equals(k, 3)) (removed) @@ -8,8 +5,15 @@ Prewhere filter Prewhere filter column: and(notEmpty(v), equals(k, 3)) (removed) 2 - Filter column: and(equals(k, 3), notEmpty(v)) (removed) + Filter column: and(equals(k, 3), notEmpty(v)) (removed) + Prewhere info + Prewhere filter + Prewhere filter column: and(notEmpty(v), equals(k, 3)) (removed) 2 - Filter column: and(equals(k, 3), notEmpty(v)) (removed) - Filter column: and(equals(k, 3), notEmpty(v)) (removed) + Prewhere info + Prewhere filter + Prewhere filter column: and(notEmpty(v), equals(k, 3)) (removed) + Prewhere info + Prewhere filter + Prewhere filter column: and(notEmpty(v), equals(k, 3)) (removed) 2 diff --git a/tests/queries/0_stateless/02156_storage_merge_prewhere_not_ready_set_bug.reference b/tests/queries/0_stateless/02156_storage_merge_prewhere_not_ready_set_bug.reference new file mode 100644 index 00000000000..20c58c33770 --- /dev/null +++ b/tests/queries/0_stateless/02156_storage_merge_prewhere_not_ready_set_bug.reference @@ -0,0 +1 @@ +59900 1000 1396 diff --git a/tests/queries/0_stateless/02156_storage_merge_prewhere_not_ready_set_bug.sql b/tests/queries/0_stateless/02156_storage_merge_prewhere_not_ready_set_bug.sql new file mode 100644 index 00000000000..fc18c97cb6e --- /dev/null +++ b/tests/queries/0_stateless/02156_storage_merge_prewhere_not_ready_set_bug.sql @@ -0,0 +1,7 @@ +create table merge_kek_1 (x UInt32, y UInt32) engine = MergeTree order by x; +create table merge_kek_2 (x UInt32, y UInt32) engine = MergeTree order by x; + +insert into merge_kek_1 select number, number from numbers(100); +insert into merge_kek_2 select number + 500, number + 500 from numbers(1e6); + +select sum(x), min(x + x), max(x + x) from merge(currentDatabase(), '^merge_kek_.$') where x > 200 and y in (select 500 + number * 2 from numbers(100)) settings max_threads=2; diff --git a/tests/queries/0_stateless/02246_tsv_csv_best_effort_schema_inference.reference b/tests/queries/0_stateless/02246_tsv_csv_best_effort_schema_inference.reference index 8ad0a566c62..1c60e40942c 100644 --- a/tests/queries/0_stateless/02246_tsv_csv_best_effort_schema_inference.reference +++ b/tests/queries/0_stateless/02246_tsv_csv_best_effort_schema_inference.reference @@ -97,8 +97,8 @@ c1 Array(Nullable(Bool)) [] [NULL] [false] -c1 Tuple(Nullable(Int64), Nullable(Int64), Nullable(Int64)) -(1,2,3) +c1 Nullable(String) +(1, 2, 3) c1 Nullable(String) 123.123 c1 Array(Tuple(Nullable(Int64), Nullable(Int64), Nullable(Int64))) diff --git a/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.sql b/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.sql index cabcd230eb6..e9deb778075 100644 --- a/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.sql +++ b/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.sql @@ -3,7 +3,6 @@ SELECT name FROM system.functions WHERE NOT is_aggregate AND origin = 'System' AND alias_to = '' AND length(description) < 10 AND name NOT IN ( 'aes_decrypt_mysql', 'aes_encrypt_mysql', 'decrypt', 'encrypt', - 'base64Decode', 'base64Encode', 'tryBase64Decode', 'convertCharset', 'detectLanguage', 'detectLanguageMixed', 'geoToH3', diff --git a/tests/queries/0_stateless/02893_vertical_final_array_join.reference b/tests/queries/0_stateless/02893_vertical_final_bugs.reference similarity index 99% rename from tests/queries/0_stateless/02893_vertical_final_array_join.reference rename to tests/queries/0_stateless/02893_vertical_final_bugs.reference index 27b54a2e42e..ab23116aa5f 100644 --- a/tests/queries/0_stateless/02893_vertical_final_array_join.reference +++ b/tests/queries/0_stateless/02893_vertical_final_bugs.reference @@ -1,3 +1,4 @@ +1 2 b 1 -- { echo ON } SELECT arrayJoin([(k1, v), (k2, v)]) AS row, row.1 as k FROM t FINAL WHERE k1 != 3 AND k = 1 ORDER BY row SETTINGS enable_vertical_final = 0; (1,4) 1 diff --git a/tests/queries/0_stateless/02893_vertical_final_array_join.sql b/tests/queries/0_stateless/02893_vertical_final_bugs.sql similarity index 65% rename from tests/queries/0_stateless/02893_vertical_final_array_join.sql rename to tests/queries/0_stateless/02893_vertical_final_bugs.sql index cc2e37fdc6e..e82ab674c58 100644 --- a/tests/queries/0_stateless/02893_vertical_final_array_join.sql +++ b/tests/queries/0_stateless/02893_vertical_final_bugs.sql @@ -1,3 +1,15 @@ +-- https://github.com/ClickHouse/ClickHouse/issues/64543 +DROP TABLE IF EXISTS foo; +DROP TABLE IF EXISTS bar; +CREATE TABLE foo (id UInt64, seq UInt64) ENGINE = Memory; +CREATE TABLE bar (id UInt64, seq UInt64, name String) ENGINE = ReplacingMergeTree ORDER BY id; +INSERT INTO foo VALUES (1, 1); +INSERT INTO bar VALUES (1, 1, 'a') (2, 2, 'b'); +INSERT INTO bar VALUES (1, 2, 'b') (2, 3, 'c'); +SELECT * FROM bar INNER JOIN foo USING id WHERE bar.seq > foo.seq SETTINGS final = 1; + +-- Same problem possible can happen with array join +DROP TABLE IF EXISTS t; CREATE TABLE t (k1 UInt64, k2 UInt64, v UInt64) ENGINE = ReplacingMergeTree() ORDER BY (k1, k2); SET optimize_on_insert = 0; INSERT INTO t VALUES (1, 2, 3) (1, 2, 4) (2, 3, 4), (2, 3, 5); diff --git a/tests/queries/0_stateless/02918_optimize_count_for_merge_tables.reference b/tests/queries/0_stateless/02918_optimize_count_for_merge_tables.reference index 786a6b3bf25..7278018f1d6 100644 --- a/tests/queries/0_stateless/02918_optimize_count_for_merge_tables.reference +++ b/tests/queries/0_stateless/02918_optimize_count_for_merge_tables.reference @@ -7,6 +7,9 @@ Expression ((Projection + Before ORDER BY)) Aggregating Expression (Before GROUP BY) ReadFromMerge - ReadFromMergeTree (default.mt1) - ReadFromMergeTree (default.mt2) - ReadFromStorage (TinyLog) + Expression + ReadFromMergeTree (default.mt1) + Expression + ReadFromMergeTree (default.mt2) + Expression + ReadFromStorage (TinyLog) diff --git a/tests/queries/0_stateless/02922_server_exit_code.sh b/tests/queries/0_stateless/02922_server_exit_code.sh index 60049902410..ded0dc4763f 100755 --- a/tests/queries/0_stateless/02922_server_exit_code.sh +++ b/tests/queries/0_stateless/02922_server_exit_code.sh @@ -7,6 +7,6 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # We will check that the server's exit code corresponds to the exception code if it was terminated after exception. # In this example, we provide an invalid path to the server's config, ignore its logs and check the exit code. -# The exception code is 400 = CANNOT_STAT, so the exit code will be 400 % 256. +# The exception code is 76 = CANNOT_OPEN_FILE, so the exit code will be 76 % 256. -${CLICKHOUSE_SERVER_BINARY} -- --path /dev/null 2>/dev/null; [[ "$?" == "$((400 % 256))" ]] && echo 'Ok' || echo 'Fail' +${CLICKHOUSE_SERVER_BINARY} -- --path /dev/null 2>/dev/null; [[ "$?" == "$((76 % 256))" ]] && echo 'Ok' || echo 'Fail' diff --git a/tests/queries/0_stateless/02931_size_virtual_column_use_structure_from_insertion_table.reference b/tests/queries/0_stateless/02931_size_virtual_column_use_structure_from_insertion_table.reference new file mode 100644 index 00000000000..35ef86f5339 --- /dev/null +++ b/tests/queries/0_stateless/02931_size_virtual_column_use_structure_from_insertion_table.reference @@ -0,0 +1 @@ +1 2 4 diff --git a/tests/queries/0_stateless/02931_size_virtual_column_use_structure_from_insertion_table.sh b/tests/queries/0_stateless/02931_size_virtual_column_use_structure_from_insertion_table.sh new file mode 100755 index 00000000000..d9e4a2c8f8b --- /dev/null +++ b/tests/queries/0_stateless/02931_size_virtual_column_use_structure_from_insertion_table.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +echo "1,2" > $CLICKHOUSE_TEST_UNIQUE_NAME.csv +$CLICKHOUSE_LOCAL -nm -q " +create table test (x UInt64, y UInt32, size UInt64) engine=Memory; +insert into test select c1, c2, _size from file('$CLICKHOUSE_TEST_UNIQUE_NAME.csv') settings use_structure_from_insertion_table_in_table_functions=1; +select * from test; +" +rm $CLICKHOUSE_TEST_UNIQUE_NAME.csv diff --git a/tests/queries/0_stateless/02931_virtual_column_use_structure_from_insertion_table.reference b/tests/queries/0_stateless/02931_virtual_column_use_structure_from_insertion_table.reference deleted file mode 100644 index 93acdc34842..00000000000 --- a/tests/queries/0_stateless/02931_virtual_column_use_structure_from_insertion_table.reference +++ /dev/null @@ -1 +0,0 @@ -1 2 4 1 1 diff --git a/tests/queries/0_stateless/02931_virtual_column_use_structure_from_insertion_table.sh b/tests/queries/0_stateless/02931_virtual_column_use_structure_from_insertion_table.sh deleted file mode 100755 index ebdda0cc1d3..00000000000 --- a/tests/queries/0_stateless/02931_virtual_column_use_structure_from_insertion_table.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/env bash - -CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) -# shellcheck source=../shell_config.sh -. "$CURDIR"/../shell_config.sh - -echo "1,2" > $CLICKHOUSE_TEST_UNIQUE_NAME.csv -sleep 1 -$CLICKHOUSE_LOCAL -nm -q " -create table test (x UInt64, y UInt32, size UInt64, d32 DateTime32, d64 DateTime64) engine=Memory; -insert into test select c1, c2, _size, _time, _time from file('$CLICKHOUSE_TEST_UNIQUE_NAME.csv') settings use_structure_from_insertion_table_in_table_functions=1; -select x, y, size, (dateDiff('millisecond', d32, now()) < 4000 AND dateDiff('millisecond', d32, now()) > 0), (dateDiff('second', d64, now()) < 4 AND dateDiff('second', d64, now()) > 0) from test; -" -rm $CLICKHOUSE_TEST_UNIQUE_NAME.csv diff --git a/tests/queries/0_stateless/02969_auto_format_detection.reference b/tests/queries/0_stateless/02969_auto_format_detection.reference index 865db11defc..4b86be04996 100644 --- a/tests/queries/0_stateless/02969_auto_format_detection.reference +++ b/tests/queries/0_stateless/02969_auto_format_detection.reference @@ -82,7 +82,8 @@ CSV c1 Nullable(UInt64) c2 Nullable(String) c3 Array(Nullable(UInt64)) -c4 Tuple(Nullable(UInt64), Nullable(String)) +c4 Nullable(UInt64) +c5 Nullable(String) a Nullable(String) b Nullable(String) c Array(Nullable(String)) diff --git a/tests/queries/0_stateless/02977_csv_format_support_tuple.sql b/tests/queries/0_stateless/02977_csv_format_support_tuple.sql index d00cc00e097..f30e217ca0f 100644 --- a/tests/queries/0_stateless/02977_csv_format_support_tuple.sql +++ b/tests/queries/0_stateless/02977_csv_format_support_tuple.sql @@ -1,5 +1,9 @@ -- Tags: no-parallel +SET output_format_csv_serialize_tuple_into_separate_columns = false; +SET input_format_csv_deserialize_separate_columns_into_tuple = false; +SET input_format_csv_try_infer_strings_from_quoted_tuples = false; + insert into function file('02977_1.csv') select '20240305', 1, ['s', 'd'], map('a', 2), tuple('222', 33, map('abc', 5)) SETTINGS engine_file_truncate_on_insert=1; desc file('02977_1.csv'); select * from file('02977_1.csv') settings max_threads=1; diff --git a/tests/queries/0_stateless/03155_analyzer_interpolate.reference b/tests/queries/0_stateless/03155_analyzer_interpolate.reference index 791aaa5b2a2..eade3b45d26 100644 --- a/tests/queries/0_stateless/03155_analyzer_interpolate.reference +++ b/tests/queries/0_stateless/03155_analyzer_interpolate.reference @@ -11,3 +11,8 @@ 5 [5] 5.5 [5] 7 [7] +2 +100500 +18 +26 +34 diff --git a/tests/queries/0_stateless/03155_analyzer_interpolate.sql b/tests/queries/0_stateless/03155_analyzer_interpolate.sql index b3c1d233f47..30423cb86ff 100644 --- a/tests/queries/0_stateless/03155_analyzer_interpolate.sql +++ b/tests/queries/0_stateless/03155_analyzer_interpolate.sql @@ -10,3 +10,6 @@ SELECT n, number+5 AS inter FROM ( -- { serverError NOT_AN_AGGREGATE } SELECT toFloat32(number % 10) AS n, number, number*2 AS mn FROM numbers(10) WHERE number % 3 = 1 ) GROUP BY n, inter ORDER BY n WITH FILL FROM 0 TO 5.51 STEP 0.5 INTERPOLATE (inter AS mn * 2); + +-- https://github.com/ClickHouse/ClickHouse/issues/64636 +select sum(number) as s from remote('127.0.0.{1,2}', numbers(10)) where (intDiv(number, 2) as key) != 1 group by key order by key with fill interpolate (s as 100500); diff --git a/tests/queries/0_stateless/03165_storage_merge_view_prewhere.reference b/tests/queries/0_stateless/03165_storage_merge_view_prewhere.reference new file mode 100644 index 00000000000..4cd7f2cb141 --- /dev/null +++ b/tests/queries/0_stateless/03165_storage_merge_view_prewhere.reference @@ -0,0 +1,8 @@ +a1451105-722e-4fe7-bfaa-65ad2ae249c2 whatever +a1451105-722e-4fe7-bfaa-65ad2ae249c2 whatever +a1451105-722e-4fe7-bfaa-65ad2ae249c2 whatever +a1451105-722e-4fe7-bfaa-65ad2ae249c2 whatever +a1451105-722e-4fe7-bfaa-65ad2ae249c2 whatever +a1451105-722e-4fe7-bfaa-65ad2ae249c2 whatever +a1451105-722e-4fe7-bfaa-65ad2ae249c2 whatever +a1451105-722e-4fe7-bfaa-65ad2ae249c2 whatever diff --git a/tests/queries/0_stateless/03165_storage_merge_view_prewhere.sql b/tests/queries/0_stateless/03165_storage_merge_view_prewhere.sql new file mode 100644 index 00000000000..97651d1b0fd --- /dev/null +++ b/tests/queries/0_stateless/03165_storage_merge_view_prewhere.sql @@ -0,0 +1,41 @@ +-- Tags: distributed + +DROP TABLE IF EXISTS ids; +DROP TABLE IF EXISTS data; +DROP TABLE IF EXISTS data2; + +CREATE TABLE ids (id UUID, whatever String) Engine=MergeTree ORDER BY tuple(); +INSERT INTO ids VALUES ('a1451105-722e-4fe7-bfaa-65ad2ae249c2', 'whatever'); + +CREATE TABLE data (id UUID, event_time DateTime, status String) Engine=MergeTree ORDER BY tuple(); +INSERT INTO data VALUES ('a1451105-722e-4fe7-bfaa-65ad2ae249c2', '2000-01-01', 'CREATED'); + +CREATE TABLE data2 (id UUID, event_time DateTime, status String) Engine=MergeTree ORDER BY tuple(); +INSERT INTO data2 VALUES ('a1451105-722e-4fe7-bfaa-65ad2ae249c2', '2000-01-02', 'CREATED'); + +SELECT + id, + whatever +FROM ids AS l +INNER JOIN merge(currentDatabase(), 'data*') AS s ON l.id = s.id +WHERE (status IN ['CREATED', 'CREATING']) +ORDER BY event_time DESC +; + +SELECT + id, + whatever +FROM ids AS l +INNER JOIN clusterAllReplicas(test_cluster_two_shards, merge(currentDatabase(), 'data*')) AS s ON l.id = s.id +WHERE (status IN ['CREATED', 'CREATING']) +ORDER BY event_time DESC +; + +SELECT + id, + whatever +FROM ids AS l +INNER JOIN view(SELECT * FROM merge(currentDatabase(), 'data*')) AS s ON l.id = s.id +WHERE (status IN ['CREATED', 'CREATING']) +ORDER BY event_time DESC +; diff --git a/tests/queries/0_stateless/03167_base64_url_functions.reference b/tests/queries/0_stateless/03167_base64_url_functions.reference new file mode 100644 index 00000000000..2a0d0013609 --- /dev/null +++ b/tests/queries/0_stateless/03167_base64_url_functions.reference @@ -0,0 +1,10 @@ +https://clickhouse.com aHR0cHM6Ly9jbGlja2hvdXNlLmNvbQ https://clickhouse.com https://clickhouse.com +12? MTI_ 12? 12? +https://www.google.com/search?q=clickhouse+base64+decode&sca_esv=739f8bb380e4c7ed&ei=TfRiZqCDIrmnwPAP2KLRkA8&ved=0ahUKEwjg3ZHitsmGAxW5ExAIHVhRFPIQ4dUDCBA&uact=5&oq=clickhouse+base64+decode aHR0cHM6Ly93d3cuZ29vZ2xlLmNvbS9zZWFyY2g_cT1jbGlja2hvdXNlK2Jhc2U2NCtkZWNvZGUmc2NhX2Vzdj03MzlmOGJiMzgwZTRjN2VkJmVpPVRmUmlacUNESXJtbndQQVAyS0xSa0E4JnZlZD0wYWhVS0V3amczWkhpdHNtR0F4VzVFeEFJSFZoUkZQSVE0ZFVEQ0JBJnVhY3Q9NSZvcT1jbGlja2hvdXNlK2Jhc2U2NCtkZWNvZGU https://www.google.com/search?q=clickhouse+base64+decode&sca_esv=739f8bb380e4c7ed&ei=TfRiZqCDIrmnwPAP2KLRkA8&ved=0ahUKEwjg3ZHitsmGAxW5ExAIHVhRFPIQ4dUDCBA&uact=5&oq=clickhouse+base64+decode https://www.google.com/search?q=clickhouse+base64+decode&sca_esv=739f8bb380e4c7ed&ei=TfRiZqCDIrmnwPAP2KLRkA8&ved=0ahUKEwjg3ZHitsmGAxW5ExAIHVhRFPIQ4dUDCBA&uact=5&oq=clickhouse+base64+decode +aHR0cHM6Ly9jbGlj https://clic https://clic +aHR0cHM6Ly9jbGlja2g https://clickh https://clickh +aHR0cHM6Ly9jbGljaw https://click https://click + + + +https://clickhouse.com aHR0cHM6Ly9jbGlja2hvdXNlLmNvbQ https://clickhouse.com https://clickhouse.com diff --git a/tests/queries/0_stateless/03167_base64_url_functions.sql b/tests/queries/0_stateless/03167_base64_url_functions.sql new file mode 100644 index 00000000000..674f1ae498b --- /dev/null +++ b/tests/queries/0_stateless/03167_base64_url_functions.sql @@ -0,0 +1,36 @@ +-- Tags: no-fasttest +-- no-fasttest because aklomp-base64 library is required + +-- incorrect number of arguments +SELECT base64UrlEncode(); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } +SELECT base64UrlDecode(); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } +SELECT tryBase64UrlDecode(); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } +SELECT base64UrlEncode('foo', 'excess argument'); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } +SELECT base64UrlDecode('foo', 'excess argument'); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } +SELECT tryBase64UrlDecode('foo', 'excess argument'); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } + +-- test with valid inputs + +SELECT 'https://clickhouse.com' AS original, base64UrlEncode(original) AS encoded, base64UrlDecode(encoded), tryBase64UrlDecode(encoded); +SELECT '12?' AS original, base64UrlEncode(original) AS encoded, base64UrlDecode(encoded), tryBase64UrlDecode(encoded); +SELECT 'https://www.google.com/search?q=clickhouse+base64+decode&sca_esv=739f8bb380e4c7ed&ei=TfRiZqCDIrmnwPAP2KLRkA8&ved=0ahUKEwjg3ZHitsmGAxW5ExAIHVhRFPIQ4dUDCBA&uact=5&oq=clickhouse+base64+decode' AS original, base64UrlEncode(original) AS encoded, base64UrlDecode(encoded), tryBase64UrlDecode(encoded); + +-- encoded value has no padding +SELECT 'aHR0cHM6Ly9jbGlj' AS encoded, base64UrlDecode(encoded), tryBase64UrlDecode(encoded); +-- encoded value has one-byte padding +SELECT 'aHR0cHM6Ly9jbGlja2g' AS encoded, base64UrlDecode(encoded), tryBase64UrlDecode(encoded); +-- encoded value has two-bytes padding +SELECT 'aHR0cHM6Ly9jbGljaw' AS encoded, base64UrlDecode(encoded), tryBase64UrlDecode(encoded); + +-- test with invalid inputs + +SELECT base64UrlDecode('https://clickhouse.com'); -- { serverError INCORRECT_DATA } +SELECT tryBase64UrlDecode('https://clickhouse.com'); +SELECT base64UrlDecode('12?'); -- { serverError INCORRECT_DATA } +SELECT tryBase64UrlDecode('12?'); +SELECT base64UrlDecode('aHR0cHM6Ly9jbGlja'); -- { serverError INCORRECT_DATA } +SELECT tryBase64UrlDecode('aHR0cHM6Ly9jbGlja'); + +-- test FixedString argument + +SELECT toFixedString('https://clickhouse.com', 22) AS original, base64UrlEncode(original) AS encoded, base64UrlDecode(encoded), tryBase64UrlDecode(encoded); diff --git a/tests/queries/0_stateless/03169_cache_complex_dict_short_circuit_bug.reference b/tests/queries/0_stateless/03169_cache_complex_dict_short_circuit_bug.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/03169_cache_complex_dict_short_circuit_bug.sql b/tests/queries/0_stateless/03169_cache_complex_dict_short_circuit_bug.sql new file mode 100644 index 00000000000..8463d13d251 --- /dev/null +++ b/tests/queries/0_stateless/03169_cache_complex_dict_short_circuit_bug.sql @@ -0,0 +1,31 @@ +DROP TABLE IF EXISTS complex_key_simple_attributes_source_short_circuit_table; +DROP DICTIONARY IF EXISTS cache_dictionary_complex_key_simple_attributes_short_circuit; + +CREATE TABLE complex_key_simple_attributes_source_short_circuit_table +( + id UInt64, + id_key String, + value_first String, + value_second String +) + ENGINE = TinyLog; + +INSERT INTO complex_key_simple_attributes_source_short_circuit_table VALUES(0, 'id_key_0', 'value_0', 'value_second_0'); + +CREATE DICTIONARY cache_dictionary_complex_key_simple_attributes_short_circuit +( + `id` UInt64, + `id_key` String, + `value_first` String DEFAULT 'value_first_default', + `value_second` String DEFAULT 'value_second_default' +) +PRIMARY KEY id, id_key +SOURCE(CLICKHOUSE(TABLE 'complex_key_simple_attributes_source_short_circuit_table')) +LIFETIME(MIN 1 MAX 1000) +LAYOUT(COMPLEX_KEY_CACHE(SIZE_IN_CELLS 10)); + +SELECT dictGetOrDefault('cache_dictionary_complex_key_simple_attributes_short_circuit', 'value_first', (number, concat(toString(number))), toString(materialize('default'))) AS value_first FROM system.numbers LIMIT 20 FORMAT Null; +SELECT dictGetOrDefault('cache_dictionary_complex_key_simple_attributes_short_circuit', 'value_first', (number, concat(toString(number))), toString(materialize('default'))) AS value_first FROM system.numbers LIMIT 20 FORMAT Null; + +DROP TABLE IF EXISTS complex_key_simple_attributes_source_short_circuit_table; +DROP DICTIONARY IF EXISTS cache_dictionary_complex_key_simple_attributes_short_circuit; diff --git a/tests/queries/0_stateless/03169_modify_column_data_loss.reference b/tests/queries/0_stateless/03169_modify_column_data_loss.reference new file mode 100644 index 00000000000..2126a658c16 --- /dev/null +++ b/tests/queries/0_stateless/03169_modify_column_data_loss.reference @@ -0,0 +1,4 @@ +1 one 0 +2 two 0 +3 \N 0 +1 one 1 0 diff --git a/tests/queries/0_stateless/03169_modify_column_data_loss.sql b/tests/queries/0_stateless/03169_modify_column_data_loss.sql new file mode 100644 index 00000000000..def0a25a1b4 --- /dev/null +++ b/tests/queries/0_stateless/03169_modify_column_data_loss.sql @@ -0,0 +1,19 @@ +DROP TABLE IF EXISTS column_modify_test; + +CREATE TABLE column_modify_test (id UInt64, val String, other_col UInt64) engine=MergeTree ORDER BY id SETTINGS min_bytes_for_wide_part=0; +INSERT INTO column_modify_test VALUES (1,'one',0); +INSERT INTO column_modify_test VALUES (2,'two',0); + +-- on 21.9 that was done via mutations mechanism +ALTER TABLE column_modify_test MODIFY COLUMN val Nullable(String); + +INSERT INTO column_modify_test VALUES (3,Null,0); + +-- till now everythings looks ok +SELECT * FROM column_modify_test order by id, val, other_col; + +-- Now we do mutation. It will affect one of the parts, and will update columns.txt to the latest / correct state w/o updating the column file! +alter table column_modify_test update other_col=1 where id = 1 SETTINGS mutations_sync=1; + +-- row 1 is damaged now the column file & columns.txt is out of sync! +SELECT *, throwIf(val <> 'one') as issue FROM column_modify_test WHERE id = 1; diff --git a/tests/queries/0_stateless/03169_optimize_injective_functions_inside_uniq_crash.reference b/tests/queries/0_stateless/03169_optimize_injective_functions_inside_uniq_crash.reference new file mode 100644 index 00000000000..e58e9764b39 --- /dev/null +++ b/tests/queries/0_stateless/03169_optimize_injective_functions_inside_uniq_crash.reference @@ -0,0 +1,2 @@ +100 +100 diff --git a/tests/queries/0_stateless/03169_optimize_injective_functions_inside_uniq_crash.sql b/tests/queries/0_stateless/03169_optimize_injective_functions_inside_uniq_crash.sql new file mode 100644 index 00000000000..50d99b851a6 --- /dev/null +++ b/tests/queries/0_stateless/03169_optimize_injective_functions_inside_uniq_crash.sql @@ -0,0 +1,21 @@ +SELECT sum(u) +FROM +( + SELECT + intDiv(number, 4096) AS k, + uniqCombined(tuple(materialize(toLowCardinality(toNullable(16))))) AS u + FROM numbers(4096 * 100) + GROUP BY k +) +SETTINGS allow_experimental_analyzer = 1, optimize_injective_functions_inside_uniq=0; + +SELECT sum(u) +FROM +( + SELECT + intDiv(number, 4096) AS k, + uniqCombined(tuple(materialize(toLowCardinality(toNullable(16))))) AS u + FROM numbers(4096 * 100) + GROUP BY k +) +SETTINGS allow_experimental_analyzer = 1, optimize_injective_functions_inside_uniq=1; diff --git a/tests/queries/0_stateless/03169_time_virtual_column.reference b/tests/queries/0_stateless/03169_time_virtual_column.reference new file mode 100644 index 00000000000..4482956b706 --- /dev/null +++ b/tests/queries/0_stateless/03169_time_virtual_column.reference @@ -0,0 +1 @@ +4 1 diff --git a/tests/queries/0_stateless/03169_time_virtual_column.sh b/tests/queries/0_stateless/03169_time_virtual_column.sh new file mode 100755 index 00000000000..fef1de8c6f2 --- /dev/null +++ b/tests/queries/0_stateless/03169_time_virtual_column.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +echo "1,2" > $CLICKHOUSE_TEST_UNIQUE_NAME.csv +sleep 1 +$CLICKHOUSE_LOCAL -nm -q " +select _size, (dateDiff('millisecond', _time, now()) < 600000 AND dateDiff('millisecond', _time, now()) > 0) from file('$CLICKHOUSE_TEST_UNIQUE_NAME.csv'); +" +rm $CLICKHOUSE_TEST_UNIQUE_NAME.csv diff --git a/tests/queries/0_stateless/03170_float_schema_inference_small_block.reference b/tests/queries/0_stateless/03170_float_schema_inference_small_block.reference new file mode 100644 index 00000000000..9ee16da8728 --- /dev/null +++ b/tests/queries/0_stateless/03170_float_schema_inference_small_block.reference @@ -0,0 +1,15 @@ +Int64 +x Nullable(Int64) +x Nullable(Int64) +x Nullable(Int64) +Float64 +x Nullable(Float64) +x Nullable(Float64) +x Nullable(Float64) +x Nullable(Float64) +Float64.explicit File +x Nullable(Float64) +Float64.pipe +x Nullable(Float64) +Float64.default max_read_buffer_size +x Nullable(Float64) diff --git a/tests/queries/0_stateless/03170_float_schema_inference_small_block.sh b/tests/queries/0_stateless/03170_float_schema_inference_small_block.sh new file mode 100755 index 00000000000..88f9bfad7ed --- /dev/null +++ b/tests/queries/0_stateless/03170_float_schema_inference_small_block.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +# do not fallback to float always +echo "Int64" +$CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' <<<'{"x" : 1}' +$CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' <<<'{"x" : +1}' +$CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' <<<'{"x" : -1}' + +echo "Float64" +$CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' <<<'{"x" : 1.1}' +$CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' <<<'{"x" : +1.1}' +$CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' <<<'{"x" : 1.111}' +$CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' <<<'{"x" : +1.111}' + +# this is requried due to previously clickhouse-local does not interprets +# --max_read_buffer_size for fds [1] +# +# [1]: https://github.com/ClickHouse/ClickHouse/pull/64532 +echo "Float64.explicit File" +tmp_path=$(mktemp "$CUR_DIR/03170_float_schema_inference_small_block.json.XXXXXX") +trap 'rm -f $tmp_path' EXIT +cat > "$tmp_path" <<<'{"x" : 1.111}' +$CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' --file "$tmp_path" + +echo "Float64.pipe" +echo '{"x" : 1.1}' | $CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' +echo "Float64.default max_read_buffer_size" +echo '{"x" : 1.1}' | $CLICKHOUSE_LOCAL --storage_file_read_method read --input-format JSONEachRow 'desc "table"' diff --git a/tests/queries/0_stateless/03170_part_offset_as_table_column.reference b/tests/queries/0_stateless/03170_part_offset_as_table_column.reference new file mode 100644 index 00000000000..435187cb39b --- /dev/null +++ b/tests/queries/0_stateless/03170_part_offset_as_table_column.reference @@ -0,0 +1,30 @@ +0 0 +1 0 +2 0 +3 0 +4 0 +5 0 +6 0 +7 0 +8 0 +9 0 +0 0 +1 0 +2 0 +3 0 +4 0 +5 0 +6 0 +7 0 +8 0 +9 0 +0 0 +1 0 +2 0 +3 0 +4 0 +5 0 +6 0 +7 0 +8 0 +9 0 diff --git a/tests/queries/0_stateless/03170_part_offset_as_table_column.sql b/tests/queries/0_stateless/03170_part_offset_as_table_column.sql new file mode 100644 index 00000000000..36cbc156744 --- /dev/null +++ b/tests/queries/0_stateless/03170_part_offset_as_table_column.sql @@ -0,0 +1,25 @@ +CREATE TABLE test_table +( + `key` UInt32, + `_part_offset` DEFAULT 0 +) +ENGINE = MergeTree +ORDER BY key; + +INSERT INTO test_table (key) SELECT number +FROM numbers(10); + +set allow_experimental_analyzer=0; + +SELECT * +FROM test_table; + +set allow_experimental_analyzer=1; + +SELECT * +FROM test_table; + +SELECT + key, + _part_offset +FROM test_table; diff --git a/tests/queries/0_stateless/03171_hashed_dictionary_short_circuit_bug_fix.reference b/tests/queries/0_stateless/03171_hashed_dictionary_short_circuit_bug_fix.reference new file mode 100644 index 00000000000..a2ac115060f --- /dev/null +++ b/tests/queries/0_stateless/03171_hashed_dictionary_short_circuit_bug_fix.reference @@ -0,0 +1,6 @@ +100 1 1 +300 3 0 +200 2 2 +100 1 1 +300 3 0 +200 2 2 diff --git a/tests/queries/0_stateless/03171_hashed_dictionary_short_circuit_bug_fix.sql b/tests/queries/0_stateless/03171_hashed_dictionary_short_circuit_bug_fix.sql new file mode 100644 index 00000000000..e1b5531a442 --- /dev/null +++ b/tests/queries/0_stateless/03171_hashed_dictionary_short_circuit_bug_fix.sql @@ -0,0 +1,30 @@ +-- Tags: no-parallel + +CREATE TABLE x ( hash_id UInt64, user_result Decimal(3, 2) ) ENGINE = Memory(); + +CREATE TABLE y ( hash_id UInt64, user_result DECIMAL(18, 6) ) ENGINE = Memory(); + +INSERT INTO x values (100, 1), (200, 2); +INSERT INTO y values (100, 1), (300, 3), (200, 2); + +CREATE DICTIONARY d1 (hash_id UInt64, user_result Decimal(3, 2) ) +PRIMARY KEY hash_id +SOURCE(CLICKHOUSE(TABLE 'x')) +LIFETIME(0) +LAYOUT(HASHED()); + +SELECT hash_id, + dictGetOrDefault(d1, 'user_result', toUInt64(hash_id), toFloat64(user_result)), + dictGet(d1, 'user_result', toUInt64(hash_id)) +FROM y; + +CREATE DICTIONARY d2 (hash_id UInt64, user_result Decimal(3, 2) ) +PRIMARY KEY hash_id +SOURCE(CLICKHOUSE(TABLE 'x')) +LIFETIME(0) +LAYOUT(HASHED_ARRAY()); + +SELECT hash_id, + dictGetOrDefault(d2, 'user_result', toUInt64(hash_id), toFloat64(user_result)), + dictGet(d2, 'user_result', toUInt64(hash_id)) +FROM y; diff --git a/utils/changelog/changelog.py b/utils/changelog/changelog.py index 314461a6b3a..b79e4139bcc 100755 --- a/utils/changelog/changelog.py +++ b/utils/changelog/changelog.py @@ -1,427 +1,15 @@ #!/usr/bin/env python3 # In our CI this script runs in style-test containers -import argparse -import logging -import os -import os.path as p -import re -from datetime import date, timedelta -from subprocess import DEVNULL, CalledProcessError -from typing import Dict, List, Optional, TextIO +# The main script is moved to tests/ci/changelog.py +# It depends on the ci scripts too hard to keep it here +# Here's only a wrapper around it for the people who used to it -from github.GithubException import RateLimitExceededException, UnknownObjectException -from github.NamedUser import NamedUser -from thefuzz.fuzz import ratio # type: ignore - -from git_helper import git_runner as runner -from git_helper import is_shallow -from github_helper import GitHub, PullRequest, PullRequests, Repository - -# This array gives the preferred category order, and is also used to -# normalize category names. -# Categories are used in .github/PULL_REQUEST_TEMPLATE.md, keep comments there -# updated accordingly -categories_preferred_order = ( - "Backward Incompatible Change", - "New Feature", - "Performance Improvement", - "Improvement", - "Critical Bug Fix", - "Bug Fix", - "Build/Testing/Packaging Improvement", - "Other", -) - -FROM_REF = "" -TO_REF = "" -SHA_IN_CHANGELOG = [] # type: List[str] -gh = GitHub(create_cache_dir=False) -CACHE_PATH = p.join(p.dirname(p.realpath(__file__)), "gh_cache") - - -class Description: - def __init__( - self, number: int, user: NamedUser, html_url: str, entry: str, category: str - ): - self.number = number - self.html_url = html_url - self.user = gh.get_user_cached(user._rawData["login"]) # type: ignore - self.entry = entry - self.category = category - - @property - def formatted_entry(self) -> str: - # Substitute issue links. - # 1) issue number w/o markdown link - entry = re.sub( - r"([^[])#([0-9]{4,})", - r"\1[#\2](https://github.com/ClickHouse/ClickHouse/issues/\2)", - self.entry, - ) - # 2) issue URL w/o markdown link - # including #issuecomment-1 or #event-12 - entry = re.sub( - r"([^(])(https://github.com/ClickHouse/ClickHouse/issues/([0-9]{4,})[-#a-z0-9]*)", - r"\1[#\3](\2)", - entry, - ) - # It's possible that we face a secondary rate limit. - # In this case we should sleep until we get it - while True: - try: - user_name = self.user.name if self.user.name else self.user.login - break - except UnknownObjectException: - user_name = self.user.login - break - except RateLimitExceededException: - gh.sleep_on_rate_limit() - return ( - f"* {entry} [#{self.number}]({self.html_url}) " - f"([{user_name}]({self.user.html_url}))." - ) - - # Sort PR descriptions by numbers - def __eq__(self, other) -> bool: - if not isinstance(self, type(other)): - return NotImplemented - return self.number == other.number - - def __lt__(self, other: "Description") -> bool: - return self.number < other.number - - -def get_descriptions(prs: PullRequests) -> Dict[str, List[Description]]: - descriptions = {} # type: Dict[str, List[Description]] - repos = {} # type: Dict[str, Repository] - for pr in prs: - # See https://github.com/PyGithub/PyGithub/issues/2202, - # obj._rawData doesn't spend additional API requests - # We'll save some requests - # pylint: disable=protected-access - repo_name = pr._rawData["base"]["repo"]["full_name"] - # pylint: enable=protected-access - if repo_name not in repos: - repos[repo_name] = pr.base.repo - in_changelog = False - merge_commit = pr.merge_commit_sha - if merge_commit is None: - logging.warning("PR %s does not have merge-commit, skipping", pr.number) - continue - - in_changelog = merge_commit in SHA_IN_CHANGELOG - if in_changelog: - desc = generate_description(pr, repos[repo_name]) - if desc: - if desc.category not in descriptions: - descriptions[desc.category] = [] - descriptions[desc.category].append(desc) - - for descs in descriptions.values(): - descs.sort() - - return descriptions - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - description="Generate a changelog in Markdown format between given tags. " - "It fetches all tags and unshallow the git repository automatically", - ) - parser.add_argument( - "-v", - "--verbose", - action="count", - default=0, - help="set the script verbosity, could be used multiple", - ) - parser.add_argument( - "--debug-helpers", - action="store_true", - help="add debug logging for git_helper and github_helper", - ) - parser.add_argument( - "--output", - type=argparse.FileType("w"), - default="-", - help="output file for changelog", - ) - parser.add_argument( - "--repo", - default="ClickHouse/ClickHouse", - help="a repository to query for pull-requests from GitHub", - ) - parser.add_argument( - "--jobs", - type=int, - default=10, - help="number of jobs to get pull-requests info from GitHub API", - ) - parser.add_argument( - "--gh-user-or-token", - help="user name or GH token to authenticate", - ) - parser.add_argument( - "--gh-password", - help="a password that should be used when user is given", - ) - parser.add_argument( - "--with-testing-tags", - action="store_true", - help="by default '*-testing' tags are ignored, this argument enables them too", - ) - parser.add_argument( - "--from", - dest="from_ref", - help="git ref for a starting point of changelog, by default is calculated " - "automatically to match a previous tag in history", - ) - parser.add_argument( - "to_ref", - metavar="TO_REF", - help="git ref for the changelog end", - ) - args = parser.parse_args() - return args - - -# This function mirrors the PR description checks in ClickhousePullRequestTrigger. -# Returns None if the PR should not be mentioned in changelog. -def generate_description(item: PullRequest, repo: Repository) -> Optional[Description]: - backport_number = item.number - if item.head.ref.startswith("backport/"): - branch_parts = item.head.ref.split("/") - if len(branch_parts) == 3: - try: - item = gh.get_pull_cached(repo, int(branch_parts[-1])) - except Exception as e: - logging.warning("unable to get backpoted PR, exception: %s", e) - else: - logging.warning( - "The branch %s doesn't match backport template, using PR %s as is", - item.head.ref, - item.number, - ) - description = item.body - # Don't skip empty lines because they delimit parts of description - lines = [x.strip() for x in (description.split("\n") if description else [])] - lines = [re.sub(r"\s+", " ", ln) for ln in lines] - - category = "" - entry = "" - - if lines: - i = 0 - while i < len(lines): - if re.match(r"(?i)^[#>*_ ]*change\s*log\s*category", lines[i]): - i += 1 - if i >= len(lines): - break - # Can have one empty line between header and the category itself. - # Filter it out. - if not lines[i]: - i += 1 - if i >= len(lines): - break - category = re.sub(r"^[-*\s]*", "", lines[i]) - i += 1 - elif re.match( - r"(?i)^[#>*_ ]*(short\s*description|change\s*log\s*entry)", lines[i] - ): - i += 1 - # Can have one empty line between header and the entry itself. - # Filter it out. - if i < len(lines) and not lines[i]: - i += 1 - # All following lines until empty one are the changelog entry. - entry_lines = [] - while i < len(lines) and lines[i]: - entry_lines.append(lines[i]) - i += 1 - entry = " ".join(entry_lines) - else: - i += 1 - - # Remove excessive bullets from the entry. - if re.match(r"^[\-\*] ", entry): - entry = entry[2:] - - # Better style. - if re.match(r"^[a-z]", entry): - entry = entry.capitalize() - - if not category: - # Shouldn't happen, because description check in CI should catch such PRs. - # Fall through, so that it shows up in output and the user can fix it. - category = "NO CL CATEGORY" - - # Filter out the PR categories that are not for changelog. - if re.match( - r"(?i)((non|in|not|un)[-\s]*significant)|(not[ ]*for[ ]*changelog)", - category, - ): - category = "NOT FOR CHANGELOG / INSIGNIFICANT" - return Description(item.number, item.user, item.html_url, item.title, category) - - # Normalize bug fixes - if re.match( - r"(?i)bug\Wfix", - category, - ): - category = "Bug Fix (user-visible misbehavior in an official stable release)" - - # Filter out documentations changelog - if re.match( - r"(?i)doc", - category, - ): - return None - - if backport_number != item.number: - entry = f"Backported in #{backport_number}: {entry}" - - if not entry: - # Shouldn't happen, because description check in CI should catch such PRs. - category = "NO CL ENTRY" - entry = "NO CL ENTRY: '" + item.title + "'" - - entry = entry.strip() - if entry[-1] != ".": - entry += "." - - for c in categories_preferred_order: - if ratio(category.lower(), c.lower()) >= 90: - category = c - break - - return Description(item.number, item.user, item.html_url, entry, category) - - -def write_changelog( - fd: TextIO, descriptions: Dict[str, List[Description]], year: int -) -> None: - to_commit = runner(f"git rev-parse {TO_REF}^{{}}")[:11] - from_commit = runner(f"git rev-parse {FROM_REF}^{{}}")[:11] - fd.write( - f"---\nsidebar_position: 1\nsidebar_label: {year}\n---\n\n" - f"# {year} Changelog\n\n" - f"### ClickHouse release {TO_REF} ({to_commit}) FIXME " - f"as compared to {FROM_REF} ({from_commit})\n\n" - ) - - seen_categories = [] # type: List[str] - for category in categories_preferred_order: - if category in descriptions: - seen_categories.append(category) - fd.write(f"#### {category}\n") - for desc in descriptions[category]: - fd.write(f"{desc.formatted_entry}\n") - - fd.write("\n") - - for category in sorted(descriptions): - if category not in seen_categories: - fd.write(f"#### {category}\n\n") - for desc in descriptions[category]: - fd.write(f"{desc.formatted_entry}\n") - - fd.write("\n") - - -def check_refs(from_ref: Optional[str], to_ref: str, with_testing_tags: bool): - global FROM_REF, TO_REF - TO_REF = to_ref - - # Check TO_REF - runner.run(f"git rev-parse {TO_REF}") - - # Check from_ref - if from_ref is None: - # Get all tags pointing to TO_REF - tags = runner.run(f"git tag --points-at '{TO_REF}^{{}}'").split("\n") - logging.info("All tags pointing to %s:\n%s", TO_REF, tags) - if not with_testing_tags: - tags.append("*-testing") - exclude = " ".join([f"--exclude='{tag}'" for tag in tags]) - cmd = f"git describe --abbrev=0 --tags {exclude} '{TO_REF}'" - FROM_REF = runner.run(cmd) - else: - runner.run(f"git rev-parse {FROM_REF}") - FROM_REF = from_ref - - -def set_sha_in_changelog(): - global SHA_IN_CHANGELOG - SHA_IN_CHANGELOG = runner.run( - f"git log --format=format:%H {FROM_REF}..{TO_REF}" - ).split("\n") - - -def get_year(prs: PullRequests) -> int: - if not prs: - return date.today().year - return max(pr.created_at.year for pr in prs) - - -def main(): - log_levels = [logging.WARN, logging.INFO, logging.DEBUG] - args = parse_args() - logging.basicConfig( - format="%(asctime)s %(levelname)-8s [%(filename)s:%(lineno)d]:\n%(message)s", - level=log_levels[min(args.verbose, 2)], - ) - if args.debug_helpers: - logging.getLogger("github_helper").setLevel(logging.DEBUG) - logging.getLogger("git_helper").setLevel(logging.DEBUG) - # Create a cache directory - if not p.isdir(CACHE_PATH): - os.mkdir(CACHE_PATH, 0o700) - - # Get the full repo - if is_shallow(): - logging.info("Unshallow repository") - runner.run("git fetch --unshallow", stderr=DEVNULL) - logging.info("Fetching all tags") - runner.run("git fetch --tags", stderr=DEVNULL) - - check_refs(args.from_ref, args.to_ref, args.with_testing_tags) - set_sha_in_changelog() - - logging.info("Using %s..%s as changelog interval", FROM_REF, TO_REF) - - # use merge-base commit as a starting point, if used ref in another branch - base_commit = runner.run(f"git merge-base '{FROM_REF}^{{}}' '{TO_REF}^{{}}'") - # Get starting and ending dates for gathering PRs - # Add one day after and before to mitigate TZ possible issues - # `tag^{}` format gives commit ref when we have annotated tags - # format %cs gives a committer date, works better for cherry-picked commits - from_date = runner.run(f"git log -1 --format=format:%cs '{base_commit}'") - to_date = runner.run(f"git log -1 --format=format:%cs '{TO_REF}^{{}}'") - merged = ( - date.fromisoformat(from_date) - timedelta(1), - date.fromisoformat(to_date) + timedelta(1), - ) - - # Get all PRs for the given time frame - global gh - gh = GitHub( - args.gh_user_or_token, - args.gh_password, - create_cache_dir=False, - per_page=100, - pool_size=args.jobs, - ) - gh.cache_path = CACHE_PATH - query = f"type:pr repo:{args.repo} is:merged" - prs = gh.get_pulls_from_search(query=query, merged=merged, sort="created") - - descriptions = get_descriptions(prs) - changelog_year = get_year(prs) - - write_changelog(args.output, descriptions, changelog_year) +import subprocess +import sys +from pathlib import Path +SCRIPT_PATH = (Path(__file__).parents[2] / "tests/ci/changelog.py").absolute() if __name__ == "__main__": - main() + subprocess.check_call(["python3", SCRIPT_PATH, *sys.argv[1:]]) diff --git a/utils/changelog/git_helper.py b/utils/changelog/git_helper.py deleted file mode 120000 index 03b05a7eddd..00000000000 --- a/utils/changelog/git_helper.py +++ /dev/null @@ -1 +0,0 @@ -../../tests/ci/git_helper.py \ No newline at end of file diff --git a/utils/changelog/github_helper.py b/utils/changelog/github_helper.py deleted file mode 120000 index 2d44dfe8000..00000000000 --- a/utils/changelog/github_helper.py +++ /dev/null @@ -1 +0,0 @@ -../../tests/ci/github_helper.py \ No newline at end of file diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index 84682689934..64ff3e8e2cb 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -1011,6 +1011,8 @@ Updatable Uppercased Uptime Uptrace +UrlDecode +UrlEncode UserID Util VARCHAR diff --git a/utils/check-style/check-style b/utils/check-style/check-style index 5c05907e9dd..7f25ca4607c 100755 --- a/utils/check-style/check-style +++ b/utils/check-style/check-style @@ -65,6 +65,7 @@ EXTERN_TYPES_EXCLUDES=( ProfileEvents::increment ProfileEvents::incrementForLogMessage ProfileEvents::getName + ProfileEvents::Timer ProfileEvents::Type ProfileEvents::TypeEnum ProfileEvents::dumpToMapColumn @@ -240,8 +241,22 @@ for test_case in "${tests_with_replicated_merge_tree[@]}"; do esac done -# All the submodules should be from https://github.com/ -find $ROOT_PATH -name '.gitmodules' | while read i; do grep -F 'url = ' $i | grep -v -F 'https://github.com/' && echo 'All the submodules should be from https://github.com/'; done +# All submodules should be from https://github.com/ +git config --file "$ROOT_PATH/.gitmodules" --get-regexp 'submodule\..+\.url' | \ +while read -r line; do + name=${line#submodule.}; name=${name%.url*} + url=${line#* } + [[ "$url" != 'https://github.com/'* ]] && echo "All submodules should be from https://github.com/, submodule '$name' has '$url'" +done + +# All submodules should be of this form: [submodule "contrib/libxyz"] (for consistency, the submodule name does matter too much) +# - restrict the check to top-level .gitmodules file +git config --file "$ROOT_PATH/.gitmodules" --get-regexp 'submodule\..+\.path' | \ +while read -r line; do + name=${line#submodule.}; name=${name%.path*} + path=${line#* } + [ "$name" != "$path" ] && echo "Submodule name '$name' is not equal to it's path '$path'" +done # There shouldn't be any code snippets under GPL or LGPL find $ROOT_PATH/{src,base,programs} -name '*.h' -or -name '*.cpp' 2>/dev/null | xargs grep -i -F 'General Public License' && echo "There shouldn't be any code snippets under GPL or LGPL" @@ -309,6 +324,7 @@ std_cerr_cout_excludes=( src/Bridge/IBridge.cpp src/Daemon/BaseDaemon.cpp src/Loggers/Loggers.cpp + src/Common/GWPAsan.cpp ) sources_with_std_cerr_cout=( $( find $ROOT_PATH/{src,base} -name '*.h' -or -name '*.cpp' | \ diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv index 2f96daf4887..719c25bdc95 100644 --- a/utils/list-versions/version_date.tsv +++ b/utils/list-versions/version_date.tsv @@ -1,6 +1,9 @@ +v24.5.3.5-stable 2024-06-13 +v24.5.2.34-stable 2024-06-13 v24.5.1.1763-stable 2024-06-01 v24.4.2.141-stable 2024-06-07 v24.4.1.2088-stable 2024-05-01 +v24.3.4.147-lts 2024-06-13 v24.3.3.102-lts 2024-05-01 v24.3.2.23-lts 2024-04-03 v24.3.1.2672-lts 2024-03-27